Web Scraping Google Finance Markets in Python

Web Scraping Google Finance Markets in Python

What will be scraped

image

Full Code

import requests
import json
import re
import argparse
from parsel import Selector

parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")

args = parser.parse_args()

def main():

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    # https://www.whatismybrowser.com/detect/what-is-my-user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
    }

    if args.indexes:
        html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
        return parser(html=html)

    if args.most_active:
        html = requests.get("https://www.google.com/finance/markets/most-active", headers=headers, timeout=30)
        return parser(html=html)

    if args.gainers:
        html = requests.get("https://www.google.com/finance/markets/gainers", headers=headers, timeout=30)
        return parser(html=html)

    if args.losers:
        html = requests.get("https://www.google.com/finance/markets/losers", headers=headers, timeout=30)
        return parser(html=html)

    if args.climate_leaders:
        html = requests.get("https://www.google.com/finance/markets/climate-leaders", headers=headers, timeout=30)
        return parser(html=html)

    if args.crypto:
        html = requests.get("https://www.google.com/finance/markets/cryptocurrencies", headers=headers, timeout=30)
        return parser(html=html)

    if args.currency:
        html = requests.get("https://www.google.com/finance/markets/currencies", headers=headers, timeout=30)
        return parser(html=html)


def parser(html):
    selector = Selector(text=html.text)
    stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")

    data = {
        f"{stock_topic}_trends": [],
        f"{stock_topic}_discover_more": [],
        f"{stock_topic}_news": []
    }

    # news ressults
    for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
        data[f"{stock_topic}_news"].append({
            "position": index,
            "title": news_results.css(".mRjSYb::text").get(),
            "source": news_results.css(".sfyJob::text").get(),
            "date": news_results.css(".Adak::text").get(),
            "image": news_results.css("img::attr(src)").get(),
        })

    # stocks table
    for index, stock_results in enumerate(selector.css("li a"), start=1):
        current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
        quote = stock_results.attrib["href"].replace("./quote/", "")

        data[f"{stock_topic}_trends"].append({
            "position": index,
            "title": stock_results.css(".ZvmM7::text").get(),
            "quote": stock_results.css(".COaKTb::text").get(),
            # "https://www.google.com/finance/MSFT:NASDAQ"
            "quote_link": f"https://www.google.com/finance/{quote}",
            "price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    # "you may be interested in" at the bottom of the page
    for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
        current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        quote = stock_results.attrib["href"].replace("./quote/", "")

        data[f"{stock_topic}_discover_more"].append({
            "position": index,
            "quote": interested_bottom.css(".COaKTb::text").get(),
            "quote_link": f"https://www.google.com/finance{quote}",
            "title": interested_bottom.css(".RwFyvf::text").get(),
            "price": interested_bottom.css(".YMlKec::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    return data



if __name__ == "__main__":
    print(json.dumps(main(), indent=2, ensure_ascii=False))

Prerequisites

Install libraries:

pip install requests parsel

Basic knowledge scraping with CSS selectors

CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.

If you haven't scraped with CSS selectors, there's a dedicated blog post of mine about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they matter from a web-scraping perspective.

Separate virtual environment

In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other in the same system thus preventing libraries or Python version conflicts.

If you didn't work with a virtual environment before, have a look at the dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get a little bit more familiar.

📌Note: this is not a strict requirement for this blog.

Reduce the chance of being blocked

There's a chance that a request might be blocked. Have a look at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.

Code Explanation

Import libraries:

import requests
import json
import re
import argparse
from parsel import Selector
LibraryPurpose
requeststo make a request to the website.
jsonto convert extracted data to a JSON object.
reto extract parts of the data via regular expression.
argparseto extract parts of the data via regular expression.
parselto parse data from HTML/XML documents. Similar to BeautifulSoup but supports XPath.

Firstly, if we need to parse data by typing command-line arguments without the need to activate certain functions in the code in order to extract specific types of results, for example crypto, gainers or losers, we can do it with argparse built-in library by creating command-line arguments:

parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")

args = parser.parse_args()

Then we can run the script something like so:

$ python main.py -cc # will parse crypto results

Note if action="store_true" is not used, the result will be an error:

$ python main.py -cc

Google Finance Markets Options: error: argument -cc/--crypto: expected one argument

The action set to store_true will store the argument as True, if present. So if the argument is present, it will return some output.

We can also make parameters as requried, which means that certain parameter is required to be used when file is used.

CodeExplanation
add_argumentDefines how a single command-line argument should be parsed.
parse_argsdetermines what objects are created by add_argument and how they are assigned. Returns the populated namespace.

The next step is to create function with all the command-line logic. You can access command-line arguments with dot notation:

def main():

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    # https://www.whatismybrowser.com/detect/what-is-my-user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
    }

    if args.indexes:
        html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
        return parser(html=html)

    # ... other arguments logic
CodeExplanation
user-agentto act as a "real" user request from the browser by passing it to request headers. This is used to bypass blocks from Google as default requests user-agent is python-requests and websites understand that it's a bot that sends a request and might block it. Check what's your user-agent.
if args.indexeswill check if certain command-line argument is being passed.
timeout=30to tell requests to stop waiting for response after 30 seconds.
return parser(html=html)to return data from the parser() function to reduce the code size since every selector is identical to extract the data.

The next step is to make a parser function that will extract all the needed data from the page. The function requires html argument that will be passed to parsel, then we need to create how the data will be structured once its parsed:

def parser(html):
    selector = Selector(text=html.text)
    stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")

    data = {
        f"{stock_topic}_trends": [],
        f"{stock_topic}_discover_more": [],
        f"{stock_topic}_news": []
    }
CodeExplanation
Selector(text=html.text)where passed HTML from the response will be processed by parsel.
text=is a parsel argument that accepts str object from where HTML nodes will be extracted.
css()to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood.
::text or ::attr(<attribute>)to extract textual or attribute data from the node.
get()to get actual data returned from parsel
split()to split a string into a list where each word is a list item
replace("<something>", "<with_something>")to replace something old with something new in a string.

After creating an empty dictionary structure, we need to fill it with news, stocks and other data by appending it, as a dict in this case:

# news ressults
for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
    data[f"{stock_topic}_news"].append({
        "position": index,
        "title": news_results.css(".mRjSYb::text").get(),
        "source": news_results.css(".sfyJob::text").get(),
        "date": news_results.css(".Adak::text").get(),
        "image": news_results.css("img::attr(src)").get(),
    })

# stocks table
for index, stock_results in enumerate(selector.css("li a"), start=1):
    current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

    # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
    quote = stock_results.attrib["href"].replace("./quote/", "")

    data[f"{stock_topic}_trends"].append({
        "position": index,
        "title": stock_results.css(".ZvmM7::text").get(),
        "quote": stock_results.css(".COaKTb::text").get(),
        # "https://www.google.com/finance/MSFT:NASDAQ"
        "quote_link": f"https://www.google.com/finance/{quote}",
        "price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })

# "you may be interested in" at the bottom of the page
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
    current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

    # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
    quote = stock_results.attrib["href"].replace("./quote/", "")

    data[f"{stock_topic}_discover_more"].append({
        "position": index,
        "quote": interested_bottom.css(".COaKTb::text").get(),
        "quote_link": f"https://www.google.com/finance{quote}",
        "title": interested_bottom.css(".RwFyvf::text").get(),
        "price": interested_bottom.css(".YMlKec::text").get(),
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })
CodeExplanation
enumerate()to add a counter to an iterable and return it. start=1 will start counting from 1, instead from the default value of 0.
::text or ::attr(<attribute>)to extract textual or attribute data from the node.
data[f"{stock_topic}_news"]dynamically appends data as a dict to whatever value is being extracted by a stock_topic variable.
append()to append extracted data to the list as dictionary.
css()to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood.
get()to get actual data.
getall()to get all a list of matches.
[jsname=Fe7oBc]is a CSS selector that used to select elements with the specified attribute and value e.g. [attribute=value].
attrib["href"]is a parsel method of accessing node attribute. It returns dict for the first matched element. None if dict is empty.
replace("<something>", "<with_something>")to replace something old with something new in a string.
re.search()to match parts of the string and grab only digit values. group() to return matched string by a regular expression.

Return the data:

# data = {
#     f"{stock_topic}_trends": [],
#     f"{stock_topic}_discover_more": [],
#     f"{stock_topic}_news": []
# }

# extraction code...

return data

And finally, we need to specify that this is a runnable script for code readers:

if __name__ == "__main__":
    print(json.dumps(main(), indent=2, ensure_ascii=False))

Now you can run your script from the command-line:

$ python main.py -ma # most-active

You can also access help command -h to see available arguments like so:

$ python main.py -h
usage: Google Finance Markets Options [-h] [-i] [-ma] [-g] [-l]
                                      [-cl] [-cc] [-c]

optional arguments:
  -h, --help            show this help message and exit
  -i, --indexes
  -ma, --most-active
  -g, --gainers
  -l, --losers
  -cl, --climate-leaders
  -cc, --crypto
  -c, --currency

Full output:

{
  "most_active_trends": [
    {
      "position": 1,
      "title": "Advanced Micro Devices, Inc.",
      "quote": "AMD",
      "quote_link": "https://www.google.com/finance/AMD:NASDAQ",
      "price_change": "+$3.04",
      "percent_price_change": "+3.22%"
    }, ... other results
    {
      "position": 50,
      "title": "Freeport-McMoRan Inc",
      "quote": "FCX",
      "quote_link": "https://www.google.com/finance/FCX:NYSE",
      "price_change": "-$1.15",
      "percent_price_change": "-3.66%"
    }
  ],
  "most_active_discover_more": [
    {
      "position": 1,
      "quote": "Index",
      "quote_link": "https://www.google.com/financeFCX:NYSE",
      "title": "Dow Jones Industrial Average",
      "price": "32,772.36",
      "percent_price_change": "-0.22%"
    }, ... other results
    {
      "position": 18,
      "quote": "NFLX",
      "quote_link": "https://www.google.com/financeFCX:NYSE",
      "title": "Netflix Inc",
      "price": "$226.14",
      "percent_price_change": "+0.55%"
    }
  ],
  "most_active_news": [
    {
      "position": 1,
      "title": "Alibaba says will work to keep trading in U.S., Hong Kong after being added \nto SEC delisting risk list",
      "source": "CNBC",
      "date": "7 hours ago",
      "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRMBjVDpAgK8AJP6gxfd89Kb5rz7th_s3ntTLA_WYWnVWT3Q05aQJTWpMpjcOg"
    }, ... other news results
    {
      "position": 6,
      "title": "Intel CEO: 'This is a time for a bit of austerity'",
      "source": "Yahoo Finance",
      "date": "4 hours ago",
      "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTxkwNmmHmcXqkF3-pa2Bl0SsCzdIJyB0jPdutL0vw9pV4sRkgy8BKemYIkEeg"
    }
  ]
}

Join us on Twitter | YouTube

Originally published at SerpApi: serpapi.com/blog/scrape-google-finance-mark..