Web Scraping Google Finance Markets in Python

What will be scraped

Full Code

import requests
import json
import re
import argparse
from parsel import Selector

parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")

args = parser.parse_args()

def main():

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    # https://www.whatismybrowser.com/detect/what-is-my-user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
    }

    if args.indexes:
        html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
        return parser(html=html)

    if args.most_active:
        html = requests.get("https://www.google.com/finance/markets/most-active", headers=headers, timeout=30)
        return parser(html=html)

    if args.gainers:
        html = requests.get("https://www.google.com/finance/markets/gainers", headers=headers, timeout=30)
        return parser(html=html)

    if args.losers:
        html = requests.get("https://www.google.com/finance/markets/losers", headers=headers, timeout=30)
        return parser(html=html)

    if args.climate_leaders:
        html = requests.get("https://www.google.com/finance/markets/climate-leaders", headers=headers, timeout=30)
        return parser(html=html)

    if args.crypto:
        html = requests.get("https://www.google.com/finance/markets/cryptocurrencies", headers=headers, timeout=30)
        return parser(html=html)

    if args.currency:
        html = requests.get("https://www.google.com/finance/markets/currencies", headers=headers, timeout=30)
        return parser(html=html)


def parser(html):
    selector = Selector(text=html.text)
    stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")

    data = {
        f"{stock_topic}_trends": [],
        f"{stock_topic}_discover_more": [],
        f"{stock_topic}_news": []
    }

    # news ressults
    for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
        data[f"{stock_topic}_news"].append({
            "position": index,
            "title": news_results.css(".mRjSYb::text").get(),
            "source": news_results.css(".sfyJob::text").get(),
            "date": news_results.css(".Adak::text").get(),
            "image": news_results.css("img::attr(src)").get(),
        })

    # stocks table
    for index, stock_results in enumerate(selector.css("li a"), start=1):
        current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
        quote = stock_results.attrib["href"].replace("./quote/", "")

        data[f"{stock_topic}_trends"].append({
            "position": index,
            "title": stock_results.css(".ZvmM7::text").get(),
            "quote": stock_results.css(".COaKTb::text").get(),
            # "https://www.google.com/finance/MSFT:NASDAQ"
            "quote_link": f"https://www.google.com/finance/{quote}",
            "price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    # "you may be interested in" at the bottom of the page
    for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
        current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
        current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

        quote = stock_results.attrib["href"].replace("./quote/", "")

        data[f"{stock_topic}_discover_more"].append({
            "position": index,
            "quote": interested_bottom.css(".COaKTb::text").get(),
            "quote_link": f"https://www.google.com/finance{quote}",
            "title": interested_bottom.css(".RwFyvf::text").get(),
            "price": interested_bottom.css(".YMlKec::text").get(),
            "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
        })

    return data



if __name__ == "__main__":
    print(json.dumps(main(), indent=2, ensure_ascii=False))

Prerequisites

Install libraries:

pip install requests parsel

Basic knowledge scraping with CSS selectors

CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.

If you haven't scraped with CSS selectors, there's a dedicated blog post of mine about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they matter from a web-scraping perspective.

Separate virtual environment

In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other in the same system thus preventing libraries or Python version conflicts.

If you didn't work with a virtual environment before, have a look at the dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get a little bit more familiar.

📌Note: this is not a strict requirement for this blog.

Reduce the chance of being blocked

There's a chance that a request might be blocked. Have a look at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.

Code Explanation

Import libraries:

import requests
import json
import re
import argparse
from parsel import Selector

Library	Purpose
`requests`	to make a request to the website.
`json`	to convert extracted data to a JSON object.
`re`	to extract parts of the data via regular expression.
`argparse`	to extract parts of the data via regular expression.
`parsel`	to parse data from HTML/XML documents. Similar to `BeautifulSoup` but supports XPath.

Firstly, if we need to parse data by typing command-line arguments without the need to activate certain functions in the code in order to extract specific types of results, for example crypto, gainers or losers, we can do it with argparse built-in library by creating command-line arguments:

parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")

args = parser.parse_args()

Then we can run the script something like so:

$ python main.py -cc # will parse crypto results

Note if action="store_true" is not used, the result will be an error:

$ python main.py -cc

Google Finance Markets Options: error: argument -cc/--crypto: expected one argument

The action set to store_true will store the argument as True, if present. So if the argument is present, it will return some output.

We can also make parameters as requried, which means that certain parameter is required to be used when file is used.

Code	Explanation
`add_argument`	Defines how a single command-line argument should be parsed.
`parse_args`	determines what objects are created by `add_argument` and how they are assigned. Returns the populated namespace.

The next step is to create function with all the command-line logic. You can access command-line arguments with dot notation:

def main():

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    # https://www.whatismybrowser.com/detect/what-is-my-user-agent
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
    }

    if args.indexes:
        html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
        return parser(html=html)

    # ... other arguments logic

Code	Explanation
`user-agent`	to act as a "real" user request from the browser by passing it to request headers. This is used to bypass blocks from Google as default `requests` user-agent is `python-requests` and websites understand that it's a bot that sends a request and might block it. Check what's your `user-agent`.
`if args.indexes`	will check if certain command-line argument is being passed.
`timeout=30`	to tell `requests` to stop waiting for response after 30 seconds.
`return parser(html=html)`	to return data from the `parser()` function to reduce the code size since every selector is identical to extract the data.

The next step is to make a parser function that will extract all the needed data from the page. The function requires html argument that will be passed to parsel, then we need to create how the data will be structured once its parsed:

def parser(html):
    selector = Selector(text=html.text)
    stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")

    data = {
        f"{stock_topic}_trends": [],
        f"{stock_topic}_discover_more": [],
        f"{stock_topic}_news": []
    }

Code	Explanation
`Selector(text=html.text)`	where passed HTML from the response will be processed by `parsel`.
`text=`	is a `parsel` argument that accepts `str` object from where HTML nodes will be extracted.
`css()`	to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using `csselect` package under the hood.
`::text` or `::attr(<attribute>)`	to extract textual or attribute data from the node.
`get()`	to get actual data returned from `parsel`
`split()`	to split a string into a list where each word is a list item
`replace("<something>", "<with_something>")`	to replace something old with something new in a string.

After creating an empty dictionary structure, we need to fill it with news, stocks and other data by appending it, as a dict in this case:

# news ressults
for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
    data[f"{stock_topic}_news"].append({
        "position": index,
        "title": news_results.css(".mRjSYb::text").get(),
        "source": news_results.css(".sfyJob::text").get(),
        "date": news_results.css(".Adak::text").get(),
        "image": news_results.css("img::attr(src)").get(),
    })

# stocks table
for index, stock_results in enumerate(selector.css("li a"), start=1):
    current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

    # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
    quote = stock_results.attrib["href"].replace("./quote/", "")

    data[f"{stock_topic}_trends"].append({
        "position": index,
        "title": stock_results.css(".ZvmM7::text").get(),
        "quote": stock_results.css(".COaKTb::text").get(),
        # "https://www.google.com/finance/MSFT:NASDAQ"
        "quote_link": f"https://www.google.com/finance/{quote}",
        "price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })

# "you may be interested in" at the bottom of the page
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
    current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
    current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()

    # ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
    quote = stock_results.attrib["href"].replace("./quote/", "")

    data[f"{stock_topic}_discover_more"].append({
        "position": index,
        "quote": interested_bottom.css(".COaKTb::text").get(),
        "quote_link": f"https://www.google.com/finance{quote}",
        "title": interested_bottom.css(".RwFyvf::text").get(),
        "price": interested_bottom.css(".YMlKec::text").get(),
        "percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
    })

Code	Explanation
`enumerate()`	to add a counter to an iterable and return it. `start=1` will start counting from 1, instead from the default value of 0.
`::text` or `::attr(<attribute>)`	to extract textual or attribute data from the node.
`data[f"{stock_topic}_news"]`	dynamically appends data as a `dict` to whatever value is being extracted by a `stock_topic` variable.
`append()`	to append extracted data to the `list` as dictionary.
`css()`	to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using `csselect` package under the hood.
`get()`	to get actual data.
`getall()`	to get all a `list` of matches.
`[jsname=Fe7oBc]`	is a CSS selector that used to select elements with the specified attribute and value e.g. `[attribute=value]`.
`attrib["href"]`	is a `parsel` method of accessing node attribute. It returns `dict` for the first matched element. `None` if dict is empty.
`replace("<something>", "<with_something>")`	to replace something old with something new in a string.
`re.search()`	to match parts of the string and grab only digit values. `group()` to return matched string by a regular expression.

Return the data:

# data = {
#     f"{stock_topic}_trends": [],
#     f"{stock_topic}_discover_more": [],
#     f"{stock_topic}_news": []
# }

# extraction code...

return data

And finally, we need to specify that this is a runnable script for code readers:

if __name__ == "__main__":
    print(json.dumps(main(), indent=2, ensure_ascii=False))

Now you can run your script from the command-line:

$ python main.py -ma # most-active

You can also access help command -h to see available arguments like so:

$ python main.py -h
usage: Google Finance Markets Options [-h] [-i] [-ma] [-g] [-l]
                                      [-cl] [-cc] [-c]

optional arguments:
  -h, --help            show this help message and exit
  -i, --indexes
  -ma, --most-active
  -g, --gainers
  -l, --losers
  -cl, --climate-leaders
  -cc, --crypto
  -c, --currency

Full output:

{
  "most_active_trends": [
    {
      "position": 1,
      "title": "Advanced Micro Devices, Inc.",
      "quote": "AMD",
      "quote_link": "https://www.google.com/finance/AMD:NASDAQ",
      "price_change": "+$3.04",
      "percent_price_change": "+3.22%"
    }, ... other results
    {
      "position": 50,
      "title": "Freeport-McMoRan Inc",
      "quote": "FCX",
      "quote_link": "https://www.google.com/finance/FCX:NYSE",
      "price_change": "-$1.15",
      "percent_price_change": "-3.66%"
    }
  ],
  "most_active_discover_more": [
    {
      "position": 1,
      "quote": "Index",
      "quote_link": "https://www.google.com/financeFCX:NYSE",
      "title": "Dow Jones Industrial Average",
      "price": "32,772.36",
      "percent_price_change": "-0.22%"
    }, ... other results
    {
      "position": 18,
      "quote": "NFLX",
      "quote_link": "https://www.google.com/financeFCX:NYSE",
      "title": "Netflix Inc",
      "price": "$226.14",
      "percent_price_change": "+0.55%"
    }
  ],
  "most_active_news": [
    {
      "position": 1,
      "title": "Alibaba says will work to keep trading in U.S., Hong Kong after being added \nto SEC delisting risk list",
      "source": "CNBC",
      "date": "7 hours ago",
      "image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRMBjVDpAgK8AJP6gxfd89Kb5rz7th_s3ntTLA_WYWnVWT3Q05aQJTWpMpjcOg"
    }, ... other news results
    {
      "position": 6,
      "title": "Intel CEO: 'This is a time for a bit of austerity'",
      "source": "Yahoo Finance",
      "date": "4 hours ago",
      "image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTxkwNmmHmcXqkF3-pa2Bl0SsCzdIJyB0jPdutL0vw9pV4sRkgy8BKemYIkEeg"
    }
  ]
}

Links

Join us on Twitter | YouTube

Originally published at SerpApi: serpapi.com/blog/scrape-google-finance-mark..

dimitryzub