Table of contents
What will be scraped
Full Code
import requests
import json
import re
import argparse
from parsel import Selector
parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")
args = parser.parse_args()
def main():
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
}
if args.indexes:
html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
return parser(html=html)
if args.most_active:
html = requests.get("https://www.google.com/finance/markets/most-active", headers=headers, timeout=30)
return parser(html=html)
if args.gainers:
html = requests.get("https://www.google.com/finance/markets/gainers", headers=headers, timeout=30)
return parser(html=html)
if args.losers:
html = requests.get("https://www.google.com/finance/markets/losers", headers=headers, timeout=30)
return parser(html=html)
if args.climate_leaders:
html = requests.get("https://www.google.com/finance/markets/climate-leaders", headers=headers, timeout=30)
return parser(html=html)
if args.crypto:
html = requests.get("https://www.google.com/finance/markets/cryptocurrencies", headers=headers, timeout=30)
return parser(html=html)
if args.currency:
html = requests.get("https://www.google.com/finance/markets/currencies", headers=headers, timeout=30)
return parser(html=html)
def parser(html):
selector = Selector(text=html.text)
stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")
data = {
f"{stock_topic}_trends": [],
f"{stock_topic}_discover_more": [],
f"{stock_topic}_news": []
}
# news ressults
for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
data[f"{stock_topic}_news"].append({
"position": index,
"title": news_results.css(".mRjSYb::text").get(),
"source": news_results.css(".sfyJob::text").get(),
"date": news_results.css(".Adak::text").get(),
"image": news_results.css("img::attr(src)").get(),
})
# stocks table
for index, stock_results in enumerate(selector.css("li a"), start=1):
current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
# ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
quote = stock_results.attrib["href"].replace("./quote/", "")
data[f"{stock_topic}_trends"].append({
"position": index,
"title": stock_results.css(".ZvmM7::text").get(),
"quote": stock_results.css(".COaKTb::text").get(),
# "https://www.google.com/finance/MSFT:NASDAQ"
"quote_link": f"https://www.google.com/finance/{quote}",
"price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
# "you may be interested in" at the bottom of the page
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
quote = stock_results.attrib["href"].replace("./quote/", "")
data[f"{stock_topic}_discover_more"].append({
"position": index,
"quote": interested_bottom.css(".COaKTb::text").get(),
"quote_link": f"https://www.google.com/finance{quote}",
"title": interested_bottom.css(".RwFyvf::text").get(),
"price": interested_bottom.css(".YMlKec::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
return data
if __name__ == "__main__":
print(json.dumps(main(), indent=2, ensure_ascii=False))
Prerequisites
Install libraries:
pip install requests parsel
Basic knowledge scraping with CSS selectors
CSS selectors declare which part of the markup a style applies to thus allowing to extract data from matching tags and attributes.
If you haven't scraped with CSS selectors, there's a dedicated blog post of mine about how to use CSS selectors when web-scraping that covers what it is, pros and cons, and why they matter from a web-scraping perspective.
Separate virtual environment
In short, it's a thing that creates an independent set of installed libraries including different Python versions that can coexist with each other in the same system thus preventing libraries or Python version conflicts.
If you didn't work with a virtual environment before, have a look at the dedicated Python virtual environments tutorial using Virtualenv and Poetry blog post of mine to get a little bit more familiar.
📌Note: this is not a strict requirement for this blog.
Reduce the chance of being blocked
There's a chance that a request might be blocked. Have a look at how to reduce the chance of being blocked while web-scraping, there are eleven methods to bypass blocks from most websites.
Code Explanation
Import libraries:
import requests
import json
import re
import argparse
from parsel import Selector
Library | Purpose |
requests | to make a request to the website. |
json | to convert extracted data to a JSON object. |
re | to extract parts of the data via regular expression. |
argparse | to extract parts of the data via regular expression. |
parsel | to parse data from HTML/XML documents. Similar to BeautifulSoup but supports XPath. |
Firstly, if we need to parse data by typing command-line arguments without the need to activate certain functions in the code in order to extract specific types of results, for example crypto, gainers or losers, we can do it with argparse
built-in library by creating command-line arguments:
parser = argparse.ArgumentParser(prog="Google Finance Markets Options")
parser.add_argument('-i','--indexes', action="store_true")
parser.add_argument('-ma','--most-active', action="store_true")
parser.add_argument('-g','--gainers', action="store_true")
parser.add_argument('-l','--losers', action="store_true")
parser.add_argument('-cl','--climate-leaders', action="store_true")
parser.add_argument('-cc','--crypto', action="store_true")
parser.add_argument('-c','--currency', action="store_true")
args = parser.parse_args()
Then we can run the script something like so:
$ python main.py -cc # will parse crypto results
Note if action="store_true"
is not used, the result will be an error:
$ python main.py -cc
Google Finance Markets Options: error: argument -cc/--crypto: expected one argument
The action
set to store_true
will store the argument as True
, if present. So if the argument is present, it will return some output.
We can also make parameters as requried
, which means that certain parameter is required to be used when file is used.
Code | Explanation |
add_argument | Defines how a single command-line argument should be parsed. |
parse_args | determines what objects are created by add_argument and how they are assigned. Returns the populated namespace. |
The next step is to create function with all the command-line logic. You can access command-line arguments with dot notation:
def main():
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
# https://www.whatismybrowser.com/detect/what-is-my-user-agent
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.134 Safari/537.36"
}
if args.indexes:
html = requests.get("https://www.google.com/finance/markets/indexes", headers=headers, timeout=30)
return parser(html=html)
# ... other arguments logic
Code | Explanation |
user-agent | to act as a "real" user request from the browser by passing it to request headers. This is used to bypass blocks from Google as default requests user-agent is python-requests and websites understand that it's a bot that sends a request and might block it. Check what's your user-agent . |
if args.indexes | will check if certain command-line argument is being passed. |
timeout=30 | to tell requests to stop waiting for response after 30 seconds. |
return parser(html=html) | to return data from the parser() function to reduce the code size since every selector is identical to extract the data. |
The next step is to make a parser function that will extract all the needed data from the page. The function requires html
argument that will be passed to parsel
, then we need to create how the data
will be structured once its parsed:
def parser(html):
selector = Selector(text=html.text)
stock_topic = selector.css(".Mrksgc::text").get().split("on ")[1].replace(" ", "_")
data = {
f"{stock_topic}_trends": [],
f"{stock_topic}_discover_more": [],
f"{stock_topic}_news": []
}
Code | Explanation |
Selector(text=html.text) | where passed HTML from the response will be processed by parsel . |
text= | is a parsel argument that accepts str object from where HTML nodes will be extracted. |
css() | to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood. |
::text or ::attr(<attribute>) | to extract textual or attribute data from the node. |
get() | to get actual data returned from parsel |
split() | to split a string into a list where each word is a list item |
replace("<something>", "<with_something>") | to replace something old with something new in a string. |
After creating an empty dictionary structure, we need to fill it with news, stocks and other data by appending
it, as a dict
in this case:
# news ressults
for index, news_results in enumerate(selector.css(".yY3Lee"), start=1):
data[f"{stock_topic}_news"].append({
"position": index,
"title": news_results.css(".mRjSYb::text").get(),
"source": news_results.css(".sfyJob::text").get(),
"date": news_results.css(".Adak::text").get(),
"image": news_results.css("img::attr(src)").get(),
})
# stocks table
for index, stock_results in enumerate(selector.css("li a"), start=1):
current_percent_change_raw_value = stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d+\.\d+%", stock_results.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
# ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
quote = stock_results.attrib["href"].replace("./quote/", "")
data[f"{stock_topic}_trends"].append({
"position": index,
"title": stock_results.css(".ZvmM7::text").get(),
"quote": stock_results.css(".COaKTb::text").get(),
# "https://www.google.com/finance/MSFT:NASDAQ"
"quote_link": f"https://www.google.com/finance/{quote}",
"price_change": stock_results.css(".SEGxAb .P2Luy::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
# "you may be interested in" at the bottom of the page
for index, interested_bottom in enumerate(selector.css(".HDXgAf .tOzDHb"), start=1):
current_percent_change_raw_value = interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()
current_percent_change = re.search(r"\d+\.\d+%", interested_bottom.css("[jsname=Fe7oBc]::attr(aria-label)").get()).group()
# ./quote/SNAP:NASDAQ -> SNAP:NASDAQ
quote = stock_results.attrib["href"].replace("./quote/", "")
data[f"{stock_topic}_discover_more"].append({
"position": index,
"quote": interested_bottom.css(".COaKTb::text").get(),
"quote_link": f"https://www.google.com/finance{quote}",
"title": interested_bottom.css(".RwFyvf::text").get(),
"price": interested_bottom.css(".YMlKec::text").get(),
"percent_price_change": f"+{current_percent_change}" if "Up" in current_percent_change_raw_value else f"-{current_percent_change}"
})
Code | Explanation |
enumerate() | to add a counter to an iterable and return it. start=1 will start counting from 1, instead from the default value of 0. |
::text or ::attr(<attribute>) | to extract textual or attribute data from the node. |
data[f"{stock_topic}_news"] | dynamically appends data as a dict to whatever value is being extracted by a stock_topic variable. |
append() | to append extracted data to the list as dictionary. |
css() | to parse data from the passed CSS selector(s). Every CSS query traslates to XPath using csselect package under the hood. |
get() | to get actual data. |
getall() | to get all a list of matches. |
[jsname=Fe7oBc] | is a CSS selector that used to select elements with the specified attribute and value e.g. [attribute=value] . |
attrib["href"] | is a parsel method of accessing node attribute. It returns dict for the first matched element. None if dict is empty. |
replace("<something>", "<with_something>") | to replace something old with something new in a string. |
re.search() | to match parts of the string and grab only digit values. group() to return matched string by a regular expression. |
Return the data:
# data = {
# f"{stock_topic}_trends": [],
# f"{stock_topic}_discover_more": [],
# f"{stock_topic}_news": []
# }
# extraction code...
return data
And finally, we need to specify that this is a runnable script for code readers:
if __name__ == "__main__":
print(json.dumps(main(), indent=2, ensure_ascii=False))
Now you can run your script from the command-line:
$ python main.py -ma # most-active
You can also access help command -h
to see available arguments like so:
$ python main.py -h
usage: Google Finance Markets Options [-h] [-i] [-ma] [-g] [-l]
[-cl] [-cc] [-c]
optional arguments:
-h, --help show this help message and exit
-i, --indexes
-ma, --most-active
-g, --gainers
-l, --losers
-cl, --climate-leaders
-cc, --crypto
-c, --currency
Full output:
{
"most_active_trends": [
{
"position": 1,
"title": "Advanced Micro Devices, Inc.",
"quote": "AMD",
"quote_link": "https://www.google.com/finance/AMD:NASDAQ",
"price_change": "+$3.04",
"percent_price_change": "+3.22%"
}, ... other results
{
"position": 50,
"title": "Freeport-McMoRan Inc",
"quote": "FCX",
"quote_link": "https://www.google.com/finance/FCX:NYSE",
"price_change": "-$1.15",
"percent_price_change": "-3.66%"
}
],
"most_active_discover_more": [
{
"position": 1,
"quote": "Index",
"quote_link": "https://www.google.com/financeFCX:NYSE",
"title": "Dow Jones Industrial Average",
"price": "32,772.36",
"percent_price_change": "-0.22%"
}, ... other results
{
"position": 18,
"quote": "NFLX",
"quote_link": "https://www.google.com/financeFCX:NYSE",
"title": "Netflix Inc",
"price": "$226.14",
"percent_price_change": "+0.55%"
}
],
"most_active_news": [
{
"position": 1,
"title": "Alibaba says will work to keep trading in U.S., Hong Kong after being added \nto SEC delisting risk list",
"source": "CNBC",
"date": "7 hours ago",
"image": "https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcRMBjVDpAgK8AJP6gxfd89Kb5rz7th_s3ntTLA_WYWnVWT3Q05aQJTWpMpjcOg"
}, ... other news results
{
"position": 6,
"title": "Intel CEO: 'This is a time for a bit of austerity'",
"source": "Yahoo Finance",
"date": "4 hours ago",
"image": "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcTxkwNmmHmcXqkF3-pa2Bl0SsCzdIJyB0jPdutL0vw9pV4sRkgy8BKemYIkEeg"
}
]
}
Links
Originally published at SerpApi: serpapi.com/blog/scrape-google-finance-mark..