Spaces:

tensora
/

webcrawler

Running

App Files Files Community

webcrawler / pytrends /request.py

Add1E

Upload 8 files

ff01b82 verified almost 2 years ago

raw

history blame

25.2 kB

	import json

	import pandas as pd
	import requests

	from requests.adapters import HTTPAdapter
	from requests.packages.urllib3.util.retry import Retry
	from requests import status_codes

	from pytrends import exceptions

	from urllib.parse import quote


	BASE_TRENDS_URL = 'https://trends.google.com/trends'


	class TrendReq(object):
	"""
	Google Trends API
	"""
	GET_METHOD = 'get'
	POST_METHOD = 'post'
	GENERAL_URL = f'{BASE_TRENDS_URL}/api/explore'
	INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multiline'
	MULTIRANGE_INTEREST_OVER_TIME_URL = f'{BASE_TRENDS_URL}/api/widgetdata/multirange'
	INTEREST_BY_REGION_URL = f'{BASE_TRENDS_URL}/api/widgetdata/comparedgeo'
	RELATED_QUERIES_URL = f'{BASE_TRENDS_URL}/api/widgetdata/relatedsearches'
	TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/hottrends/visualize/internal/data'
	TOP_CHARTS_URL = f'{BASE_TRENDS_URL}/api/topcharts'
	SUGGESTIONS_URL = f'{BASE_TRENDS_URL}/api/autocomplete/'
	CATEGORIES_URL = f'{BASE_TRENDS_URL}/api/explore/pickers/category'
	TODAY_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/dailytrends'
	REALTIME_TRENDING_SEARCHES_URL = f'{BASE_TRENDS_URL}/api/realtimetrends'
	TRENDS_URL = f'{BASE_TRENDS_URL}/api/trends'
	ERROR_CODES = (500, 502, 504, 429)

	def __init__(self, hl='en-US', tz=360, geo='', timeout=(2, 5), proxies='',
	retries=0, backoff_factor=0, requests_args=None):
	"""
	Initialize default values for params
	"""
	# google rate limit
	self.google_rl = 'You have reached your quota limit. Please try again later.'
	self.results = None
	# set user defined options used globally
	self.tz = tz
	self.hl = hl
	self.geo = geo
	self.kw_list = list()
	self.timeout = timeout
	self.proxies = proxies # add a proxy option
	self.retries = retries
	self.backoff_factor = backoff_factor
	self.proxy_index = 0
	self.requests_args = requests_args or {}
	self.cookies = self.GetGoogleCookie()
	# intialize widget payloads
	self.token_payload = dict()
	self.interest_over_time_widget = dict()
	self.interest_by_region_widget = dict()
	self.related_topics_widget_list = list()
	self.related_queries_widget_list = list()

	self.headers = {'accept-language': self.hl}
	self.headers.update(self.requests_args.pop('headers', {}))

	def GetGoogleCookie(self):
	"""
	Gets google cookie (used for each and every proxy; once on init otherwise)
	Removes proxy from the list on proxy error
	"""
	while True:
	if "proxies" in self.requests_args:
	try:
	return dict(filter(lambda i: i[0] == 'NID', requests.get(
	f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
	timeout=self.timeout,
	**self.requests_args
	).cookies.items()))
	except:
	continue
	else:
	if len(self.proxies) > 0:
	proxy = {'https': self.proxies[self.proxy_index]}
	else:
	proxy = ''
	try:
	return dict(filter(lambda i: i[0] == 'NID', requests.get(
	f'{BASE_TRENDS_URL}/explore/?geo={self.hl[-2:]}',
	timeout=self.timeout,
	proxies=proxy,
	**self.requests_args
	).cookies.items()))
	except requests.exceptions.ProxyError:
	print('Proxy error. Changing IP')
	if len(self.proxies) > 1:
	self.proxies.remove(self.proxies[self.proxy_index])
	else:
	print('No more proxies available. Bye!')
	raise
	continue

	def GetNewProxy(self):
	"""
	Increment proxy INDEX; zero on overflow
	"""
	if self.proxy_index < (len(self.proxies) - 1):
	self.proxy_index += 1
	else:
	self.proxy_index = 0

	def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
	"""Send a request to Google and return the JSON response as a Python object
	:param url: the url to which the request will be sent
	:param method: the HTTP method ('get' or 'post')
	:param trim_chars: how many characters should be trimmed off the beginning of the content of the response
	before this is passed to the JSON parser
	:param kwargs: any extra key arguments passed to the request builder (usually query parameters or data)
	:return:
	"""
	s = requests.session()
	# Retries mechanism. Activated when one of statements >0 (best used for proxy)
	if self.retries > 0 or self.backoff_factor > 0:
	retry = Retry(total=self.retries, read=self.retries,
	connect=self.retries,
	backoff_factor=self.backoff_factor,
	status_forcelist=TrendReq.ERROR_CODES,
	method_whitelist=frozenset(['GET', 'POST']))
	s.mount('https://', HTTPAdapter(max_retries=retry))

	s.headers.update(self.headers)
	if len(self.proxies) > 0:
	self.cookies = self.GetGoogleCookie()
	s.proxies.update({'https': self.proxies[self.proxy_index]})
	if method == TrendReq.POST_METHOD:
	response = s.post(url, timeout=self.timeout,
	cookies=self.cookies, **kwargs,
	**self.requests_args) # DO NOT USE retries or backoff_factor here
	else:
	response = s.get(url, timeout=self.timeout, cookies=self.cookies,
	kwargs, self.requests_args) # DO NOT USE retries or backoff_factor here
	# check if the response contains json and throw an exception otherwise
	# Google mostly sends 'application/json' in the Content-Type header,
	# but occasionally it sends 'application/javascript
	# and sometimes even 'text/javascript
	if response.status_code == 200 and 'application/json' in \
	response.headers['Content-Type'] or \
	'application/javascript' in response.headers['Content-Type'] or \
	'text/javascript' in response.headers['Content-Type']:
	# trim initial characters
	# some responses start with garbage characters, like ")]}',"
	# these have to be cleaned before being passed to the json parser
	content = response.text[trim_chars:]
	# parse json
	self.GetNewProxy()
	return json.loads(content)
	else:
	if response.status_code == status_codes.codes.too_many_requests:
	raise exceptions.TooManyRequestsError.from_response(response)
	raise exceptions.ResponseError.from_response(response)

	def build_payload(self, kw_list, cat=0, timeframe='today 5-y', geo='',
	gprop=''):
	"""Create the payload for related queries, interest over time and interest by region"""
	if gprop not in ['', 'images', 'news', 'youtube', 'froogle']:
	raise ValueError('gprop must be empty (to indicate web), images, news, youtube, or froogle')
	self.kw_list = kw_list
	self.geo = geo or self.geo
	self.token_payload = {
	'hl': self.hl,
	'tz': self.tz,
	'req': {'comparisonItem': [], 'category': cat, 'property': gprop}
	}

	# Check if timeframe is a list
	if isinstance(timeframe, list):
	for index, kw in enumerate(self.kw_list):
	keyword_payload = {'keyword': kw, 'time': timeframe[index], 'geo': self.geo}
	self.token_payload['req']['comparisonItem'].append(keyword_payload)
	else:
	# build out json for each keyword with
	for kw in self.kw_list:
	keyword_payload = {'keyword': kw, 'time': timeframe, 'geo': self.geo}
	self.token_payload['req']['comparisonItem'].append(keyword_payload)

	# requests will mangle this if it is not a string
	self.token_payload['req'] = json.dumps(self.token_payload['req'])
	# get tokens
	self._tokens()
	return

	def _tokens(self):
	"""Makes request to Google to get API tokens for interest over time, interest by region and related queries"""
	# make the request and parse the returned json
	widget_dicts = self._get_data(
	url=TrendReq.GENERAL_URL,
	method=TrendReq.POST_METHOD,
	params=self.token_payload,
	trim_chars=4,
	)['widgets']
	# order of the json matters...
	first_region_token = True
	# clear self.related_queries_widget_list and self.related_topics_widget_list
	# of old keywords'widgets
	self.related_queries_widget_list[:] = []
	self.related_topics_widget_list[:] = []
	# assign requests
	for widget in widget_dicts:
	if widget['id'] == 'TIMESERIES':
	self.interest_over_time_widget = widget
	if widget['id'] == 'GEO_MAP' and first_region_token:
	self.interest_by_region_widget = widget
	first_region_token = False
	# response for each term, put into a list
	if 'RELATED_TOPICS' in widget['id']:
	self.related_topics_widget_list.append(widget)
	if 'RELATED_QUERIES' in widget['id']:
	self.related_queries_widget_list.append(widget)
	return

	def interest_over_time(self):
	"""Request data from Google's Interest Over Time section and return a dataframe"""

	over_time_payload = {
	# convert to string as requests will mangle
	'req': json.dumps(self.interest_over_time_widget['request']),
	'token': self.interest_over_time_widget['token'],
	'tz': self.tz
	}

	# make the request and parse the returned json
	req_json = self._get_data(
	url=TrendReq.INTEREST_OVER_TIME_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=over_time_payload,
	)

	df = pd.DataFrame(req_json['default']['timelineData'])
	if (df.empty):
	return df

	df['date'] = pd.to_datetime(df['time'].astype(dtype='float64'),
	unit='s')
	df = df.set_index(['date']).sort_index()
	# split list columns into seperate ones, remove brackets and split on comma
	result_df = df['value'].apply(lambda x: pd.Series(
	str(x).replace('[', '').replace(']', '').split(',')))
	# rename each column with its search term, relying on order that google provides...
	for idx, kw in enumerate(self.kw_list):
	# there is currently a bug with assigning columns that may be
	# parsed as a date in pandas: use explicit insert column method
	result_df.insert(len(result_df.columns), kw,
	result_df[idx].astype('int'))
	del result_df[idx]

	if 'isPartial' in df:
	# make other dataframe from isPartial key data
	# split list columns into seperate ones, remove brackets and split on comma
	df = df.fillna(False)
	result_df2 = df['isPartial'].apply(lambda x: pd.Series(
	str(x).replace('[', '').replace(']', '').split(',')))
	result_df2.columns = ['isPartial']
	# Change to a bool type.
	result_df2.isPartial = result_df2.isPartial == 'True'
	# concatenate the two dataframes
	final = pd.concat([result_df, result_df2], axis=1)
	else:
	final = result_df
	final['isPartial'] = False

	return final

	def multirange_interest_over_time(self):
	"""Request data from Google's Interest Over Time section across different time ranges and return a dataframe"""

	over_time_payload = {
	# convert to string as requests will mangle
	'req': json.dumps(self.interest_over_time_widget['request']),
	'token': self.interest_over_time_widget['token'],
	'tz': self.tz
	}

	# make the request and parse the returned json
	req_json = self._get_data(
	url=TrendReq.MULTIRANGE_INTEREST_OVER_TIME_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=over_time_payload,
	)

	df = pd.DataFrame(req_json['default']['timelineData'])
	if (df.empty):
	return df

	result_df = pd.json_normalize(df['columnData'])

	# Split dictionary columns into seperate ones
	for i, column in enumerate(result_df.columns):
	result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = result_df[i].apply(pd.Series)["formattedTime"]
	result_df["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = result_df[i].apply(pd.Series)["value"]
	result_df = result_df.drop([i], axis=1)

	# Adds a row with the averages at the top of the dataframe
	avg_row = {}
	for i, avg in enumerate(req_json['default']['averages']):
	avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " date"] = "Average"
	avg_row["[" + str(i) + "] " + str(self.kw_list[i]) + " value"] = req_json['default']['averages'][i]

	result_df.loc[-1] = avg_row
	result_df.index = result_df.index + 1
	result_df = result_df.sort_index()

	return result_df


	def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
	inc_geo_code=False):
	"""Request data from Google's Interest by Region section and return a dataframe"""

	# make the request
	region_payload = dict()
	if self.geo == '':
	self.interest_by_region_widget['request'][
	'resolution'] = resolution
	elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
	self.interest_by_region_widget['request'][
	'resolution'] = resolution

	self.interest_by_region_widget['request'][
	'includeLowSearchVolumeGeos'] = inc_low_vol

	# convert to string as requests will mangle
	region_payload['req'] = json.dumps(
	self.interest_by_region_widget['request'])
	region_payload['token'] = self.interest_by_region_widget['token']
	region_payload['tz'] = self.tz

	# parse returned json
	req_json = self._get_data(
	url=TrendReq.INTEREST_BY_REGION_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=region_payload,
	)
	df = pd.DataFrame(req_json['default']['geoMapData'])
	if (df.empty):
	return df

	# rename the column with the search keyword
	geo_column = 'geoCode' if 'geoCode' in df.columns else 'coordinates'
	columns = ['geoName', geo_column, 'value']
	df = df[columns].set_index(['geoName']).sort_index()
	# split list columns into separate ones, remove brackets and split on comma
	result_df = df['value'].apply(lambda x: pd.Series(
	str(x).replace('[', '').replace(']', '').split(',')))
	if inc_geo_code:
	if geo_column in df.columns:
	result_df[geo_column] = df[geo_column]
	else:
	print('Could not find geo_code column; Skipping')

	# rename each column with its search term
	for idx, kw in enumerate(self.kw_list):
	result_df[kw] = result_df[idx].astype('int')
	del result_df[idx]

	return result_df

	def related_topics(self):
	"""Request data from Google's Related Topics section and return a dictionary of dataframes

	If no top and/or rising related topics are found, the value for the key "top" and/or "rising" will be None
	"""

	# make the request
	related_payload = dict()
	result_dict = dict()
	for request_json in self.related_topics_widget_list:
	# ensure we know which keyword we are looking at rather than relying on order
	try:
	kw = request_json['request']['restriction'][
	'complexKeywordsRestriction']['keyword'][0]['value']
	except KeyError:
	kw = ''
	# convert to string as requests will mangle
	related_payload['req'] = json.dumps(request_json['request'])
	related_payload['token'] = request_json['token']
	related_payload['tz'] = self.tz

	# parse the returned json
	req_json = self._get_data(
	url=TrendReq.RELATED_QUERIES_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=related_payload,
	)

	# top topics
	try:
	top_list = req_json['default']['rankedList'][0]['rankedKeyword']
	df_top = pd.json_normalize(top_list, sep='_')
	except KeyError:
	# in case no top topics are found, the lines above will throw a KeyError
	df_top = None

	# rising topics
	try:
	rising_list = req_json['default']['rankedList'][1]['rankedKeyword']
	df_rising = pd.json_normalize(rising_list, sep='_')
	except KeyError:
	# in case no rising topics are found, the lines above will throw a KeyError
	df_rising = None

	result_dict[kw] = {'rising': df_rising, 'top': df_top}
	return result_dict

	def related_queries(self):
	"""Request data from Google's Related Queries section and return a dictionary of dataframes

	If no top and/or rising related queries are found, the value for the key "top" and/or "rising" will be None
	"""

	# make the request
	related_payload = dict()
	result_dict = dict()
	for request_json in self.related_queries_widget_list:
	# ensure we know which keyword we are looking at rather than relying on order
	try:
	kw = request_json['request']['restriction'][
	'complexKeywordsRestriction']['keyword'][0]['value']
	except KeyError:
	kw = ''
	# convert to string as requests will mangle
	related_payload['req'] = json.dumps(request_json['request'])
	related_payload['token'] = request_json['token']
	related_payload['tz'] = self.tz

	# parse the returned json
	req_json = self._get_data(
	url=TrendReq.RELATED_QUERIES_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=related_payload,
	)

	# top queries
	try:
	top_df = pd.DataFrame(
	req_json['default']['rankedList'][0]['rankedKeyword'])
	top_df = top_df[['query', 'value']]
	except KeyError:
	# in case no top queries are found, the lines above will throw a KeyError
	top_df = None

	# rising queries
	try:
	rising_df = pd.DataFrame(
	req_json['default']['rankedList'][1]['rankedKeyword'])
	rising_df = rising_df[['query', 'value']]
	except KeyError:
	# in case no rising queries are found, the lines above will throw a KeyError
	rising_df = None

	result_dict[kw] = {'top': top_df, 'rising': rising_df}
	return result_dict

	def trending_searches(self, pn='united_states'):
	"""Request data from Google's Hot Searches section and return a dataframe"""

	# make the request
	# forms become obsolete due to the new TRENDING_SEARCHES_URL
	# forms = {'ajax': 1, 'pn': pn, 'htd': '', 'htv': 'l'}
	req_json = self._get_data(
	url=TrendReq.TRENDING_SEARCHES_URL,
	method=TrendReq.GET_METHOD
	)[pn]
	print(req_json)
	result_df = pd.DataFrame(req_json)
	return result_df

	def today_searches(self, pn='US'):
	"""Request data from Google Daily Trends section and returns a dataframe"""
	forms = {'ns': 15, 'geo': pn, 'tz': '-180', 'hl': self.hl}
	req_json = self._get_data(
	url=TrendReq.TODAY_SEARCHES_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=forms,
	**self.requests_args
	)['default']['trendingSearchesDays'][0]['trendingSearches']
	# parse the returned jso

	return req_json

	def realtime_trending_searches(self, pn='US', cat='all', count =300):
	"""Request data from Google Realtime Search Trends section and returns a dataframe"""
	# Don't know what some of the params mean here, followed the nodejs library
	# https://github.com/pat310/google-trends-api/ 's implemenration


	#sort: api accepts only 0 as the value, optional parameter

	# ri: number of trending stories IDs returned,
	# max value of ri supported is 300, based on emperical evidence

	ri_value = 300
	if count < ri_value:
	ri_value = count

	# rs : don't know what is does but it's max value is never more than the ri_value based on emperical evidence
	# max value of ri supported is 200, based on emperical evidence
	rs_value = 200
	if count < rs_value:
	rs_value = count-1

	forms = {'ns': 15, 'geo': pn, 'tz': '300', 'hl': self.hl, 'cat': cat, 'fi' : '0', 'fs' : '0', 'ri' : ri_value, 'rs' : rs_value, 'sort' : 0}
	req_json = self._get_data(
	url=TrendReq.REALTIME_TRENDING_SEARCHES_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=forms
	)['storySummaries']['trendingStories']

	return req_json

	def top_charts(self, date, hl='en-US', tz=300, geo='GLOBAL'):
	"""Request data from Google's Top Charts section and return a dataframe"""

	try:
	date = int(date)
	except:
	raise ValueError(
	'The date must be a year with format YYYY. See https://github.com/GeneralMills/pytrends/issues/355')

	# create the payload
	chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
	'isMobile': False}

	# make the request and parse the returned json
	req_json = self._get_data(
	url=TrendReq.TOP_CHARTS_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=chart_payload
	)
	try:
	df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
	except IndexError:
	df = None
	return df

	def trends(self, date, hl='en-US', tz=300, geo='GLOBAL'):
	"""Request data from Google's Top Charts section and return a dataframe"""

	# create the payload
	chart_payload = {'hl': hl, 'tz': tz, 'date': date, 'geo': geo,
	'isMobile': False}

	# make the request and parse the returned json
	req_json = self._get_data(
	url=TrendReq.GENERAL_URL,
	method=TrendReq.GET_METHOD,
	trim_chars=5,
	params=chart_payload
	)
	try:
	df = pd.DataFrame(req_json['topCharts'][0]['listItems'])
	except IndexError:
	df = None
	return df

	def suggestions(self, keyword):
	"""Request data from Google's Keyword Suggestion dropdown and return a dictionary"""

	# make the request
	kw_param = quote(keyword)
	parameters = {'hl': self.hl}

	req_json = self._get_data(
	url=TrendReq.SUGGESTIONS_URL + kw_param,
	params=parameters,
	method=TrendReq.GET_METHOD,
	trim_chars=5
	)['default']['topics']
	return req_json

	def categories(self):
	"""Request available categories data from Google's API and return a dictionary"""

	params = {'hl': self.hl}

	req_json = self._get_data(
	url=TrendReq.CATEGORIES_URL,
	params=params,
	method=TrendReq.GET_METHOD,
	trim_chars=5
	)
	return req_json

	def get_historical_interest(self, args, *kwargs):
	raise NotImplementedError(
	"""This method has been removed for incorrectness. It will be removed completely in v5.
	If you'd like similar functionality, please try implementing it yourself and consider submitting a pull request to add it to pytrends.

	There is discussion at:
	https://github.com/GeneralMills/pytrends/pull/542"""
	)