Spaces:
Running
Running
| import json | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from datetime import datetime | |
| import time | |
| import random | |
| from tenacity import ( | |
| retry, | |
| stop_after_attempt, | |
| wait_exponential, | |
| retry_if_exception_type, | |
| retry_if_result, | |
| ) | |
| def is_rate_limited(response): | |
| """Check if the response indicates rate limiting (status code 429)""" | |
| return response.status_code == 429 | |
| def make_request(url, headers): | |
| """Make a request with retry logic for rate limiting""" | |
| # Random delay before each request to avoid detection | |
| time.sleep(random.uniform(2, 6)) | |
| response = requests.get(url, headers=headers) | |
| return response | |
| def getNewsData(query, start_date, end_date): | |
| """ | |
| Scrape Google News search results for a given query and date range. | |
| query: str - search query | |
| start_date: str - start date in the format yyyy-mm-dd or mm/dd/yyyy | |
| end_date: str - end date in the format yyyy-mm-dd or mm/dd/yyyy | |
| """ | |
| if "-" in start_date: | |
| start_date = datetime.strptime(start_date, "%Y-%m-%d") | |
| start_date = start_date.strftime("%m/%d/%Y") | |
| if "-" in end_date: | |
| end_date = datetime.strptime(end_date, "%Y-%m-%d") | |
| end_date = end_date.strftime("%m/%d/%Y") | |
| headers = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/101.0.4951.54 Safari/537.36" | |
| ) | |
| } | |
| news_results = [] | |
| page = 0 | |
| while True: | |
| offset = page * 10 | |
| url = ( | |
| f"https://www.google.com/search?q={query}" | |
| f"&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}" | |
| f"&tbm=nws&start={offset}" | |
| ) | |
| try: | |
| response = make_request(url, headers) | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| results_on_page = soup.select("div.SoaBEf") | |
| if not results_on_page: | |
| break # No more results found | |
| for el in results_on_page: | |
| try: | |
| link = el.find("a")["href"] | |
| title = el.select_one("div.MBeuO").get_text() | |
| snippet = el.select_one(".GI74Re").get_text() | |
| date = el.select_one(".LfVVr").get_text() | |
| source = el.select_one(".NUnG9d span").get_text() | |
| news_results.append( | |
| { | |
| "link": link, | |
| "title": title, | |
| "snippet": snippet, | |
| "date": date, | |
| "source": source, | |
| } | |
| ) | |
| except Exception as e: | |
| print(f"Error processing result: {e}") | |
| # If one of the fields is not found, skip this result | |
| continue | |
| # Update the progress bar with the current count of results scraped | |
| # Check for the "Next" link (pagination) | |
| next_link = soup.find("a", id="pnnext") | |
| if not next_link: | |
| break | |
| page += 1 | |
| except Exception as e: | |
| print(f"Failed after multiple retries: {e}") | |
| break | |
| return news_results | |