Spaces:
Runtime error
Runtime error
| import os | |
| from urllib.parse import parse_qs, urlparse | |
| from bs4 import BeautifulSoup | |
| import requests | |
| from typing import TypedDict | |
| SIMULATE_BROWSER = os.environ.get("SIMULATE_BROWSER_SEARCH") == "true" | |
| class SelectorsDict(TypedDict): | |
| answer: str | |
| answer_desc: str | |
| answer_citation: str | |
| search_results: str | |
| search_results_desc: str | |
| SELECTORS: SelectorsDict | |
| if SIMULATE_BROWSER: | |
| SELECTORS = { | |
| "answer": ".IZ6rdc", | |
| "answer_desc": ".LGOjhe", | |
| "answer_citation": ".kX21rb.ZYHQ7e", | |
| "search_results": ".Ww4FFb", | |
| "search_results_desc": ".VwiC3b.yXK7lf", | |
| } | |
| else: | |
| SELECTORS = { | |
| "answer_desc": "div.PqksIc", | |
| "answer_citation": "sub.gMUaMb.r0bn4c.rQMQod", | |
| "search_results": "div.egMi0.kCrYT", | |
| "search_results_desc": "div.BNeawe.s3v9rd.AP7Wnd .BNeawe.s3v9rd.AP7Wnd:last-child", | |
| } | |
| def scrapeGoogleSearch(query): | |
| # U.pprint(f"{SIMULATE_BROWSER=}") | |
| finalResponse = [] | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Referer": "https://www.google.com/", | |
| "DNT": "1", # Do Not Track Request Header | |
| "Connection": "keep-alive", | |
| "Upgrade-Insecure-Requests": "1" | |
| } | |
| searchUrl = f"https://www.google.com/search?q={query}" | |
| # Use a session to maintain cookies | |
| with requests.Session() as session: | |
| if SIMULATE_BROWSER: | |
| session.headers.update(headers) | |
| response = session.get(searchUrl) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| with open("soup_dump.html", "w", encoding="utf-8") as file: | |
| file.write(soup.prettify()) | |
| mainDiv = soup.find("div", attrs={"id": "main"}) | |
| answerText = "" | |
| if SELECTORS.get("answer"): | |
| mainAnswerDiv = mainDiv.select_one(SELECTORS["answer"]) | |
| if mainAnswerDiv: | |
| mainAnswer = mainAnswerDiv.text.strip() | |
| answerText = f"**{mainAnswer}**. " | |
| answerDescDiv = mainDiv.select_one(SELECTORS["answer_desc"]) | |
| if answerDescDiv: | |
| citationDateDiv = answerDescDiv.select_one(SELECTORS["answer_citation"]) | |
| citationDate = citationDateDiv.text if citationDateDiv else "" | |
| answerText += answerDescDiv.text.replace(citationDate, "").strip() | |
| citationText = f"Citation Date: {citationDate}" if citationDate else "" | |
| finalResponse.append(f"Verified Answer:\n{answerText}\n{citationText}\n\n\n") | |
| results = mainDiv.select(SELECTORS["search_results"]) | |
| resultsDesc = mainDiv.select(SELECTORS["search_results_desc"]) | |
| # Ensure resultsDesc has the same length as results | |
| resultsDesc += [None] * (len(results) - len(resultsDesc)) | |
| if results: | |
| finalResponse.append("Search Results:\n") | |
| for (i, result) in enumerate(results[:10]): | |
| title = result.find("h3").text | |
| link = result.find("a")["href"] | |
| if not SIMULATE_BROWSER: | |
| parsedUrl = urlparse(link) | |
| urlParams = parse_qs(parsedUrl.query) | |
| link = urlParams.get("q", [None])[0] | |
| desc = resultsDesc[i].text if resultsDesc[i] else "" | |
| finalResponse.append(f"Title: {title}") | |
| finalResponse.append(f"Description: {desc}") | |
| finalResponse.append(f"URL: {link}\n") | |
| else: | |
| print("Failed to retrieve search results.") | |
| return "\n".join(finalResponse) | |