|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import json
|
|
|
import time
|
|
|
|
|
|
SERPAPI_KEY = "c3e3e8fd8d12ca55d8a8954a14bf827f2d4261ef55373b381661f23b1440a2af"
|
|
|
|
|
|
def google_search(query, num_results=1000):
|
|
|
url = "https://serpapi.com/search"
|
|
|
params = {
|
|
|
"engine": "google",
|
|
|
"q": query,
|
|
|
"api_key": SERPAPI_KEY,
|
|
|
"num": num_results
|
|
|
}
|
|
|
resp = requests.get(url, params=params)
|
|
|
data = resp.json()
|
|
|
links = []
|
|
|
for result in data.get("organic_results", []):
|
|
|
link = result.get("link")
|
|
|
if link:
|
|
|
links.append(link)
|
|
|
return links
|
|
|
|
|
|
def scrape_page(url):
|
|
|
try:
|
|
|
resp = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
|
|
|
soup = BeautifulSoup(resp.content, "html.parser")
|
|
|
text = soup.get_text(separator="\n", strip=True)
|
|
|
return text[:20000]
|
|
|
except Exception as e:
|
|
|
return f"[SCRAPE ERROR] {e}"
|
|
|
|
|
|
def scrape_topic_and_save(topic, filename="results.json"):
|
|
|
links = google_search(topic)
|
|
|
results = []
|
|
|
for url in links:
|
|
|
print(f"Scraping: {url}")
|
|
|
content = scrape_page(url)
|
|
|
results.append({"url": url, "content": content})
|
|
|
time.sleep(0)
|
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
print(f"Saved {len(results)} results to {filename}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
topic = input("Enter topic to search: ")
|
|
|
scrape_topic_and_save(topic) |