Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import json | |
| import os | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from playwright.sync_api import sync_playwright | |
| # Load from environment variables | |
| GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") | |
| GOOGLE_CSE_ID = os.environ.get("GOOGLE_CSE_ID") | |
| if not GOOGLE_API_KEY or not GOOGLE_CSE_ID: | |
| raise ValueError("Please set GOOGLE_API_KEY and GOOGLE_CSE_ID in the environment") | |
| def get_google_search_links(query, num=5): | |
| url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={GOOGLE_API_KEY}&cx={GOOGLE_CSE_ID}&num={num}" | |
| response = requests.get(url) | |
| links = [] | |
| if response.status_code == 200: | |
| results = response.json().get('items', []) | |
| for item in results: | |
| link = item.get('link') | |
| if link: | |
| links.append(link) | |
| return links | |
| def scrape_with_selenium(url): | |
| try: | |
| options = Options() | |
| options.add_argument("--headless") | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| driver = webdriver.Chrome(options=options) | |
| driver.get(url) | |
| content = driver.page_source | |
| driver.quit() | |
| return {"url": url, "content": content[:1000]} # Limit content size | |
| except Exception as e: | |
| return {"url": url, "error": str(e)} | |
| def scrape_with_playwright(url): | |
| try: | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| page = browser.new_page() | |
| page.goto(url) | |
| content = page.content() | |
| browser.close() | |
| return {"url": url, "content": content[:1000]} # Limit content size | |
| except Exception as e: | |
| return {"url": url, "error": str(e)} | |
| def parallel_scrape(urls): | |
| results = [] | |
| with ThreadPoolExecutor(max_workers=10) as executor: | |
| futures = [] | |
| for url in urls: | |
| futures.append(executor.submit(scrape_with_selenium, url)) | |
| futures.append(executor.submit(scrape_with_playwright, url)) | |
| for future in as_completed(futures): | |
| results.append(future.result()) | |
| return results | |
| def process_query(query): | |
| links = get_google_search_links(query) | |
| if not links: | |
| return json.dumps({"error": "No links found"}, indent=2) | |
| scraped_data = parallel_scrape(links) | |
| return json.dumps(scraped_data, indent=2) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Google CSE Scraper using Selenium & Playwright") | |
| query_input = gr.Textbox(label="Enter search query") | |
| output = gr.Textbox(label="Scraped Results", lines=20) | |
| submit_btn = gr.Button("Search and Scrape") | |
| submit_btn.click(fn=process_query, inputs=query_input, outputs=output) | |
| if __name__ == "__main__": | |
| demo.launch() | |