import gradio as gr import requests from concurrent.futures import ThreadPoolExecutor, as_completed import json import os from selenium import webdriver from selenium.webdriver.chrome.options import Options from playwright.sync_api import sync_playwright # Load from environment variables GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") GOOGLE_CSE_ID = os.environ.get("GOOGLE_CSE_ID") if not GOOGLE_API_KEY or not GOOGLE_CSE_ID: raise ValueError("Please set GOOGLE_API_KEY and GOOGLE_CSE_ID in the environment") def get_google_search_links(query, num=5): url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={GOOGLE_API_KEY}&cx={GOOGLE_CSE_ID}&num={num}" response = requests.get(url) links = [] if response.status_code == 200: results = response.json().get('items', []) for item in results: link = item.get('link') if link: links.append(link) return links def scrape_with_selenium(url): try: options = Options() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=options) driver.get(url) content = driver.page_source driver.quit() return {"url": url, "content": content[:1000]} # Limit content size except Exception as e: return {"url": url, "error": str(e)} def scrape_with_playwright(url): try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto(url) content = page.content() browser.close() return {"url": url, "content": content[:1000]} # Limit content size except Exception as e: return {"url": url, "error": str(e)} def parallel_scrape(urls): results = [] with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for url in urls: futures.append(executor.submit(scrape_with_selenium, url)) futures.append(executor.submit(scrape_with_playwright, url)) for future in as_completed(futures): results.append(future.result()) return results def process_query(query): links = get_google_search_links(query) if not links: return json.dumps({"error": "No links found"}, indent=2) scraped_data = parallel_scrape(links) return json.dumps(scraped_data, indent=2) with gr.Blocks() as demo: gr.Markdown("## Google CSE Scraper using Selenium & Playwright") query_input = gr.Textbox(label="Enter search query") output = gr.Textbox(label="Scraped Results", lines=20) submit_btn = gr.Button("Search and Scrape") submit_btn.click(fn=process_query, inputs=query_input, outputs=output) if __name__ == "__main__": demo.launch()