WebKnitter / app.py
Dedeep Vasireddy
Create app.py
7af6504 verified
import gradio as gr
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from playwright.sync_api import sync_playwright
# Load from environment variables
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.environ.get("GOOGLE_CSE_ID")
if not GOOGLE_API_KEY or not GOOGLE_CSE_ID:
raise ValueError("Please set GOOGLE_API_KEY and GOOGLE_CSE_ID in the environment")
def get_google_search_links(query, num=5):
url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={GOOGLE_API_KEY}&cx={GOOGLE_CSE_ID}&num={num}"
response = requests.get(url)
links = []
if response.status_code == 200:
results = response.json().get('items', [])
for item in results:
link = item.get('link')
if link:
links.append(link)
return links
def scrape_with_selenium(url):
try:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)
driver.get(url)
content = driver.page_source
driver.quit()
return {"url": url, "content": content[:1000]} # Limit content size
except Exception as e:
return {"url": url, "error": str(e)}
def scrape_with_playwright(url):
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
content = page.content()
browser.close()
return {"url": url, "content": content[:1000]} # Limit content size
except Exception as e:
return {"url": url, "error": str(e)}
def parallel_scrape(urls):
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for url in urls:
futures.append(executor.submit(scrape_with_selenium, url))
futures.append(executor.submit(scrape_with_playwright, url))
for future in as_completed(futures):
results.append(future.result())
return results
def process_query(query):
links = get_google_search_links(query)
if not links:
return json.dumps({"error": "No links found"}, indent=2)
scraped_data = parallel_scrape(links)
return json.dumps(scraped_data, indent=2)
with gr.Blocks() as demo:
gr.Markdown("## Google CSE Scraper using Selenium & Playwright")
query_input = gr.Textbox(label="Enter search query")
output = gr.Textbox(label="Scraped Results", lines=20)
submit_btn = gr.Button("Search and Scrape")
submit_btn.click(fn=process_query, inputs=query_input, outputs=output)
if __name__ == "__main__":
demo.launch()