Dedeep Vasireddy commited on
Commit
7af6504
·
verified ·
1 Parent(s): f87176f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+ import json
5
+ import os
6
+
7
+ from selenium import webdriver
8
+ from selenium.webdriver.chrome.options import Options
9
+
10
+ from playwright.sync_api import sync_playwright
11
+
12
+ # Load from environment variables
13
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
14
+ GOOGLE_CSE_ID = os.environ.get("GOOGLE_CSE_ID")
15
+
16
+ if not GOOGLE_API_KEY or not GOOGLE_CSE_ID:
17
+ raise ValueError("Please set GOOGLE_API_KEY and GOOGLE_CSE_ID in the environment")
18
+
19
+ def get_google_search_links(query, num=5):
20
+ url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={GOOGLE_API_KEY}&cx={GOOGLE_CSE_ID}&num={num}"
21
+ response = requests.get(url)
22
+ links = []
23
+ if response.status_code == 200:
24
+ results = response.json().get('items', [])
25
+ for item in results:
26
+ link = item.get('link')
27
+ if link:
28
+ links.append(link)
29
+ return links
30
+
31
+ def scrape_with_selenium(url):
32
+ try:
33
+ options = Options()
34
+ options.add_argument("--headless")
35
+ options.add_argument("--no-sandbox")
36
+ options.add_argument("--disable-dev-shm-usage")
37
+ driver = webdriver.Chrome(options=options)
38
+ driver.get(url)
39
+ content = driver.page_source
40
+ driver.quit()
41
+ return {"url": url, "content": content[:1000]} # Limit content size
42
+ except Exception as e:
43
+ return {"url": url, "error": str(e)}
44
+
45
+ def scrape_with_playwright(url):
46
+ try:
47
+ with sync_playwright() as p:
48
+ browser = p.chromium.launch(headless=True)
49
+ page = browser.new_page()
50
+ page.goto(url)
51
+ content = page.content()
52
+ browser.close()
53
+ return {"url": url, "content": content[:1000]} # Limit content size
54
+ except Exception as e:
55
+ return {"url": url, "error": str(e)}
56
+
57
+ def parallel_scrape(urls):
58
+ results = []
59
+ with ThreadPoolExecutor(max_workers=10) as executor:
60
+ futures = []
61
+ for url in urls:
62
+ futures.append(executor.submit(scrape_with_selenium, url))
63
+ futures.append(executor.submit(scrape_with_playwright, url))
64
+
65
+ for future in as_completed(futures):
66
+ results.append(future.result())
67
+ return results
68
+
69
+ def process_query(query):
70
+ links = get_google_search_links(query)
71
+ if not links:
72
+ return json.dumps({"error": "No links found"}, indent=2)
73
+ scraped_data = parallel_scrape(links)
74
+ return json.dumps(scraped_data, indent=2)
75
+
76
+ with gr.Blocks() as demo:
77
+ gr.Markdown("## Google CSE Scraper using Selenium & Playwright")
78
+ query_input = gr.Textbox(label="Enter search query")
79
+ output = gr.Textbox(label="Scraped Results", lines=20)
80
+ submit_btn = gr.Button("Search and Scrape")
81
+
82
+ submit_btn.click(fn=process_query, inputs=query_input, outputs=output)
83
+
84
+ if __name__ == "__main__":
85
+ demo.launch()