Spaces:
Sleeping
Sleeping
Dedeep Vasireddy
commited on
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import requests
|
| 3 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from selenium import webdriver
|
| 8 |
+
from selenium.webdriver.chrome.options import Options
|
| 9 |
+
|
| 10 |
+
from playwright.sync_api import sync_playwright
|
| 11 |
+
|
| 12 |
+
# Load from environment variables
|
| 13 |
+
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
|
| 14 |
+
GOOGLE_CSE_ID = os.environ.get("GOOGLE_CSE_ID")
|
| 15 |
+
|
| 16 |
+
if not GOOGLE_API_KEY or not GOOGLE_CSE_ID:
|
| 17 |
+
raise ValueError("Please set GOOGLE_API_KEY and GOOGLE_CSE_ID in the environment")
|
| 18 |
+
|
| 19 |
+
def get_google_search_links(query, num=5):
|
| 20 |
+
url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={GOOGLE_API_KEY}&cx={GOOGLE_CSE_ID}&num={num}"
|
| 21 |
+
response = requests.get(url)
|
| 22 |
+
links = []
|
| 23 |
+
if response.status_code == 200:
|
| 24 |
+
results = response.json().get('items', [])
|
| 25 |
+
for item in results:
|
| 26 |
+
link = item.get('link')
|
| 27 |
+
if link:
|
| 28 |
+
links.append(link)
|
| 29 |
+
return links
|
| 30 |
+
|
| 31 |
+
def scrape_with_selenium(url):
|
| 32 |
+
try:
|
| 33 |
+
options = Options()
|
| 34 |
+
options.add_argument("--headless")
|
| 35 |
+
options.add_argument("--no-sandbox")
|
| 36 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 37 |
+
driver = webdriver.Chrome(options=options)
|
| 38 |
+
driver.get(url)
|
| 39 |
+
content = driver.page_source
|
| 40 |
+
driver.quit()
|
| 41 |
+
return {"url": url, "content": content[:1000]} # Limit content size
|
| 42 |
+
except Exception as e:
|
| 43 |
+
return {"url": url, "error": str(e)}
|
| 44 |
+
|
| 45 |
+
def scrape_with_playwright(url):
|
| 46 |
+
try:
|
| 47 |
+
with sync_playwright() as p:
|
| 48 |
+
browser = p.chromium.launch(headless=True)
|
| 49 |
+
page = browser.new_page()
|
| 50 |
+
page.goto(url)
|
| 51 |
+
content = page.content()
|
| 52 |
+
browser.close()
|
| 53 |
+
return {"url": url, "content": content[:1000]} # Limit content size
|
| 54 |
+
except Exception as e:
|
| 55 |
+
return {"url": url, "error": str(e)}
|
| 56 |
+
|
| 57 |
+
def parallel_scrape(urls):
|
| 58 |
+
results = []
|
| 59 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 60 |
+
futures = []
|
| 61 |
+
for url in urls:
|
| 62 |
+
futures.append(executor.submit(scrape_with_selenium, url))
|
| 63 |
+
futures.append(executor.submit(scrape_with_playwright, url))
|
| 64 |
+
|
| 65 |
+
for future in as_completed(futures):
|
| 66 |
+
results.append(future.result())
|
| 67 |
+
return results
|
| 68 |
+
|
| 69 |
+
def process_query(query):
|
| 70 |
+
links = get_google_search_links(query)
|
| 71 |
+
if not links:
|
| 72 |
+
return json.dumps({"error": "No links found"}, indent=2)
|
| 73 |
+
scraped_data = parallel_scrape(links)
|
| 74 |
+
return json.dumps(scraped_data, indent=2)
|
| 75 |
+
|
| 76 |
+
with gr.Blocks() as demo:
|
| 77 |
+
gr.Markdown("## Google CSE Scraper using Selenium & Playwright")
|
| 78 |
+
query_input = gr.Textbox(label="Enter search query")
|
| 79 |
+
output = gr.Textbox(label="Scraped Results", lines=20)
|
| 80 |
+
submit_btn = gr.Button("Search and Scrape")
|
| 81 |
+
|
| 82 |
+
submit_btn.click(fn=process_query, inputs=query_input, outputs=output)
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
demo.launch()
|