nakas commited on
Commit
57dd157
·
verified ·
1 Parent(s): 13b5c09

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from playwright.sync_api import sync_playwright
3
+ import time
4
+ import json
5
+
6
+ def scrape_website(url, wait_time=5):
7
+ """
8
+ Scrape a website using Playwright headless browser
9
+ Args:
10
+ url (str): The URL to scrape
11
+ wait_time (int): Time to wait for dynamic content to load
12
+ Returns:
13
+ dict: Dictionary containing scraped data
14
+ """
15
+ try:
16
+ with sync_playwright() as p:
17
+ # Launch browser in headless mode
18
+ browser = p.chromium.launch(headless=True)
19
+ page = browser.new_page()
20
+
21
+ # Go to URL and wait for network to be idle
22
+ page.goto(url, wait_until="networkidle")
23
+ time.sleep(wait_time) # Additional wait for dynamic content
24
+
25
+ # Get basic page information
26
+ title = page.title()
27
+
28
+ # Extract all text content
29
+ text_content = page.text_content('body')
30
+
31
+ # Extract all links
32
+ links = page.eval_on_selector_all('a[href]', 'elements => elements.map(el => el.href)')
33
+
34
+ # Extract all images
35
+ images = page.eval_on_selector_all('img[src]', 'elements => elements.map(el => el.src)')
36
+
37
+ # Get meta description
38
+ meta_description = page.eval_on_selector('meta[name="description"]',
39
+ 'element => element.content') if page.query_selector('meta[name="description"]') else ''
40
+
41
+ # Close browser
42
+ browser.close()
43
+
44
+ return {
45
+ "title": title,
46
+ "meta_description": meta_description,
47
+ "text_content": text_content[:1000] + "...", # Truncate for display
48
+ "links": links[:10], # Show first 10 links
49
+ "images": images[:5], # Show first 5 images
50
+ "status": "Success"
51
+ }
52
+
53
+ except Exception as e:
54
+ return {
55
+ "status": "Error",
56
+ "error_message": str(e)
57
+ }
58
+
59
+ def format_output(result):
60
+ """Format the output for better display in Gradio"""
61
+ if result["status"] == "Error":
62
+ return f"Error: {result['error_message']}"
63
+
64
+ output = f"""
65
+ ### Page Title
66
+ {result['title']}
67
+
68
+ ### Meta Description
69
+ {result['meta_description']}
70
+
71
+ ### First 1000 characters of content
72
+ {result['text_content']}
73
+
74
+ ### First 10 Links
75
+ {json.dumps(result['links'], indent=2)}
76
+
77
+ ### First 5 Images
78
+ {json.dumps(result['images'], indent=2)}
79
+ """
80
+ return output
81
+
82
+ # Create Gradio interface
83
+ iface = gr.Interface(
84
+ fn=lambda url, wait_time: format_output(scrape_website(url, wait_time)),
85
+ inputs=[
86
+ gr.Textbox(label="URL to scrape", placeholder="https://example.com"),
87
+ gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Wait time (seconds)")
88
+ ],
89
+ outputs=gr.Markdown(),
90
+ title="Web Scraper with Headless Browser",
91
+ description="""
92
+ Enter a URL to scrape its content using a headless browser.
93
+ The tool will extract the title, meta description, text content, links, and images.
94
+ Please use responsibly and respect websites' terms of service and robots.txt files.
95
+ """,
96
+ examples=[
97
+ ["https://example.com", 5],
98
+ ["https://news.ycombinator.com", 8]
99
+ ]
100
+ )
101
+
102
+ # Launch the interface
103
+ if __name__ == "__main__":
104
+ iface.launch()