Spaces:

nakas
/

TimberLine_Snow_History

Sleeping

App Files Files Community

nakas commited on Feb 10, 2025

Commit

57dd157

verified ·

1 Parent(s): 13b5c09

Create app.py

Browse files

Files changed (1) hide show

app.py +104 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import gradio as gr
+from playwright.sync_api import sync_playwright
+import time
+import json
+def scrape_website(url, wait_time=5):
+    """
+    Scrape a website using Playwright headless browser
+    Args:
+        url (str): The URL to scrape
+        wait_time (int): Time to wait for dynamic content to load
+    Returns:
+        dict: Dictionary containing scraped data
+    """
+    try:
+        with sync_playwright() as p:
+            # Launch browser in headless mode
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page()
+            # Go to URL and wait for network to be idle
+            page.goto(url, wait_until="networkidle")
+            time.sleep(wait_time)  # Additional wait for dynamic content
+            # Get basic page information
+            title = page.title()
+            # Extract all text content
+            text_content = page.text_content('body')
+            # Extract all links
+            links = page.eval_on_selector_all('a[href]', 'elements => elements.map(el => el.href)')
+            # Extract all images
+            images = page.eval_on_selector_all('img[src]', 'elements => elements.map(el => el.src)')
+            # Get meta description
+            meta_description = page.eval_on_selector('meta[name="description"]',
+                'element => element.content') if page.query_selector('meta[name="description"]') else ''
+            # Close browser
+            browser.close()
+            return {
+                "title": title,
+                "meta_description": meta_description,
+                "text_content": text_content[:1000] + "...",  # Truncate for display
+                "links": links[:10],  # Show first 10 links
+                "images": images[:5],  # Show first 5 images
+                "status": "Success"
+            }
+    except Exception as e:
+        return {
+            "status": "Error",
+            "error_message": str(e)
+        }
+def format_output(result):
+    """Format the output for better display in Gradio"""
+    if result["status"] == "Error":
+        return f"Error: {result['error_message']}"
+    output = f"""
+### Page Title
+{result['title']}
+### Meta Description
+{result['meta_description']}
+### First 1000 characters of content
+{result['text_content']}
+### First 10 Links
+{json.dumps(result['links'], indent=2)}
+### First 5 Images
+{json.dumps(result['images'], indent=2)}
+    """
+    return output
+# Create Gradio interface
+iface = gr.Interface(
+    fn=lambda url, wait_time: format_output(scrape_website(url, wait_time)),
+    inputs=[
+        gr.Textbox(label="URL to scrape", placeholder="https://example.com"),
+        gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Wait time (seconds)")
+    ],
+    outputs=gr.Markdown(),
+    title="Web Scraper with Headless Browser",
+    description="""
+    Enter a URL to scrape its content using a headless browser.
+    The tool will extract the title, meta description, text content, links, and images.
+    Please use responsibly and respect websites' terms of service and robots.txt files.
+    """,
+    examples=[
+        ["https://example.com", 5],
+        ["https://news.ycombinator.com", 8]
+    ]
+)
+# Launch the interface
+if __name__ == "__main__":
+    iface.launch()