Spaces:

WD101
/

OneServerToRuleThemAll

Runtime error

App Files Files Community

etukurudinesh commited on Jun 8, 2025

Commit

200310b

1 Parent(s): d298527

docstrings

Browse files

Files changed (1) hide show

server.py +35 -0

server.py CHANGED Viewed

@@ -21,6 +21,41 @@ async def scrape_async(url):
     }
 def scrape(url):
     return asyncio.run(scrape_async(url))
 with gr.Blocks(title="MCP Web Scraper") as demo:

     }
 def scrape(url):
+    """
+    Asynchronously scrapes a webpage using Playwright, extracts content, formats it as JSON
+    with a specific structure, and stores the result in a MongoDB database.
+    Args:
+        url (str): The URL of the webpage to scrape (e.g., 'https://example.com').
+    Returns:
+        dict: A JSON-compatible dictionary containing the scraped content in the following format:
+            - URL (str): The scraped webpage URL.
+            - Title (str): The title of the webpage.
+            - Text Length (int): The length of the extracted text.
+            - Headings (list): Key headings extracted from the webpage.
+            - Main Topics (list): Main topics identified in the content.
+            - Summary (Short) (str): A short summary of the text, truncated to 800 characters with '...' appended.
+    Notes:
+        - Utilizes Playwright for browser automation to fetch and render the webpage.
+        - The scraped data is processed into a structured JSON format suitable for LLM processing.
+        - The resulting JSON is stored in a MongoDB collection for persistence.
+        - This function wraps an asynchronous `scrape_async` function and runs it synchronously
+          using `asyncio.run`.
+    Example:
+        >>> result = scrape("https://example.com")
+        >>> print(result)
+        {
+            "URL": "https://example.com",
+            "Title": "Example Page",
+            "Text Length": 1234,
+            "Headings": ["Heading 1", "Heading 2"],
+            "Main Topics": ["Topic 1", "Topic 2"],
+            "Summary (Short)": "This is a summary of the webpage content..."
+        }
+    """
     return asyncio.run(scrape_async(url))
 with gr.Blocks(title="MCP Web Scraper") as demo: