File size: 2,872 Bytes
feea636
 
 
f96237c
feea636
 
 
 
 
 
e8d6ee4
 
 
feea636
 
 
 
 
 
 
 
 
 
200310b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feea636
 
 
 
 
 
 
 
 
 
 
8b441ff
269dc01
2c7c6e8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import asyncio
from main import WebScrapingOrchestrator
import os

orchestrator = WebScrapingOrchestrator()

async def scrape_async(url):
    result = await orchestrator.process_url(url)
    if "error" in result:
        return {
            "Error": {result['error']},
        }
    return {
        "URL": result.get("url"),
        "Title": result.get("title"),
        "Text Length": result["summary"]["text_length"],
        "Headings": result["llm_ready_data"]["key_headings"],
        "Main Topics": result["llm_ready_data"]["main_topics"],
        "Summary (Short)": result["llm_ready_data"]["text_summary"][:800] + "..."
    }

def scrape(url):
    """
    Asynchronously scrapes a webpage using Playwright, extracts content, formats it as JSON
    with a specific structure, and stores the result in a MongoDB database.

    Args:
        url (str): The URL of the webpage to scrape (e.g., 'https://example.com').

    Returns:
        dict: A JSON-compatible dictionary containing the scraped content in the following format:
            - URL (str): The scraped webpage URL.
            - Title (str): The title of the webpage.
            - Text Length (int): The length of the extracted text.
            - Headings (list): Key headings extracted from the webpage.
            - Main Topics (list): Main topics identified in the content.
            - Summary (Short) (str): A short summary of the text, truncated to 800 characters with '...' appended.

    Notes:
        - Utilizes Playwright for browser automation to fetch and render the webpage.
        - The scraped data is processed into a structured JSON format suitable for LLM processing.
        - The resulting JSON is stored in a MongoDB collection for persistence.
        - This function wraps an asynchronous `scrape_async` function and runs it synchronously
          using `asyncio.run`.

    Example:
        >>> result = scrape("https://example.com")
        >>> print(result)
        {
            "URL": "https://example.com",
            "Title": "Example Page",
            "Text Length": 1234,
            "Headings": ["Heading 1", "Heading 2"],
            "Main Topics": ["Topic 1", "Topic 2"],
            "Summary (Short)": "This is a summary of the webpage content..."
        }
    """
    return asyncio.run(scrape_async(url))

with gr.Blocks(title="MCP Web Scraper") as demo:
    gr.Markdown("### 🔍 MCP LLM Web Scraper")
    url_input = gr.Textbox(label="Enter a webpage URL", placeholder="https://...")
    output = gr.JSON(label="Scraped & LLM-ready Content")

    scrape_button = gr.Button("Scrape Page")
    scrape_button.click(scrape, inputs=url_input, outputs=output)

if __name__ == "__main__":
    #os.environ['no_proxy'] = 'localhost, 127.0.0.1, ::1'
    #demo.launch(server_name="0.0.0.0", server_port=7860)
    demo.launch(mcp_server=True)