etukurudinesh commited on
Commit
200310b
·
1 Parent(s): d298527

docstrings

Browse files
Files changed (1) hide show
  1. server.py +35 -0
server.py CHANGED
@@ -21,6 +21,41 @@ async def scrape_async(url):
21
  }
22
 
23
  def scrape(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  return asyncio.run(scrape_async(url))
25
 
26
  with gr.Blocks(title="MCP Web Scraper") as demo:
 
21
  }
22
 
23
  def scrape(url):
24
+ """
25
+ Asynchronously scrapes a webpage using Playwright, extracts content, formats it as JSON
26
+ with a specific structure, and stores the result in a MongoDB database.
27
+
28
+ Args:
29
+ url (str): The URL of the webpage to scrape (e.g., 'https://example.com').
30
+
31
+ Returns:
32
+ dict: A JSON-compatible dictionary containing the scraped content in the following format:
33
+ - URL (str): The scraped webpage URL.
34
+ - Title (str): The title of the webpage.
35
+ - Text Length (int): The length of the extracted text.
36
+ - Headings (list): Key headings extracted from the webpage.
37
+ - Main Topics (list): Main topics identified in the content.
38
+ - Summary (Short) (str): A short summary of the text, truncated to 800 characters with '...' appended.
39
+
40
+ Notes:
41
+ - Utilizes Playwright for browser automation to fetch and render the webpage.
42
+ - The scraped data is processed into a structured JSON format suitable for LLM processing.
43
+ - The resulting JSON is stored in a MongoDB collection for persistence.
44
+ - This function wraps an asynchronous `scrape_async` function and runs it synchronously
45
+ using `asyncio.run`.
46
+
47
+ Example:
48
+ >>> result = scrape("https://example.com")
49
+ >>> print(result)
50
+ {
51
+ "URL": "https://example.com",
52
+ "Title": "Example Page",
53
+ "Text Length": 1234,
54
+ "Headings": ["Heading 1", "Heading 2"],
55
+ "Main Topics": ["Topic 1", "Topic 2"],
56
+ "Summary (Short)": "This is a summary of the webpage content..."
57
+ }
58
+ """
59
  return asyncio.run(scrape_async(url))
60
 
61
  with gr.Blocks(title="MCP Web Scraper") as demo: