CultriX commited on
Commit
23a446a
·
verified ·
1 Parent(s): 2893e36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -127
app.py CHANGED
@@ -2,13 +2,12 @@
2
  """
3
  RAG-Ready Content Scraper — Gradio + MCP (SSE)
4
 
5
- This Gradio app runs on Hugging Face Spaces and automatically exposes an MCP SSE
6
- endpoint at `/gradio_api/mcp/sse` (when the environment supports it).
7
 
8
- MCP SSE client config examples:
9
 
10
  1) Direct SSE (Cursor, Windsurf, Cline, etc.)
11
-
12
  {
13
  "mcpServers": {
14
  "gradio": {
@@ -17,8 +16,7 @@ MCP SSE client config examples:
17
  }
18
  }
19
 
20
- 2) Experimental stdio via Node (for clients that only support stdio):
21
-
22
  {
23
  "mcpServers": {
24
  "gradio": {
@@ -57,18 +55,19 @@ from rag_scraper.utils import URLUtils
57
 
58
 
59
  # -----------------------------
60
- # Utility / helper functions
61
  # -----------------------------
62
 
63
  def is_github_repo(url_or_id: str) -> bool:
64
- """Return True if the string looks like a GitHub repository reference.
65
-
66
- Args:
67
- url_or_id: Either a full GitHub URL (containing 'github.com') or an
68
- "owner/repo" identifier (alphanumeric/._-).
69
-
70
- Returns:
71
- bool: True if value matches a GitHub repo URL or "owner/repo" pattern.
 
72
  """
73
  if "github.com" in url_or_id:
74
  return True
@@ -76,10 +75,11 @@ def is_github_repo(url_or_id: str) -> bool:
76
 
77
 
78
  def check_repomix_installed() -> bool:
79
- """Check whether `repomix` is available on PATH.
 
80
 
81
- Returns:
82
- bool: True if `repomix --version` executes successfully, else False.
83
  """
84
  try:
85
  result = subprocess.run(
@@ -97,22 +97,18 @@ def run_repomix(
97
  repo_url_or_id: str,
98
  progress: gr.Progress = gr.Progress(track_tqdm=True),
99
  ) -> Tuple[str, Optional[str]]:
100
- """Run Repomix on a GitHub repository and return its combined Markdown corpus.
101
-
102
- Args:
103
- repo_url_or_id: GitHub repo as full URL or "owner/repo".
104
- progress: Gradio progress object (auto-provided in UI; ignored by MCP).
105
-
106
- Returns:
107
- (content, output_path):
108
- content (str): Combined Markdown content, or an error message that
109
- starts with "Error".
110
- output_path (Optional[str]): Path to the temp file created by Repomix,
111
- or None if not applicable.
112
-
113
- Notes:
114
- - Requires `repomix` installed in the environment.
115
- - If `repo_url_or_id` is "owner/repo" it is expanded to a full GitHub URL.
116
  """
117
  progress(0, desc="Starting Repomix processing...")
118
  try:
@@ -128,15 +124,11 @@ def run_repomix(
128
  progress(0.2, desc=f"Running Repomix on {repo_url}...")
129
  cmd = [
130
  "repomix",
131
- "--remote",
132
- repo_url,
133
- "--output",
134
- output_file_path,
135
- "--style",
136
- "markdown",
137
  "--compress",
138
  ]
139
-
140
  process = subprocess.run(
141
  cmd, capture_output=True, text=True, check=False, encoding="utf-8"
142
  )
@@ -177,22 +169,17 @@ def scrape_and_convert_website(
177
  depth: int,
178
  progress: gr.Progress = gr.Progress(track_tqdm=True),
179
  ) -> Tuple[str, str]:
180
- """Recursively scrape a website and convert each visited page to Markdown.
181
-
182
- Args:
183
- url: Starting URL to scrape.
184
- depth: Crawl depth (0 = only the main page, 1..3 will follow internal links).
185
- progress: Gradio progress object (auto-provided in UI; ignored by MCP).
186
-
187
- Returns:
188
- (combined_markdown, temp_markdown_filepath):
189
- combined_markdown (str): All pages concatenated with headings.
190
- temp_markdown_filepath (str): Path to a temp file containing the same
191
- combined Markdown content.
192
-
193
- Notes:
194
- - Only internal links are visited.
195
- - Link extraction uses `LinkExtractor` with `LinkType.INTERNAL`.
196
  """
197
  progress(0, desc=f"Starting web scrape for {url}...")
198
  visited_urls = set()
@@ -212,7 +199,10 @@ def scrape_and_convert_website(
212
  progress_val = (
213
  link_index / total_links_estimate if total_links_estimate > 0 else 0
214
  )
215
- progress(progress_val, desc=f"Scraping: {current_url} (Depth used: {depth - current_depth})")
 
 
 
216
  html_content = Scraper.fetch_html(current_url)
217
  except Exception as e:
218
  return f"Error fetching {current_url}: {str(e)}\n"
@@ -254,28 +244,30 @@ def scrape_and_convert_website(
254
 
255
 
256
  def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
257
- """Wrap Markdown text in a simple JSON object with 'source' and 'content' keys.
258
-
259
- Args:
260
- markdown_content: The Markdown body to embed.
261
- source_url_or_id: The original input string identifying the source.
262
-
263
- Returns:
264
- str: Pretty-printed JSON string.
 
265
  """
266
  data = {"source": source_url_or_id, "content": markdown_content}
267
  return json.dumps(data, indent=2)
268
 
269
 
270
  def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
271
- """Write a simple CSV file with columns ['source','content'].
272
-
273
- Args:
274
- markdown_content: The Markdown body to store in CSV.
275
- source_url_or_id: The original input string identifying the source.
276
-
277
- Returns:
278
- str: Path to the created CSV file.
 
279
  """
280
  output = tempfile.NamedTemporaryFile(
281
  mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
@@ -292,19 +284,21 @@ def save_output_to_file(
292
  output_format: str,
293
  source_url_or_id: str,
294
  ) -> str:
295
- """Persist processed content in the selected output format and return a filepath.
296
-
297
- Args:
298
- content: The raw Markdown to save or convert.
299
- output_format: One of {"Markdown","JSON","CSV","Text","PDF"}.
300
- source_url_or_id: The original input string identifying the source.
301
-
302
- Returns:
303
- str: Path to a temporary file holding the artifact (may be a PDF, CSV, etc.).
304
-
305
- Notes:
306
- - PDF uses `markdown_pdf` and writes directly to a temporary `.pdf` file.
307
- - CSV uses a 2-column schema: ['source','content'].
 
 
308
  """
309
  processed_content = content # default for Markdown/Text
310
 
@@ -316,7 +310,6 @@ def save_output_to_file(
316
  elif output_format == "Text":
317
  suffix = ".txt"
318
  elif output_format == "PDF":
319
- # Write PDF directly and return the path.
320
  try:
321
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
322
  pdf_output_path = tmp_pdf.name
@@ -327,7 +320,6 @@ def save_output_to_file(
327
  # Fallback: persist as Markdown with .pdf.md suffix.
328
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
329
  suffix = ".pdf.md"
330
- # For the fallback, continue below and write Markdown.
331
  else:
332
  suffix = ".md"
333
 
@@ -339,7 +331,7 @@ def save_output_to_file(
339
 
340
 
341
  # ----------------------------------------------------------
342
- # Main tool function (this is what MCP exposes via SSE)
343
  # ----------------------------------------------------------
344
 
345
  def process_input_updated(
@@ -349,39 +341,38 @@ def process_input_updated(
349
  output_format_selection: str,
350
  progress: gr.Progress = gr.Progress(track_tqdm=True),
351
  ) -> Tuple[str, str, Optional[str]]:
352
- """Scrape or repo-dump content and export it as Markdown/JSON/CSV/Text/PDF.
353
-
354
- This function is exposed to MCP clients at the Spaces SSE endpoint.
355
-
356
- Args:
357
- url_or_id:
358
- For webpages: a full URL (e.g. "https://example.com").
359
- For GitHub: either "owner/repo" or a full GitHub URL.
360
- source_type:
361
- One of {"Webpage", "GitHub Repository"}.
362
- Selects whether to crawl HTML pages or run Repomix.
363
- depth:
364
- Crawl depth for webpages (0–3). Ignored when source_type="GitHub Repository".
365
- output_format_selection:
366
- One of {"Markdown","JSON","CSV","Text","PDF"} specifying the output format.
367
- progress:
368
- Gradio progress tracker (in UI). MCP callers can omit it.
369
-
370
- Returns:
371
- (status, preview, file_path):
372
- status (str): Human-readable status message.
373
- preview (str): Short preview or full text (Markdown/JSON/Text). For CSV/PDF,
374
- a helpful note is returned with the generated filename.
375
- file_path (Optional[str]): Path to the generated file (for download),
376
- or None if an error occurred.
377
-
378
- Behavior:
379
- - For "GitHub Repository", requires `repomix` to be installed on PATH.
380
- - For "Webpage", recursively scrapes internal links up to `depth`.
381
- - Converts to the requested format and saves a temp file for download.
382
-
383
- Errors:
384
- - Any internal exception is caught and returned as a user-facing status + preview.
385
  """
386
  progress(0, desc="Initializing...")
387
  raw_content = ""
@@ -422,7 +413,7 @@ def process_input_updated(
422
  if output_format_selection == "JSON":
423
  preview_content = convert_to_json(raw_content, url_or_id)
424
  elif output_format_selection == "CSV" and output_file_path:
425
- # Read only a few lines for preview
426
  try:
427
  with open(output_file_path, "r", encoding="utf-8") as f_csv:
428
  csv_preview_lines = [next(f_csv) for _ in range(5)]
@@ -435,7 +426,7 @@ def process_input_updated(
435
  elif output_format_selection == "CSV" and not output_file_path:
436
  preview_content = "[CSV file path not available for preview]"
437
  elif output_format_selection == "PDF":
438
- # PDF cannot be previewed as text; provide a helpful note.
439
  preview_content = (
440
  f"[PDF generated. Download to view: "
441
  f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
@@ -529,7 +520,7 @@ Markdown, JSON, CSV, Text, PDF.
529
 
530
  **Notes**
531
  - PDF generation requires the `markdown-pdf` library.
532
- - This app is designed for Docker/Hugging Face Spaces.
533
  - MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
534
  """
535
  )
@@ -541,5 +532,5 @@ Markdown, JSON, CSV, Text, PDF.
541
  )
542
 
543
  if __name__ == "__main__":
544
- # Spaces typically set up their own server. queue() is safe for concurrency.
545
  iface.queue().launch(share=True)
 
2
  """
3
  RAG-Ready Content Scraper — Gradio + MCP (SSE)
4
 
5
+ This app runs on Hugging Face Spaces and exposes an MCP SSE endpoint at:
6
+ /gradio_api/mcp/sse
7
 
8
+ Example MCP configs:
9
 
10
  1) Direct SSE (Cursor, Windsurf, Cline, etc.)
 
11
  {
12
  "mcpServers": {
13
  "gradio": {
 
16
  }
17
  }
18
 
19
+ 2) Experimental stdio via Node:
 
20
  {
21
  "mcpServers": {
22
  "gradio": {
 
55
 
56
 
57
  # -----------------------------
58
+ # Helper utilities
59
  # -----------------------------
60
 
61
  def is_github_repo(url_or_id: str) -> bool:
62
+ """
63
+ Determine whether the string looks like a GitHub repository reference.
64
+
65
+ :param url_or_id: Full GitHub URL containing ``github.com`` or an
66
+ ``owner/repo`` identifier.
67
+ :type url_or_id: str
68
+ :return: ``True`` if it matches a GitHub URL or ``owner/repo`` pattern,
69
+ otherwise ``False``.
70
+ :rtype: bool
71
  """
72
  if "github.com" in url_or_id:
73
  return True
 
75
 
76
 
77
  def check_repomix_installed() -> bool:
78
+ """
79
+ Check if the ``repomix`` CLI is available on PATH.
80
 
81
+ :return: ``True`` if ``repomix --version`` succeeds, else ``False``.
82
+ :rtype: bool
83
  """
84
  try:
85
  result = subprocess.run(
 
97
  repo_url_or_id: str,
98
  progress: gr.Progress = gr.Progress(track_tqdm=True),
99
  ) -> Tuple[str, Optional[str]]:
100
+ """
101
+ Run Repomix on a GitHub repository and return combined Markdown.
102
+
103
+ :param repo_url_or_id: GitHub repo as full URL (``https://github.com/...``)
104
+ or in the form ``owner/repo``.
105
+ :type repo_url_or_id: str
106
+ :param progress: Gradio progress tracker (UI only).
107
+ :type progress: gr.Progress
108
+ :return: A tuple ``(content, output_path)`` where ``content`` is the
109
+ combined Markdown or an error string starting with ``"Error"``, and
110
+ ``output_path`` is the temp file path (or ``None``).
111
+ :rtype: Tuple[str, Optional[str]]
 
 
 
 
112
  """
113
  progress(0, desc="Starting Repomix processing...")
114
  try:
 
124
  progress(0.2, desc=f"Running Repomix on {repo_url}...")
125
  cmd = [
126
  "repomix",
127
+ "--remote", repo_url,
128
+ "--output", output_file_path,
129
+ "--style", "markdown",
 
 
 
130
  "--compress",
131
  ]
 
132
  process = subprocess.run(
133
  cmd, capture_output=True, text=True, check=False, encoding="utf-8"
134
  )
 
169
  depth: int,
170
  progress: gr.Progress = gr.Progress(track_tqdm=True),
171
  ) -> Tuple[str, str]:
172
+ """
173
+ Recursively scrape a website and convert pages to Markdown.
174
+
175
+ :param url: Starting URL to scrape.
176
+ :type url: str
177
+ :param depth: Crawl depth where 0 = only the main page (1..3 follow internal links).
178
+ :type depth: int
179
+ :param progress: Gradio progress tracker (UI only).
180
+ :type progress: gr.Progress
181
+ :return: A tuple ``(combined_markdown, tmp_md_path)``.
182
+ :rtype: Tuple[str, str]
 
 
 
 
 
183
  """
184
  progress(0, desc=f"Starting web scrape for {url}...")
185
  visited_urls = set()
 
199
  progress_val = (
200
  link_index / total_links_estimate if total_links_estimate > 0 else 0
201
  )
202
+ progress(
203
+ progress_val,
204
+ desc=f"Scraping: {current_url} (Depth used: {depth - current_depth})",
205
+ )
206
  html_content = Scraper.fetch_html(current_url)
207
  except Exception as e:
208
  return f"Error fetching {current_url}: {str(e)}\n"
 
244
 
245
 
246
  def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
247
+ """
248
+ Wrap Markdown text in a JSON object with ``source`` and ``content`` keys.
249
+
250
+ :param markdown_content: The Markdown body to embed.
251
+ :type markdown_content: str
252
+ :param source_url_or_id: Original input string identifying the source.
253
+ :type source_url_or_id: str
254
+ :return: Pretty-printed JSON string.
255
+ :rtype: str
256
  """
257
  data = {"source": source_url_or_id, "content": markdown_content}
258
  return json.dumps(data, indent=2)
259
 
260
 
261
  def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
262
+ """
263
+ Persist Markdown as a simple CSV with two columns: ``source``, ``content``.
264
+
265
+ :param markdown_content: The Markdown body to store.
266
+ :type markdown_content: str
267
+ :param source_url_or_id: Original input string identifying the source.
268
+ :type source_url_or_id: str
269
+ :return: Path to the created CSV file.
270
+ :rtype: str
271
  """
272
  output = tempfile.NamedTemporaryFile(
273
  mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
 
284
  output_format: str,
285
  source_url_or_id: str,
286
  ) -> str:
287
+ """
288
+ Save processed content in the selected format and return a file path.
289
+
290
+ :param content: The raw Markdown to save or convert.
291
+ :type content: str
292
+ :param output_format: One of {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
293
+ :type output_format: str
294
+ :param source_url_or_id: Original input string identifying the source.
295
+ :type source_url_or_id: str
296
+ :return: Path to a temporary file holding the artifact.
297
+ :rtype: str
298
+
299
+ .. note::
300
+ PDF uses ``markdown_pdf`` and writes directly to a temporary ``.pdf`` file.
301
+ CSV uses a 2-column schema: ``['source','content']``.
302
  """
303
  processed_content = content # default for Markdown/Text
304
 
 
310
  elif output_format == "Text":
311
  suffix = ".txt"
312
  elif output_format == "PDF":
 
313
  try:
314
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
315
  pdf_output_path = tmp_pdf.name
 
320
  # Fallback: persist as Markdown with .pdf.md suffix.
321
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
322
  suffix = ".pdf.md"
 
323
  else:
324
  suffix = ".md"
325
 
 
331
 
332
 
333
  # ----------------------------------------------------------
334
+ # Main tool function (exposed to MCP via SSE)
335
  # ----------------------------------------------------------
336
 
337
  def process_input_updated(
 
341
  output_format_selection: str,
342
  progress: gr.Progress = gr.Progress(track_tqdm=True),
343
  ) -> Tuple[str, str, Optional[str]]:
344
+ """
345
+ Scrape or repo-dump content and export it as Markdown/JSON/CSV/Text/PDF.
346
+
347
+ This function is exposed to MCP clients via the Spaces SSE endpoint
348
+ ``/gradio_api/mcp/sse``.
349
+
350
+ :param url_or_id: For webpages, a full URL (e.g., ``https://example.com``).
351
+ For GitHub, either ``owner/repo`` or a full GitHub URL
352
+ (e.g., ``https://github.com/owner/repo``).
353
+ :type url_or_id: str
354
+ :param source_type: Select the content source. One of
355
+ {``"Webpage"``, ``"GitHub Repository"``}.
356
+ :type source_type: str
357
+ :param depth: Crawl depth for webpages. Integer in the range 0–3 where
358
+ 0 = only the main page. **Ignored** when ``source_type`` is
359
+ ``"GitHub Repository"``.
360
+ :type depth: int
361
+ :param output_format_selection: Desired output format. One of
362
+ {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
363
+ :type output_format_selection: str
364
+ :param progress: (UI only) Gradio progress tracker. MCP callers can omit this.
365
+ :type progress: gr.Progress
366
+
367
+ :returns: A 3-tuple:
368
+ - **status** (*str*): Human-readable status line.
369
+ - **preview** (*str*): Text preview (full Markdown/JSON/Text, or a note for CSV/PDF).
370
+ - **file_path** (*Optional[str]*): Path to the generated artifact for download,
371
+ or ``None`` on error.
372
+ :rtype: Tuple[str, str, Optional[str]]
373
+
374
+ :raises Exception: (caught internally) Unexpected processing errors are surfaced
375
+ as a user-facing status with details in the preview.
 
376
  """
377
  progress(0, desc="Initializing...")
378
  raw_content = ""
 
413
  if output_format_selection == "JSON":
414
  preview_content = convert_to_json(raw_content, url_or_id)
415
  elif output_format_selection == "CSV" and output_file_path:
416
+ # Show a small preview of the CSV
417
  try:
418
  with open(output_file_path, "r", encoding="utf-8") as f_csv:
419
  csv_preview_lines = [next(f_csv) for _ in range(5)]
 
426
  elif output_format_selection == "CSV" and not output_file_path:
427
  preview_content = "[CSV file path not available for preview]"
428
  elif output_format_selection == "PDF":
429
+ # Can't render PDF in text preview
430
  preview_content = (
431
  f"[PDF generated. Download to view: "
432
  f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
 
520
 
521
  **Notes**
522
  - PDF generation requires the `markdown-pdf` library.
523
+ - Designed for Docker/Hugging Face Spaces.
524
  - MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
525
  """
526
  )
 
532
  )
533
 
534
  if __name__ == "__main__":
535
+ # Enable queuing for concurrency; Spaces generally manage hosting.
536
  iface.queue().launch(share=True)