CultriX commited on
Commit
315dac2
·
verified ·
1 Parent(s): 23a446a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -114
app.py CHANGED
@@ -2,7 +2,7 @@
2
  """
3
  RAG-Ready Content Scraper — Gradio + MCP (SSE)
4
 
5
- This app runs on Hugging Face Spaces and exposes an MCP SSE endpoint at:
6
  /gradio_api/mcp/sse
7
 
8
  Example MCP configs:
@@ -43,7 +43,10 @@ import json
43
  import re
44
  import subprocess
45
  import tempfile
46
- from typing import Optional, Tuple
 
 
 
47
 
48
  import gradio as gr
49
  import markdown_pdf
@@ -59,28 +62,14 @@ from rag_scraper.utils import URLUtils
59
  # -----------------------------
60
 
61
  def is_github_repo(url_or_id: str) -> bool:
62
- """
63
- Determine whether the string looks like a GitHub repository reference.
64
-
65
- :param url_or_id: Full GitHub URL containing ``github.com`` or an
66
- ``owner/repo`` identifier.
67
- :type url_or_id: str
68
- :return: ``True`` if it matches a GitHub URL or ``owner/repo`` pattern,
69
- otherwise ``False``.
70
- :rtype: bool
71
- """
72
  if "github.com" in url_or_id:
73
  return True
74
  return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
75
 
76
 
77
  def check_repomix_installed() -> bool:
78
- """
79
- Check if the ``repomix`` CLI is available on PATH.
80
-
81
- :return: ``True`` if ``repomix --version`` succeeds, else ``False``.
82
- :rtype: bool
83
- """
84
  try:
85
  result = subprocess.run(
86
  ["repomix", "--version"],
@@ -97,19 +86,7 @@ def run_repomix(
97
  repo_url_or_id: str,
98
  progress: gr.Progress = gr.Progress(track_tqdm=True),
99
  ) -> Tuple[str, Optional[str]]:
100
- """
101
- Run Repomix on a GitHub repository and return combined Markdown.
102
-
103
- :param repo_url_or_id: GitHub repo as full URL (``https://github.com/...``)
104
- or in the form ``owner/repo``.
105
- :type repo_url_or_id: str
106
- :param progress: Gradio progress tracker (UI only).
107
- :type progress: gr.Progress
108
- :return: A tuple ``(content, output_path)`` where ``content`` is the
109
- combined Markdown or an error string starting with ``"Error"``, and
110
- ``output_path`` is the temp file path (or ``None``).
111
- :rtype: Tuple[str, Optional[str]]
112
- """
113
  progress(0, desc="Starting Repomix processing...")
114
  try:
115
  with tempfile.TemporaryDirectory() as temp_dir:
@@ -169,18 +146,7 @@ def scrape_and_convert_website(
169
  depth: int,
170
  progress: gr.Progress = gr.Progress(track_tqdm=True),
171
  ) -> Tuple[str, str]:
172
- """
173
- Recursively scrape a website and convert pages to Markdown.
174
-
175
- :param url: Starting URL to scrape.
176
- :type url: str
177
- :param depth: Crawl depth where 0 = only the main page (1..3 follow internal links).
178
- :type depth: int
179
- :param progress: Gradio progress tracker (UI only).
180
- :type progress: gr.Progress
181
- :return: A tuple ``(combined_markdown, tmp_md_path)``.
182
- :rtype: Tuple[str, str]
183
- """
184
  progress(0, desc=f"Starting web scrape for {url}...")
185
  visited_urls = set()
186
 
@@ -244,31 +210,13 @@ def scrape_and_convert_website(
244
 
245
 
246
  def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
247
- """
248
- Wrap Markdown text in a JSON object with ``source`` and ``content`` keys.
249
-
250
- :param markdown_content: The Markdown body to embed.
251
- :type markdown_content: str
252
- :param source_url_or_id: Original input string identifying the source.
253
- :type source_url_or_id: str
254
- :return: Pretty-printed JSON string.
255
- :rtype: str
256
- """
257
  data = {"source": source_url_or_id, "content": markdown_content}
258
  return json.dumps(data, indent=2)
259
 
260
 
261
  def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
262
- """
263
- Persist Markdown as a simple CSV with two columns: ``source``, ``content``.
264
-
265
- :param markdown_content: The Markdown body to store.
266
- :type markdown_content: str
267
- :param source_url_or_id: Original input string identifying the source.
268
- :type source_url_or_id: str
269
- :return: Path to the created CSV file.
270
- :rtype: str
271
- """
272
  output = tempfile.NamedTemporaryFile(
273
  mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
274
  )
@@ -284,22 +232,7 @@ def save_output_to_file(
284
  output_format: str,
285
  source_url_or_id: str,
286
  ) -> str:
287
- """
288
- Save processed content in the selected format and return a file path.
289
-
290
- :param content: The raw Markdown to save or convert.
291
- :type content: str
292
- :param output_format: One of {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
293
- :type output_format: str
294
- :param source_url_or_id: Original input string identifying the source.
295
- :type source_url_or_id: str
296
- :return: Path to a temporary file holding the artifact.
297
- :rtype: str
298
-
299
- .. note::
300
- PDF uses ``markdown_pdf`` and writes directly to a temporary ``.pdf`` file.
301
- CSV uses a 2-column schema: ``['source','content']``.
302
- """
303
  processed_content = content # default for Markdown/Text
304
 
305
  if output_format == "JSON":
@@ -335,44 +268,29 @@ def save_output_to_file(
335
  # ----------------------------------------------------------
336
 
337
  def process_input_updated(
338
- url_or_id: str,
339
- source_type: str,
340
- depth: int,
341
- output_format_selection: str,
 
 
 
 
 
 
 
 
 
 
 
 
342
  progress: gr.Progress = gr.Progress(track_tqdm=True),
343
  ) -> Tuple[str, str, Optional[str]]:
344
  """
345
- Scrape or repo-dump content and export it as Markdown/JSON/CSV/Text/PDF.
346
-
347
- This function is exposed to MCP clients via the Spaces SSE endpoint
348
- ``/gradio_api/mcp/sse``.
349
-
350
- :param url_or_id: For webpages, a full URL (e.g., ``https://example.com``).
351
- For GitHub, either ``owner/repo`` or a full GitHub URL
352
- (e.g., ``https://github.com/owner/repo``).
353
- :type url_or_id: str
354
- :param source_type: Select the content source. One of
355
- {``"Webpage"``, ``"GitHub Repository"``}.
356
- :type source_type: str
357
- :param depth: Crawl depth for webpages. Integer in the range 0–3 where
358
- 0 = only the main page. **Ignored** when ``source_type`` is
359
- ``"GitHub Repository"``.
360
- :type depth: int
361
- :param output_format_selection: Desired output format. One of
362
- {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
363
- :type output_format_selection: str
364
- :param progress: (UI only) Gradio progress tracker. MCP callers can omit this.
365
- :type progress: gr.Progress
366
-
367
- :returns: A 3-tuple:
368
- - **status** (*str*): Human-readable status line.
369
- - **preview** (*str*): Text preview (full Markdown/JSON/Text, or a note for CSV/PDF).
370
- - **file_path** (*Optional[str]*): Path to the generated artifact for download,
371
- or ``None`` on error.
372
- :rtype: Tuple[str, str, Optional[str]]
373
-
374
- :raises Exception: (caught internally) Unexpected processing errors are surfaced
375
- as a user-facing status with details in the preview.
376
  """
377
  progress(0, desc="Initializing...")
378
  raw_content = ""
 
2
  """
3
  RAG-Ready Content Scraper — Gradio + MCP (SSE)
4
 
5
+ Exposes an MCP SSE endpoint on Hugging Face Spaces at:
6
  /gradio_api/mcp/sse
7
 
8
  Example MCP configs:
 
43
  import re
44
  import subprocess
45
  import tempfile
46
+ from typing import Optional, Tuple, Literal
47
+
48
+ # NEW: use Annotated+Doc so MCP can show per-parameter descriptions
49
+ from typing_extensions import Annotated, Doc
50
 
51
  import gradio as gr
52
  import markdown_pdf
 
62
  # -----------------------------
63
 
64
  def is_github_repo(url_or_id: str) -> bool:
65
+ """Return True if the string looks like a GitHub repository reference."""
 
 
 
 
 
 
 
 
 
66
  if "github.com" in url_or_id:
67
  return True
68
  return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
69
 
70
 
71
  def check_repomix_installed() -> bool:
72
+ """Check if the `repomix` CLI is available on PATH."""
 
 
 
 
 
73
  try:
74
  result = subprocess.run(
75
  ["repomix", "--version"],
 
86
  repo_url_or_id: str,
87
  progress: gr.Progress = gr.Progress(track_tqdm=True),
88
  ) -> Tuple[str, Optional[str]]:
89
+ """Run Repomix on a GitHub repository and return combined Markdown."""
 
 
 
 
 
 
 
 
 
 
 
 
90
  progress(0, desc="Starting Repomix processing...")
91
  try:
92
  with tempfile.TemporaryDirectory() as temp_dir:
 
146
  depth: int,
147
  progress: gr.Progress = gr.Progress(track_tqdm=True),
148
  ) -> Tuple[str, str]:
149
+ """Recursively scrape a website and convert pages to Markdown."""
 
 
 
 
 
 
 
 
 
 
 
150
  progress(0, desc=f"Starting web scrape for {url}...")
151
  visited_urls = set()
152
 
 
210
 
211
 
212
  def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
213
+ """Wrap Markdown text in a JSON object with `source` and `content` keys."""
 
 
 
 
 
 
 
 
 
214
  data = {"source": source_url_or_id, "content": markdown_content}
215
  return json.dumps(data, indent=2)
216
 
217
 
218
  def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
219
+ """Persist Markdown as a simple CSV with two columns: `source`, `content`."""
 
 
 
 
 
 
 
 
 
220
  output = tempfile.NamedTemporaryFile(
221
  mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
222
  )
 
232
  output_format: str,
233
  source_url_or_id: str,
234
  ) -> str:
235
+ """Save processed content in the selected format and return a file path."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  processed_content = content # default for Markdown/Text
237
 
238
  if output_format == "JSON":
 
268
  # ----------------------------------------------------------
269
 
270
  def process_input_updated(
271
+ url_or_id: Annotated[
272
+ str,
273
+ Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
274
+ ],
275
+ source_type: Annotated[
276
+ Literal["Webpage", "GitHub Repository"],
277
+ Doc('Select the content source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
278
+ ],
279
+ depth: Annotated[
280
+ int,
281
+ Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub repositories."),
282
+ ],
283
+ output_format_selection: Annotated[
284
+ Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
285
+ Doc("Desired output format for the processed content."),
286
+ ],
287
  progress: gr.Progress = gr.Progress(track_tqdm=True),
288
  ) -> Tuple[str, str, Optional[str]]:
289
  """
290
+ Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
291
+
292
+ Returns:
293
+ Tuple[str, str, Optional[str]]: (status, preview, file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  """
295
  progress(0, desc="Initializing...")
296
  raw_content = ""