CultriX commited on
Commit
20dc7c9
·
verified ·
1 Parent(s): 88150f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -32
app.py CHANGED
@@ -116,74 +116,105 @@ def save_output_to_file(content: str, fmt: str, source: str) -> str:
116
  return tmp.name
117
 
118
  # ---------- MCP-exposed tool ----------
 
 
 
 
 
119
  def process_input_updated(
120
  url_or_id: Annotated[
121
  str,
122
- Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or the full GitHub URL."),
123
  ],
124
  source_type: Annotated[
125
  Literal["Webpage", "GitHub Repository"],
126
- Doc('Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
127
  ],
128
  depth: Annotated[
129
  int,
130
- Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub."),
131
  ],
132
  output_format_selection: Annotated[
133
  Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
134
- Doc("Desired output format for the processed content."),
135
  ],
136
  progress: gr.Progress = gr.Progress(track_tqdm=True),
137
  ) -> Tuple[str, str, Optional[str]]:
138
  """
139
- Scrape a webpage (with configurable depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- Returns:
142
- (status, preview, file_path)
 
 
 
 
143
  """
144
  progress(0, desc="Initializing…")
145
- raw, err = "", ""
146
- out_path: Optional[str] = None
147
 
148
  if source_type == "GitHub Repository":
149
  if not check_repomix_installed():
150
- return "Repomix is not installed or not accessible.", "", None
151
- raw, _ = run_repomix(url_or_id, progress=progress)
152
- if raw.startswith("Error"):
153
- return raw, "", None
154
  elif source_type == "Webpage":
155
- raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
156
- if raw.startswith("Error"):
157
- return raw, "", None
158
  else:
159
  return "Invalid source type selected.", "", None
160
 
161
  try:
162
  progress(0.9, desc=f"Converting to {output_format_selection}…")
163
- out_path = save_output_to_file(raw, output_format_selection, url_or_id)
164
 
165
- preview = raw
166
  if output_format_selection == "JSON":
167
- preview = convert_to_json(raw, url_or_id)
168
- elif output_format_selection == "CSV":
169
  try:
170
- with open(out_path, "r", encoding="utf-8") as f:
171
- first_lines = [next(f) for _ in range(5)]
172
- preview = "".join(first_lines) or "[CSV content is empty or very short]"
173
  except StopIteration:
174
- with open(out_path, "r", encoding="utf-8") as f:
175
- preview = f.read() or "[CSV content is empty]"
176
- except Exception as e:
177
- preview = f"[Error reading CSV for preview: {e}]"
 
 
178
  elif output_format_selection == "PDF":
179
- from os.path import basename
180
- preview = f"[PDF generated. Download to view: {basename(out_path) if out_path else 'file.pdf'}]"
 
 
181
 
182
- progress(1, desc="Done.")
183
- return f"Successfully processed: {url_or_id}", preview, out_path
184
 
185
  except Exception as e:
186
- return f"Error during conversion: {e}", raw, None
 
187
 
188
  # ---------- UI ----------
189
  with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
 
116
  return tmp.name
117
 
118
  # ---------- MCP-exposed tool ----------
119
+ from typing import Optional, Tuple, Literal
120
+ from typing_extensions import Annotated, Doc
121
+ import os
122
+ import gradio as gr
123
+
124
  def process_input_updated(
125
  url_or_id: Annotated[
126
  str,
127
+ Doc("For webpages: full URL (e.g. https://example.com). For GitHub: owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
128
  ],
129
  source_type: Annotated[
130
  Literal["Webpage", "GitHub Repository"],
131
+ Doc('Choose source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
132
  ],
133
  depth: Annotated[
134
  int,
135
+ Doc("Crawl depth for webpages (0–3). 0 = main page only. Ignored for GitHub repositories."),
136
  ],
137
  output_format_selection: Annotated[
138
  Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
139
+ Doc("Output format for the processed content."),
140
  ],
141
  progress: gr.Progress = gr.Progress(track_tqdm=True),
142
  ) -> Tuple[str, str, Optional[str]]:
143
  """
144
+ Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
145
+
146
+ Parameters
147
+ ----------
148
+ url_or_id : str
149
+ For webpages: full URL (e.g. ``https://example.com``).
150
+ For GitHub: either ``owner/repo`` or a full GitHub URL
151
+ (e.g. ``https://github.com/owner/repo``).
152
+ source_type : {"Webpage", "GitHub Repository"}
153
+ Choose the content source. Use **Webpage** to crawl HTML; use
154
+ **GitHub Repository** to run Repomix.
155
+ depth : int
156
+ Crawl depth for webpages in the range 0–3 where 0 = only the main page.
157
+ Ignored when ``source_type`` is ``"GitHub Repository"``.
158
+ output_format_selection : {"Markdown", "JSON", "CSV", "Text", "PDF"}
159
+ Desired output format for the processed content.
160
+ progress : gr.Progress, optional
161
+ (UI only) Gradio progress tracker. MCP callers can omit this.
162
 
163
+ Returns
164
+ -------
165
+ (status, preview, file_path) : tuple[str, str, Optional[str]]
166
+ - **status**: Human-readable status line.
167
+ - **preview**: Text preview (full Markdown/JSON/Text, or a short note for CSV/PDF).
168
+ - **file_path**: Path to the generated artifact, or ``None`` on error.
169
  """
170
  progress(0, desc="Initializing…")
171
+ raw_content = ""
172
+ output_file_path: Optional[str] = None
173
 
174
  if source_type == "GitHub Repository":
175
  if not check_repomix_installed():
176
+ return "Repomix is not installed or not accessible. Please install it.", "", None
177
+ raw_content, _ = run_repomix(url_or_id, progress=progress)
178
+ if raw_content.startswith("Error"):
179
+ return raw_content, "", None
180
  elif source_type == "Webpage":
181
+ raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
182
+ if raw_content.startswith("Error"):
183
+ return raw_content, "", None
184
  else:
185
  return "Invalid source type selected.", "", None
186
 
187
  try:
188
  progress(0.9, desc=f"Converting to {output_format_selection}…")
189
+ output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
190
 
191
+ preview_content = raw_content
192
  if output_format_selection == "JSON":
193
+ preview_content = convert_to_json(raw_content, url_or_id)
194
+ elif output_format_selection == "CSV" and output_file_path:
195
  try:
196
+ with open(output_file_path, "r", encoding="utf-8") as f_csv:
197
+ csv_preview_lines = [next(f_csv) for _ in range(5)]
198
+ preview_content = "".join(csv_preview_lines) or "[CSV content is empty or very short]"
199
  except StopIteration:
200
+ with open(output_file_path, "r", encoding="utf-8") as f_csv:
201
+ preview_content = f_csv.read() or "[CSV content is empty]"
202
+ except Exception as e_csv_preview:
203
+ preview_content = f"[Error reading CSV for preview: {e_csv_preview}]"
204
+ elif output_format_selection == "CSV" and not output_file_path:
205
+ preview_content = "[CSV file path not available for preview]"
206
  elif output_format_selection == "PDF":
207
+ preview_content = (
208
+ f"[PDF generated. Download to view: "
209
+ f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
210
+ )
211
 
212
+ progress(1, desc="Processing complete.")
213
+ return f"Successfully processed: {url_or_id}", preview_content, output_file_path
214
 
215
  except Exception as e:
216
+ return f"Error during file conversion/saving: {e}", raw_content, None
217
+
218
 
219
  # ---------- UI ----------
220
  with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface: