Files changed (4) hide show
  1. Dockerfile +27 -34
  2. README.md +0 -3
  3. app.py +226 -291
  4. requirements.txt +6 -71
Dockerfile CHANGED
@@ -1,55 +1,48 @@
1
- # Pin to Debian 12 so wkhtmltox bookworm package exists
2
- FROM python:3.10-bookworm
3
-
4
- ENV DEBIAN_FRONTEND=noninteractive \
5
- PYTHONDONTWRITEBYTECODE=1 \
6
- PYTHONUNBUFFERED=1
7
 
 
8
  WORKDIR /app
9
 
10
- # OS deps + fonts + X libs required by wkhtmltopdf
11
- RUN apt-get update && apt-get install -y --no-install-recommends \
12
- ca-certificates curl gnupg git xz-utils \
13
- fontconfig fonts-dejavu-core \
14
- libfreetype6 libjpeg62-turbo libpng16-16 \
15
- libx11-6 libxext6 libxrender1 libxcb1 \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
- # Install wkhtmltopdf (bookworm build)
19
- ARG WKHTML_VER=0.12.6.1-3
20
- RUN curl -fsSL -o /tmp/wkhtml.deb \
21
- "https://github.com/wkhtmltopdf/packaging/releases/download/${WKHTML_VER}/wkhtmltox_${WKHTML_VER}.bookworm_amd64.deb" \
22
- && apt-get update \
23
- && apt-get install -y --no-install-recommends /tmp/wkhtml.deb \
24
- && rm -f /tmp/wkhtml.deb \
25
- && rm -rf /var/lib/apt/lists/*
26
-
27
- RUN wkhtmltopdf --version
28
-
29
- # Node.js LTS (for repomix)
30
  RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
31
- && apt-get update && apt-get install -y --no-install-recommends nodejs \
32
- && rm -rf /var/lib/apt/lists/*
33
 
34
- # repomix
35
  RUN npm install -g repomix
36
 
37
- # Poetry
38
  RUN curl -sSL https://install.python-poetry.org | python3 -
 
 
39
  ENV PATH="/root/.local/bin:$PATH"
 
 
40
  RUN poetry config virtualenvs.create false
41
 
42
- # deps first for better layer caching
43
  COPY poetry.lock pyproject.toml /app/
 
 
44
  RUN poetry install --no-root --no-interaction --no-ansi
45
- RUN pip install gradio[mcp]
46
 
47
- # app
48
  COPY . .
49
 
 
50
  EXPOSE 7860
51
- ENV GRADIO_SERVER_NAME=0.0.0.0 \
52
- GRADIO_SERVER_PORT=7860 \
53
- GRADIO_MCP_SERVER=True
54
 
 
 
 
 
 
55
  CMD ["python", "app.py"]
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
 
 
 
 
3
 
4
+ # Set the working directory in the container
5
  WORKDIR /app
6
 
7
+ # Install system dependencies for Node.js installation, Git, and wkhtmltopdf (for PDF generation)
8
+ RUN apt-get update && apt-get install -y \
9
+ curl \
10
+ gnupg \
11
+ git \
12
+ wkhtmltopdf \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
+ # Add Node.js LTS repository and install Node.js and npm
 
 
 
 
 
 
 
 
 
 
 
16
  RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
17
+ && apt-get install -y nodejs
 
18
 
19
+ # Install repomix globally using npm
20
  RUN npm install -g repomix
21
 
22
+ # Install Poetry
23
  RUN curl -sSL https://install.python-poetry.org | python3 -
24
+
25
+ # Add Poetry to PATH
26
  ENV PATH="/root/.local/bin:$PATH"
27
+
28
+ # Configure Poetry to not create virtual environments
29
  RUN poetry config virtualenvs.create false
30
 
31
+ # Copy poetry.lock and pyproject.toml
32
  COPY poetry.lock pyproject.toml /app/
33
+
34
+ # Install project dependencies using Poetry
35
  RUN poetry install --no-root --no-interaction --no-ansi
 
36
 
37
+ # Copy the rest of the application code into the container
38
  COPY . .
39
 
40
+ # Make port 7860 available to the world outside this container
41
  EXPOSE 7860
 
 
 
42
 
43
+ # Define environment variable for Gradio server
44
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
45
+ ENV GRADIO_SERVER_PORT="7860"
46
+
47
+ # Run app.py when the container launches
48
  CMD ["python", "app.py"]
README.md CHANGED
@@ -8,9 +8,6 @@ app_file: app.py
8
  pinned: false
9
  license: mit
10
  short_description: Scrape web/GitHub for RAG-ready datasets.
11
- tags:
12
- - anycoder
13
- sdk_version: 6.0.2
14
  ---
15
 
16
  # RAG-Ready Content Scraper
 
8
  pinned: false
9
  license: mit
10
  short_description: Scrape web/GitHub for RAG-ready datasets.
 
 
 
11
  ---
12
 
13
  # RAG-Ready Content Scraper
app.py CHANGED
@@ -1,295 +1,239 @@
1
- # app.py
2
  from __future__ import annotations
3
-
4
  import os
5
- import csv
6
- import json
7
- import re
8
  import subprocess
 
 
9
  import tempfile
10
- from typing import Optional, Tuple, Literal
11
-
12
- import gradio as gr
13
- import markdown_pdf
14
- from typing_extensions import Annotated, Doc
15
-
16
- from pydantic import BaseModel, Field, conint
17
-
18
  from rag_scraper.scraper import Scraper
19
  from rag_scraper.converter import Converter
20
  from rag_scraper.link_extractor import LinkExtractor, LinkType
21
  from rag_scraper.utils import URLUtils
22
-
23
- # -----------------------------
24
- # Environment (HF cache dir)
25
- # -----------------------------
26
- os.environ["HF_HOME"] = "/tmp/hf_cache"
27
- os.makedirs(os.environ["HF_HOME"], exist_ok=True)
28
-
29
-
30
- # -----------------------------
31
- # Helper utilities
32
- # -----------------------------
33
- def check_repomix_installed() -> bool:
34
- """Return True if `repomix` is available on PATH."""
 
 
 
35
  try:
36
- r = subprocess.run(
37
- ["repomix", "--version"],
38
- capture_output=True,
39
- text=True,
40
- check=False,
41
- )
42
- return r.returncode == 0
43
  except Exception:
44
  return False
45
 
46
-
47
- def run_repomix(
48
- repo_url_or_id: str,
49
- progress: gr.Progress = gr.Progress(track_tqdm=True),
50
- ) -> Tuple[str, Optional[str]]:
51
- """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
52
- progress(0, desc="Starting Repomix…")
53
  try:
54
- with tempfile.TemporaryDirectory() as td:
55
- out_path = os.path.join(td, "repomix-output.md")
56
- repo_url = (
57
- f"https://github.com/{repo_url_or_id}"
58
- if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
59
- else repo_url_or_id
60
- )
 
 
 
61
  cmd = [
62
  "repomix",
63
- "--remote",
64
- repo_url,
65
- "--output",
66
- out_path,
67
- "--style",
68
- "markdown",
69
- "--compress",
70
  ]
71
- p = subprocess.run(
72
- cmd, capture_output=True, text=True, check=False, encoding="utf-8"
73
- )
74
- progress(0.8, desc="Repomix done.")
75
- if p.returncode != 0:
76
- err = (
77
- f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
78
- )
79
- return f"Error running Repomix:\n{err}", None
80
- if os.path.exists(out_path):
81
- with open(out_path, "r", encoding="utf-8") as f:
82
- return f.read(), out_path
83
- return "Error: Repomix did not produce an output file.", None
 
 
 
 
84
  except Exception as e:
85
- progress(1, desc="Error")
86
- return f"Error processing GitHub repository: {e}", None
87
-
88
 
89
- def scrape_and_convert_website(
90
- url: str,
91
- depth: int,
92
- progress: gr.Progress = gr.Progress(track_tqdm=True),
93
- ) -> Tuple[str, str]:
94
- """Recursively scrape a website and convert visited pages to Markdown."""
95
- progress(0, desc=f"Scraping {url}…")
96
- visited = set()
97
 
98
- def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
99
- if u in visited or d < 0:
100
  return ""
101
- visited.add(u)
 
 
102
  try:
103
- progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
104
- html = Scraper.fetch_html(u)
 
105
  except Exception as e:
106
- return f"Error fetching {u}: {e}\n"
107
- md = (
108
- f"## Extracted from: {u}\n\n"
109
- + Converter.html_to_markdown(
110
- html=html, base_url=u, parser_features="html.parser", ignore_links=True
111
- )
112
- + "\n\n"
 
113
  )
114
- if d > 0:
 
 
 
115
  try:
116
- links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
117
- valid = [
118
- l
119
- for l in links
120
- if URLUtils.is_internal(l, u) and l not in visited
121
  ]
122
- for j, nxt in enumerate(valid):
123
- md += rec(nxt, d - 1, len(valid), j)
 
 
124
  except Exception as e:
125
- md += f"Error extracting links from {u}: {e}\n"
126
- return md
127
-
128
- all_md = rec(url, depth)
129
- with tempfile.NamedTemporaryFile(
130
- mode="w+", delete=False, suffix=".md", encoding="utf-8"
131
- ) as tmp:
132
- tmp.write(all_md)
133
- return all_md, tmp.name
134
-
135
-
136
- def convert_to_json(markdown_content: str, source: str) -> str:
137
- """Wrap Markdown in a tiny JSON schema."""
138
- return json.dumps({"source": source, "content": markdown_content}, indent=2)
139
-
140
-
141
- def convert_to_csv(markdown_content: str, source: str) -> str:
142
- """Write a simple 2-column CSV and return its path."""
143
- f = tempfile.NamedTemporaryFile(
144
- mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
145
- )
146
- w = csv.writer(f)
147
- w.writerow(["source", "content"])
148
- w.writerow([source, markdown_content])
149
- f.close()
150
- return f.name
151
-
152
-
153
- def save_output_to_file(content: str, fmt: str, source: str) -> str:
154
- """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
155
- if fmt == "JSON":
156
- data = convert_to_json(content, source)
157
  suffix = ".json"
158
- elif fmt == "CSV":
159
- return convert_to_csv(content, source)
160
- elif fmt == "Text":
161
- data, suffix = content, ".txt"
162
- elif fmt == "PDF":
 
 
 
 
 
163
  try:
164
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
165
- path = tmp_pdf.name
166
- markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
167
- return path
 
 
168
  except Exception as e:
169
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
170
- data, suffix = content, ".pdf.md"
171
- else:
172
- data, suffix = content, ".md"
 
173
 
174
- with tempfile.NamedTemporaryFile(
175
- mode="w+", delete=False, suffix=suffix, encoding="utf-8"
176
- ) as tmp:
177
- tmp.write(data)
178
- return tmp.name
179
 
180
-
181
- # -----------------------------
182
- # Core UI-bound function
183
- # -----------------------------
184
- def process_input_updated(
185
- url_or_id: str,
186
- source_type: Literal["Webpage", "GitHub Repository"],
187
- depth: int,
188
- output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
189
- progress: gr.Progress = gr.Progress(track_tqdm=True),
190
- ) -> Tuple[str, str, Optional[str]]:
191
- """
192
- UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
193
- then export as Markdown/JSON/CSV/Text/PDF.
194
- """
195
- progress(0, desc="Initializing…")
196
- out_path: Optional[str] = None
197
 
198
  if source_type == "GitHub Repository":
199
  if not check_repomix_installed():
200
- return "Repomix is not installed or not accessible.", "", None
201
- raw, _ = run_repomix(url_or_id, progress=progress)
202
- if raw.startswith("Error"):
203
- return raw, "", None
 
 
204
  elif source_type == "Webpage":
205
- raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
206
- if raw.startswith("Error"):
207
- return raw, "", None
 
208
  else:
209
- return "Invalid source type selected.", "", None
 
210
 
211
- try:
212
- progress(0.9, desc=f"Converting to {output_format_selection}…")
213
- out_path = save_output_to_file(raw, output_format_selection, url_or_id)
214
 
215
- preview = raw
 
 
 
 
216
  if output_format_selection == "JSON":
217
- preview = convert_to_json(raw, url_or_id)
218
- elif output_format_selection == "CSV":
219
  try:
220
- with open(out_path, "r", encoding="utf-8") as f:
221
- first_lines = [next(f) for _ in range(5)]
222
- preview = "".join(first_lines) or "[CSV content is empty or very short]"
 
223
  except StopIteration:
224
- with open(out_path, "r", encoding="utf-8") as f:
225
- preview = f.read() or "[CSV content is empty]"
226
- except Exception as e:
227
- preview = f"[Error reading CSV for preview: {e}]"
 
 
 
228
  elif output_format_selection == "PDF":
229
- from os.path import basename
230
-
231
- preview = (
232
- f"[PDF generated. Download to view: "
233
- f"{basename(out_path) if out_path else 'file.pdf'}]"
234
- )
235
-
236
- progress(1, desc="Done.")
237
- return f"Successfully processed: {url_or_id}", preview, out_path
238
 
 
 
239
  except Exception as e:
240
- return f"Error during conversion: {e}", "", None
241
-
242
-
243
- # -----------------------------
244
- # Pydantic models for MCP tool
245
- # -----------------------------
246
- class ProcessArgs(BaseModel):
247
- url_or_id: str = Field(
248
- ...,
249
- description=(
250
- "For webpages, a full URL (e.g., https://example.com). "
251
- "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
252
- ),
253
- )
254
- source_type: Literal["Webpage", "GitHub Repository"] = Field(
255
- ...,
256
- description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
257
- )
258
- depth: conint(ge=0, le=3) = Field(
259
- ...,
260
- description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
261
- )
262
- output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
263
- ...,
264
- description="Desired output format for the processed content.",
265
- )
266
 
267
-
268
- class ProcessResult(BaseModel):
269
- status: str = Field(..., description="Human-readable status line.")
270
- preview: str = Field(
271
- ...,
272
- description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
273
- )
274
- file_path: Optional[str] = Field(
275
- None, description="Temp file path for the artifact, or null if not created."
276
- )
277
-
278
-
279
- def process_input_mcp(args: ProcessArgs) -> ProcessResult:
280
- """
281
- MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
282
- """
283
- status, preview, path = process_input_updated(
284
- args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
285
- )
286
- return ProcessResult(status=status, preview=preview, file_path=path)
287
-
288
-
289
- # -----------------------------
290
- # Gradio UI
291
- # -----------------------------
292
- with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
293
  gr.Markdown("# RAG-Ready Content Scraper")
294
  gr.Markdown(
295
  "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
@@ -299,52 +243,64 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
299
  with gr.Column(scale=2):
300
  url_input = gr.Textbox(
301
  label="Enter URL or GitHub Repository ID",
302
- placeholder="https://example.com or owner/repo",
303
  )
304
  source_type_input = gr.Radio(
305
  choices=["Webpage", "GitHub Repository"],
306
  value="Webpage",
307
- label="Select Source Type",
308
  )
309
  depth_input = gr.Slider(
310
- minimum=0,
311
- maximum=3,
312
- step=1,
313
- value=0,
314
  label="Scraping Depth (for Webpages)",
315
- info="0 = only main page. Ignored for GitHub.",
316
  )
317
  output_format_input = gr.Dropdown(
318
- choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
319
  value="Markdown",
320
- label="Select Output Format",
321
  )
322
  submit_button = gr.Button("Process Content", variant="primary")
 
323
  with gr.Column(scale=3):
324
  status_output = gr.Textbox(label="Status", interactive=False)
325
- preview_output = gr.Code(
326
- label="Preview Content", language="markdown", interactive=False
327
- )
328
- file_download_output = gr.File(
329
- label="Download Processed File", interactive=False
330
- )
331
 
332
  gr.Examples(
333
  examples=[
334
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
335
- ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
336
- [
337
- "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
338
- "Webpage",
339
- 0,
340
- "JSON",
341
- ],
342
  ],
343
  inputs=[url_input, source_type_input, depth_input, output_format_input],
344
  outputs=[status_output, preview_output, file_download_output],
345
  fn=process_input_updated,
346
- cache_examples=False,
347
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
  submit_button.click(
350
  fn=process_input_updated,
@@ -352,26 +308,5 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
352
  outputs=[status_output, preview_output, file_download_output],
353
  )
354
 
355
- # -----------------------------
356
- # MCP-only Interface (Pydantic tool)
357
- # -----------------------------
358
- # We expose a second interface whose *function signature* uses Pydantic models.
359
- # MCP reads this signature to build a JSON Schema with rich field descriptions.
360
- mcp_iface = gr.Interface(
361
- fn=process_input_mcp,
362
- # Components are placeholders; MCP ignores them and reads the Python types.
363
- # Keep them simple so the tab is usable if someone clicks it.
364
- inputs=gr.JSON(label="ProcessArgs (JSON)"),
365
- outputs=gr.JSON(label="ProcessResult (JSON)"),
366
- title="MCP Tool: process_input_mcp",
367
- description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
368
- allow_flagging="never",
369
- )
370
-
371
- # Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
372
- app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
373
-
374
-
375
  if __name__ == "__main__":
376
- # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
377
- app.queue().launch(share=True, mcp_server=True)
 
 
1
  from __future__ import annotations
 
2
  import os
3
+ os.environ['HF_HOME'] = '/tmp/hf_cache'
4
+ os.makedirs(os.environ['HF_HOME'], exist_ok=True) # Ensure the directory exists
5
+ import gradio as gr
6
  import subprocess
7
+ import os
8
+ import re
9
  import tempfile
10
+ import json
11
+ import csv
12
+ # Removed: from typing import Iterable # Added for Theme
 
 
 
 
 
13
  from rag_scraper.scraper import Scraper
14
  from rag_scraper.converter import Converter
15
  from rag_scraper.link_extractor import LinkExtractor, LinkType
16
  from rag_scraper.utils import URLUtils
17
+ # Removed: from gradio.themes.base import Base # Added for Theme
18
+ # Removed: from gradio.themes.utils import colors, fonts, sizes # Added for Theme
19
+ import markdown_pdf # Added for PDF conversion
20
+
21
+ # --- Custom Theme Definition --- (REMOVED Seafoam class and instance)
22
+
23
+ def is_github_repo(url_or_id):
24
+ """Check if the input is a GitHub repository URL or ID."""
25
+ if "github.com" in url_or_id:
26
+ return True
27
+ if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
28
+ return True
29
+ return False
30
+
31
+ def check_repomix_installed():
32
+ """Check if Repomix is installed."""
33
  try:
34
+ result = subprocess.run(["repomix", "--version"],
35
+ capture_output=True, text=True, check=False)
36
+ return result.returncode == 0
 
 
 
 
37
  except Exception:
38
  return False
39
 
40
+ def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
41
+ """Run Repomix on the GitHub repository and return the content."""
42
+ progress(0, desc="Starting Repomix processing...")
 
 
 
 
43
  try:
44
+ with tempfile.TemporaryDirectory() as temp_dir:
45
+ output_file_name = "repomix-output.md"
46
+ output_file_path = os.path.join(temp_dir, output_file_name)
47
+
48
+ if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
49
+ repo_url = f"https://github.com/{repo_url_or_id}"
50
+ else:
51
+ repo_url = repo_url_or_id
52
+
53
+ progress(0.2, desc=f"Running Repomix on {repo_url}...")
54
  cmd = [
55
  "repomix",
56
+ "--remote", repo_url,
57
+ "--output", output_file_path,
58
+ "--style", "markdown",
59
+ "--compress"
 
 
 
60
  ]
61
+
62
+ process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8')
63
+ progress(0.8, desc="Repomix command executed.")
64
+
65
+ if process.returncode != 0:
66
+ error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
67
+ return f"Error running Repomix:\n{error_details}", None
68
+
69
+ if os.path.exists(output_file_path):
70
+ with open(output_file_path, 'r', encoding='utf-8') as f:
71
+ content = f.read()
72
+ progress(1, desc="Repomix output processed.")
73
+ return content, output_file_path
74
+ else:
75
+ error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
76
+ return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
77
+
78
  except Exception as e:
79
+ progress(1, desc="Error during Repomix processing.")
80
+ return f"Error processing GitHub repository: {str(e)}", None
 
81
 
82
+ def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)):
83
+ """Fetch HTML, extract links, convert to Markdown."""
84
+ progress(0, desc=f"Starting web scrape for {url}...")
85
+ visited_urls = set()
86
+ all_markdown_content = ""
 
 
 
87
 
88
+ def recursive_scrape(current_url, current_depth, total_links_estimate=1, link_index=0):
89
+ if current_url in visited_urls or current_depth < 0:
90
  return ""
91
+
92
+ visited_urls.add(current_url)
93
+
94
  try:
95
+ progress_val = link_index / total_links_estimate if total_links_estimate > 0 else 0
96
+ progress(progress_val, desc=f"Scraping: {current_url} (Depth: {depth - current_depth})")
97
+ html_content = Scraper.fetch_html(current_url)
98
  except Exception as e:
99
+ return f"Error fetching {current_url}: {str(e)}\n"
100
+
101
+ markdown_content = f"## Extracted from: {current_url}\n\n"
102
+ markdown_content += Converter.html_to_markdown(
103
+ html=html_content,
104
+ base_url=current_url,
105
+ parser_features='html.parser',
106
+ ignore_links=True
107
  )
108
+
109
+ page_content = markdown_content + "\n\n"
110
+
111
+ if current_depth > 0:
112
  try:
113
+ links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
114
+ valid_links = [
115
+ link for link in links
116
+ if URLUtils.is_internal(link, current_url) and link not in visited_urls
 
117
  ]
118
+
119
+ num_links = len(valid_links)
120
+ for i, link_url in enumerate(valid_links):
121
+ page_content += recursive_scrape(link_url, current_depth - 1, num_links, i)
122
  except Exception as e:
123
+ page_content += f"Error extracting links from {current_url}: {str(e)}\n"
124
+ return page_content
125
+
126
+ all_markdown_content = recursive_scrape(url, depth)
127
+ progress(1, desc="Web scraping complete.")
128
+
129
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
130
+ tmp_file.write(all_markdown_content)
131
+ return all_markdown_content, tmp_file.name
132
+
133
+ def convert_to_json(markdown_content, source_url_or_id):
134
+ data = {"source": source_url_or_id, "content": markdown_content}
135
+ return json.dumps(data, indent=2)
136
+
137
+ def convert_to_csv(markdown_content, source_url_or_id):
138
+ output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
139
+ writer = csv.writer(output)
140
+ writer.writerow(["source", "content"])
141
+ writer.writerow([source_url_or_id, markdown_content])
142
+ output.close()
143
+ return output.name
144
+
145
+ def save_output_to_file(content, output_format, source_url_or_id):
146
+ """Saves content to a temporary file based on format and returns its path."""
147
+ processed_content = content # Default for Markdown and Text
148
+
149
+ if output_format == "JSON":
 
 
 
 
 
150
  suffix = ".json"
151
+ processed_content = convert_to_json(content, source_url_or_id)
152
+ elif output_format == "CSV":
153
+ # convert_to_csv returns a path directly
154
+ return convert_to_csv(content, source_url_or_id)
155
+ elif output_format == "Text":
156
+ suffix = ".txt"
157
+ elif output_format == "PDF":
158
+ suffix = ".pdf"
159
+ # PDF conversion happens differently, creates file directly
160
+ pdf_output_path = ""
161
  try:
162
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
163
+ pdf_output_path = tmp_pdf_file.name
164
+
165
+ md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
166
+ md_pdf.convert_from_string(content, pdf_output_path)
167
+ return pdf_output_path
168
  except Exception as e:
169
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
170
+ suffix = ".pdf.md"
171
+ # No processed_content change needed, it's already markdown
172
+ else: # Default to Markdown
173
+ suffix = ".md"
174
 
175
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
176
+ tmp_file.write(processed_content)
177
+ return tmp_file.name
 
 
178
 
179
+ def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
180
+ progress(0, desc="Initializing...")
181
+ raw_content = ""
182
+ error_message = ""
183
+ output_file_path = None
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  if source_type == "GitHub Repository":
186
  if not check_repomix_installed():
187
+ error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally."
188
+ return error_message, None, None
189
+ raw_content, _ = run_repomix(url_or_id, progress=progress)
190
+ if "Error" in raw_content:
191
+ error_message = raw_content
192
+ raw_content = ""
193
  elif source_type == "Webpage":
194
+ raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
195
+ if "Error" in raw_content:
196
+ error_message = raw_content
197
+ raw_content = ""
198
  else:
199
+ error_message = "Invalid source type selected."
200
+ return error_message, None, None
201
 
202
+ if error_message:
203
+ return error_message, None, None
 
204
 
205
+ try:
206
+ progress(0.9, desc=f"Converting to {output_format_selection}...")
207
+ output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
208
+
209
+ preview_content = raw_content
210
  if output_format_selection == "JSON":
211
+ preview_content = convert_to_json(raw_content, url_or_id)
212
+ elif output_format_selection == "CSV" and output_file_path:
213
  try:
214
+ with open(output_file_path, 'r', encoding='utf-8') as f_csv:
215
+ csv_preview_lines = [next(f_csv) for _ in range(5)]
216
+ preview_content = "".join(csv_preview_lines)
217
+ if not preview_content: preview_content = "[CSV content is empty or very short]"
218
  except StopIteration:
219
+ with open(output_file_path, 'r', encoding='utf-8') as f_csv:
220
+ preview_content = f_csv.read()
221
+ if not preview_content: preview_content = "[CSV content is empty]"
222
+ except Exception as e_csv_preview:
223
+ preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
224
+ elif output_format_selection == "CSV" and not output_file_path:
225
+ preview_content = "[CSV file path not available for preview]"
226
  elif output_format_selection == "PDF":
227
+ preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
228
+ if "Saving as Markdown instead" in (output_file_path or ""):
229
+ preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
 
 
 
 
 
 
230
 
231
+ progress(1, desc="Processing complete.")
232
+ return f"Successfully processed: {url_or_id}", preview_content, output_file_path
233
  except Exception as e:
234
+ return f"Error during file conversion/saving: {str(e)}", raw_content, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  gr.Markdown("# RAG-Ready Content Scraper")
238
  gr.Markdown(
239
  "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
 
243
  with gr.Column(scale=2):
244
  url_input = gr.Textbox(
245
  label="Enter URL or GitHub Repository ID",
246
+ placeholder="e.g., https://example.com OR username/repo"
247
  )
248
  source_type_input = gr.Radio(
249
  choices=["Webpage", "GitHub Repository"],
250
  value="Webpage",
251
+ label="Select Source Type"
252
  )
253
  depth_input = gr.Slider(
254
+ minimum=0, maximum=3, step=1, value=0,
 
 
 
255
  label="Scraping Depth (for Webpages)",
256
+ info="0: Only main page. Ignored for GitHub repos."
257
  )
258
  output_format_input = gr.Dropdown(
259
+ choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
260
  value="Markdown",
261
+ label="Select Output Format"
262
  )
263
  submit_button = gr.Button("Process Content", variant="primary")
264
+
265
  with gr.Column(scale=3):
266
  status_output = gr.Textbox(label="Status", interactive=False)
267
+ preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
268
+ file_download_output = gr.File(label="Download Processed File", interactive=False)
 
 
 
 
269
 
270
  gr.Examples(
271
  examples=[
272
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
273
+ ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
274
+ ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
 
 
 
 
 
275
  ],
276
  inputs=[url_input, source_type_input, depth_input, output_format_input],
277
  outputs=[status_output, preview_output, file_download_output],
278
  fn=process_input_updated,
279
+ cache_examples=False
280
  )
281
+
282
+ with gr.Accordion("How it Works & More Info", open=False):
283
+ gr.Markdown(
284
+ """
285
+ **Webpage Scraping:**
286
+ 1. Enter a full URL (e.g., `https://example.com`).
287
+ 2. Select "Webpage" as the source type.
288
+ 3. Set the desired scraping depth.
289
+ 4. Choose your output format.
290
+
291
+ **GitHub Repository Processing:**
292
+ 1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
293
+ 2. Select "GitHub Repository". (Depth is ignored).
294
+ 3. Choose your output format. Uses **RepoMix**.
295
+
296
+ **Output Formats:** Markdown, JSON, CSV, Text, PDF.
297
+
298
+ **Note:** PDF generation requires `markdown-pdf` library.
299
+ This app is designed for Docker/HuggingFace Spaces.
300
+
301
+ [View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
302
+ """
303
+ )
304
 
305
  submit_button.click(
306
  fn=process_input_updated,
 
308
  outputs=[status_output, preview_output, file_download_output],
309
  )
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  if __name__ == "__main__":
312
+ iface.launch()
 
requirements.txt CHANGED
@@ -1,71 +1,6 @@
1
- rag-scraper
2
- pydantic
3
- markdown-pdf
4
- typing-extensions
5
- gradio>=6.0
6
- requests
7
- Pillow
8
- markdown
9
- beautifulsoup4
10
- lxml
11
- aiohttp
12
- fake-useragent
13
- urllib3
14
- html5lib
15
- chardet
16
- tqdm
17
- python-dateutil
18
- pytz
19
- click
20
- nltk
21
- spacy
22
- scrapy
23
- selenium
24
- webdriver-manager
25
- pandas
26
- numpy
27
- openpyxl
28
- PyPDF2
29
- python-docx
30
- python-pptx
31
- reportlab
32
- pdfkit
33
- weasyprint
34
- cssutils
35
- tinycss2
36
- cchardet
37
- idna
38
- certifi
39
- charset-normalizer
40
- httpx
41
- httpcore
42
- anyio
43
- sniffio
44
- pysocks
45
- win-inet-pton
46
- deprecation
47
- docstring-parser
48
- rich
49
- typer
50
- pyyaml
51
- toml
52
- tomli
53
- packaging
54
- filelock
55
- huggingface-hub
56
- safetensors
57
- regex
58
- tokenizers
59
- sentencepiece
60
- accelerate
61
- torch
62
- torchvision
63
- torchaudio
64
- transformers
65
- diffusers
66
- datasets
67
- evaluate
68
- scipy
69
- scikit-learn
70
- joblib
71
- threadpoolctl
 
1
+ html2text
2
+ gradio>=4.44.1
3
+ requests>=2.31.0
4
+ beautifulsoup4>=4.12.3
5
+ lxml>=4.9.3
6
+ markdown>=3.5.2