Spaces:

CultriX
/

RAG-Scraper

Paused

App Files Files Community

Webscrapper

by myrhydm - opened Jun 20, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+259

-399

This PR is in draft mode

Files changed (4) hide show

Dockerfile +27 -34
README.md +0 -3
app.py +226 -291
requirements.txt +6 -71

Dockerfile CHANGED Viewed

@@ -1,55 +1,48 @@
-# Pin to Debian 12 so wkhtmltox bookworm package exists
-FROM python:3.10-bookworm
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONDONTWRITEBYTECODE=1 \
-    PYTHONUNBUFFERED=1
 WORKDIR /app
-# OS deps + fonts + X libs required by wkhtmltopdf
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    ca-certificates curl gnupg git xz-utils \
-    fontconfig fonts-dejavu-core \
-    libfreetype6 libjpeg62-turbo libpng16-16 \
-    libx11-6 libxext6 libxrender1 libxcb1 \
     && rm -rf /var/lib/apt/lists/*
-# Install wkhtmltopdf (bookworm build)
-ARG WKHTML_VER=0.12.6.1-3
-RUN curl -fsSL -o /tmp/wkhtml.deb \
-    "https://github.com/wkhtmltopdf/packaging/releases/download/${WKHTML_VER}/wkhtmltox_${WKHTML_VER}.bookworm_amd64.deb" \
- && apt-get update \
- && apt-get install -y --no-install-recommends /tmp/wkhtml.deb \
- && rm -f /tmp/wkhtml.deb \
- && rm -rf /var/lib/apt/lists/*
-RUN wkhtmltopdf --version
-# Node.js LTS (for repomix)
 RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
- && apt-get update && apt-get install -y --no-install-recommends nodejs \
- && rm -rf /var/lib/apt/lists/*
-# repomix
 RUN npm install -g repomix
-# Poetry
 RUN curl -sSL https://install.python-poetry.org | python3 -
 ENV PATH="/root/.local/bin:$PATH"
 RUN poetry config virtualenvs.create false
-# deps first for better layer caching
 COPY poetry.lock pyproject.toml /app/
 RUN poetry install --no-root --no-interaction --no-ansi
-RUN pip install gradio[mcp]
-# app
 COPY . .
 EXPOSE 7860
-ENV GRADIO_SERVER_NAME=0.0.0.0 \
-    GRADIO_SERVER_PORT=7860 \
-    GRADIO_MCP_SERVER=True
 CMD ["python", "app.py"]

+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# Set the working directory in the container
 WORKDIR /app
+# Install system dependencies for Node.js installation, Git, and wkhtmltopdf (for PDF generation)
+RUN apt-get update && apt-get install -y \
+    curl \
+    gnupg \
+    git \
+    wkhtmltopdf \
     && rm -rf /var/lib/apt/lists/*
+# Add Node.js LTS repository and install Node.js and npm
 RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
+    && apt-get install -y nodejs
+# Install repomix globally using npm
 RUN npm install -g repomix
+# Install Poetry
 RUN curl -sSL https://install.python-poetry.org | python3 -
+# Add Poetry to PATH
 ENV PATH="/root/.local/bin:$PATH"
+# Configure Poetry to not create virtual environments
 RUN poetry config virtualenvs.create false
+# Copy poetry.lock and pyproject.toml
 COPY poetry.lock pyproject.toml /app/
+# Install project dependencies using Poetry
 RUN poetry install --no-root --no-interaction --no-ansi
+# Copy the rest of the application code into the container
 COPY . .
+# Make port 7860 available to the world outside this container
 EXPOSE 7860
+# Define environment variable for Gradio server
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+ENV GRADIO_SERVER_PORT="7860"
+# Run app.py when the container launches
 CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -8,9 +8,6 @@ app_file: app.py
 pinned: false
 license: mit
 short_description: Scrape web/GitHub for RAG-ready datasets.
-tags:
-- anycoder
-sdk_version: 6.0.2
 ---
 # RAG-Ready Content Scraper

 pinned: false
 license: mit
 short_description: Scrape web/GitHub for RAG-ready datasets.
 ---
 # RAG-Ready Content Scraper

app.py CHANGED Viewed

@@ -1,295 +1,239 @@
-# app.py
 from __future__ import annotations
 import os
-import csv
-import json
-import re
 import subprocess
 import tempfile
-from typing import Optional, Tuple, Literal
-import gradio as gr
-import markdown_pdf
-from typing_extensions import Annotated, Doc
-from pydantic import BaseModel, Field, conint
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
-# -----------------------------
-# Environment (HF cache dir)
-# -----------------------------
-os.environ["HF_HOME"] = "/tmp/hf_cache"
-os.makedirs(os.environ["HF_HOME"], exist_ok=True)
-# -----------------------------
-# Helper utilities
-# -----------------------------
-def check_repomix_installed() -> bool:
-    """Return True if `repomix` is available on PATH."""
     try:
-        r = subprocess.run(
-            ["repomix", "--version"],
-            capture_output=True,
-            text=True,
-            check=False,
-        )
-        return r.returncode == 0
     except Exception:
         return False
-def run_repomix(
-    repo_url_or_id: str,
-    progress: gr.Progress = gr.Progress(track_tqdm=True),
-) -> Tuple[str, Optional[str]]:
-    """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
-    progress(0, desc="Starting Repomix…")
     try:
-        with tempfile.TemporaryDirectory() as td:
-            out_path = os.path.join(td, "repomix-output.md")
-            repo_url = (
-                f"https://github.com/{repo_url_or_id}"
-                if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
-                else repo_url_or_id
-            )
             cmd = [
                 "repomix",
-                "--remote",
-                repo_url,
-                "--output",
-                out_path,
-                "--style",
-                "markdown",
-                "--compress",
             ]
-            p = subprocess.run(
-                cmd, capture_output=True, text=True, check=False, encoding="utf-8"
-            )
-            progress(0.8, desc="Repomix done.")
-            if p.returncode != 0:
-                err = (
-                    f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
-                )
-                return f"Error running Repomix:\n{err}", None
-            if os.path.exists(out_path):
-                with open(out_path, "r", encoding="utf-8") as f:
-                    return f.read(), out_path
-            return "Error: Repomix did not produce an output file.", None
     except Exception as e:
-        progress(1, desc="Error")
-        return f"Error processing GitHub repository: {e}", None
-def scrape_and_convert_website(
-    url: str,
-    depth: int,
-    progress: gr.Progress = gr.Progress(track_tqdm=True),
-) -> Tuple[str, str]:
-    """Recursively scrape a website and convert visited pages to Markdown."""
-    progress(0, desc=f"Scraping {url}…")
-    visited = set()
-    def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
-        if u in visited or d < 0:
             return ""
-        visited.add(u)
         try:
-            progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
-            html = Scraper.fetch_html(u)
         except Exception as e:
-            return f"Error fetching {u}: {e}\n"
-        md = (
-            f"## Extracted from: {u}\n\n"
-            + Converter.html_to_markdown(
-                html=html, base_url=u, parser_features="html.parser", ignore_links=True
-            )
-            + "\n\n"
         )
-        if d > 0:
             try:
-                links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
-                valid = [
-                    l
-                    for l in links
-                    if URLUtils.is_internal(l, u) and l not in visited
                 ]
-                for j, nxt in enumerate(valid):
-                    md += rec(nxt, d - 1, len(valid), j)
             except Exception as e:
-                md += f"Error extracting links from {u}: {e}\n"
-        return md
-    all_md = rec(url, depth)
-    with tempfile.NamedTemporaryFile(
-        mode="w+", delete=False, suffix=".md", encoding="utf-8"
-    ) as tmp:
-        tmp.write(all_md)
-        return all_md, tmp.name
-def convert_to_json(markdown_content: str, source: str) -> str:
-    """Wrap Markdown in a tiny JSON schema."""
-    return json.dumps({"source": source, "content": markdown_content}, indent=2)
-def convert_to_csv(markdown_content: str, source: str) -> str:
-    """Write a simple 2-column CSV and return its path."""
-    f = tempfile.NamedTemporaryFile(
-        mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
-    )
-    w = csv.writer(f)
-    w.writerow(["source", "content"])
-    w.writerow([source, markdown_content])
-    f.close()
-    return f.name
-def save_output_to_file(content: str, fmt: str, source: str) -> str:
-    """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
-    if fmt == "JSON":
-        data = convert_to_json(content, source)
         suffix = ".json"
-    elif fmt == "CSV":
-        return convert_to_csv(content, source)
-    elif fmt == "Text":
-        data, suffix = content, ".txt"
-    elif fmt == "PDF":
         try:
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
-                path = tmp_pdf.name
-            markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
-            return path
         except Exception as e:
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
-            data, suffix = content, ".pdf.md"
-    else:
-        data, suffix = content, ".md"
-    with tempfile.NamedTemporaryFile(
-        mode="w+", delete=False, suffix=suffix, encoding="utf-8"
-    ) as tmp:
-        tmp.write(data)
-        return tmp.name
-# -----------------------------
-# Core UI-bound function
-# -----------------------------
-def process_input_updated(
-    url_or_id: str,
-    source_type: Literal["Webpage", "GitHub Repository"],
-    depth: int,
-    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
-    progress: gr.Progress = gr.Progress(track_tqdm=True),
-) -> Tuple[str, str, Optional[str]]:
-    """
-    UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
-    then export as Markdown/JSON/CSV/Text/PDF.
-    """
-    progress(0, desc="Initializing…")
-    out_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
-            return "Repomix is not installed or not accessible.", "", None
-        raw, _ = run_repomix(url_or_id, progress=progress)
-        if raw.startswith("Error"):
-            return raw, "", None
     elif source_type == "Webpage":
-        raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
-        if raw.startswith("Error"):
-            return raw, "", None
     else:
-        return "Invalid source type selected.", "", None
-    try:
-        progress(0.9, desc=f"Converting to {output_format_selection}…")
-        out_path = save_output_to_file(raw, output_format_selection, url_or_id)
-        preview = raw
         if output_format_selection == "JSON":
-            preview = convert_to_json(raw, url_or_id)
-        elif output_format_selection == "CSV":
             try:
-                with open(out_path, "r", encoding="utf-8") as f:
-                    first_lines = [next(f) for _ in range(5)]
-                preview = "".join(first_lines) or "[CSV content is empty or very short]"
             except StopIteration:
-                with open(out_path, "r", encoding="utf-8") as f:
-                    preview = f.read() or "[CSV content is empty]"
-            except Exception as e:
-                preview = f"[Error reading CSV for preview: {e}]"
         elif output_format_selection == "PDF":
-            from os.path import basename
-            preview = (
-                f"[PDF generated. Download to view: "
-                f"{basename(out_path) if out_path else 'file.pdf'}]"
-            )
-        progress(1, desc="Done.")
-        return f"Successfully processed: {url_or_id}", preview, out_path
     except Exception as e:
-        return f"Error during conversion: {e}", "", None
-# -----------------------------
-# Pydantic models for MCP tool
-# -----------------------------
-class ProcessArgs(BaseModel):
-    url_or_id: str = Field(
-        ...,
-        description=(
-            "For webpages, a full URL (e.g., https://example.com). "
-            "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
-        ),
-    )
-    source_type: Literal["Webpage", "GitHub Repository"] = Field(
-        ...,
-        description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
-    )
-    depth: conint(ge=0, le=3) = Field(
-        ...,
-        description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
-    )
-    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
-        ...,
-        description="Desired output format for the processed content.",
-    )
-class ProcessResult(BaseModel):
-    status: str = Field(..., description="Human-readable status line.")
-    preview: str = Field(
-        ...,
-        description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
-    )
-    file_path: Optional[str] = Field(
-        None, description="Temp file path for the artifact, or null if not created."
-    )
-def process_input_mcp(args: ProcessArgs) -> ProcessResult:
-    """
-    MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
-    """
-    status, preview, path = process_input_updated(
-        args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
-    )
-    return ProcessResult(status=status, preview=preview, file_path=path)
-# -----------------------------
-# Gradio UI
-# -----------------------------
-with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
     gr.Markdown("# RAG-Ready Content Scraper")
     gr.Markdown(
         "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
@@ -299,52 +243,64 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
         with gr.Column(scale=2):
             url_input = gr.Textbox(
                 label="Enter URL or GitHub Repository ID",
-                placeholder="https://example.com  or  owner/repo",
             )
             source_type_input = gr.Radio(
                 choices=["Webpage", "GitHub Repository"],
                 value="Webpage",
-                label="Select Source Type",
             )
             depth_input = gr.Slider(
-                minimum=0,
-                maximum=3,
-                step=1,
-                value=0,
                 label="Scraping Depth (for Webpages)",
-                info="0 = only main page. Ignored for GitHub.",
             )
             output_format_input = gr.Dropdown(
-                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
                 value="Markdown",
-                label="Select Output Format",
             )
             submit_button = gr.Button("Process Content", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
-            preview_output = gr.Code(
-                label="Preview Content", language="markdown", interactive=False
-            )
-            file_download_output = gr.File(
-                label="Download Processed File", interactive=False
-            )
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
-            ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
-            [
-                "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
-                "Webpage",
-                0,
-                "JSON",
-            ],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
         outputs=[status_output, preview_output, file_download_output],
         fn=process_input_updated,
-        cache_examples=False,
     )
     submit_button.click(
         fn=process_input_updated,
@@ -352,26 +308,5 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
         outputs=[status_output, preview_output, file_download_output],
     )
-# -----------------------------
-# MCP-only Interface (Pydantic tool)
-# -----------------------------
-# We expose a second interface whose *function signature* uses Pydantic models.
-# MCP reads this signature to build a JSON Schema with rich field descriptions.
-mcp_iface = gr.Interface(
-    fn=process_input_mcp,
-    # Components are placeholders; MCP ignores them and reads the Python types.
-    # Keep them simple so the tab is usable if someone clicks it.
-    inputs=gr.JSON(label="ProcessArgs (JSON)"),
-    outputs=gr.JSON(label="ProcessResult (JSON)"),
-    title="MCP Tool: process_input_mcp",
-    description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
-    allow_flagging="never",
-)
-# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
-app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
 if __name__ == "__main__":
-    # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
-    app.queue().launch(share=True, mcp_server=True)

 from __future__ import annotations
 import os
+os.environ['HF_HOME'] = '/tmp/hf_cache'
+os.makedirs(os.environ['HF_HOME'], exist_ok=True) # Ensure the directory exists
+import gradio as gr
 import subprocess
+import os
+import re
 import tempfile
+import json
+import csv
+# Removed: from typing import Iterable # Added for Theme
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
+# Removed: from gradio.themes.base import Base # Added for Theme
+# Removed: from gradio.themes.utils import colors, fonts, sizes # Added for Theme
+import markdown_pdf # Added for PDF conversion
+# --- Custom Theme Definition --- (REMOVED Seafoam class and instance)
+def is_github_repo(url_or_id):
+    """Check if the input is a GitHub repository URL or ID."""
+    if "github.com" in url_or_id:
+        return True
+    if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
+        return True
+    return False
+def check_repomix_installed():
+    """Check if Repomix is installed."""
     try:
+        result = subprocess.run(["repomix", "--version"],
+                               capture_output=True, text=True, check=False)
+        return result.returncode == 0
     except Exception:
         return False
+def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
+    """Run Repomix on the GitHub repository and return the content."""
+    progress(0, desc="Starting Repomix processing...")
     try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_file_name = "repomix-output.md"
+            output_file_path = os.path.join(temp_dir, output_file_name)
+            if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
+                repo_url = f"https://github.com/{repo_url_or_id}"
+            else:
+                repo_url = repo_url_or_id
+            progress(0.2, desc=f"Running Repomix on {repo_url}...")
             cmd = [
                 "repomix",
+                "--remote", repo_url,
+                "--output", output_file_path,
+                "--style", "markdown",
+                "--compress"
             ]
+            process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8')
+            progress(0.8, desc="Repomix command executed.")
+            if process.returncode != 0:
+                error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
+                return f"Error running Repomix:\n{error_details}", None
+            if os.path.exists(output_file_path):
+                with open(output_file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                progress(1, desc="Repomix output processed.")
+                return content, output_file_path
+            else:
+                error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
+                return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
     except Exception as e:
+        progress(1, desc="Error during Repomix processing.")
+        return f"Error processing GitHub repository: {str(e)}", None
+def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)):
+    """Fetch HTML, extract links, convert to Markdown."""
+    progress(0, desc=f"Starting web scrape for {url}...")
+    visited_urls = set()
+    all_markdown_content = ""
+    def recursive_scrape(current_url, current_depth, total_links_estimate=1, link_index=0):
+        if current_url in visited_urls or current_depth < 0:
             return ""
+        visited_urls.add(current_url)
         try:
+            progress_val = link_index / total_links_estimate if total_links_estimate > 0 else 0
+            progress(progress_val, desc=f"Scraping: {current_url} (Depth: {depth - current_depth})")
+            html_content = Scraper.fetch_html(current_url)
         except Exception as e:
+            return f"Error fetching {current_url}: {str(e)}\n"
+        markdown_content = f"## Extracted from: {current_url}\n\n"
+        markdown_content += Converter.html_to_markdown(
+            html=html_content,
+            base_url=current_url,
+            parser_features='html.parser',
+            ignore_links=True
         )
+        page_content = markdown_content + "\n\n"
+        if current_depth > 0:
             try:
+                links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
+                valid_links = [
+                    link for link in links
+                    if URLUtils.is_internal(link, current_url) and link not in visited_urls
                 ]
+                num_links = len(valid_links)
+                for i, link_url in enumerate(valid_links):
+                    page_content += recursive_scrape(link_url, current_depth - 1, num_links, i)
             except Exception as e:
+                page_content += f"Error extracting links from {current_url}: {str(e)}\n"
+        return page_content
+    all_markdown_content = recursive_scrape(url, depth)
+    progress(1, desc="Web scraping complete.")
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
+        tmp_file.write(all_markdown_content)
+        return all_markdown_content, tmp_file.name
+def convert_to_json(markdown_content, source_url_or_id):
+    data = {"source": source_url_or_id, "content": markdown_content}
+    return json.dumps(data, indent=2)
+def convert_to_csv(markdown_content, source_url_or_id):
+    output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
+    writer = csv.writer(output)
+    writer.writerow(["source", "content"])
+    writer.writerow([source_url_or_id, markdown_content])
+    output.close()
+    return output.name
+def save_output_to_file(content, output_format, source_url_or_id):
+    """Saves content to a temporary file based on format and returns its path."""
+    processed_content = content # Default for Markdown and Text
+    if output_format == "JSON":
         suffix = ".json"
+        processed_content = convert_to_json(content, source_url_or_id)
+    elif output_format == "CSV":
+        # convert_to_csv returns a path directly
+        return convert_to_csv(content, source_url_or_id)
+    elif output_format == "Text":
+        suffix = ".txt"
+    elif output_format == "PDF":
+        suffix = ".pdf"
+        # PDF conversion happens differently, creates file directly
+        pdf_output_path = ""
         try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
+                pdf_output_path = tmp_pdf_file.name
+            md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
+            md_pdf.convert_from_string(content, pdf_output_path)
+            return pdf_output_path
         except Exception as e:
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
+            suffix = ".pdf.md"
+            # No processed_content change needed, it's already markdown
+    else: # Default to Markdown
+        suffix = ".md"
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
+        tmp_file.write(processed_content)
+        return tmp_file.name
+def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
+    progress(0, desc="Initializing...")
+    raw_content = ""
+    error_message = ""
+    output_file_path = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
+            error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally."
+            return error_message, None, None
+        raw_content, _ = run_repomix(url_or_id, progress=progress)
+        if "Error" in raw_content:
+            error_message = raw_content
+            raw_content = ""
     elif source_type == "Webpage":
+        raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
+        if "Error" in raw_content:
+            error_message = raw_content
+            raw_content = ""
     else:
+        error_message = "Invalid source type selected."
+        return error_message, None, None
+    if error_message:
+        return error_message, None, None
+    try:
+        progress(0.9, desc=f"Converting to {output_format_selection}...")
+        output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
+        preview_content = raw_content
         if output_format_selection == "JSON":
+            preview_content = convert_to_json(raw_content, url_or_id)
+        elif output_format_selection == "CSV" and output_file_path:
             try:
+                with open(output_file_path, 'r', encoding='utf-8') as f_csv:
+                    csv_preview_lines = [next(f_csv) for _ in range(5)]
+                preview_content = "".join(csv_preview_lines)
+                if not preview_content: preview_content = "[CSV content is empty or very short]"
             except StopIteration:
+                with open(output_file_path, 'r', encoding='utf-8') as f_csv:
+                    preview_content = f_csv.read()
+                if not preview_content: preview_content = "[CSV content is empty]"
+            except Exception as e_csv_preview:
+                preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
+        elif output_format_selection == "CSV" and not output_file_path:
+             preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
+            preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
+            if "Saving as Markdown instead" in (output_file_path or ""):
+                 preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
+        progress(1, desc="Processing complete.")
+        return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
+        return f"Error during file conversion/saving: {str(e)}", raw_content, None
+with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
     gr.Markdown("# RAG-Ready Content Scraper")
     gr.Markdown(
         "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
         with gr.Column(scale=2):
             url_input = gr.Textbox(
                 label="Enter URL or GitHub Repository ID",
+                placeholder="e.g., https://example.com OR username/repo"
             )
             source_type_input = gr.Radio(
                 choices=["Webpage", "GitHub Repository"],
                 value="Webpage",
+                label="Select Source Type"
             )
             depth_input = gr.Slider(
+                minimum=0, maximum=3, step=1, value=0,
                 label="Scraping Depth (for Webpages)",
+                info="0: Only main page. Ignored for GitHub repos."
             )
             output_format_input = gr.Dropdown(
+                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
                 value="Markdown",
+                label="Select Output Format"
             )
             submit_button = gr.Button("Process Content", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
+            preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
+            file_download_output = gr.File(label="Download Processed File", interactive=False)
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
+            ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
+            ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
         outputs=[status_output, preview_output, file_download_output],
         fn=process_input_updated,
+        cache_examples=False
     )
+    with gr.Accordion("How it Works & More Info", open=False):
+        gr.Markdown(
+            """
+            **Webpage Scraping:**
+            1. Enter a full URL (e.g., `https://example.com`).
+            2. Select "Webpage" as the source type.
+            3. Set the desired scraping depth.
+            4. Choose your output format.
+            **GitHub Repository Processing:**
+            1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
+            2. Select "GitHub Repository". (Depth is ignored).
+            3. Choose your output format. Uses **RepoMix**.
+            **Output Formats:** Markdown, JSON, CSV, Text, PDF.
+            **Note:** PDF generation requires `markdown-pdf` library.
+            This app is designed for Docker/HuggingFace Spaces.
+            [View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
+            """
+        )
     submit_button.click(
         fn=process_input_updated,
         outputs=[status_output, preview_output, file_download_output],
     )
 if __name__ == "__main__":
+    iface.launch()

requirements.txt CHANGED Viewed

@@ -1,71 +1,6 @@
-rag-scraper
-pydantic
-markdown-pdf
-typing-extensions
-gradio>=6.0
-requests
-Pillow
-markdown
-beautifulsoup4
-lxml
-aiohttp
-fake-useragent
-urllib3
-html5lib
-chardet
-tqdm
-python-dateutil
-pytz
-click
-nltk
-spacy
-scrapy
-selenium
-webdriver-manager
-pandas
-numpy
-openpyxl
-PyPDF2
-python-docx
-python-pptx
-reportlab
-pdfkit
-weasyprint
-cssutils
-tinycss2
-cchardet
-idna
-certifi
-charset-normalizer
-httpx
-httpcore
-anyio
-sniffio
-pysocks
-win-inet-pton
-deprecation
-docstring-parser
-rich
-typer
-pyyaml
-toml
-tomli
-packaging
-filelock
-huggingface-hub
-safetensors
-regex
-tokenizers
-sentencepiece
-accelerate
-torch
-torchvision
-torchaudio
-transformers
-diffusers
-datasets
-evaluate
-scipy
-scikit-learn
-joblib
-threadpoolctl

+html2text
+gradio>=4.44.1
+requests>=2.31.0
+beautifulsoup4>=4.12.3
+lxml>=4.9.3
+markdown>=3.5.2