Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

CultriX commited on May 29, 2025

Commit

2471025

1 Parent(s): 2c85e25

feat: Overhaul WebUI, add PDF/Text export, use Poetry in Docker

Browse files

Files changed (3) hide show

Dockerfile +15 -5
app.py +153 -82
pyproject.toml +1 -0

Dockerfile CHANGED Viewed

@@ -4,11 +4,12 @@ FROM python:3.10-slim
 # Set the working directory in the container
 WORKDIR /app
-# Install system dependencies for Node.js installation and Git
 RUN apt-get update && apt-get install -y \
     curl \
     gnupg \
     git \
     && rm -rf /var/lib/apt/lists/*
 # Add Node.js LTS repository and install Node.js and npm
@@ -18,11 +19,20 @@ RUN curl -fsSL https://deb.nodesource.com/setup_lts.x | bash - \
 # Install repomix globally using npm
 RUN npm install -g repomix
-# Copy the requirements file into the container
-COPY requirements.txt .
-# Install any needed packages specified in requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application code into the container
 COPY . .

 # Set the working directory in the container
 WORKDIR /app
+# Install system dependencies for Node.js installation, Git, and wkhtmltopdf (for PDF generation)
 RUN apt-get update && apt-get install -y \
     curl \
     gnupg \
     git \
+    wkhtmltopdf \
     && rm -rf /var/lib/apt/lists/*
 # Add Node.js LTS repository and install Node.js and npm
 # Install repomix globally using npm
 RUN npm install -g repomix
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | python3 -
+# Add Poetry to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Configure Poetry to not create virtual environments
+RUN poetry config virtualenvs.create false
+# Copy poetry.lock and pyproject.toml
+COPY poetry.lock pyproject.toml /app/
+# Install project dependencies using Poetry
+RUN poetry install --no-root --no-dev --no-interaction --no-ansi
 # Copy the rest of the application code into the container
 COPY . .

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import subprocess
 import os
@@ -5,10 +6,95 @@ import re
 import tempfile
 import json
 import csv
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
 def is_github_repo(url_or_id):
     """Check if the input is a GitHub repository URL or ID."""
@@ -32,11 +118,7 @@ def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
     progress(0, desc="Starting Repomix processing...")
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
-            # RepoMix typically outputs a zip file if not specifying a single output style,
-            # or a specific file if --style is used.
-            # For simplicity, let's assume we want markdown and it outputs to a known file or stdout.
-            # The current repomix command in the original script uses --style markdown and --output.
-            output_file_name = "repomix-output.md" # Assuming markdown output
             output_file_path = os.path.join(temp_dir, output_file_name)
             if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
@@ -48,12 +130,12 @@ def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
             cmd = [
                 "repomix",
                 "--remote", repo_url,
-                "--output", output_file_path, # Direct output to a file
-                "--style", "markdown", # Explicitly request markdown
                 "--compress"
             ]
-            process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8') # Added encoding
             progress(0.8, desc="Repomix command executed.")
             if process.returncode != 0:
@@ -64,7 +146,7 @@ def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
                 with open(output_file_path, 'r', encoding='utf-8') as f:
                     content = f.read()
                 progress(1, desc="Repomix output processed.")
-                return content, output_file_path # Return content and path for potential download
             else:
                 error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
                 return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
@@ -105,7 +187,6 @@ def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)
         if current_depth > 0:
             try:
                 links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
-                # Filter out already visited links and external links more carefully
                 valid_links = [
                     link for link in links
                     if URLUtils.is_internal(link, current_url) and link not in visited_urls
@@ -121,53 +202,63 @@ def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)
     all_markdown_content = recursive_scrape(url, depth)
     progress(1, desc="Web scraping complete.")
-    # For web scraping, we create a temporary file with the content for download
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
         tmp_file.write(all_markdown_content)
         return all_markdown_content, tmp_file.name
-# --- Data Conversion Functions (Stubs for now) ---
 def convert_to_json(markdown_content, source_url_or_id):
-    """Converts markdown content to a JSON string."""
-    # Basic implementation: create a JSON object with source and content
-    # More sophisticated parsing can be added later
     data = {"source": source_url_or_id, "content": markdown_content}
     return json.dumps(data, indent=2)
 def convert_to_csv(markdown_content, source_url_or_id):
-    """Converts markdown content to a CSV string."""
-    # Basic implementation: create a CSV with source and content
-    # This is a simplified CSV; real CSVs might need more structure
     output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
     writer = csv.writer(output)
-    writer.writerow(["source", "content"]) # Header
-    # Split content into manageable chunks or lines if necessary for CSV
-    # For now, putting all content in one cell.
     writer.writerow([source_url_or_id, markdown_content])
     output.close()
-    return output.name # Return path to the CSV file
 def save_output_to_file(content, output_format, source_url_or_id):
     """Saves content to a temporary file based on format and returns its path."""
-    suffix = f".{output_format.lower()}"
     if output_format == "JSON":
         processed_content = convert_to_json(content, source_url_or_id)
     elif output_format == "CSV":
-        # convert_to_csv now returns a path directly
         return convert_to_csv(content, source_url_or_id)
-    else: # Markdown/Text
-        processed_content = content
         suffix = ".md"
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
         tmp_file.write(processed_content)
         return tmp_file.name
-# --- Main Processing Function ---
 def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
-    """Main function to process URL or GitHub repo based on selected type and format."""
     progress(0, desc="Initializing...")
     raw_content = ""
     error_message = ""
@@ -175,17 +266,15 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
-            error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally in your Docker environment."
-            return error_message, None, None # Text output, Preview, File output
-        raw_content, _ = run_repomix(url_or_id, progress=progress) # Repomix returns content and its original path
-        if "Error" in raw_content: # Simple error check
             error_message = raw_content
             raw_content = ""
     elif source_type == "Webpage":
         raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
-        if "Error" in raw_content: # Simple error check
             error_message = raw_content
             raw_content = ""
     else:
@@ -193,54 +282,44 @@ def process_input_updated(url_or_id, source_type, depth, output_format_selection
         return error_message, None, None
     if error_message:
-        print(f"Error before file generation: {error_message}") # DEBUGGING
-        return error_message, None, None # Error text, no preview, no file
-    # Save raw_content (which is markdown) to a file of the chosen output_format
-    # This will handle conversion if necessary
     try:
         progress(0.9, desc=f"Converting to {output_format_selection}...")
         output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
-        # For preview, we'll show the raw markdown, or a snippet of JSON/CSV
-        preview_content = raw_content # Default to markdown
         if output_format_selection == "JSON":
             preview_content = convert_to_json(raw_content, url_or_id)
         elif output_format_selection == "CSV" and output_file_path:
             try:
                 with open(output_file_path, 'r', encoding='utf-8') as f_csv:
-                    # Read the first 5 lines for preview
                     csv_preview_lines = [next(f_csv) for _ in range(5)]
                 preview_content = "".join(csv_preview_lines)
-                if not preview_content: # Handle empty or very short CSV
-                    preview_content = "[CSV content is empty or very short]"
-            except StopIteration: # Handle files with less than 5 lines
-                 # If StopIteration occurs, it means we've read all lines.
-                 # We need to re-open and read all lines if it was less than 5.
                 with open(output_file_path, 'r', encoding='utf-8') as f_csv:
                     preview_content = f_csv.read()
-                if not preview_content:
-                    preview_content = "[CSV content is empty]"
             except Exception as e_csv_preview:
                 preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
         elif output_format_selection == "CSV" and not output_file_path:
              preview_content = "[CSV file path not available for preview]"
-        print(f"Generated output file path for download: {output_file_path}") # DEBUGGING
         progress(1, desc="Processing complete.")
         return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
-        print(f"Exception during file conversion/saving: {str(e)}") # DEBUGGING
         return f"Error during file conversion/saving: {str(e)}", raw_content, None
-# --- Gradio Interface Definition ---
-with gr.Blocks(theme=gr.themes.Soft()) as iface:
     gr.Markdown("# RAG-Ready Content Scraper")
     gr.Markdown(
-        "Scrape webpage content (using RAG-scraper) or GitHub repositories (using RepoMix) "
-        "to generate RAG-ready datasets. Uses Docker for full functionality on HuggingFace Spaces."
     )
     with gr.Row():
@@ -260,7 +339,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                 info="0: Only main page. Ignored for GitHub repos."
             )
             output_format_input = gr.Dropdown(
-                choices=["Markdown", "JSON", "CSV"], # Markdown is like text file
                 value="Markdown",
                 label="Select Output Format"
             )
@@ -268,58 +347,50 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
-            preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False) # Default to markdown, can show JSON too
             file_download_output = gr.File(label="Download Processed File", interactive=False)
-    progress_bar = gr.Progress(track_tqdm=True)
-    # --- Examples ---
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
-            ["gradio-app/gradio", "GitHub Repository", 0, "Markdown"],
             ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
-        outputs=[status_output, preview_output, file_download_output], # Function needs to match this
-        fn=process_input_updated, # Make sure the function signature matches
-        cache_examples=False # For development, disable caching
     )
-    # --- How it Works & GitHub Link ---
     with gr.Accordion("How it Works & More Info", open=False):
         gr.Markdown(
             """
             **Webpage Scraping:**
             1. Enter a full URL (e.g., `https://example.com`).
             2. Select "Webpage" as the source type.
-            3. Set the desired scraping depth (how many levels of internal links to follow).
             4. Choose your output format.
-            5. The tool fetches HTML, converts it to Markdown, and follows internal links up to the specified depth.
             **GitHub Repository Processing:**
-            1. Enter a GitHub repository URL (e.g., `https://github.com/username/repo`) or shorthand ID (e.g., `username/repo`).
-            2. Select "GitHub Repository" as the source type. (Scraping depth is ignored).
-            3. Choose your output format.
-            4. The tool uses **RepoMix** to fetch and process the repository into a structured Markdown format.
-            **Output Formats:**
-            - **Markdown:** Plain text Markdown file, suitable for direct reading or further processing.
-            - **JSON:** Structured JSON output, typically with fields like `source` and `content`.
-            - **CSV:** Comma-Separated Values file, useful for tabular data or importing into spreadsheets.
-            **Note on HuggingFace Spaces:** This application is designed to run in a Docker-based HuggingFace Space,
-            which allows the use of `RepoMix` for GitHub repositories.
-            [View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
             """
         )
     submit_button.click(
         fn=process_input_updated,
-        inputs=[url_input, source_type_input, depth_input, output_format_input], # Removed progress_bar
         outputs=[status_output, preview_output, file_download_output],
-        # The progress instance is passed to the function via its signature's default or if explicitly managed
     )
 if __name__ == "__main__":

+from __future__ import annotations
 import gradio as gr
 import subprocess
 import os
 import tempfile
 import json
 import csv
+from typing import Iterable # Added for Theme
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
+from gradio.themes.base import Base # Added for Theme
+from gradio.themes.utils import colors, fonts, sizes # Added for Theme
+import markdown_pdf # Added for PDF conversion
+# --- Custom Theme Definition ---
+class Seafoam(Base):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.teal,
+        secondary_hue: colors.Color | str = colors.cyan,
+        neutral_hue: colors.Color | str = colors.gray,
+        spacing_size: sizes.Size | str = sizes.spacing_md,
+        radius_size: sizes.Size | str = sizes.radius_md,
+        text_size: sizes.Size | str = sizes.text_md, # Adjusted from lg for a more professional feel
+        font: fonts.Font
+        | str
+        | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Inter"), # Modern sans-serif
+            "ui-sans-serif",
+            "system-ui",
+            "sans-serif",
+        ),
+        font_mono: fonts.Font
+        | str
+        | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"),
+            "ui-monospace",
+            "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            spacing_size=spacing_size,
+            radius_size=radius_size,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        # Dark Mode First
+        super().set(
+            # Core Colors
+            body_background_fill_dark="black", # True black
+            body_text_color_dark=colors.gray_200,
+            block_background_fill_dark=colors.gray_900,
+            block_border_color_dark=colors.gray_700,
+            block_label_background_fill_dark=colors.gray_800,
+            block_label_text_color_dark=colors.gray_200,
+            input_background_fill_dark=colors.gray_800,
+            input_border_color_dark=colors.gray_600,
+            input_text_color_dark=colors.gray_50,
+            button_primary_background_fill_dark=colors.teal_600,
+            button_primary_background_fill_hover_dark=colors.teal_500,
+            button_primary_text_color_dark="white",
+            button_secondary_background_fill_dark=colors.gray_700,
+            button_secondary_background_fill_hover_dark=colors.gray_600,
+            button_secondary_text_color_dark="white",
+            slider_color_dark=colors.teal_500,
+            # Light Mode
+            body_background_fill="white",
+            body_text_color=colors.gray_800,
+            block_background_fill=colors.gray_50,
+            block_border_color=colors.gray_300,
+            block_label_background_fill=colors.gray_200,
+            block_label_text_color=colors.gray_700,
+            input_background_fill=colors.white,
+            input_border_color=colors.gray_300,
+            input_text_color=colors.gray_900,
+            button_primary_background_fill=colors.teal_500,
+            button_primary_background_fill_hover=colors.teal_600,
+            button_primary_text_color="white",
+            button_secondary_background_fill=colors.gray_200,
+            button_secondary_background_fill_hover=colors.gray_300,
+            button_secondary_text_color=colors.gray_800,
+            slider_color=colors.teal_500,
+            # General
+            block_title_text_weight="600",
+            block_shadow="*shadow_drop_lg",
+            button_shadow="*shadow_drop"
+        )
+seafoam_theme = Seafoam()
 def is_github_repo(url_or_id):
     """Check if the input is a GitHub repository URL or ID."""
     progress(0, desc="Starting Repomix processing...")
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
+            output_file_name = "repomix-output.md"
             output_file_path = os.path.join(temp_dir, output_file_name)
             if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
             cmd = [
                 "repomix",
                 "--remote", repo_url,
+                "--output", output_file_path,
+                "--style", "markdown",
                 "--compress"
             ]
+            process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8')
             progress(0.8, desc="Repomix command executed.")
             if process.returncode != 0:
                 with open(output_file_path, 'r', encoding='utf-8') as f:
                     content = f.read()
                 progress(1, desc="Repomix output processed.")
+                return content, output_file_path
             else:
                 error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
                 return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None
         if current_depth > 0:
             try:
                 links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
                 valid_links = [
                     link for link in links
                     if URLUtils.is_internal(link, current_url) and link not in visited_urls
     all_markdown_content = recursive_scrape(url, depth)
     progress(1, desc="Web scraping complete.")
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
         tmp_file.write(all_markdown_content)
         return all_markdown_content, tmp_file.name
 def convert_to_json(markdown_content, source_url_or_id):
     data = {"source": source_url_or_id, "content": markdown_content}
     return json.dumps(data, indent=2)
 def convert_to_csv(markdown_content, source_url_or_id):
     output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
     writer = csv.writer(output)
+    writer.writerow(["source", "content"])
     writer.writerow([source_url_or_id, markdown_content])
     output.close()
+    return output.name
 def save_output_to_file(content, output_format, source_url_or_id):
     """Saves content to a temporary file based on format and returns its path."""
+    processed_content = content # Default for Markdown and Text
     if output_format == "JSON":
+        suffix = ".json"
         processed_content = convert_to_json(content, source_url_or_id)
     elif output_format == "CSV":
+        # convert_to_csv returns a path directly
         return convert_to_csv(content, source_url_or_id)
+    elif output_format == "Text":
+        suffix = ".txt"
+    elif output_format == "PDF":
+        suffix = ".pdf"
+        # PDF conversion happens differently, creates file directly
+        pdf_output_path = ""
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
+                pdf_output_path = tmp_pdf_file.name
+            # Basic PDF conversion from Markdown string
+            # You might need to install a library like `markdown-pdf` or `WeasyPrint`
+            # Example using markdown_pdf (ensure it's installed: pip install markdown-pdf)
+            md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
+            # md_pdf.meta["css"] = "your_custom_css_path.css" # Optional: for styling
+            md_pdf.convert_from_string(content, pdf_output_path)
+            return pdf_output_path
+        except Exception as e:
+            # Fallback: save as markdown with .pdf.md suffix if PDF fails
+            print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
+            suffix = ".pdf.md" # Indicate it's markdown intended for PDF
+            # No processed_content change needed, it's already markdown
+    else: # Default to Markdown
         suffix = ".md"
+    # For formats that don't return early (JSON, Text, Markdown, PDF fallback)
     with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
         tmp_file.write(processed_content)
         return tmp_file.name
 def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
     progress(0, desc="Initializing...")
     raw_content = ""
     error_message = ""
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
+            error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally."
+            return error_message, None, None
+        raw_content, _ = run_repomix(url_or_id, progress=progress)
+        if "Error" in raw_content:
             error_message = raw_content
             raw_content = ""
     elif source_type == "Webpage":
         raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
+        if "Error" in raw_content:
             error_message = raw_content
             raw_content = ""
     else:
         return error_message, None, None
     if error_message:
+        return error_message, None, None
     try:
         progress(0.9, desc=f"Converting to {output_format_selection}...")
         output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
+        preview_content = raw_content # Default for Markdown, Text
         if output_format_selection == "JSON":
             preview_content = convert_to_json(raw_content, url_or_id)
         elif output_format_selection == "CSV" and output_file_path:
             try:
                 with open(output_file_path, 'r', encoding='utf-8') as f_csv:
                     csv_preview_lines = [next(f_csv) for _ in range(5)]
                 preview_content = "".join(csv_preview_lines)
+                if not preview_content: preview_content = "[CSV content is empty or very short]"
+            except StopIteration:
                 with open(output_file_path, 'r', encoding='utf-8') as f_csv:
                     preview_content = f_csv.read()
+                if not preview_content: preview_content = "[CSV content is empty]"
             except Exception as e_csv_preview:
                 preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
         elif output_format_selection == "CSV" and not output_file_path:
              preview_content = "[CSV file path not available for preview]"
+        elif output_format_selection == "PDF":
+            preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
+            if "Saving as Markdown instead" in (output_file_path or ""): # Check if PDF failed
+                 preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"
         progress(1, desc="Processing complete.")
         return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
         return f"Error during file conversion/saving: {str(e)}", raw_content, None
+with gr.Blocks(theme=seafoam_theme) as iface: # Applied custom theme
     gr.Markdown("# RAG-Ready Content Scraper")
     gr.Markdown(
+        "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
     )
     with gr.Row():
                 info="0: Only main page. Ignored for GitHub repos."
             )
             output_format_input = gr.Dropdown(
+                choices=["Markdown", "JSON", "CSV", "Text", "PDF"], # Added Text and PDF
                 value="Markdown",
                 label="Select Output Format"
             )
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
+            preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
             file_download_output = gr.File(label="Download Processed File", interactive=False)
+    # Removed progress_bar = gr.Progress(track_tqdm=True) as it's passed directly
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
+            ["gradio-app/gradio", "GitHub Repository", 0, "Text"], # Changed to Text
             ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
+        outputs=[status_output, preview_output, file_download_output],
+        fn=process_input_updated,
+        cache_examples=False
     )
     with gr.Accordion("How it Works & More Info", open=False):
         gr.Markdown(
             """
             **Webpage Scraping:**
             1. Enter a full URL (e.g., `https://example.com`).
             2. Select "Webpage" as the source type.
+            3. Set the desired scraping depth.
             4. Choose your output format.
             **GitHub Repository Processing:**
+            1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
+            2. Select "GitHub Repository". (Depth is ignored).
+            3. Choose your output format. Uses **RepoMix**.
+            **Output Formats:** Markdown, JSON, CSV, Text, PDF.
+            **Note:** PDF generation requires `markdown-pdf` library.
+            This app is designed for Docker/HuggingFace Spaces.
+            [View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
             """
         )
     submit_button.click(
         fn=process_input_updated,
+        inputs=[url_input, source_type_input, depth_input, output_format_input],
         outputs=[status_output, preview_output, file_download_output],
     )
 if __name__ == "__main__":

pyproject.toml CHANGED Viewed

@@ -12,6 +12,7 @@ python = "^3.10"
 requests = "^2.31.0"
 beautifulsoup4 = "^4.12.2"
 html2text = "^2020.1.16"
 [tool.poetry.group.dev]
 optional = true

 requests = "^2.31.0"
 beautifulsoup4 = "^4.12.2"
 html2text = "^2020.1.16"
+markdown-pdf = "^0.2.1"
 [tool.poetry.group.dev]
 optional = true