Spaces:

AiCoderv2
/

app-ernmjp-79

Build error

App Files Files Community

AiCoderv2 commited on Oct 4, 2025

Commit

de9e2bd

verified ·

1 Parent(s): 947ec04

Deploy Gradio app with multiple files

Browse files

Files changed (2) hide show

app.py +441 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import gradio as gr
+import requests
+import os
+import base64
+import json
+import re
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple
+import zipfile
+import io
+from datetime import datetime
+import math
+from utils import (
+    clean_code_content,
+    get_file_language,
+    estimate_tokens,
+    create_chunked_output
+)
+from models import (
+    process_github_repo,
+    process_huggingface_repo,
+    download_repo_as_zip
+)
+from config import (
+    SUPPORTED_EXTENSIONS,
+    MAX_FILE_SIZE,
+    MAX_TOTAL_SIZE,
+    CHUNK_SIZE,
+    GITHUB_API_BASE,
+    HF_API_BASE
+)
+# CSS for better UI
+css = """
+.container {
+    max-width: 1200px;
+    margin: 0 auto;
+}
+.progress-bar {
+    height: 20px;
+    background: linear-gradient(90deg, #4CAF50, #45a049);
+    border-radius: 10px;
+    transition: width 0.3s ease;
+}
+.file-stats {
+    background: #f0f0f0;
+    padding: 10px;
+    border-radius: 5px;
+    margin: 10px 0;
+}
+.warning {
+    background: #fff3cd;
+    border: 1px solid #ffeaa7;
+    padding: 10px;
+    border-radius: 5px;
+    color: #856404;
+}
+.error {
+    background: #f8d7da;
+    border: 1px solid #f5c6cb;
+    padding: 10px;
+    border-radius: 5px;
+    color: #721c24;
+}
+.success {
+    background: #d4edda;
+    border: 1px solid #c3e6cb;
+    padding: 10px;
+    border-radius: 5px;
+    color: #155724;
+}
+"""
+def validate_repo_url(url: str) -> Tuple[str, str]:
+    """Validate and determine repository type and owner/name"""
+    url = url.strip()
+    # GitHub URL patterns
+    github_patterns = [
+        r'github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$',
+        r'api\.github\.com/repos/([^/]+)/([^/]+)'
+    ]
+    # Hugging Face URL patterns
+    hf_patterns = [
+        r'huggingface\.co/([^/]+)/([^/]+?)(?:\.git)?/?$',
+        r'hf\.co/([^/]+)/([^/]+?)(?:\.git)?/?$'
+    ]
+    for pattern in github_patterns:
+        match = re.search(pattern, url)
+        if match:
+            return "github", f"{match.group(1)}/{match.group(2)}"
+    for pattern in hf_patterns:
+        match = re.search(pattern, url)
+        if match:
+            return "huggingface", f"{match.group(1)}/{match.group(2)}"
+    raise ValueError("Invalid repository URL. Please provide a valid GitHub or Hugging Face repository URL.")
+def process_repository(
+    repo_url: str,
+    token: str = "",
+    include_patterns: str = "",
+    exclude_patterns: str = "",
+    max_file_size_mb: int = 10,
+    chunk_size: int = 50000,
+    include_metadata: bool = True,
+    remove_comments: bool = False,
+    progress=gr.Progress()
+) -> Tuple[str, str, str]:
+    """Main function to process repository and generate text file"""
+    try:
+        # Validate URL and get repo info
+        repo_type, repo_path = validate_repo_url(repo_url)
+        # Parse include/exclude patterns
+        include_list = [p.strip() for p in include_patterns.split(",") if p.strip()] if include_patterns else []
+        exclude_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()] if exclude_patterns else []
+        progress(0.1, desc="Fetching repository information...")
+        # Process repository based on type
+        if repo_type == "github":
+            files_data, repo_info = process_github_repo(
+                repo_path,
+                token,
+                include_list,
+                exclude_list,
+                max_file_size_mb * 1024 * 1024
+            )
+        else:  # huggingface
+            files_data, repo_info = process_huggingface_repo(
+                repo_path,
+                token,
+                include_list,
+                exclude_list,
+                max_file_size_mb * 1024 * 1024
+            )
+        if not files_data:
+            return "", "⚠️ No files found matching the criteria.", ""
+        progress(0.3, desc="Processing files...")
+        # Generate consolidated text
+        total_files = len(files_data)
+        processed_files = 0
+        total_tokens = 0
+        total_chars = 0
+        # Create header
+        header_lines = []
+        if include_metadata:
+            header_lines.append("=" * 80)
+            header_lines.append(f"REPOSITORY: {repo_info.get('full_name', repo_path)}")
+            header_lines.append(f"DESCRIPTION: {repo_info.get('description', 'No description')}")
+            header_lines.append(f"URL: {repo_url}")
+            header_lines.append(f"PROCESSED: {datetime.now().isoformat()}")
+            header_lines.append(f"TOTAL FILES: {total_files}")
+            header_lines.append("=" * 80)
+            header_lines.append("")
+        content_parts = ["\n".join(header_lines)]
+        # Process each file
+        for i, (file_path, content, file_size) in enumerate(files_data):
+            progress(0.3 + (0.5 * i / total_files), desc=f"Processing file {i+1}/{total_files}")
+            # Clean content if requested
+            if remove_comments:
+                content = clean_code_content(content, file_path)
+            # Add file header
+            file_header = f"\n{'-' * 60}\n"
+            file_header += f"FILE: {file_path}\n"
+            file_header += f"SIZE: {file_size:,} bytes\n"
+            file_header += f"LANGUAGE: {get_file_language(file_path)}\n"
+            file_header += f"{'-' * 60}\n\n"
+            # Add content
+            file_content = file_header + content + "\n\n"
+            # Check if adding this file would exceed chunk size
+            if len("\n".join(content_parts + [file_content])) > chunk_size:
+                # Save current chunk
+                yield "\n".join(content_parts), generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
+                # Start new chunk
+                content_parts = [file_header + "\n".join(header_lines)]
+            content_parts.append(file_content)
+            processed_files += 1
+            total_chars += len(content)
+            total_tokens += estimate_tokens(content)
+        progress(0.9, desc="Finalizing...")
+        # Final content
+        final_content = "\n".join(content_parts)
+        # Add footer
+        if include_metadata:
+            footer = f"\n{'=' * 80}\n"
+            footer += f"SUMMARY:\n"
+            footer += f"- Files processed: {processed_files}\n"
+            footer += f"- Total characters: {total_chars:,}\n"
+            footer += f"- Estimated tokens: {total_tokens:,}\n"
+            footer += f"- Repository: {repo_info.get('full_name', repo_path)}\n"
+            footer += f"{'=' * 80}\n"
+            final_content += footer
+        progress(1.0, desc="Complete!")
+        return final_content, generate_stats(processed_files, total_tokens, total_chars, total_files), "success"
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
+        return "", error_msg, "error"
+def generate_stats(files_processed: int, tokens: int, chars: int, total_files: int) -> str:
+    """Generate statistics HTML"""
+    stats_html = f"""
+    <div class="file-stats">
+        <h3>📊 Processing Statistics</h3>
+        <p><strong>Files Processed:</strong> {files_processed:,} / {total_files:,}</p>
+        <p><strong>Total Characters:</strong> {chars:,}</p>
+        <p><strong>Estimated Tokens:</strong> {tokens:,}</p>
+        <p><strong>Average Tokens per File:</strong> {tokens // max(files_processed, 1):,}</p>
+    </div>
+    """
+    return stats_html
+def download_repo_locally(repo_url: str, token: str = "") -> str:
+    """Download repository as ZIP for local processing"""
+    try:
+        repo_type, repo_path = validate_repo_url(repo_url)
+        if repo_type == "github":
+            return download_repo_as_zip(f"github.com/{repo_path}", token)
+        else:
+            return download_repo_as_zip(f"huggingface.co/{repo_path}", token)
+    except Exception as e:
+        return f"Error downloading repository: {str(e)}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(
+        title="Repo-to-Text Converter",
+        theme=gr.themes.Soft(),
+        css=css
+    ) as demo:
+        gr.Markdown("""
+        # 📚 Repository to Text Converter
+        Convert GitHub or Hugging Face repositories into formatted text files perfect for LLM training.
+        **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)**
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Input section
+                gr.Markdown("## 📥 Repository Input")
+                repo_url = gr.Textbox(
+                    label="Repository URL",
+                    placeholder="https://github.com/username/repo or https://huggingface.co/username/repo",
+                    lines=2
+                )
+                token = gr.Textbox(
+                    label="Access Token (Optional)",
+                    placeholder="GitHub token or Hugging Face token for private repos",
+                    type="password"
+                )
+                with gr.Accordion("🔧 Advanced Options", open=False):
+                    include_patterns = gr.Textbox(
+                        label="Include Patterns (comma-separated)",
+                        placeholder="*.py,*.md,src/**/*.py",
+                        info="Only include files matching these patterns"
+                    )
+                    exclude_patterns = gr.Textbox(
+                        label="Exclude Patterns (comma-separated)",
+                        placeholder="*.git*,*.log,node_modules/**",
+                        value="*.git*,*.log,node_modules/**,__pycache__/**,.DS_Store"
+                    )
+                    max_file_size = gr.Slider(
+                        minimum=1,
+                        maximum=100,
+                        value=10,
+                        step=1,
+                        label="Max File Size (MB)",
+                        info="Files larger than this will be skipped"
+                    )
+                    chunk_size = gr.Slider(
+                        minimum=1000,
+                        maximum=100000,
+                        value=50000,
+                        step=1000,
+                        label="Chunk Size (characters)",
+                        info="Split output into chunks of this size"
+                    )
+                    include_metadata = gr.Checkbox(
+                        value=True,
+                        label="Include Metadata",
+                        info="Add repository information and statistics"
+                    )
+                    remove_comments = gr.Checkbox(
+                        value=False,
+                        label="Remove Comments",
+                        info="Strip comments from code files (experimental)"
+                    )
+                process_btn = gr.Button(
+                    "🚀 Process Repository",
+                    variant="primary",
+                    size="lg"
+                )
+                download_btn = gr.Button(
+                    "⬇️ Download as ZIP",
+                    variant="secondary"
+                )
+            with gr.Column(scale=1):
+                # Info section
+                gr.Markdown("## ℹ️ Information")
+                gr.Markdown("""
+                ### Supported Platforms:
+                - ✅ GitHub (public and private)
+                - ✅ Hugging Face (public and private)
+                ### Supported File Types:
+                - Code files (.py, .js, .java, .cpp, etc.)
+                - Documentation (.md, .txt, .rst)
+                - Configuration files (.json, .yaml, .toml)
+                - And many more!
+                ### Features:
+                - 🔄 Chunked output for large repos
+                - 📊 Token estimation
+                - 🎯 Pattern-based file filtering
+                - 🧹 Optional comment removal
+                """)
+        # Output section
+        gr.Markdown("## 📤 Output")
+        with gr.Row():
+            stats_display = gr.HTML(label="Statistics")
+        output_text = gr.Textbox(
+            label="Generated Text",
+            lines=20,
+            max_lines=50,
+            show_copy_button=True,
+            interactive=True
+        )
+        status_display = gr.HTML()
+        # Event handlers
+        process_btn.click(
+            fn=process_repository,
+            inputs=[
+                repo_url,
+                token,
+                include_patterns,
+                exclude_patterns,
+                max_file_size,
+                chunk_size,
+                include_metadata,
+                remove_comments
+            ],
+            outputs=[output_text, stats_display, status_display]
+        )
+        download_btn.click(
+            fn=download_repo_locally,
+            inputs=[repo_url, token],
+            outputs=gr.File(label="Downloaded Repository")
+        )
+        # Examples
+        gr.Markdown("## 🎯 Examples")
+        gr.Examples(
+            examples=[
+                [
+                    "https://github.com/gradio-app/gradio",
+                    "",
+                    "*.py,*.md",
+                    "",
+                    10,
+                    50000,
+                    True,
+                    False
+                ],
+                [
+                    "https://huggingface.co/huggingface/transformers",
+                    "",
+                    "*.py,*.md,*.rst",
+                    "tests/**,docs/**",
+                    5,
+                    30000,
+                    True,
+                    False
+                ]
+            ],
+            inputs=[
+                repo_url,
+                token,
+                include_patterns,
+                exclude_patterns,
+                max_file_size,
+                chunk_size,
+                include_metadata,
+                remove_comments
+            ]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        share=True,
+        show_error=True,
+        show_tips=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+requirements.txt
+requests
+gradio
+git+https://github.com/huggingface/transformers
+torch
+tokenizers
+accelerate
+sentencepiece
+numpy
+Pillow