Spaces:

Asish22
/

code-crawler

Sleeping

App Files Files Community

Asish Karthikeya Gogineni commited on Jan 28

Commit

7dec411

1 Parent(s): 039d022

chore: Remove unused files (rate_limit_config, setup.py, ingestor, cli)

Browse files

Files changed (4) hide show

code_chatbot/cli.py +0 -298
code_chatbot/ingestor.py +0 -103
rate_limit_config.py +0 -63
setup.py +0 -15

code_chatbot/cli.py DELETED Viewed

@@ -1,298 +0,0 @@
-#!/usr/bin/env python3
-"""
-🕷️ Code Crawler CLI
-Command-line interface for the Code Crawler engine.
-"""
-import argparse
-import os
-import sys
-import logging
-import shutil
-import json
-from dotenv import load_dotenv
-# Rich Imports
-from rich.console import Console
-from rich.markdown import Markdown
-from rich.panel import Panel
-from rich.prompt import Prompt
-from rich.progress import Progress, SpinnerColumn, TextColumn
-# Local Imports
-from .indexer import Indexer
-from .rag import ChatEngine
-from .ast_analysis import ASTGraphBuilder
-from .graph_rag import GraphEnhancedRetriever
-from .universal_ingestor import process_source
-from .agent_workflow import create_agent_graph
-# Configure Console
-console = Console()
-logging.basicConfig(level=logging.ERROR)
-# Suppress noisy libraries
-logging.getLogger("httpx").setLevel(logging.WARNING)
-logging.getLogger("httpcore").setLevel(logging.WARNING)
-logging.getLogger("chromadb").setLevel(logging.ERROR)
-logging.getLogger("google_genai").setLevel(logging.ERROR)
-logging.getLogger("google.genai").setLevel(logging.ERROR)
-logging.getLogger("code_chatbot.chunker").setLevel(logging.ERROR)
-logger = logging.getLogger("CodeCrawlerCLI")
-logger.setLevel(logging.INFO)
-BANNER = """
-[bold cyan]    🕷️  Code Crawler CLI  🕷️[/bold cyan]
-[dim]    Index. Chat. Understand.[/dim]
-"""
-def setup_env():
-    load_dotenv()
-def print_banner():
-    console.print(Panel(BANNER, subtitle="v2.0", border_style="cyan"))
-def handle_index(args):
-    """
-    Handles the indexing command.
-    """
-    console.print(f"[bold blue][INFO][/bold blue] Starting indexing for source: [green]{args.source}[/green]")
-    # 1. Setup Environment
-    if args.provider == "gemini":
-        api_key = os.getenv("GOOGLE_API_KEY")
-        if not api_key:
-            console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY not found in .env")
-            sys.exit(1)
-        embedding_provider = "gemini"
-        embedding_api_key = api_key
-    elif args.provider == "groq":
-        api_key = os.getenv("GROQ_API_KEY")
-        embedding_api_key = os.getenv("GOOGLE_API_KEY")
-        if not api_key:
-            console.print("[bold red][ERROR][/bold red] GROQ_API_KEY not found in .env")
-            sys.exit(1)
-        if not embedding_api_key:
-            console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY (for embeddings) not found in .env")
-            sys.exit(1)
-        embedding_provider = "gemini"
-    else:
-        console.print(f"[bold red]Unknown provider:[/bold red] {args.provider}")
-        sys.exit(1)
-    try:
-        # 2. Extract & Ingest
-        extract_to = "data/extracted"
-        # Optional: Clean previous data
-        if args.clean and os.path.exists(extract_to):
-            console.print("[bold yellow][WARN][/bold yellow] Cleaning previous data...")
-            shutil.rmtree(extract_to)
-        with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
-            task = progress.add_task("Processing source...", total=None)
-            documents, local_path = process_source(args.source, extract_to)
-            progress.update(task, completed=True, description="[bold green]Source Processed[/bold green]")
-        console.print(f"[bold green][SUCCESS][/bold green] Ingested {len(documents)} documents.")
-        # Save metadata for Chat to find the path
-        os.makedirs("data", exist_ok=True)
-        with open("data/cli_meta.json", "w") as f:
-            json.dump({"repo_path": local_path}, f)
-        # 3. AST Analysis
-        with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
-            task = progress.add_task("Building AST Knowledge Graph...", total=None)
-            ast_builder = ASTGraphBuilder()
-            for doc in documents:
-                # doc.metadata['file_path'] is absolute
-                ast_builder.add_file(doc.metadata['file_path'], doc.page_content)
-            # Web sources might not create the directory
-            os.makedirs(local_path, exist_ok=True)
-            graph_path = os.path.join(local_path, "ast_graph.graphml")
-            ast_builder.save_graph(graph_path)
-            progress.update(task, completed=True, description="[bold green]AST Graph Built[/bold green]")
-        console.print(f"[bold green][SUCCESS][/bold green] AST Graph ready ({ast_builder.graph.number_of_nodes()} nodes).")
-        # 4. Vector Indexing
-        with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
-            task = progress.add_task(f"Indexing into {args.vector_db}...", total=None)
-            indexer = Indexer(
-                provider=embedding_provider,
-                api_key=embedding_api_key
-            )
-            # Clear old data if requested
-            if args.clean:
-                indexer.clear_collection()
-            indexer.index_documents(documents, vector_db_type=args.vector_db)
-            progress.update(task, completed=True, description=f"[bold green]Indexed into {args.vector_db}[/bold green]")
-        console.print(f"[bold green][SUCCESS][/bold green] Indexing Complete! You can now run `code-crawler chat`.")
-    except Exception as e:
-        console.print(f"[bold red][ERROR][/bold red] Indexing failed: {e}")
-        # import traceback
-        # traceback.print_exc()
-def handle_chat(args):
-    """
-    Handles the chat command.
-    """
-    console.print(f"[bold blue][INFO][/bold blue] Initializing Chat Engine ({args.provider})...")
-    # Setup Env & Keys
-    if args.provider == "gemini":
-        api_key = os.getenv("GOOGLE_API_KEY")
-        embedding_api_key = api_key
-        embedding_provider = "gemini"
-        model_name = "gemini-2.5-flash"
-        llm_provider_lib = "google_genai"
-    elif args.provider == "groq":
-        api_key = os.getenv("GROQ_API_KEY")
-        embedding_api_key = os.getenv("GOOGLE_API_KEY")
-        embedding_provider = "gemini"
-        model_name = "llama-3.3-70b-versatile"
-        llm_provider_lib = "groq"
-    if not api_key:
-        console.print("[bold red][ERROR][/bold red] API Keys missing. Check .env")
-        sys.exit(1)
-    try:
-        # Load Resources
-        meta_file = "data/cli_meta.json"
-        if os.path.exists(meta_file):
-            with open(meta_file, "r") as f:
-                meta = json.load(f)
-                local_path = meta.get("repo_path")
-        else:
-             # Fallback Heuristic
-             extract_root = "data/extracted"
-             if not os.path.exists(extract_root):
-                 console.print("[bold red][ERROR][/bold red] No index info found. Run 'code-crawler index' first.")
-                 sys.exit(1)
-             subdirs = [f.path for f in os.scandir(extract_root) if f.is_dir()]
-             if not subdirs:
-                 local_path = extract_root
-             else:
-                 subdirs.sort(key=lambda x: os.path.getmtime(x), reverse=True)
-                 local_path = subdirs[0]
-        if not local_path or not os.path.exists(local_path):
-             console.print(f"[bold red][ERROR][/bold red] Codebase path not found: {local_path}")
-             sys.exit(1)
-        console.print(f"[dim]Using codebase at: {local_path}[/dim]")
-        # Initialize Components
-        with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
-            task = progress.add_task("Loading resources...", total=None)
-            indexer = Indexer(provider=embedding_provider, api_key=embedding_api_key)
-            base_retriever = indexer.get_retriever(vector_db_type=args.vector_db)
-            graph_retriever = GraphEnhancedRetriever(
-                base_retriever=base_retriever,
-                repo_dir=local_path
-            )
-            repo_files = []
-            for root, _, files in os.walk(local_path):
-                for file in files:
-                    repo_files.append(os.path.join(root, file))
-            progress.update(task, completed=True, description="[bold green]Resources Loaded[/bold green]")
-        # Initialize ChatEngine
-        if args.agent:
-            console.print("[bold purple]🤖 Agent Mode Enabled[/bold purple]")
-        chat_engine = ChatEngine(
-            retriever=graph_retriever,
-            provider=args.provider,
-            model_name=model_name,
-            api_key=api_key,
-            repo_files=repo_files,
-            repo_name=os.path.basename(local_path),
-            use_agent=args.agent,
-            repo_dir=local_path
-        )
-        console.print("\n[bold green]Ready![/bold green] chat initialized. Type 'exit' to quit.\n")
-        while True:
-            try:
-                query = Prompt.ask("[bold cyan]User[/bold cyan]")
-                if query.strip().lower() in ['exit', 'quit', ':q']:
-                    break
-                if not query.strip():
-                    continue
-                console.print("[dim]🕷️  Thinking...[/dim]")
-                # Unified Chat Call (Handles Agent & Standard + Fallback)
-                response = chat_engine.chat(query)
-                if isinstance(response, tuple):
-                    answer, sources = response
-                else:
-                    answer = response
-                    sources = []
-                # Render Response
-                console.print(Panel(Markdown(answer), title="Spider", border_style="magenta", expand=False))
-                if sources:
-                    console.print("[dim]Sources:[/dim]")
-                    seen = set()
-                    for s in sources:
-                        fp = s.get('file_path', 'unknown')
-                        if fp not in seen:
-                            console.print(f" - [underline]{os.path.basename(fp)}[/underline]")
-                            seen.add(fp)
-                console.print("")
-            except KeyboardInterrupt:
-                break
-            except Exception as e:
-                console.print(f"[bold red][ERROR][/bold red] {e}")
-    except Exception as e:
-        console.print(f"[bold red][ERROR][/bold red] Chat failed to start: {e}")
-        # import traceback
-        # traceback.print_exc()
-def main():
-    setup_env()
-    print_banner()
-    parser = argparse.ArgumentParser(description="Code Crawler CLI")
-    subparsers = parser.add_subparsers(dest="command", required=True)
-    # Index Command
-    index_parser = subparsers.add_parser("index", help="Index a codebase (ZIP, URL, or Path)")
-    index_parser.add_argument("--source", "-s", required=True, help="Path to ZIP, Folder, or GitHub URL")
-    index_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider")
-    index_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database")
-    index_parser.add_argument("--clean", action="store_true", help="Clean previous index before running")
-    # Chat Command
-    chat_parser = subparsers.add_parser("chat", help="Chat with the indexed codebase")
-    chat_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider")
-    chat_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database type used during index")
-    chat_parser.add_argument("--agent", "-a", action="store_true", help="Enable Agentic Reasoning (LangGraph)")
-    args = parser.parse_args()
-    if args.command == "index":
-        handle_index(args)
-    elif args.command == "chat":
-        handle_chat(args)
-if __name__ == "__main__":
-    main()

code_chatbot/ingestor.py DELETED Viewed

@@ -1,103 +0,0 @@
-import os
-import zipfile
-import tempfile
-import shutil
-from typing import List, Optional
-from langchain_core.documents import Document
-import logging
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Extensions to ignore (binaries, images, etc.)
-IGNORE_EXTENSIONS = {
-    '.pyc', '.git', '.github', '.idea', '.vscode', '.DS_Store',
-    '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg',
-    '.mp4', '.mov', '.mp3', '.wav',
-    '.zip', '.tar', '.gz', '.pkl', '.bin', '.exe', '.dll', '.so', '.dylib',
-    '.pdf', '.docx', '.xlsx', '.pptx'
-}
-# Directories to ignore
-IGNORE_DIRS = {
-    '__pycache__', '.git', '.github', '.idea', '.vscode', 'node_modules', 'venv', '.venv', 'env', '.env', 'dist', 'build', 'target'
-}
-def is_text_file(file_path: str) -> bool:
-    """Check if a file is likely a text file based on extension and content."""
-    _, ext = os.path.splitext(file_path)
-    if ext.lower() in IGNORE_EXTENSIONS:
-        return False
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            f.read(1024)
-        return True
-    except UnicodeDecodeError:
-        return False
-    except Exception:
-        return False
-def process_zip(zip_path: str, extract_to: str) -> List[Document]:
-    """
-    Extracts a ZIP file and returns a list of LangChain Documents.
-    Args:
-        zip_path: Path to the uploaded ZIP file.
-        extract_to: Directory to extract files to.
-    Returns:
-        List[Document]: List of documents with content and metadata.
-    """
-    documents = []
-    if not os.path.exists(extract_to):
-        os.makedirs(extract_to)
-    try:
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_ref.extractall(extract_to)
-        logger.info(f"Extracted {zip_path} to {extract_to}")
-        # Walk through the extracted files
-        for root, dirs, files in os.walk(extract_to):
-            # Modify dirs in-place to skip ignored directories
-            dirs[:] = [d for d in dirs if d not in IGNORE_DIRS and not d.startswith('.')]
-            for file in files:
-                if file.startswith('.'):
-                    continue
-                file_path = os.path.join(root, file)
-                if is_text_file(file_path):
-                    try:
-                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-                            content = f.read()
-                        # Create relative path for metadata
-                        rel_path = os.path.relpath(file_path, extract_to)
-                        doc = Document(
-                            page_content=content,
-                            metadata={
-                                "source": rel_path,
-                                "file_path": file_path,
-                                "file_name": file
-                            }
-                        )
-                        documents.append(doc)
-                    except Exception as e:
-                        logger.warning(f"Failed to read {file_path}: {e}")
-        logger.info(f"Processed {len(documents)} documents from {zip_path}")
-        return documents
-    except zipfile.BadZipFile:
-        logger.error(f"Invalid ZIP file: {zip_path}")
-        raise ValueError("The provided file is not a valid ZIP archive.")
-    except Exception as e:
-        logger.error(f"Error processing ZIP: {e}")
-        raise e

rate_limit_config.py DELETED Viewed

@@ -1,63 +0,0 @@
-# Rate Limit Configuration
-# Customize these settings to control API usage and maximize chat availability
-# ============================================================================
-# PROVIDER LIMITS (Free Tier Defaults)
-# ============================================================================
-# Gemini 2.0 Flash Experimental (Latest Model)
-GEMINI_RPM = 15  # Requests per minute
-GEMINI_TPM = 1000000  # Tokens per minute (1 million)
-GEMINI_MIN_DELAY = 4.0  # Minimum seconds between requests (60s / 15 RPM = 4s)
-GEMINI_BURST_DELAY = 10.0  # Delay when approaching limit
-# Groq Free Tier (Increased delays to prevent rate limits)
-GROQ_RPM = 30  # Requests per minute
-GROQ_TPM = 20000  # Conservative daily token estimate
-GROQ_MIN_DELAY = 8.0  # Minimum 8 seconds between requests (was 1s)
-GROQ_BURST_DELAY = 20.0  # Delay when approaching limit (was 10s)
-# ============================================================================
-# OPTIMIZATION SETTINGS
-# ============================================================================
-# Response Caching
-ENABLE_CACHE = True  # Cache identical queries to save API calls
-CACHE_TTL = 300  # Cache lifetime in seconds (5 minutes)
-MAX_CACHE_SIZE = 100  # Maximum number of cached responses
-# Adaptive Delays
-USE_ADAPTIVE_DELAYS = True  # Dynamically adjust delays based on usage
-RATE_LIMIT_THRESHOLD = 0.7  # Trigger longer delays at 70% of limit (0.0-1.0)
-# Context Optimization
-MAX_AGENT_TOOL_RESULTS = 5  # Number of search results per tool call
-MAX_AGENT_CONTENT_LENGTH = 2000  # Characters per search result
-MAX_LINEAR_DOCS = 8  # Number of documents for linear RAG
-MAX_LINEAR_CONTENT_LENGTH = 1500  # Characters per document
-# ============================================================================
-# ADVANCED SETTINGS
-# ============================================================================
-# Fallback Behavior
-AUTO_FALLBACK_TO_LINEAR = True  # Fall back to linear RAG on agent rate limits
-MAX_AGENT_RETRIES = 2  # Number of retries on rate limit errors
-# Statistics & Monitoring
-SHOW_USAGE_STATS = True  # Display usage stats in sidebar
-LOG_RATE_LIMIT_WARNINGS = True  # Log when approaching limits
-# Token Budget (Optional - set to 0 to disable)
-# Stop making requests after hitting daily token budget
-DAILY_TOKEN_BUDGET_GEMINI = 0  # 0 = unlimited (within API limits)
-DAILY_TOKEN_BUDGET_GROQ = 0  # 0 = unlimited (within API limits)
-# ============================================================================
-# TIPS FOR MAXIMIZING USAGE
-# ============================================================================
-# 1. Set lower MIN_DELAY values for faster responses (but higher risk)
-# 2. Enable CACHE to avoid repeat API calls
-# 3. Reduce MAX_AGENT_TOOL_RESULTS if hitting rate limits frequently
-# 4. Use linear RAG mode for simpler questions (faster, fewer API calls)
-# 5. Switch providers if one is exhausted (Gemini <-> Groq)

setup.py DELETED Viewed

@@ -1,15 +0,0 @@
-# Work-around the fact that `pip install -e .` doesn't work with `pyproject.toml` files.
-from setuptools import setup
-setup(
-    name="code_chatbot",
-    version="0.1.0",
-    packages=["code_chatbot", "api"],
-    install_requires=[
-        "streamlit",
-        "langchain",
-        "chromadb",
-        "networkx",
-        "tree-sitter",
-    ],
-)