Spaces:
Running
Running
Asish Karthikeya Gogineni
Refactor: Upgraded to Agentic Chatbot with AST & Call Graph support
5b89d45 | #!/usr/bin/env python3 | |
| """ | |
| 🕷️ Code Crawler CLI | |
| Command-line interface for the Code Crawler engine. | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| import logging | |
| import shutil | |
| import json | |
| from dotenv import load_dotenv | |
| # Rich Imports | |
| from rich.console import Console | |
| from rich.markdown import Markdown | |
| from rich.panel import Panel | |
| from rich.prompt import Prompt | |
| from rich.progress import Progress, SpinnerColumn, TextColumn | |
| # Local Imports | |
| from .indexer import Indexer | |
| from .rag import ChatEngine | |
| from .ast_analysis import ASTGraphBuilder | |
| from .graph_rag import GraphEnhancedRetriever | |
| from .universal_ingestor import process_source | |
| from .agent_workflow import create_agent_graph | |
| # Configure Console | |
| console = Console() | |
| logging.basicConfig(level=logging.ERROR) | |
| # Suppress noisy libraries | |
| logging.getLogger("httpx").setLevel(logging.WARNING) | |
| logging.getLogger("httpcore").setLevel(logging.WARNING) | |
| logging.getLogger("chromadb").setLevel(logging.ERROR) | |
| logging.getLogger("google_genai").setLevel(logging.ERROR) | |
| logging.getLogger("google.genai").setLevel(logging.ERROR) | |
| logging.getLogger("code_chatbot.chunker").setLevel(logging.ERROR) | |
| logger = logging.getLogger("CodeCrawlerCLI") | |
| logger.setLevel(logging.INFO) | |
| BANNER = """ | |
| [bold cyan] 🕷️ Code Crawler CLI 🕷️[/bold cyan] | |
| [dim] Index. Chat. Understand.[/dim] | |
| """ | |
| def setup_env(): | |
| load_dotenv() | |
| def print_banner(): | |
| console.print(Panel(BANNER, subtitle="v2.0", border_style="cyan")) | |
| def handle_index(args): | |
| """ | |
| Handles the indexing command. | |
| """ | |
| console.print(f"[bold blue][INFO][/bold blue] Starting indexing for source: [green]{args.source}[/green]") | |
| # 1. Setup Environment | |
| if args.provider == "gemini": | |
| api_key = os.getenv("GOOGLE_API_KEY") | |
| if not api_key: | |
| console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY not found in .env") | |
| sys.exit(1) | |
| embedding_provider = "gemini" | |
| embedding_api_key = api_key | |
| elif args.provider == "groq": | |
| api_key = os.getenv("GROQ_API_KEY") | |
| embedding_api_key = os.getenv("GOOGLE_API_KEY") | |
| if not api_key: | |
| console.print("[bold red][ERROR][/bold red] GROQ_API_KEY not found in .env") | |
| sys.exit(1) | |
| if not embedding_api_key: | |
| console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY (for embeddings) not found in .env") | |
| sys.exit(1) | |
| embedding_provider = "gemini" | |
| else: | |
| console.print(f"[bold red]Unknown provider:[/bold red] {args.provider}") | |
| sys.exit(1) | |
| try: | |
| # 2. Extract & Ingest | |
| extract_to = "data/extracted" | |
| # Optional: Clean previous data | |
| if args.clean and os.path.exists(extract_to): | |
| console.print("[bold yellow][WARN][/bold yellow] Cleaning previous data...") | |
| shutil.rmtree(extract_to) | |
| with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress: | |
| task = progress.add_task("Processing source...", total=None) | |
| documents, local_path = process_source(args.source, extract_to) | |
| progress.update(task, completed=True, description="[bold green]Source Processed[/bold green]") | |
| console.print(f"[bold green][SUCCESS][/bold green] Ingested {len(documents)} documents.") | |
| # Save metadata for Chat to find the path | |
| os.makedirs("data", exist_ok=True) | |
| with open("data/cli_meta.json", "w") as f: | |
| json.dump({"repo_path": local_path}, f) | |
| # 3. AST Analysis | |
| with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress: | |
| task = progress.add_task("Building AST Knowledge Graph...", total=None) | |
| ast_builder = ASTGraphBuilder() | |
| for doc in documents: | |
| # doc.metadata['file_path'] is absolute | |
| ast_builder.add_file(doc.metadata['file_path'], doc.page_content) | |
| # Web sources might not create the directory | |
| os.makedirs(local_path, exist_ok=True) | |
| graph_path = os.path.join(local_path, "ast_graph.graphml") | |
| ast_builder.save_graph(graph_path) | |
| progress.update(task, completed=True, description="[bold green]AST Graph Built[/bold green]") | |
| console.print(f"[bold green][SUCCESS][/bold green] AST Graph ready ({ast_builder.graph.number_of_nodes()} nodes).") | |
| # 4. Vector Indexing | |
| with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress: | |
| task = progress.add_task(f"Indexing into {args.vector_db}...", total=None) | |
| indexer = Indexer( | |
| provider=embedding_provider, | |
| api_key=embedding_api_key | |
| ) | |
| # Clear old data if requested | |
| if args.clean: | |
| indexer.clear_collection() | |
| indexer.index_documents(documents, vector_db_type=args.vector_db) | |
| progress.update(task, completed=True, description=f"[bold green]Indexed into {args.vector_db}[/bold green]") | |
| console.print(f"[bold green][SUCCESS][/bold green] Indexing Complete! You can now run `code-crawler chat`.") | |
| except Exception as e: | |
| console.print(f"[bold red][ERROR][/bold red] Indexing failed: {e}") | |
| # import traceback | |
| # traceback.print_exc() | |
| def handle_chat(args): | |
| """ | |
| Handles the chat command. | |
| """ | |
| console.print(f"[bold blue][INFO][/bold blue] Initializing Chat Engine ({args.provider})...") | |
| # Setup Env & Keys | |
| if args.provider == "gemini": | |
| api_key = os.getenv("GOOGLE_API_KEY") | |
| embedding_api_key = api_key | |
| embedding_provider = "gemini" | |
| model_name = "gemini-2.5-flash" | |
| llm_provider_lib = "google_genai" | |
| elif args.provider == "groq": | |
| api_key = os.getenv("GROQ_API_KEY") | |
| embedding_api_key = os.getenv("GOOGLE_API_KEY") | |
| embedding_provider = "gemini" | |
| model_name = "llama-3.3-70b-versatile" | |
| llm_provider_lib = "groq" | |
| if not api_key: | |
| console.print("[bold red][ERROR][/bold red] API Keys missing. Check .env") | |
| sys.exit(1) | |
| try: | |
| # Load Resources | |
| meta_file = "data/cli_meta.json" | |
| if os.path.exists(meta_file): | |
| with open(meta_file, "r") as f: | |
| meta = json.load(f) | |
| local_path = meta.get("repo_path") | |
| else: | |
| # Fallback Heuristic | |
| extract_root = "data/extracted" | |
| if not os.path.exists(extract_root): | |
| console.print("[bold red][ERROR][/bold red] No index info found. Run 'code-crawler index' first.") | |
| sys.exit(1) | |
| subdirs = [f.path for f in os.scandir(extract_root) if f.is_dir()] | |
| if not subdirs: | |
| local_path = extract_root | |
| else: | |
| subdirs.sort(key=lambda x: os.path.getmtime(x), reverse=True) | |
| local_path = subdirs[0] | |
| if not local_path or not os.path.exists(local_path): | |
| console.print(f"[bold red][ERROR][/bold red] Codebase path not found: {local_path}") | |
| sys.exit(1) | |
| console.print(f"[dim]Using codebase at: {local_path}[/dim]") | |
| # Initialize Components | |
| with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress: | |
| task = progress.add_task("Loading resources...", total=None) | |
| indexer = Indexer(provider=embedding_provider, api_key=embedding_api_key) | |
| base_retriever = indexer.get_retriever(vector_db_type=args.vector_db) | |
| graph_retriever = GraphEnhancedRetriever( | |
| base_retriever=base_retriever, | |
| repo_dir=local_path | |
| ) | |
| repo_files = [] | |
| for root, _, files in os.walk(local_path): | |
| for file in files: | |
| repo_files.append(os.path.join(root, file)) | |
| progress.update(task, completed=True, description="[bold green]Resources Loaded[/bold green]") | |
| # Initialize ChatEngine | |
| if args.agent: | |
| console.print("[bold purple]🤖 Agent Mode Enabled[/bold purple]") | |
| chat_engine = ChatEngine( | |
| retriever=graph_retriever, | |
| provider=args.provider, | |
| model_name=model_name, | |
| api_key=api_key, | |
| repo_files=repo_files, | |
| repo_name=os.path.basename(local_path), | |
| use_agent=args.agent, | |
| repo_dir=local_path | |
| ) | |
| console.print("\n[bold green]Ready![/bold green] chat initialized. Type 'exit' to quit.\n") | |
| while True: | |
| try: | |
| query = Prompt.ask("[bold cyan]User[/bold cyan]") | |
| if query.strip().lower() in ['exit', 'quit', ':q']: | |
| break | |
| if not query.strip(): | |
| continue | |
| console.print("[dim]🕷️ Thinking...[/dim]") | |
| # Unified Chat Call (Handles Agent & Standard + Fallback) | |
| response = chat_engine.chat(query) | |
| if isinstance(response, tuple): | |
| answer, sources = response | |
| else: | |
| answer = response | |
| sources = [] | |
| # Render Response | |
| console.print(Panel(Markdown(answer), title="Spider", border_style="magenta", expand=False)) | |
| if sources: | |
| console.print("[dim]Sources:[/dim]") | |
| seen = set() | |
| for s in sources: | |
| fp = s.get('file_path', 'unknown') | |
| if fp not in seen: | |
| console.print(f" - [underline]{os.path.basename(fp)}[/underline]") | |
| seen.add(fp) | |
| console.print("") | |
| except KeyboardInterrupt: | |
| break | |
| except Exception as e: | |
| console.print(f"[bold red][ERROR][/bold red] {e}") | |
| except Exception as e: | |
| console.print(f"[bold red][ERROR][/bold red] Chat failed to start: {e}") | |
| # import traceback | |
| # traceback.print_exc() | |
| def main(): | |
| setup_env() | |
| print_banner() | |
| parser = argparse.ArgumentParser(description="Code Crawler CLI") | |
| subparsers = parser.add_subparsers(dest="command", required=True) | |
| # Index Command | |
| index_parser = subparsers.add_parser("index", help="Index a codebase (ZIP, URL, or Path)") | |
| index_parser.add_argument("--source", "-s", required=True, help="Path to ZIP, Folder, or GitHub URL") | |
| index_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider") | |
| index_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database") | |
| index_parser.add_argument("--clean", action="store_true", help="Clean previous index before running") | |
| # Chat Command | |
| chat_parser = subparsers.add_parser("chat", help="Chat with the indexed codebase") | |
| chat_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider") | |
| chat_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database type used during index") | |
| chat_parser.add_argument("--agent", "-a", action="store_true", help="Enable Agentic Reasoning (LangGraph)") | |
| args = parser.parse_args() | |
| if args.command == "index": | |
| handle_index(args) | |
| elif args.command == "chat": | |
| handle_chat(args) | |
| if __name__ == "__main__": | |
| main() | |