Spaces:
Running
Running
Asish Karthikeya Gogineni commited on
Commit ·
7dec411
1
Parent(s): 039d022
chore: Remove unused files (rate_limit_config, setup.py, ingestor, cli)
Browse files- code_chatbot/cli.py +0 -298
- code_chatbot/ingestor.py +0 -103
- rate_limit_config.py +0 -63
- setup.py +0 -15
code_chatbot/cli.py
DELETED
|
@@ -1,298 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
🕷️ Code Crawler CLI
|
| 4 |
-
Command-line interface for the Code Crawler engine.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import argparse
|
| 8 |
-
import os
|
| 9 |
-
import sys
|
| 10 |
-
import logging
|
| 11 |
-
import shutil
|
| 12 |
-
import json
|
| 13 |
-
from dotenv import load_dotenv
|
| 14 |
-
|
| 15 |
-
# Rich Imports
|
| 16 |
-
from rich.console import Console
|
| 17 |
-
from rich.markdown import Markdown
|
| 18 |
-
from rich.panel import Panel
|
| 19 |
-
from rich.prompt import Prompt
|
| 20 |
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
| 21 |
-
|
| 22 |
-
# Local Imports
|
| 23 |
-
from .indexer import Indexer
|
| 24 |
-
from .rag import ChatEngine
|
| 25 |
-
from .ast_analysis import ASTGraphBuilder
|
| 26 |
-
from .graph_rag import GraphEnhancedRetriever
|
| 27 |
-
from .universal_ingestor import process_source
|
| 28 |
-
from .agent_workflow import create_agent_graph
|
| 29 |
-
|
| 30 |
-
# Configure Console
|
| 31 |
-
console = Console()
|
| 32 |
-
logging.basicConfig(level=logging.ERROR)
|
| 33 |
-
# Suppress noisy libraries
|
| 34 |
-
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 35 |
-
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
| 36 |
-
logging.getLogger("chromadb").setLevel(logging.ERROR)
|
| 37 |
-
logging.getLogger("google_genai").setLevel(logging.ERROR)
|
| 38 |
-
logging.getLogger("google.genai").setLevel(logging.ERROR)
|
| 39 |
-
logging.getLogger("code_chatbot.chunker").setLevel(logging.ERROR)
|
| 40 |
-
|
| 41 |
-
logger = logging.getLogger("CodeCrawlerCLI")
|
| 42 |
-
logger.setLevel(logging.INFO)
|
| 43 |
-
|
| 44 |
-
BANNER = """
|
| 45 |
-
[bold cyan] 🕷️ Code Crawler CLI 🕷️[/bold cyan]
|
| 46 |
-
[dim] Index. Chat. Understand.[/dim]
|
| 47 |
-
"""
|
| 48 |
-
|
| 49 |
-
def setup_env():
|
| 50 |
-
load_dotenv()
|
| 51 |
-
|
| 52 |
-
def print_banner():
|
| 53 |
-
console.print(Panel(BANNER, subtitle="v2.0", border_style="cyan"))
|
| 54 |
-
|
| 55 |
-
def handle_index(args):
|
| 56 |
-
"""
|
| 57 |
-
Handles the indexing command.
|
| 58 |
-
"""
|
| 59 |
-
console.print(f"[bold blue][INFO][/bold blue] Starting indexing for source: [green]{args.source}[/green]")
|
| 60 |
-
|
| 61 |
-
# 1. Setup Environment
|
| 62 |
-
if args.provider == "gemini":
|
| 63 |
-
api_key = os.getenv("GOOGLE_API_KEY")
|
| 64 |
-
if not api_key:
|
| 65 |
-
console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY not found in .env")
|
| 66 |
-
sys.exit(1)
|
| 67 |
-
embedding_provider = "gemini"
|
| 68 |
-
embedding_api_key = api_key
|
| 69 |
-
elif args.provider == "groq":
|
| 70 |
-
api_key = os.getenv("GROQ_API_KEY")
|
| 71 |
-
embedding_api_key = os.getenv("GOOGLE_API_KEY")
|
| 72 |
-
if not api_key:
|
| 73 |
-
console.print("[bold red][ERROR][/bold red] GROQ_API_KEY not found in .env")
|
| 74 |
-
sys.exit(1)
|
| 75 |
-
if not embedding_api_key:
|
| 76 |
-
console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY (for embeddings) not found in .env")
|
| 77 |
-
sys.exit(1)
|
| 78 |
-
embedding_provider = "gemini"
|
| 79 |
-
else:
|
| 80 |
-
console.print(f"[bold red]Unknown provider:[/bold red] {args.provider}")
|
| 81 |
-
sys.exit(1)
|
| 82 |
-
|
| 83 |
-
try:
|
| 84 |
-
# 2. Extract & Ingest
|
| 85 |
-
extract_to = "data/extracted"
|
| 86 |
-
# Optional: Clean previous data
|
| 87 |
-
if args.clean and os.path.exists(extract_to):
|
| 88 |
-
console.print("[bold yellow][WARN][/bold yellow] Cleaning previous data...")
|
| 89 |
-
shutil.rmtree(extract_to)
|
| 90 |
-
|
| 91 |
-
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
| 92 |
-
task = progress.add_task("Processing source...", total=None)
|
| 93 |
-
documents, local_path = process_source(args.source, extract_to)
|
| 94 |
-
progress.update(task, completed=True, description="[bold green]Source Processed[/bold green]")
|
| 95 |
-
|
| 96 |
-
console.print(f"[bold green][SUCCESS][/bold green] Ingested {len(documents)} documents.")
|
| 97 |
-
|
| 98 |
-
# Save metadata for Chat to find the path
|
| 99 |
-
os.makedirs("data", exist_ok=True)
|
| 100 |
-
with open("data/cli_meta.json", "w") as f:
|
| 101 |
-
json.dump({"repo_path": local_path}, f)
|
| 102 |
-
|
| 103 |
-
# 3. AST Analysis
|
| 104 |
-
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
| 105 |
-
task = progress.add_task("Building AST Knowledge Graph...", total=None)
|
| 106 |
-
ast_builder = ASTGraphBuilder()
|
| 107 |
-
for doc in documents:
|
| 108 |
-
# doc.metadata['file_path'] is absolute
|
| 109 |
-
ast_builder.add_file(doc.metadata['file_path'], doc.page_content)
|
| 110 |
-
|
| 111 |
-
# Web sources might not create the directory
|
| 112 |
-
os.makedirs(local_path, exist_ok=True)
|
| 113 |
-
graph_path = os.path.join(local_path, "ast_graph.graphml")
|
| 114 |
-
ast_builder.save_graph(graph_path)
|
| 115 |
-
progress.update(task, completed=True, description="[bold green]AST Graph Built[/bold green]")
|
| 116 |
-
|
| 117 |
-
console.print(f"[bold green][SUCCESS][/bold green] AST Graph ready ({ast_builder.graph.number_of_nodes()} nodes).")
|
| 118 |
-
|
| 119 |
-
# 4. Vector Indexing
|
| 120 |
-
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
| 121 |
-
task = progress.add_task(f"Indexing into {args.vector_db}...", total=None)
|
| 122 |
-
indexer = Indexer(
|
| 123 |
-
provider=embedding_provider,
|
| 124 |
-
api_key=embedding_api_key
|
| 125 |
-
)
|
| 126 |
-
# Clear old data if requested
|
| 127 |
-
if args.clean:
|
| 128 |
-
indexer.clear_collection()
|
| 129 |
-
|
| 130 |
-
indexer.index_documents(documents, vector_db_type=args.vector_db)
|
| 131 |
-
progress.update(task, completed=True, description=f"[bold green]Indexed into {args.vector_db}[/bold green]")
|
| 132 |
-
|
| 133 |
-
console.print(f"[bold green][SUCCESS][/bold green] Indexing Complete! You can now run `code-crawler chat`.")
|
| 134 |
-
|
| 135 |
-
except Exception as e:
|
| 136 |
-
console.print(f"[bold red][ERROR][/bold red] Indexing failed: {e}")
|
| 137 |
-
# import traceback
|
| 138 |
-
# traceback.print_exc()
|
| 139 |
-
|
| 140 |
-
def handle_chat(args):
|
| 141 |
-
"""
|
| 142 |
-
Handles the chat command.
|
| 143 |
-
"""
|
| 144 |
-
console.print(f"[bold blue][INFO][/bold blue] Initializing Chat Engine ({args.provider})...")
|
| 145 |
-
|
| 146 |
-
# Setup Env & Keys
|
| 147 |
-
if args.provider == "gemini":
|
| 148 |
-
api_key = os.getenv("GOOGLE_API_KEY")
|
| 149 |
-
embedding_api_key = api_key
|
| 150 |
-
embedding_provider = "gemini"
|
| 151 |
-
model_name = "gemini-2.5-flash"
|
| 152 |
-
llm_provider_lib = "google_genai"
|
| 153 |
-
elif args.provider == "groq":
|
| 154 |
-
api_key = os.getenv("GROQ_API_KEY")
|
| 155 |
-
embedding_api_key = os.getenv("GOOGLE_API_KEY")
|
| 156 |
-
embedding_provider = "gemini"
|
| 157 |
-
model_name = "llama-3.3-70b-versatile"
|
| 158 |
-
llm_provider_lib = "groq"
|
| 159 |
-
|
| 160 |
-
if not api_key:
|
| 161 |
-
console.print("[bold red][ERROR][/bold red] API Keys missing. Check .env")
|
| 162 |
-
sys.exit(1)
|
| 163 |
-
|
| 164 |
-
try:
|
| 165 |
-
# Load Resources
|
| 166 |
-
meta_file = "data/cli_meta.json"
|
| 167 |
-
if os.path.exists(meta_file):
|
| 168 |
-
with open(meta_file, "r") as f:
|
| 169 |
-
meta = json.load(f)
|
| 170 |
-
local_path = meta.get("repo_path")
|
| 171 |
-
else:
|
| 172 |
-
# Fallback Heuristic
|
| 173 |
-
extract_root = "data/extracted"
|
| 174 |
-
if not os.path.exists(extract_root):
|
| 175 |
-
console.print("[bold red][ERROR][/bold red] No index info found. Run 'code-crawler index' first.")
|
| 176 |
-
sys.exit(1)
|
| 177 |
-
|
| 178 |
-
subdirs = [f.path for f in os.scandir(extract_root) if f.is_dir()]
|
| 179 |
-
if not subdirs:
|
| 180 |
-
local_path = extract_root
|
| 181 |
-
else:
|
| 182 |
-
subdirs.sort(key=lambda x: os.path.getmtime(x), reverse=True)
|
| 183 |
-
local_path = subdirs[0]
|
| 184 |
-
|
| 185 |
-
if not local_path or not os.path.exists(local_path):
|
| 186 |
-
console.print(f"[bold red][ERROR][/bold red] Codebase path not found: {local_path}")
|
| 187 |
-
sys.exit(1)
|
| 188 |
-
|
| 189 |
-
console.print(f"[dim]Using codebase at: {local_path}[/dim]")
|
| 190 |
-
|
| 191 |
-
# Initialize Components
|
| 192 |
-
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
| 193 |
-
task = progress.add_task("Loading resources...", total=None)
|
| 194 |
-
|
| 195 |
-
indexer = Indexer(provider=embedding_provider, api_key=embedding_api_key)
|
| 196 |
-
base_retriever = indexer.get_retriever(vector_db_type=args.vector_db)
|
| 197 |
-
|
| 198 |
-
graph_retriever = GraphEnhancedRetriever(
|
| 199 |
-
base_retriever=base_retriever,
|
| 200 |
-
repo_dir=local_path
|
| 201 |
-
)
|
| 202 |
-
|
| 203 |
-
repo_files = []
|
| 204 |
-
for root, _, files in os.walk(local_path):
|
| 205 |
-
for file in files:
|
| 206 |
-
repo_files.append(os.path.join(root, file))
|
| 207 |
-
|
| 208 |
-
progress.update(task, completed=True, description="[bold green]Resources Loaded[/bold green]")
|
| 209 |
-
|
| 210 |
-
# Initialize ChatEngine
|
| 211 |
-
if args.agent:
|
| 212 |
-
console.print("[bold purple]🤖 Agent Mode Enabled[/bold purple]")
|
| 213 |
-
|
| 214 |
-
chat_engine = ChatEngine(
|
| 215 |
-
retriever=graph_retriever,
|
| 216 |
-
provider=args.provider,
|
| 217 |
-
model_name=model_name,
|
| 218 |
-
api_key=api_key,
|
| 219 |
-
repo_files=repo_files,
|
| 220 |
-
repo_name=os.path.basename(local_path),
|
| 221 |
-
use_agent=args.agent,
|
| 222 |
-
repo_dir=local_path
|
| 223 |
-
)
|
| 224 |
-
|
| 225 |
-
console.print("\n[bold green]Ready![/bold green] chat initialized. Type 'exit' to quit.\n")
|
| 226 |
-
|
| 227 |
-
while True:
|
| 228 |
-
try:
|
| 229 |
-
query = Prompt.ask("[bold cyan]User[/bold cyan]")
|
| 230 |
-
if query.strip().lower() in ['exit', 'quit', ':q']:
|
| 231 |
-
break
|
| 232 |
-
|
| 233 |
-
if not query.strip():
|
| 234 |
-
continue
|
| 235 |
-
|
| 236 |
-
console.print("[dim]🕷️ Thinking...[/dim]")
|
| 237 |
-
|
| 238 |
-
# Unified Chat Call (Handles Agent & Standard + Fallback)
|
| 239 |
-
response = chat_engine.chat(query)
|
| 240 |
-
|
| 241 |
-
if isinstance(response, tuple):
|
| 242 |
-
answer, sources = response
|
| 243 |
-
else:
|
| 244 |
-
answer = response
|
| 245 |
-
sources = []
|
| 246 |
-
|
| 247 |
-
# Render Response
|
| 248 |
-
console.print(Panel(Markdown(answer), title="Spider", border_style="magenta", expand=False))
|
| 249 |
-
|
| 250 |
-
if sources:
|
| 251 |
-
console.print("[dim]Sources:[/dim]")
|
| 252 |
-
seen = set()
|
| 253 |
-
for s in sources:
|
| 254 |
-
fp = s.get('file_path', 'unknown')
|
| 255 |
-
if fp not in seen:
|
| 256 |
-
console.print(f" - [underline]{os.path.basename(fp)}[/underline]")
|
| 257 |
-
seen.add(fp)
|
| 258 |
-
console.print("")
|
| 259 |
-
|
| 260 |
-
except KeyboardInterrupt:
|
| 261 |
-
break
|
| 262 |
-
except Exception as e:
|
| 263 |
-
console.print(f"[bold red][ERROR][/bold red] {e}")
|
| 264 |
-
|
| 265 |
-
except Exception as e:
|
| 266 |
-
console.print(f"[bold red][ERROR][/bold red] Chat failed to start: {e}")
|
| 267 |
-
# import traceback
|
| 268 |
-
# traceback.print_exc()
|
| 269 |
-
|
| 270 |
-
def main():
|
| 271 |
-
setup_env()
|
| 272 |
-
print_banner()
|
| 273 |
-
|
| 274 |
-
parser = argparse.ArgumentParser(description="Code Crawler CLI")
|
| 275 |
-
subparsers = parser.add_subparsers(dest="command", required=True)
|
| 276 |
-
|
| 277 |
-
# Index Command
|
| 278 |
-
index_parser = subparsers.add_parser("index", help="Index a codebase (ZIP, URL, or Path)")
|
| 279 |
-
index_parser.add_argument("--source", "-s", required=True, help="Path to ZIP, Folder, or GitHub URL")
|
| 280 |
-
index_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider")
|
| 281 |
-
index_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database")
|
| 282 |
-
index_parser.add_argument("--clean", action="store_true", help="Clean previous index before running")
|
| 283 |
-
|
| 284 |
-
# Chat Command
|
| 285 |
-
chat_parser = subparsers.add_parser("chat", help="Chat with the indexed codebase")
|
| 286 |
-
chat_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider")
|
| 287 |
-
chat_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database type used during index")
|
| 288 |
-
chat_parser.add_argument("--agent", "-a", action="store_true", help="Enable Agentic Reasoning (LangGraph)")
|
| 289 |
-
|
| 290 |
-
args = parser.parse_args()
|
| 291 |
-
|
| 292 |
-
if args.command == "index":
|
| 293 |
-
handle_index(args)
|
| 294 |
-
elif args.command == "chat":
|
| 295 |
-
handle_chat(args)
|
| 296 |
-
|
| 297 |
-
if __name__ == "__main__":
|
| 298 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code_chatbot/ingestor.py
DELETED
|
@@ -1,103 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import zipfile
|
| 3 |
-
import tempfile
|
| 4 |
-
import shutil
|
| 5 |
-
from typing import List, Optional
|
| 6 |
-
from langchain_core.documents import Document
|
| 7 |
-
import logging
|
| 8 |
-
|
| 9 |
-
# Configure logging
|
| 10 |
-
logging.basicConfig(level=logging.INFO)
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
# Extensions to ignore (binaries, images, etc.)
|
| 14 |
-
IGNORE_EXTENSIONS = {
|
| 15 |
-
'.pyc', '.git', '.github', '.idea', '.vscode', '.DS_Store',
|
| 16 |
-
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg',
|
| 17 |
-
'.mp4', '.mov', '.mp3', '.wav',
|
| 18 |
-
'.zip', '.tar', '.gz', '.pkl', '.bin', '.exe', '.dll', '.so', '.dylib',
|
| 19 |
-
'.pdf', '.docx', '.xlsx', '.pptx'
|
| 20 |
-
}
|
| 21 |
-
|
| 22 |
-
# Directories to ignore
|
| 23 |
-
IGNORE_DIRS = {
|
| 24 |
-
'__pycache__', '.git', '.github', '.idea', '.vscode', 'node_modules', 'venv', '.venv', 'env', '.env', 'dist', 'build', 'target'
|
| 25 |
-
}
|
| 26 |
-
|
| 27 |
-
def is_text_file(file_path: str) -> bool:
|
| 28 |
-
"""Check if a file is likely a text file based on extension and content."""
|
| 29 |
-
_, ext = os.path.splitext(file_path)
|
| 30 |
-
if ext.lower() in IGNORE_EXTENSIONS:
|
| 31 |
-
return False
|
| 32 |
-
|
| 33 |
-
try:
|
| 34 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
| 35 |
-
f.read(1024)
|
| 36 |
-
return True
|
| 37 |
-
except UnicodeDecodeError:
|
| 38 |
-
return False
|
| 39 |
-
except Exception:
|
| 40 |
-
return False
|
| 41 |
-
|
| 42 |
-
def process_zip(zip_path: str, extract_to: str) -> List[Document]:
|
| 43 |
-
"""
|
| 44 |
-
Extracts a ZIP file and returns a list of LangChain Documents.
|
| 45 |
-
|
| 46 |
-
Args:
|
| 47 |
-
zip_path: Path to the uploaded ZIP file.
|
| 48 |
-
extract_to: Directory to extract files to.
|
| 49 |
-
|
| 50 |
-
Returns:
|
| 51 |
-
List[Document]: List of documents with content and metadata.
|
| 52 |
-
"""
|
| 53 |
-
documents = []
|
| 54 |
-
|
| 55 |
-
if not os.path.exists(extract_to):
|
| 56 |
-
os.makedirs(extract_to)
|
| 57 |
-
|
| 58 |
-
try:
|
| 59 |
-
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 60 |
-
zip_ref.extractall(extract_to)
|
| 61 |
-
|
| 62 |
-
logger.info(f"Extracted {zip_path} to {extract_to}")
|
| 63 |
-
|
| 64 |
-
# Walk through the extracted files
|
| 65 |
-
for root, dirs, files in os.walk(extract_to):
|
| 66 |
-
# Modify dirs in-place to skip ignored directories
|
| 67 |
-
dirs[:] = [d for d in dirs if d not in IGNORE_DIRS and not d.startswith('.')]
|
| 68 |
-
|
| 69 |
-
for file in files:
|
| 70 |
-
if file.startswith('.'):
|
| 71 |
-
continue
|
| 72 |
-
|
| 73 |
-
file_path = os.path.join(root, file)
|
| 74 |
-
|
| 75 |
-
if is_text_file(file_path):
|
| 76 |
-
try:
|
| 77 |
-
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 78 |
-
content = f.read()
|
| 79 |
-
|
| 80 |
-
# Create relative path for metadata
|
| 81 |
-
rel_path = os.path.relpath(file_path, extract_to)
|
| 82 |
-
|
| 83 |
-
doc = Document(
|
| 84 |
-
page_content=content,
|
| 85 |
-
metadata={
|
| 86 |
-
"source": rel_path,
|
| 87 |
-
"file_path": file_path,
|
| 88 |
-
"file_name": file
|
| 89 |
-
}
|
| 90 |
-
)
|
| 91 |
-
documents.append(doc)
|
| 92 |
-
except Exception as e:
|
| 93 |
-
logger.warning(f"Failed to read {file_path}: {e}")
|
| 94 |
-
|
| 95 |
-
logger.info(f"Processed {len(documents)} documents from {zip_path}")
|
| 96 |
-
return documents
|
| 97 |
-
|
| 98 |
-
except zipfile.BadZipFile:
|
| 99 |
-
logger.error(f"Invalid ZIP file: {zip_path}")
|
| 100 |
-
raise ValueError("The provided file is not a valid ZIP archive.")
|
| 101 |
-
except Exception as e:
|
| 102 |
-
logger.error(f"Error processing ZIP: {e}")
|
| 103 |
-
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rate_limit_config.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
| 1 |
-
# Rate Limit Configuration
|
| 2 |
-
# Customize these settings to control API usage and maximize chat availability
|
| 3 |
-
|
| 4 |
-
# ============================================================================
|
| 5 |
-
# PROVIDER LIMITS (Free Tier Defaults)
|
| 6 |
-
# ============================================================================
|
| 7 |
-
|
| 8 |
-
# Gemini 2.0 Flash Experimental (Latest Model)
|
| 9 |
-
GEMINI_RPM = 15 # Requests per minute
|
| 10 |
-
GEMINI_TPM = 1000000 # Tokens per minute (1 million)
|
| 11 |
-
GEMINI_MIN_DELAY = 4.0 # Minimum seconds between requests (60s / 15 RPM = 4s)
|
| 12 |
-
GEMINI_BURST_DELAY = 10.0 # Delay when approaching limit
|
| 13 |
-
|
| 14 |
-
# Groq Free Tier (Increased delays to prevent rate limits)
|
| 15 |
-
GROQ_RPM = 30 # Requests per minute
|
| 16 |
-
GROQ_TPM = 20000 # Conservative daily token estimate
|
| 17 |
-
GROQ_MIN_DELAY = 8.0 # Minimum 8 seconds between requests (was 1s)
|
| 18 |
-
GROQ_BURST_DELAY = 20.0 # Delay when approaching limit (was 10s)
|
| 19 |
-
|
| 20 |
-
# ============================================================================
|
| 21 |
-
# OPTIMIZATION SETTINGS
|
| 22 |
-
# ============================================================================
|
| 23 |
-
|
| 24 |
-
# Response Caching
|
| 25 |
-
ENABLE_CACHE = True # Cache identical queries to save API calls
|
| 26 |
-
CACHE_TTL = 300 # Cache lifetime in seconds (5 minutes)
|
| 27 |
-
MAX_CACHE_SIZE = 100 # Maximum number of cached responses
|
| 28 |
-
|
| 29 |
-
# Adaptive Delays
|
| 30 |
-
USE_ADAPTIVE_DELAYS = True # Dynamically adjust delays based on usage
|
| 31 |
-
RATE_LIMIT_THRESHOLD = 0.7 # Trigger longer delays at 70% of limit (0.0-1.0)
|
| 32 |
-
|
| 33 |
-
# Context Optimization
|
| 34 |
-
MAX_AGENT_TOOL_RESULTS = 5 # Number of search results per tool call
|
| 35 |
-
MAX_AGENT_CONTENT_LENGTH = 2000 # Characters per search result
|
| 36 |
-
MAX_LINEAR_DOCS = 8 # Number of documents for linear RAG
|
| 37 |
-
MAX_LINEAR_CONTENT_LENGTH = 1500 # Characters per document
|
| 38 |
-
|
| 39 |
-
# ============================================================================
|
| 40 |
-
# ADVANCED SETTINGS
|
| 41 |
-
# ============================================================================
|
| 42 |
-
|
| 43 |
-
# Fallback Behavior
|
| 44 |
-
AUTO_FALLBACK_TO_LINEAR = True # Fall back to linear RAG on agent rate limits
|
| 45 |
-
MAX_AGENT_RETRIES = 2 # Number of retries on rate limit errors
|
| 46 |
-
|
| 47 |
-
# Statistics & Monitoring
|
| 48 |
-
SHOW_USAGE_STATS = True # Display usage stats in sidebar
|
| 49 |
-
LOG_RATE_LIMIT_WARNINGS = True # Log when approaching limits
|
| 50 |
-
|
| 51 |
-
# Token Budget (Optional - set to 0 to disable)
|
| 52 |
-
# Stop making requests after hitting daily token budget
|
| 53 |
-
DAILY_TOKEN_BUDGET_GEMINI = 0 # 0 = unlimited (within API limits)
|
| 54 |
-
DAILY_TOKEN_BUDGET_GROQ = 0 # 0 = unlimited (within API limits)
|
| 55 |
-
|
| 56 |
-
# ============================================================================
|
| 57 |
-
# TIPS FOR MAXIMIZING USAGE
|
| 58 |
-
# ============================================================================
|
| 59 |
-
# 1. Set lower MIN_DELAY values for faster responses (but higher risk)
|
| 60 |
-
# 2. Enable CACHE to avoid repeat API calls
|
| 61 |
-
# 3. Reduce MAX_AGENT_TOOL_RESULTS if hitting rate limits frequently
|
| 62 |
-
# 4. Use linear RAG mode for simpler questions (faster, fewer API calls)
|
| 63 |
-
# 5. Switch providers if one is exhausted (Gemini <-> Groq)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
# Work-around the fact that `pip install -e .` doesn't work with `pyproject.toml` files.
|
| 2 |
-
from setuptools import setup
|
| 3 |
-
|
| 4 |
-
setup(
|
| 5 |
-
name="code_chatbot",
|
| 6 |
-
version="0.1.0",
|
| 7 |
-
packages=["code_chatbot", "api"],
|
| 8 |
-
install_requires=[
|
| 9 |
-
"streamlit",
|
| 10 |
-
"langchain",
|
| 11 |
-
"chromadb",
|
| 12 |
-
"networkx",
|
| 13 |
-
"tree-sitter",
|
| 14 |
-
],
|
| 15 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|