Asish Karthikeya Gogineni commited on
Commit
7dec411
·
1 Parent(s): 039d022

chore: Remove unused files (rate_limit_config, setup.py, ingestor, cli)

Browse files
Files changed (4) hide show
  1. code_chatbot/cli.py +0 -298
  2. code_chatbot/ingestor.py +0 -103
  3. rate_limit_config.py +0 -63
  4. setup.py +0 -15
code_chatbot/cli.py DELETED
@@ -1,298 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- 🕷️ Code Crawler CLI
4
- Command-line interface for the Code Crawler engine.
5
- """
6
-
7
- import argparse
8
- import os
9
- import sys
10
- import logging
11
- import shutil
12
- import json
13
- from dotenv import load_dotenv
14
-
15
- # Rich Imports
16
- from rich.console import Console
17
- from rich.markdown import Markdown
18
- from rich.panel import Panel
19
- from rich.prompt import Prompt
20
- from rich.progress import Progress, SpinnerColumn, TextColumn
21
-
22
- # Local Imports
23
- from .indexer import Indexer
24
- from .rag import ChatEngine
25
- from .ast_analysis import ASTGraphBuilder
26
- from .graph_rag import GraphEnhancedRetriever
27
- from .universal_ingestor import process_source
28
- from .agent_workflow import create_agent_graph
29
-
30
- # Configure Console
31
- console = Console()
32
- logging.basicConfig(level=logging.ERROR)
33
- # Suppress noisy libraries
34
- logging.getLogger("httpx").setLevel(logging.WARNING)
35
- logging.getLogger("httpcore").setLevel(logging.WARNING)
36
- logging.getLogger("chromadb").setLevel(logging.ERROR)
37
- logging.getLogger("google_genai").setLevel(logging.ERROR)
38
- logging.getLogger("google.genai").setLevel(logging.ERROR)
39
- logging.getLogger("code_chatbot.chunker").setLevel(logging.ERROR)
40
-
41
- logger = logging.getLogger("CodeCrawlerCLI")
42
- logger.setLevel(logging.INFO)
43
-
44
- BANNER = """
45
- [bold cyan] 🕷️ Code Crawler CLI 🕷️[/bold cyan]
46
- [dim] Index. Chat. Understand.[/dim]
47
- """
48
-
49
- def setup_env():
50
- load_dotenv()
51
-
52
- def print_banner():
53
- console.print(Panel(BANNER, subtitle="v2.0", border_style="cyan"))
54
-
55
- def handle_index(args):
56
- """
57
- Handles the indexing command.
58
- """
59
- console.print(f"[bold blue][INFO][/bold blue] Starting indexing for source: [green]{args.source}[/green]")
60
-
61
- # 1. Setup Environment
62
- if args.provider == "gemini":
63
- api_key = os.getenv("GOOGLE_API_KEY")
64
- if not api_key:
65
- console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY not found in .env")
66
- sys.exit(1)
67
- embedding_provider = "gemini"
68
- embedding_api_key = api_key
69
- elif args.provider == "groq":
70
- api_key = os.getenv("GROQ_API_KEY")
71
- embedding_api_key = os.getenv("GOOGLE_API_KEY")
72
- if not api_key:
73
- console.print("[bold red][ERROR][/bold red] GROQ_API_KEY not found in .env")
74
- sys.exit(1)
75
- if not embedding_api_key:
76
- console.print("[bold red][ERROR][/bold red] GOOGLE_API_KEY (for embeddings) not found in .env")
77
- sys.exit(1)
78
- embedding_provider = "gemini"
79
- else:
80
- console.print(f"[bold red]Unknown provider:[/bold red] {args.provider}")
81
- sys.exit(1)
82
-
83
- try:
84
- # 2. Extract & Ingest
85
- extract_to = "data/extracted"
86
- # Optional: Clean previous data
87
- if args.clean and os.path.exists(extract_to):
88
- console.print("[bold yellow][WARN][/bold yellow] Cleaning previous data...")
89
- shutil.rmtree(extract_to)
90
-
91
- with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
92
- task = progress.add_task("Processing source...", total=None)
93
- documents, local_path = process_source(args.source, extract_to)
94
- progress.update(task, completed=True, description="[bold green]Source Processed[/bold green]")
95
-
96
- console.print(f"[bold green][SUCCESS][/bold green] Ingested {len(documents)} documents.")
97
-
98
- # Save metadata for Chat to find the path
99
- os.makedirs("data", exist_ok=True)
100
- with open("data/cli_meta.json", "w") as f:
101
- json.dump({"repo_path": local_path}, f)
102
-
103
- # 3. AST Analysis
104
- with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
105
- task = progress.add_task("Building AST Knowledge Graph...", total=None)
106
- ast_builder = ASTGraphBuilder()
107
- for doc in documents:
108
- # doc.metadata['file_path'] is absolute
109
- ast_builder.add_file(doc.metadata['file_path'], doc.page_content)
110
-
111
- # Web sources might not create the directory
112
- os.makedirs(local_path, exist_ok=True)
113
- graph_path = os.path.join(local_path, "ast_graph.graphml")
114
- ast_builder.save_graph(graph_path)
115
- progress.update(task, completed=True, description="[bold green]AST Graph Built[/bold green]")
116
-
117
- console.print(f"[bold green][SUCCESS][/bold green] AST Graph ready ({ast_builder.graph.number_of_nodes()} nodes).")
118
-
119
- # 4. Vector Indexing
120
- with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
121
- task = progress.add_task(f"Indexing into {args.vector_db}...", total=None)
122
- indexer = Indexer(
123
- provider=embedding_provider,
124
- api_key=embedding_api_key
125
- )
126
- # Clear old data if requested
127
- if args.clean:
128
- indexer.clear_collection()
129
-
130
- indexer.index_documents(documents, vector_db_type=args.vector_db)
131
- progress.update(task, completed=True, description=f"[bold green]Indexed into {args.vector_db}[/bold green]")
132
-
133
- console.print(f"[bold green][SUCCESS][/bold green] Indexing Complete! You can now run `code-crawler chat`.")
134
-
135
- except Exception as e:
136
- console.print(f"[bold red][ERROR][/bold red] Indexing failed: {e}")
137
- # import traceback
138
- # traceback.print_exc()
139
-
140
- def handle_chat(args):
141
- """
142
- Handles the chat command.
143
- """
144
- console.print(f"[bold blue][INFO][/bold blue] Initializing Chat Engine ({args.provider})...")
145
-
146
- # Setup Env & Keys
147
- if args.provider == "gemini":
148
- api_key = os.getenv("GOOGLE_API_KEY")
149
- embedding_api_key = api_key
150
- embedding_provider = "gemini"
151
- model_name = "gemini-2.5-flash"
152
- llm_provider_lib = "google_genai"
153
- elif args.provider == "groq":
154
- api_key = os.getenv("GROQ_API_KEY")
155
- embedding_api_key = os.getenv("GOOGLE_API_KEY")
156
- embedding_provider = "gemini"
157
- model_name = "llama-3.3-70b-versatile"
158
- llm_provider_lib = "groq"
159
-
160
- if not api_key:
161
- console.print("[bold red][ERROR][/bold red] API Keys missing. Check .env")
162
- sys.exit(1)
163
-
164
- try:
165
- # Load Resources
166
- meta_file = "data/cli_meta.json"
167
- if os.path.exists(meta_file):
168
- with open(meta_file, "r") as f:
169
- meta = json.load(f)
170
- local_path = meta.get("repo_path")
171
- else:
172
- # Fallback Heuristic
173
- extract_root = "data/extracted"
174
- if not os.path.exists(extract_root):
175
- console.print("[bold red][ERROR][/bold red] No index info found. Run 'code-crawler index' first.")
176
- sys.exit(1)
177
-
178
- subdirs = [f.path for f in os.scandir(extract_root) if f.is_dir()]
179
- if not subdirs:
180
- local_path = extract_root
181
- else:
182
- subdirs.sort(key=lambda x: os.path.getmtime(x), reverse=True)
183
- local_path = subdirs[0]
184
-
185
- if not local_path or not os.path.exists(local_path):
186
- console.print(f"[bold red][ERROR][/bold red] Codebase path not found: {local_path}")
187
- sys.exit(1)
188
-
189
- console.print(f"[dim]Using codebase at: {local_path}[/dim]")
190
-
191
- # Initialize Components
192
- with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
193
- task = progress.add_task("Loading resources...", total=None)
194
-
195
- indexer = Indexer(provider=embedding_provider, api_key=embedding_api_key)
196
- base_retriever = indexer.get_retriever(vector_db_type=args.vector_db)
197
-
198
- graph_retriever = GraphEnhancedRetriever(
199
- base_retriever=base_retriever,
200
- repo_dir=local_path
201
- )
202
-
203
- repo_files = []
204
- for root, _, files in os.walk(local_path):
205
- for file in files:
206
- repo_files.append(os.path.join(root, file))
207
-
208
- progress.update(task, completed=True, description="[bold green]Resources Loaded[/bold green]")
209
-
210
- # Initialize ChatEngine
211
- if args.agent:
212
- console.print("[bold purple]🤖 Agent Mode Enabled[/bold purple]")
213
-
214
- chat_engine = ChatEngine(
215
- retriever=graph_retriever,
216
- provider=args.provider,
217
- model_name=model_name,
218
- api_key=api_key,
219
- repo_files=repo_files,
220
- repo_name=os.path.basename(local_path),
221
- use_agent=args.agent,
222
- repo_dir=local_path
223
- )
224
-
225
- console.print("\n[bold green]Ready![/bold green] chat initialized. Type 'exit' to quit.\n")
226
-
227
- while True:
228
- try:
229
- query = Prompt.ask("[bold cyan]User[/bold cyan]")
230
- if query.strip().lower() in ['exit', 'quit', ':q']:
231
- break
232
-
233
- if not query.strip():
234
- continue
235
-
236
- console.print("[dim]🕷️ Thinking...[/dim]")
237
-
238
- # Unified Chat Call (Handles Agent & Standard + Fallback)
239
- response = chat_engine.chat(query)
240
-
241
- if isinstance(response, tuple):
242
- answer, sources = response
243
- else:
244
- answer = response
245
- sources = []
246
-
247
- # Render Response
248
- console.print(Panel(Markdown(answer), title="Spider", border_style="magenta", expand=False))
249
-
250
- if sources:
251
- console.print("[dim]Sources:[/dim]")
252
- seen = set()
253
- for s in sources:
254
- fp = s.get('file_path', 'unknown')
255
- if fp not in seen:
256
- console.print(f" - [underline]{os.path.basename(fp)}[/underline]")
257
- seen.add(fp)
258
- console.print("")
259
-
260
- except KeyboardInterrupt:
261
- break
262
- except Exception as e:
263
- console.print(f"[bold red][ERROR][/bold red] {e}")
264
-
265
- except Exception as e:
266
- console.print(f"[bold red][ERROR][/bold red] Chat failed to start: {e}")
267
- # import traceback
268
- # traceback.print_exc()
269
-
270
- def main():
271
- setup_env()
272
- print_banner()
273
-
274
- parser = argparse.ArgumentParser(description="Code Crawler CLI")
275
- subparsers = parser.add_subparsers(dest="command", required=True)
276
-
277
- # Index Command
278
- index_parser = subparsers.add_parser("index", help="Index a codebase (ZIP, URL, or Path)")
279
- index_parser.add_argument("--source", "-s", required=True, help="Path to ZIP, Folder, or GitHub URL")
280
- index_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider")
281
- index_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database")
282
- index_parser.add_argument("--clean", action="store_true", help="Clean previous index before running")
283
-
284
- # Chat Command
285
- chat_parser = subparsers.add_parser("chat", help="Chat with the indexed codebase")
286
- chat_parser.add_argument("--provider", "-p", default="gemini", choices=["gemini", "groq"], help="LLM Provider")
287
- chat_parser.add_argument("--vector-db", "-v", default="chroma", choices=["chroma", "faiss"], help="Vector Database type used during index")
288
- chat_parser.add_argument("--agent", "-a", action="store_true", help="Enable Agentic Reasoning (LangGraph)")
289
-
290
- args = parser.parse_args()
291
-
292
- if args.command == "index":
293
- handle_index(args)
294
- elif args.command == "chat":
295
- handle_chat(args)
296
-
297
- if __name__ == "__main__":
298
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_chatbot/ingestor.py DELETED
@@ -1,103 +0,0 @@
1
- import os
2
- import zipfile
3
- import tempfile
4
- import shutil
5
- from typing import List, Optional
6
- from langchain_core.documents import Document
7
- import logging
8
-
9
- # Configure logging
10
- logging.basicConfig(level=logging.INFO)
11
- logger = logging.getLogger(__name__)
12
-
13
- # Extensions to ignore (binaries, images, etc.)
14
- IGNORE_EXTENSIONS = {
15
- '.pyc', '.git', '.github', '.idea', '.vscode', '.DS_Store',
16
- '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg',
17
- '.mp4', '.mov', '.mp3', '.wav',
18
- '.zip', '.tar', '.gz', '.pkl', '.bin', '.exe', '.dll', '.so', '.dylib',
19
- '.pdf', '.docx', '.xlsx', '.pptx'
20
- }
21
-
22
- # Directories to ignore
23
- IGNORE_DIRS = {
24
- '__pycache__', '.git', '.github', '.idea', '.vscode', 'node_modules', 'venv', '.venv', 'env', '.env', 'dist', 'build', 'target'
25
- }
26
-
27
- def is_text_file(file_path: str) -> bool:
28
- """Check if a file is likely a text file based on extension and content."""
29
- _, ext = os.path.splitext(file_path)
30
- if ext.lower() in IGNORE_EXTENSIONS:
31
- return False
32
-
33
- try:
34
- with open(file_path, 'r', encoding='utf-8') as f:
35
- f.read(1024)
36
- return True
37
- except UnicodeDecodeError:
38
- return False
39
- except Exception:
40
- return False
41
-
42
- def process_zip(zip_path: str, extract_to: str) -> List[Document]:
43
- """
44
- Extracts a ZIP file and returns a list of LangChain Documents.
45
-
46
- Args:
47
- zip_path: Path to the uploaded ZIP file.
48
- extract_to: Directory to extract files to.
49
-
50
- Returns:
51
- List[Document]: List of documents with content and metadata.
52
- """
53
- documents = []
54
-
55
- if not os.path.exists(extract_to):
56
- os.makedirs(extract_to)
57
-
58
- try:
59
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
60
- zip_ref.extractall(extract_to)
61
-
62
- logger.info(f"Extracted {zip_path} to {extract_to}")
63
-
64
- # Walk through the extracted files
65
- for root, dirs, files in os.walk(extract_to):
66
- # Modify dirs in-place to skip ignored directories
67
- dirs[:] = [d for d in dirs if d not in IGNORE_DIRS and not d.startswith('.')]
68
-
69
- for file in files:
70
- if file.startswith('.'):
71
- continue
72
-
73
- file_path = os.path.join(root, file)
74
-
75
- if is_text_file(file_path):
76
- try:
77
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
78
- content = f.read()
79
-
80
- # Create relative path for metadata
81
- rel_path = os.path.relpath(file_path, extract_to)
82
-
83
- doc = Document(
84
- page_content=content,
85
- metadata={
86
- "source": rel_path,
87
- "file_path": file_path,
88
- "file_name": file
89
- }
90
- )
91
- documents.append(doc)
92
- except Exception as e:
93
- logger.warning(f"Failed to read {file_path}: {e}")
94
-
95
- logger.info(f"Processed {len(documents)} documents from {zip_path}")
96
- return documents
97
-
98
- except zipfile.BadZipFile:
99
- logger.error(f"Invalid ZIP file: {zip_path}")
100
- raise ValueError("The provided file is not a valid ZIP archive.")
101
- except Exception as e:
102
- logger.error(f"Error processing ZIP: {e}")
103
- raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rate_limit_config.py DELETED
@@ -1,63 +0,0 @@
1
- # Rate Limit Configuration
2
- # Customize these settings to control API usage and maximize chat availability
3
-
4
- # ============================================================================
5
- # PROVIDER LIMITS (Free Tier Defaults)
6
- # ============================================================================
7
-
8
- # Gemini 2.0 Flash Experimental (Latest Model)
9
- GEMINI_RPM = 15 # Requests per minute
10
- GEMINI_TPM = 1000000 # Tokens per minute (1 million)
11
- GEMINI_MIN_DELAY = 4.0 # Minimum seconds between requests (60s / 15 RPM = 4s)
12
- GEMINI_BURST_DELAY = 10.0 # Delay when approaching limit
13
-
14
- # Groq Free Tier (Increased delays to prevent rate limits)
15
- GROQ_RPM = 30 # Requests per minute
16
- GROQ_TPM = 20000 # Conservative daily token estimate
17
- GROQ_MIN_DELAY = 8.0 # Minimum 8 seconds between requests (was 1s)
18
- GROQ_BURST_DELAY = 20.0 # Delay when approaching limit (was 10s)
19
-
20
- # ============================================================================
21
- # OPTIMIZATION SETTINGS
22
- # ============================================================================
23
-
24
- # Response Caching
25
- ENABLE_CACHE = True # Cache identical queries to save API calls
26
- CACHE_TTL = 300 # Cache lifetime in seconds (5 minutes)
27
- MAX_CACHE_SIZE = 100 # Maximum number of cached responses
28
-
29
- # Adaptive Delays
30
- USE_ADAPTIVE_DELAYS = True # Dynamically adjust delays based on usage
31
- RATE_LIMIT_THRESHOLD = 0.7 # Trigger longer delays at 70% of limit (0.0-1.0)
32
-
33
- # Context Optimization
34
- MAX_AGENT_TOOL_RESULTS = 5 # Number of search results per tool call
35
- MAX_AGENT_CONTENT_LENGTH = 2000 # Characters per search result
36
- MAX_LINEAR_DOCS = 8 # Number of documents for linear RAG
37
- MAX_LINEAR_CONTENT_LENGTH = 1500 # Characters per document
38
-
39
- # ============================================================================
40
- # ADVANCED SETTINGS
41
- # ============================================================================
42
-
43
- # Fallback Behavior
44
- AUTO_FALLBACK_TO_LINEAR = True # Fall back to linear RAG on agent rate limits
45
- MAX_AGENT_RETRIES = 2 # Number of retries on rate limit errors
46
-
47
- # Statistics & Monitoring
48
- SHOW_USAGE_STATS = True # Display usage stats in sidebar
49
- LOG_RATE_LIMIT_WARNINGS = True # Log when approaching limits
50
-
51
- # Token Budget (Optional - set to 0 to disable)
52
- # Stop making requests after hitting daily token budget
53
- DAILY_TOKEN_BUDGET_GEMINI = 0 # 0 = unlimited (within API limits)
54
- DAILY_TOKEN_BUDGET_GROQ = 0 # 0 = unlimited (within API limits)
55
-
56
- # ============================================================================
57
- # TIPS FOR MAXIMIZING USAGE
58
- # ============================================================================
59
- # 1. Set lower MIN_DELAY values for faster responses (but higher risk)
60
- # 2. Enable CACHE to avoid repeat API calls
61
- # 3. Reduce MAX_AGENT_TOOL_RESULTS if hitting rate limits frequently
62
- # 4. Use linear RAG mode for simpler questions (faster, fewer API calls)
63
- # 5. Switch providers if one is exhausted (Gemini <-> Groq)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
setup.py DELETED
@@ -1,15 +0,0 @@
1
- # Work-around the fact that `pip install -e .` doesn't work with `pyproject.toml` files.
2
- from setuptools import setup
3
-
4
- setup(
5
- name="code_chatbot",
6
- version="0.1.0",
7
- packages=["code_chatbot", "api"],
8
- install_requires=[
9
- "streamlit",
10
- "langchain",
11
- "chromadb",
12
- "networkx",
13
- "tree-sitter",
14
- ],
15
- )