Spaces:

Asish22
/

code-crawler

Sleeping

Asish Karthikeya Gogineni commited on Jan 29

Commit

511ccc3

1 Parent(s): 5f1e9e9

feat: Add Groq-optimized prompts, vector DB fallback, improved ignore patterns

- Added Groq-specific prompts with explicit step-by-step instructions
- Implemented vector DB fallback logic (Chroma -> FAISS)
- Added package-lock.json and other lock files to ignore patterns
- Fixed universal_ingestor.py to skip lock/config files during indexing
- Enhanced code generation prompt for better output quality
- Wired get_prompt_for_provider() for automatic prompt selection

Files changed (6) hide show

code_chatbot/config.py +7 -1
code_chatbot/indexer.py +294 -63
code_chatbot/prompts.py +265 -1
code_chatbot/rag.py +8 -11
code_chatbot/universal_ingestor.py +24 -4
components/multi_mode.py +113 -17

code_chatbot/config.py CHANGED Viewed

@@ -80,7 +80,13 @@ class IndexingConfig:
     ignore_patterns: List[str] = field(default_factory=lambda: [
         '*.pyc', '__pycache__/*', '.git/*', 'node_modules/*',
-        '.venv/*', 'venv/*', '*.egg-info/*', 'dist/*', 'build/*'
     ])
     """File patterns to ignore during indexing"""

     ignore_patterns: List[str] = field(default_factory=lambda: [
         '*.pyc', '__pycache__/*', '.git/*', 'node_modules/*',
+        '.venv/*', 'venv/*', '*.egg-info/*', 'dist/*', 'build/*',
+        # Non-code files that pollute search results
+        'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
+        '*.lock', '*.log', '*.sqlite3', '*.db',
+        '*.min.js', '*.min.css', '*.map',
+        '.env*', '*.pem', '*.key',
+        'coverage/*', '.coverage', '.nyc_output/*'
     ])
     """File patterns to ignore during indexing"""

code_chatbot/indexer.py CHANGED Viewed

@@ -13,6 +13,39 @@ import logging
 logger = logging.getLogger(__name__)
 # Global ChromaDB client cache to avoid "different settings" error
 _chroma_clients = {}
@@ -23,7 +56,13 @@ def reset_chroma_clients():
     logger.info("Reset ChromaDB client cache")
 def get_chroma_client(persist_directory: str):
-    """Get or create a shared ChromaDB client for a given path."""
     global _chroma_clients
     # Ensure directory exists
@@ -33,28 +72,64 @@ def get_chroma_client(persist_directory: str):
         import chromadb
         from chromadb.config import Settings
-        try:
-            _chroma_clients[persist_directory] = chromadb.PersistentClient(
                 path=persist_directory,
                 settings=Settings(
                     anonymized_telemetry=False,
                     allow_reset=True
                 )
             )
-        except Exception as e:
-            logger.error(f"Failed to create ChromaDB client: {e}")
-            # Try to reset and create fresh
             import shutil
             if os.path.exists(persist_directory):
                 shutil.rmtree(persist_directory)
             os.makedirs(persist_directory, exist_ok=True)
-            _chroma_clients[persist_directory] = chromadb.PersistentClient(
-                path=persist_directory,
-                settings=Settings(
-                    anonymized_telemetry=False,
-                    allow_reset=True
-                )
-            )
     return _chroma_clients[persist_directory]
@@ -155,23 +230,44 @@ class Indexer:
         all_chunks = filter_complex_metadata(all_chunks)
-        if vector_db_type == "chroma":
-            # Use shared client to avoid "different settings" error
-            chroma_client = get_chroma_client(self.persist_directory)
-            vectordb = Chroma(
-                client=chroma_client,
-                embedding_function=self.embedding_function,
-                collection_name=collection_name
-            )
-        elif vector_db_type == "faiss":
-            from langchain_community.vectorstores import FAISS
-            # FAISS is in-memory by default, we'll save it to disk later
-            vectordb = None # We build it in the loop
-        elif vector_db_type == "qdrant":
-             vectordb = None # Built in bulk later
-        else:
-             raise ValueError(f"Unsupported Vector DB: {vector_db_type}")
         # Batch processing - smaller batches to avoid rate limits
         batch_size = 20  # Reduced for free tier rate limits
@@ -183,11 +279,14 @@ class Indexer:
         import time
         # FAISS handles batching poorly if we want to save incrementally, so we build a list first for FAISS or use from_documents
-        if vector_db_type == "faiss":
              from langchain_community.vectorstores import FAISS
              # For FAISS, it's faster to just do it all at once or in big batches
              vectordb = FAISS.from_documents(all_chunks, self.embedding_function)
              vectordb.save_local(folder_path=self.persist_directory, index_name=collection_name)
              return vectordb
         elif vector_db_type == "qdrant":
@@ -240,9 +339,85 @@ class Indexer:
         return vectordb
     def get_retriever(self, collection_name: str = "codebase", k: int = 10, vector_db_type: str = "chroma"):
-        """Get a retriever for the specified collection. Default k=10 for comprehensive results."""
         logger.info(f"Creating retriever for collection '{collection_name}' from {self.persist_directory}")
         if vector_db_type == "chroma":
             # Use shared client to avoid "different settings" error
             chroma_client = get_chroma_client(self.persist_directory)
@@ -254,49 +429,105 @@ class Indexer:
                 embedding_function=self.embedding_function,
             )
-            # Log collection info
             try:
                 collection = vector_store._collection
                 count = collection.count()
                 logger.info(f"Collection '{collection_name}' has {count} documents")
             except Exception as e:
-                logger.warning(f"Could not get collection count: {e}")
         elif vector_db_type == "faiss":
             from langchain_community.vectorstores import FAISS
-            try:
-                vector_store = FAISS.load_local(
-                    folder_path=self.persist_directory,
-                    embeddings=self.embedding_function,
-                    index_name=collection_name,
-                    allow_dangerous_deserialization=True # Codebase trust assumed for local use
-                )
-                logger.info(f"Loaded FAISS index from {self.persist_directory}")
-            except Exception as e:
-                logger.error(f"Failed to load FAISS index: {e}")
-                # Create empty store if failed? Or raise?
-                raise e
         elif vector_db_type == "qdrant":
-             from langchain_qdrant import QdrantVectorStore
-             url = os.getenv("QDRANT_URL")
-             api_key = os.getenv("QDRANT_API_KEY")
-             vector_store = QdrantVectorStore(
-                 client=None, # It will create one from url/api_key
-                 collection_name=collection_name,
-                 embedding=self.embedding_function,
-                 url=url,
-                 api_key=api_key,
-             )
-             logger.info(f"Connected to Qdrant at {url}")
         else:
-             raise ValueError(f"Unsupported Vector DB: {vector_db_type}")
-        retriever = vector_store.as_retriever(search_kwargs={"k": k})
-        logger.info(f"Retriever created with k={k}")
-        return retriever
 # Add incremental indexing methods to the Indexer class
 from code_chatbot.incremental_indexing import add_incremental_indexing_methods

 logger = logging.getLogger(__name__)
+# Vector database fallback priority order
+# When primary DB fails, automatically try the next in list
+VECTOR_DB_FALLBACK_ORDER = ["chroma", "faiss"]
+# Track which vector DB is currently active (for automatic fallback)
+_active_vector_db = {"type": "chroma", "fallback_count": 0}
+def get_active_vector_db() -> str:
+    """Get the currently active vector database type."""
+    return _active_vector_db["type"]
+def set_active_vector_db(db_type: str):
+    """Set the active vector database type."""
+    _active_vector_db["type"] = db_type
+    logger.info(f"Active vector database set to: {db_type}")
+def get_next_fallback_db(current_db: str) -> Optional[str]:
+    """Get the next fallback vector database in the priority order.
+    Args:
+        current_db: Current vector database type that failed
+    Returns:
+        Next vector database type to try, or None if no more fallbacks
+    """
+    try:
+        current_idx = VECTOR_DB_FALLBACK_ORDER.index(current_db)
+        if current_idx + 1 < len(VECTOR_DB_FALLBACK_ORDER):
+            return VECTOR_DB_FALLBACK_ORDER[current_idx + 1]
+    except ValueError:
+        pass
+    return None
 # Global ChromaDB client cache to avoid "different settings" error
 _chroma_clients = {}
     logger.info("Reset ChromaDB client cache")
 def get_chroma_client(persist_directory: str):
+    """Get or create a shared ChromaDB client for a given path.
+    Includes automatic recovery for common ChromaDB errors:
+    - tenant default_tenant connection errors
+    - Database corruption
+    - Version mismatch issues
+    """
     global _chroma_clients
     # Ensure directory exists
         import chromadb
         from chromadb.config import Settings
+        def create_client():
+            """Helper to create a new ChromaDB client."""
+            return chromadb.PersistentClient(
                 path=persist_directory,
                 settings=Settings(
                     anonymized_telemetry=False,
                     allow_reset=True
                 )
             )
+        def clear_and_recreate():
+            """Clear corrupted database and create fresh client."""
+            logger.warning(f"Clearing corrupted ChromaDB at {persist_directory} and recreating...")
             import shutil
             if os.path.exists(persist_directory):
                 shutil.rmtree(persist_directory)
             os.makedirs(persist_directory, exist_ok=True)
+            return create_client()
+        def is_corruption_error(error: Exception) -> bool:
+            """Check if error indicates database corruption."""
+            error_str = str(error).lower()
+            corruption_indicators = [
+                'tenant',           # "Could not connect to tenant default_tenant"
+                'default_tenant',
+                'sqlite',           # SQLite database issues
+                'database',
+                'corrupt',
+                'no such table',
+                'disk i/o error',
+                'malformed',
+                'locked',
+            ]
+            return any(indicator in error_str for indicator in corruption_indicators)
+        try:
+            _chroma_clients[persist_directory] = create_client()
+            # Verify the client works by attempting a simple operation
+            try:
+                _chroma_clients[persist_directory].heartbeat()
+            except Exception as verify_error:
+                if is_corruption_error(verify_error):
+                    logger.error(f"ChromaDB verification failed: {verify_error}")
+                    del _chroma_clients[persist_directory]
+                    _chroma_clients[persist_directory] = clear_and_recreate()
+                else:
+                    raise
+        except Exception as e:
+            logger.error(f"Failed to create ChromaDB client: {e}")
+            if is_corruption_error(e):
+                _chroma_clients[persist_directory] = clear_and_recreate()
+            else:
+                # For non-corruption errors, still try to recover
+                try:
+                    _chroma_clients[persist_directory] = clear_and_recreate()
+                except Exception as recovery_error:
+                    logger.error(f"Recovery also failed: {recovery_error}")
+                    raise recovery_error
     return _chroma_clients[persist_directory]
         all_chunks = filter_complex_metadata(all_chunks)
+        # Attempt indexing with fallback support
+        attempted_db = vector_db_type
+        fallback_triggered = False
+        try:
+            if vector_db_type == "chroma":
+                # Use shared client to avoid "different settings" error
+                chroma_client = get_chroma_client(self.persist_directory)
+                vectordb = Chroma(
+                    client=chroma_client,
+                    embedding_function=self.embedding_function,
+                    collection_name=collection_name
+                )
+            elif vector_db_type == "faiss":
+                from langchain_community.vectorstores import FAISS
+                # FAISS is in-memory by default, we'll save it to disk later
+                vectordb = None # We build it in the loop
+            elif vector_db_type == "qdrant":
+                 vectordb = None # Built in bulk later
+            else:
+                 raise ValueError(f"Unsupported Vector DB: {vector_db_type}")
+        except Exception as e:
+            error_str = str(e).lower()
+            is_chroma_error = any(indicator in error_str for indicator in [
+                'tenant', 'default_tenant', 'sqlite', 'corrupt',
+                'no such table', 'locked', 'database'
+            ])
+            if is_chroma_error and vector_db_type == "chroma":
+                logger.warning(f"Chroma indexing failed: {e}. Falling back to FAISS...")
+                fallback_triggered = True
+                attempted_db = "faiss"
+                # Clear the corrupted chroma first
+                reset_chroma_clients()
+                vectordb = None  # Will use FAISS path
+            else:
+                raise
         # Batch processing - smaller batches to avoid rate limits
         batch_size = 20  # Reduced for free tier rate limits
         import time
         # FAISS handles batching poorly if we want to save incrementally, so we build a list first for FAISS or use from_documents
+        if vector_db_type == "faiss" or (fallback_triggered and attempted_db == "faiss"):
              from langchain_community.vectorstores import FAISS
              # For FAISS, it's faster to just do it all at once or in big batches
+             logger.info(f"Indexing with FAISS (fallback={fallback_triggered})...")
              vectordb = FAISS.from_documents(all_chunks, self.embedding_function)
              vectordb.save_local(folder_path=self.persist_directory, index_name=collection_name)
+             set_active_vector_db("faiss")
+             logger.info(f"Saved FAISS index to {self.persist_directory}/{collection_name}")
              return vectordb
         elif vector_db_type == "qdrant":
         return vectordb
     def get_retriever(self, collection_name: str = "codebase", k: int = 10, vector_db_type: str = "chroma"):
+        """Get a retriever for the specified collection with automatic fallback.
+        When the primary vector database fails, automatically attempts the next
+        database in the fallback order (chroma -> faiss).
+        Args:
+            collection_name: Name of the collection to retrieve from
+            k: Number of results to return (default 10)
+            vector_db_type: Primary vector database type to try
+        Returns:
+            Configured retriever with fallback protection
+        """
         logger.info(f"Creating retriever for collection '{collection_name}' from {self.persist_directory}")
+        # Track attempts for fallback
+        attempted_dbs = []
+        last_error = None
+        current_db = vector_db_type
+        while current_db and current_db not in attempted_dbs:
+            attempted_dbs.append(current_db)
+            try:
+                vector_store = self._create_vector_store(current_db, collection_name)
+                if vector_store:
+                    # Success! Update active DB and return retriever
+                    set_active_vector_db(current_db)
+                    retriever = vector_store.as_retriever(search_kwargs={"k": k})
+                    logger.info(f"Retriever created with k={k} using {current_db}")
+                    return retriever
+            except Exception as e:
+                last_error = e
+                error_str = str(e).lower()
+                # Check if this is a recoverable error that warrants fallback
+                is_chroma_error = any(indicator in error_str for indicator in [
+                    'tenant', 'default_tenant', 'sqlite', 'corrupt',
+                    'no such table', 'locked', 'database'
+                ])
+                if is_chroma_error or 'chroma' in error_str:
+                    logger.warning(f"Vector DB '{current_db}' failed: {e}")
+                    # Try next fallback
+                    next_db = get_next_fallback_db(current_db)
+                    if next_db:
+                        logger.info(f"Attempting fallback to '{next_db}'...")
+                        current_db = next_db
+                        continue
+                # Non-recoverable error
+                logger.error(f"Vector DB '{current_db}' failed with non-recoverable error: {e}")
+                break
+        # All fallbacks exhausted
+        if last_error:
+            raise RuntimeError(
+                f"All vector database options failed. Attempted: {attempted_dbs}. "
+                f"Last error: {last_error}"
+            )
+        else:
+            raise ValueError(f"No valid vector database available. Attempted: {attempted_dbs}")
+    def _create_vector_store(self, vector_db_type: str, collection_name: str):
+        """Create a vector store instance for the given database type.
+        Args:
+            vector_db_type: Type of vector database (chroma, faiss, qdrant)
+            collection_name: Name of the collection
+        Returns:
+            Vector store instance
+        Raises:
+            Exception: If vector store creation fails
+        """
         if vector_db_type == "chroma":
             # Use shared client to avoid "different settings" error
             chroma_client = get_chroma_client(self.persist_directory)
                 embedding_function=self.embedding_function,
             )
+            # Verify the store works by getting count
             try:
                 collection = vector_store._collection
                 count = collection.count()
                 logger.info(f"Collection '{collection_name}' has {count} documents")
+                if count == 0:
+                    logger.warning(f"Chroma collection '{collection_name}' is empty!")
             except Exception as e:
+                # Re-raise to trigger fallback
+                raise RuntimeError(f"Chroma verification failed: {e}")
+            return vector_store
         elif vector_db_type == "faiss":
             from langchain_community.vectorstores import FAISS
+            faiss_index_path = os.path.join(self.persist_directory, f"{collection_name}.faiss")
+            faiss_pkl_path = os.path.join(self.persist_directory, f"{collection_name}.pkl")
+            # Check if FAISS index exists
+            if not os.path.exists(faiss_index_path) and not os.path.exists(faiss_pkl_path):
+                # Try default naming convention
+                faiss_index_path = os.path.join(self.persist_directory, "index.faiss")
+                faiss_pkl_path = os.path.join(self.persist_directory, "index.pkl")
+            if not os.path.exists(faiss_index_path):
+                logger.warning(f"No FAISS index found at {self.persist_directory}, will need to re-index")
+                # We could trigger re-indexing here or raise to try next fallback
+                raise FileNotFoundError(f"FAISS index not found at {self.persist_directory}")
+            vector_store = FAISS.load_local(
+                folder_path=self.persist_directory,
+                embeddings=self.embedding_function,
+                index_name=collection_name,
+                allow_dangerous_deserialization=True
+            )
+            logger.info(f"Loaded FAISS index from {self.persist_directory}")
+            return vector_store
         elif vector_db_type == "qdrant":
+            from langchain_qdrant import QdrantVectorStore
+            url = os.getenv("QDRANT_URL")
+            api_key = os.getenv("QDRANT_API_KEY")
+            vector_store = QdrantVectorStore(
+                client=None,
+                collection_name=collection_name,
+                embedding=self.embedding_function,
+                url=url,
+                api_key=api_key,
+            )
+            logger.info(f"Connected to Qdrant at {url}")
+            return vector_store
         else:
+            raise ValueError(f"Unsupported Vector DB: {vector_db_type}")
+    def get_retriever_with_reindex_fallback(
+        self,
+        documents: List[Document] = None,
+        collection_name: str = "codebase",
+        k: int = 10,
+        vector_db_type: str = "chroma"
+    ):
+        """Get retriever with automatic re-indexing fallback.
+        If the primary vector DB fails and fallback also fails to load,
+        this method will automatically re-index the documents using
+        the fallback database.
+        Args:
+            documents: Documents to re-index if needed (optional)
+            collection_name: Collection name
+            k: Number of results
+            vector_db_type: Primary DB type
+        Returns:
+            Configured retriever
+        """
+        try:
+            return self.get_retriever(collection_name, k, vector_db_type)
+        except (RuntimeError, FileNotFoundError) as e:
+            if documents:
+                logger.warning(f"Retriever creation failed, attempting re-index with fallback DB: {e}")
+                # Get fallback DB
+                fallback_db = get_next_fallback_db(vector_db_type) or "faiss"
+                # Re-index with fallback
+                logger.info(f"Re-indexing {len(documents)} documents with {fallback_db}...")
+                self.index_documents(documents, collection_name, fallback_db)
+                # Try getting retriever again
+                return self.get_retriever(collection_name, k, fallback_db)
+            else:
+                raise
 # Add incremental indexing methods to the Indexer class
 from code_chatbot.incremental_indexing import add_incremental_indexing_methods

code_chatbot/prompts.py CHANGED Viewed

@@ -338,4 +338,268 @@ ARCHITECTURE_EXPLANATION_PROMPT = """Explain the architecture and design pattern
 5. **Diagram** (text-based): Visual representation of component relationships
 Format with clear sections and reference specific files.
-"""

 5. **Diagram** (text-based): Visual representation of component relationships
 Format with clear sections and reference specific files.
+"""
+# =============================================================================
+# GROQ-OPTIMIZED PROMPTS (For Llama and smaller models)
+# =============================================================================
+# These prompts are specifically designed for smaller LLMs that need:
+# - More explicit, step-by-step instructions
+# - Clearer output format specifications
+# - More examples and constraints
+# - Simpler language and shorter sections
+GROQ_SYSTEM_PROMPT_AGENT = """You are a code assistant for the repository: {repo_name}.
+YOUR JOB: Help developers understand their codebase by searching code and explaining it clearly.
+AVAILABLE TOOLS:
+1. search_codebase(query) - Search for code. USE THIS FIRST for any question.
+2. read_file(file_path) - Read a complete file for more context.
+3. list_files(directory) - See what files exist in a folder.
+4. find_callers(function_name) - Who calls this function?
+5. find_callees(function_name) - What does this function call?
+RULES (FOLLOW EXACTLY):
+1. ALWAYS search first before answering
+2. ALWAYS cite file paths in your answer
+3. ALWAYS show code snippets from the codebase
+4. NEVER make up code - only use what you find
+5. Keep answers focused and under 500 words unless asked for more
+HOW TO ANSWER:
+Step 1: Read the user's question carefully
+Step 2: Use search_codebase with relevant keywords
+Step 3: If needed, use read_file to get full file content
+Step 4: Write your answer following this format:
+## Answer
+[2-3 sentences directly answering the question]
+## Code Location
+File: `path/to/file.py`
+Lines: X-Y
+## Code
+```python
+[Actual code from the codebase]
+```
+## Explanation
+[Point-by-point explanation of how the code works]
+EXAMPLE GOOD ANSWER:
+User asks: "How does login work?"
+## Answer
+Login is handled by the `authenticate()` function in `src/auth.py`. It validates the username/password and creates a session token.
+## Code Location
+File: `src/auth.py`
+Lines: 45-67
+## Code
+```python
+def authenticate(username, password):
+    user = db.get_user(username)
+    if user and check_password(password, user.hash):
+        return create_token(user.id)
+    return None
+```
+## Explanation
+1. Gets user from database by username
+2. Checks if password matches stored hash
+3. If valid, creates and returns JWT token
+4. If invalid, returns None
+REMEMBER: Short, clear, accurate answers with real code from the codebase.
+"""
+GROQ_SYSTEM_PROMPT_LINEAR_RAG = """You are a code expert answering questions about: {repo_name}
+I will give you code snippets from the codebase. Use ONLY these snippets to answer.
+IMPORTANT - FOCUS ON SOURCE CODE:
+- PRIORITIZE files ending in: .py, .js, .ts, .jsx, .tsx, .java, .go, .rs
+- IGNORE config files like: package-lock.json, yarn.lock, *.json (unless specifically asked)
+- IGNORE: node_modules, .git, __pycache__, dist, build folders
+- Focus on: functions, classes, API endpoints, business logic
+YOUR TASK:
+1. Read the code snippets below carefully
+2. Focus on ACTUAL SOURCE CODE files, not config/lock files
+3. Find functions, classes, and logic that answer the question
+4. Write a clear, organized answer
+RULES:
+- ONLY use information from the provided code snippets
+- ALWAYS include file paths: `path/to/file.py`
+- ALWAYS show relevant code with ```python or ```javascript blocks
+- NEVER guess or make up code that isn't shown
+- If you only see config files (package.json, etc.), say "The search didn't return relevant source code. Please ask about specific functions or features."
+- If the snippets don't answer the question, say "The provided code doesn't contain information about [topic]"
+CODE SNIPPETS FROM CODEBASE:
+{context}
+---
+ANSWER FORMAT:
+## Summary
+[1-2 sentences answering the question directly based on SOURCE CODE, not config files]
+## Implementation Details
+[Explain the ACTUAL CODE logic - functions, classes, how they work]
+## Relevant Code
+```python
+# From: path/to/source_file.py (NOT config files)
+[paste the actual function/class code]
+```
+## How It Works
+1. [First step of the logic]
+2. [Second step]
+3. [Third step]
+Keep your answer under 400 words. Focus on source code, not configurations.
+"""
+GROQ_QUERY_EXPANSION_PROMPT = """Turn this question into 3 search queries for a code search engine.
+Question: {question}
+Rules:
+- Make queries short (2-5 words each)
+- Include function/class names if mentioned
+- Mix technical terms and simple descriptions
+Output exactly 3 queries, one per line:
+"""
+GROQ_ANSWER_SYNTHESIS_PROMPT = """Combine these code search results into one clear answer.
+USER QUESTION: {question}
+SEARCH RESULTS:
+{retrieved_context}
+INSTRUCTIONS:
+1. Read all the search results
+2. Find the most relevant code for the question
+3. Write ONE unified answer
+FORMAT YOUR ANSWER EXACTLY LIKE THIS:
+## Direct Answer
+[2-3 sentences answering the question]
+## Key Files
+- `file1.py` - [what it does]
+- `file2.py` - [what it does]
+## Main Code
+```python
+[most relevant code snippet]
+```
+## How It Works
+1. [Step 1]
+2. [Step 2]
+3. [Step 3]
+RULES:
+- Keep answer under 300 words
+- Only use code from the search results
+- Be specific about file names and line numbers
+"""
+GROQ_CODE_MODIFICATION_PROMPT = """You need to suggest code changes for: {repo_name}
+USER REQUEST: {user_request}
+EXISTING CODE:
+{existing_code}
+INSTRUCTIONS:
+1. Look at the existing code style
+2. Write new code that matches the style
+3. Explain where to put the new code
+OUTPUT FORMAT:
+## What I'll Change
+[1 sentence summary]
+## New Code
+```python
+# Add to: path/to/file.py
+[your code here - match existing style]
+```
+## Where to Add It
+- File: `path/to/file.py`
+- Location: After line X / In function Y / At the end
+## What It Does
+1. [First thing]
+2. [Second thing]
+RULES:
+- Match the existing code style exactly
+- Include all necessary imports
+- Handle errors properly
+"""
+# =============================================================================
+# PROMPT SELECTOR FUNCTION
+# =============================================================================
+def get_prompt_for_provider(prompt_name: str, provider: str = "gemini") -> str:
+    """Get the appropriate prompt based on LLM provider.
+    Args:
+        prompt_name: Name of the prompt (e.g., "system_agent", "linear_rag")
+        provider: LLM provider ("gemini", "groq", etc.)
+    Returns:
+        The appropriate prompt string for the provider
+    """
+    # Prompt mapping for different providers
+    prompt_map = {
+        "system_agent": {
+            "gemini": SYSTEM_PROMPT_AGENT,
+            "groq": GROQ_SYSTEM_PROMPT_AGENT,
+            "default": SYSTEM_PROMPT_AGENT
+        },
+        "linear_rag": {
+            "gemini": SYSTEM_PROMPT_LINEAR_RAG,
+            "groq": GROQ_SYSTEM_PROMPT_LINEAR_RAG,
+            "default": SYSTEM_PROMPT_LINEAR_RAG
+        },
+        "query_expansion": {
+            "gemini": QUERY_EXPANSION_PROMPT,
+            "groq": GROQ_QUERY_EXPANSION_PROMPT,
+            "default": QUERY_EXPANSION_PROMPT
+        },
+        "answer_synthesis": {
+            "gemini": ANSWER_SYNTHESIS_PROMPT,
+            "groq": GROQ_ANSWER_SYNTHESIS_PROMPT,
+            "default": ANSWER_SYNTHESIS_PROMPT
+        },
+        "code_modification": {
+            "gemini": CODE_MODIFICATION_PROMPT,
+            "groq": GROQ_CODE_MODIFICATION_PROMPT,
+            "default": CODE_MODIFICATION_PROMPT
+        }
+    }
+    if prompt_name not in prompt_map:
+        raise ValueError(f"Unknown prompt name: {prompt_name}")
+    prompts = prompt_map[prompt_name]
+    return prompts.get(provider, prompts["default"])

code_chatbot/rag.py CHANGED Viewed

@@ -288,8 +288,8 @@ class ChatEngine:
                 # Contextualize with history
                 # Use comprehensive system prompt for high-quality answers
-                from code_chatbot.prompts import SYSTEM_PROMPT_AGENT
-                sys_content = SYSTEM_PROMPT_AGENT.format(repo_name=self.repo_name)
                 system_msg = SystemMessage(content=sys_content)
                 # Token Optimization: Only pass last 4 messages (2 turns) to keep context light.
@@ -396,15 +396,12 @@ class ChatEngine:
                     "url": doc.metadata.get("url", f"file://{file_path}"),
                 })
-            # Build prompt with history
-            qa_system_prompt = (
-                f"You are a Code Chatbot, an expert software engineering assistant helping me quickly understand "
-                f"a codebase called {self.repo_name}.\n"
-                "Assume I am an advanced developer and answer my questions in the most succinct way possible.\n"
-                "Always provide code examples where relevant.\n"
-                "Link your answers to specific files if possible.\n\n"
-                "Here are some snippets from the codebase:\n\n"
-                f"{context_text}"
             )
             # Build messages with history

                 # Contextualize with history
                 # Use comprehensive system prompt for high-quality answers
+                from code_chatbot.prompts import get_prompt_for_provider
+                sys_content = get_prompt_for_provider("system_agent", self.provider).format(repo_name=self.repo_name)
                 system_msg = SystemMessage(content=sys_content)
                 # Token Optimization: Only pass last 4 messages (2 turns) to keep context light.
                     "url": doc.metadata.get("url", f"file://{file_path}"),
                 })
+            # Build prompt with history - use provider-specific prompt
+            from code_chatbot.prompts import get_prompt_for_provider
+            base_prompt = get_prompt_for_provider("linear_rag", self.provider)
+            qa_system_prompt = base_prompt.format(
+                repo_name=self.repo_name,
+                context=context_text
             )
             # Build messages with history

code_chatbot/universal_ingestor.py CHANGED Viewed

@@ -156,10 +156,16 @@ class ZIPFileManager(DataManager):
         if not os.path.exists(self.path):
             return
-        IGNORE_DIRS = {'__pycache__', '.git', 'node_modules', 'venv', '.venv', '.env'}
         IGNORE_EXTENSIONS = {
             '.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
-            '.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth'
         }
         for root, dirs, files in os.walk(self.path):
@@ -169,6 +175,10 @@ class ZIPFileManager(DataManager):
                 if file.startswith('.'):
                     continue
                 file_path = os.path.join(root, file)
                 _, ext = os.path.splitext(file)
                 if ext.lower() in IGNORE_EXTENSIONS:
@@ -204,10 +214,16 @@ class LocalDirectoryManager(DataManager):
     def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]:
         """Walks local directory."""
-        IGNORE_DIRS = {'__pycache__', '.git', 'node_modules', 'venv', '.venv', '.env'}
         IGNORE_EXTENSIONS = {
             '.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
-            '.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth'
         }
         for root, dirs, files in os.walk(self.path):
@@ -217,6 +233,10 @@ class LocalDirectoryManager(DataManager):
                 if file.startswith('.'):
                     continue
                 file_path = os.path.join(root, file)
                 _, ext = os.path.splitext(file)
                 if ext.lower() in IGNORE_EXTENSIONS:

         if not os.path.exists(self.path):
             return
+        IGNORE_DIRS = {'__pycache__', '.git', 'node_modules', 'venv', '.venv', '.env', 'dist', 'build'}
         IGNORE_EXTENSIONS = {
             '.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
+            '.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth',
+            '.lock', '.log', '.sqlite3', '.db', '.min.js', '.min.css', '.map'
+        }
+        # Files to ignore by exact name (lock files, etc.)
+        IGNORE_FILES = {
+            'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'poetry.lock',
+            'Pipfile.lock', 'composer.lock', 'Gemfile.lock', 'Cargo.lock'
         }
         for root, dirs, files in os.walk(self.path):
                 if file.startswith('.'):
                     continue
+                # Skip ignored files by name
+                if file in IGNORE_FILES:
+                    continue
                 file_path = os.path.join(root, file)
                 _, ext = os.path.splitext(file)
                 if ext.lower() in IGNORE_EXTENSIONS:
     def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]:
         """Walks local directory."""
+        IGNORE_DIRS = {'__pycache__', '.git', 'node_modules', 'venv', '.venv', '.env', 'dist', 'build'}
         IGNORE_EXTENSIONS = {
             '.pyc', '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg', '.mp4', '.mov',
+            '.zip', '.tar', '.gz', '.pdf', '.exe', '.bin', '.pkl', '.npy', '.pt', '.pth',
+            '.lock', '.log', '.sqlite3', '.db', '.min.js', '.min.css', '.map'
+        }
+        # Files to ignore by exact name (lock files, etc.)
+        IGNORE_FILES = {
+            'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'poetry.lock',
+            'Pipfile.lock', 'composer.lock', 'Gemfile.lock', 'Cargo.lock'
         }
         for root, dirs, files in os.walk(self.path):
                 if file.startswith('.'):
                     continue
+                # Skip ignored files by name
+                if file in IGNORE_FILES:
+                    continue
                 file_path = os.path.join(root, file)
                 _, ext = os.path.splitext(file)
                 if ext.lower() in IGNORE_EXTENSIONS:

components/multi_mode.py CHANGED Viewed

@@ -359,33 +359,129 @@ def render_generate_mode(chat_engine):
         with st.spinner("🤖 Generating feature... (this may take 30-60 seconds)"):
             try:
-                # Build comprehensive prompt
-                prompt = f"""Generate a complete implementation for this feature:
-**Feature Request:**
 {feature_desc}
-**Requirements:**
-- Framework: {framework}
-- Include tests: {include_tests}
-- Include documentation: {include_docs}
-- Include examples: {include_examples}
-**Please provide:**
-1. A clear file structure showing all files to create
-2. Complete, production-ready code for each file
-3. Clear comments explaining the code
-4. Setup/installation instructions
-5. Usage examples
-Format each file like this:
 ### `path/to/filename.py`
 ```python
-# Code here
 ```
-Make sure the code follows best practices and matches the existing codebase style."""
                 # Use chat engine
                 answer, sources = chat_engine.chat(prompt)

         with st.spinner("🤖 Generating feature... (this may take 30-60 seconds)"):
             try:
+                # Build comprehensive AI Engineer prompt
+                prompt = f"""You are a **Senior AI/Software Engineer** with 15+ years of experience building production systems at top tech companies. Your expertise spans system design, security, scalability, and clean code architecture.
+## 🎯 MISSION
+Analyze the existing codebase and generate a **production-ready, enterprise-grade** implementation for the requested feature.
+---
+## 📋 FEATURE REQUEST
 {feature_desc}
+---
+## ⚙️ CONFIGURATION
+| Setting | Value |
+|---------|-------|
+| **Framework** | {framework} |
+| **Include Tests** | {include_tests} |
+| **Include Documentation** | {include_docs} |
+| **Include Examples** | {include_examples} |
+---
+## 🧠 YOUR APPROACH (Follow This Process)
+### Phase 1: Architecture Analysis
+Before writing code, analyze the existing codebase to understand:
+- **Project structure** and conventions
+- **Naming patterns** (snake_case, camelCase, etc.)
+- **Import style** and module organization
+- **Error handling** patterns
+- **Logging** approach
+- **Configuration** management style
+### Phase 2: Design the Solution
+- Choose appropriate **design patterns** (Factory, Repository, Service Layer, etc.)
+- Plan **database schema** changes if needed
+- Define **API contracts** (request/response schemas)
+- Consider **edge cases** and error scenarios
+- Plan for **scalability** and performance
+### Phase 3: Implementation
+Generate code that includes:
+1. **🏗️ Architecture Overview**
+   - High-level system diagram (ASCII or Mermaid)
+   - Component relationships and data flow
+2. **📁 File Structure**
+   ```
+   feature_name/
+   ├── __init__.py
+   ├── models.py       # Data models/schemas
+   ├── service.py      # Business logic
+   ├── routes.py       # API endpoints (if applicable)
+   ├── utils.py        # Helper functions
+   └── tests/
+       ├── test_service.py
+       └── test_routes.py
+   ```
+3. **💻 Complete Code** for each file with:
+   - **Type hints** on all functions
+   - **Docstrings** with Args, Returns, Raises
+   - **Input validation** and sanitization
+   - **Error handling** with custom exceptions
+   - **Logging** at appropriate levels
+   - **Security** considerations (auth, injection prevention, etc.)
+4. **🧪 Test Suite** (if enabled):
+   - Unit tests with pytest
+   - Edge case coverage
+   - Mock external dependencies
+   - Minimum 80% code coverage target
+5. **📖 Documentation** (if enabled):
+   - API documentation with examples
+   - Usage guide
+   - Configuration options
+6. **🚀 Integration Guide**:
+   - Step-by-step setup instructions
+   - Environment variables needed
+   - Dependencies to install
+   - How to integrate with existing code
+---
+## 📝 CODE FILE FORMAT
+For each file, use this exact format:
 ### `path/to/filename.py`
 ```python
+\"\"\"
+Module docstring explaining purpose.
+\"\"\"
+# imports here
+# code here with full implementation
 ```
+---
+## ✅ QUALITY CHECKLIST
+Ensure your code:
+- [ ] Follows existing codebase conventions
+- [ ] Has no hardcoded values (use config/env vars)
+- [ ] Handles all error cases gracefully
+- [ ] Is thread-safe if applicable
+- [ ] Has no security vulnerabilities
+- [ ] Is optimized for performance
+- [ ] Is maintainable and readable
+---
+## 🎨 STYLE REQUIREMENTS
+- Clean, readable code over clever code
+- Self-documenting function/variable names
+- Comments for complex logic only
+- Consistent formatting with project style
+- DRY (Don't Repeat Yourself) principle
+Now generate the complete, production-ready implementation:"""
                 # Use chat engine
                 answer, sources = chat_engine.chat(prompt)