Spaces:

Rajhuggingface4253
/

grammar

Running

App Files Files Community

Rajhuggingface4253 commited on Jan 17

Commit

8720a18

verified ·

1 Parent(s): 172427a

Update vector.py

Browse files

Files changed (1) hide show

vector.py +812 -222

vector.py CHANGED Viewed

@@ -14,8 +14,31 @@ import atexit
 import gc
 from typing import List, Dict, Any, Optional, Tuple, Union
 from collections import defaultdict, OrderedDict  # <-- FIX 1: Add OrderedDict
-# === NEW IMPORTS FOR HYBRID SEARCH ===
 try:
     from rank_bm25 import BM25Okapi
     BM25_AVAILABLE = True
@@ -38,6 +61,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger("NeuralSessionEngine")
 class VectorDatabase:
     def __init__(self, index_path="faiss_session_index.bin", metadata_path="session_metadata.pkl"):
         self.index_path = index_path
@@ -58,6 +82,8 @@ class VectorDatabase:
             logger.error(f"❌ Failed to load models: {e}")
             raise RuntimeError(f"Model initialization failed: {e}")
         # Load or create index with file locking
         self._load_or_create_index()
@@ -92,31 +118,6 @@ class VectorDatabase:
     # ==================== FIX 2: LAZY BM25 LOADING ====================
-    # ==================== BEADS ENGINE (Granular Context) ====================
-    def _text_to_beads(self, text: str, source: str) -> List[str]:
-        """
-        Converts text blocks into 'Beads' (Atomic, Self-Contained Facts).
-        Prevents context loss when chunks are retrieved out of order.
-        """
-        # 1. Clean and normalize
-        text = re.sub(r'\s+', ' ', text).strip()
-        # 2. Split into sentences (Robust Regex for speed)
-        # Matches punctuation followed by space, avoiding abbreviations like "Mr." or "v1.0"
-        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-        beads = []
-        for sent in sentences:
-            if len(sent) < 15: continue # Skip noise/titles
-            # 3. CONTEXT INJECTION (The "Bead" Magic)
-            # We prepend the filename to EVERY sentence.
-            # Old: "It returns 404." (Useless) -> New: "[Source: app.py] It returns 404." (Valuable)
-            bead = f"[{source}] {sent}"
-            beads.append(bead)
-        return beads
     def _get_or_build_bm25(self, user_id: str, chat_id: str) -> Optional[BM25Okapi]:
         """
         Retrieve BM25 index from cache or build it on-demand (Lazy Load).
@@ -216,42 +217,8 @@ class VectorDatabase:
     # ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
-    # ==================== BEADS ENGINE (New Feature) ====================
-    def _text_to_beads(self, text: str, source: str) -> List[str]:
-        """
-        Converts text blocks into 'Beads' (Atomic, Self-Contained Facts).
-        High-Precision Logic: Prevents context loss by attaching source metadata to every sentence.
-        """
-        # 1. Clean and normalize
-        text = re.sub(r'\s+', ' ', text).strip()
-        # 2. Robust Sentence Splitting (Regex)
-        # Splits on [.!?] but avoids common abbreviations (e.g., "Mr.", "v1.0", "e.g.")
-        # This is faster and lighter than NLTK for a production environment
-        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
-        beads = []
-        for sent in sentences:
-            if len(sent) < 15: continue # Skip noise/tiny fragments
-            # 3. CONTEXT INJECTION (The "Bead" Magic)
-            # Old: "It returns 404." (Useless in isolation)
-            # New: "[Source: auth.py] It returns 404." (Highly searchable)
-            bead = f"[{source}] {sent}"
-            beads.append(bead)
-        return beads
-    # ==================== STORAGE ENGINE (Updated) ====================
     def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str, file_id: str = None) -> bool:
-        """
-        Store extracted file content with Hybrid Bead Strategy.
-        - Stores CHUNKS for reading/context.
-        - Stores BEADS for high-precision search.
-        - Stores WHOLE FILE for full retrieval.
-        """
         if not text or len(text) < 10 or not user_id:
             logger.warning(f"Invalid input for {filename}")
             return False
@@ -261,10 +228,17 @@ class VectorDatabase:
         chunks_data = []
         ext = os.path.splitext(filename)[1].lower()
-        # 1. Intelligent Chunking Strategy
         try:
-            if ext == '.py':
-                chunks_data = self._chunk_python_ast(text, filename)
             elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml', '.scss']:
                 chunks_data = self._chunk_smart_code(text, filename)
             else:
@@ -273,9 +247,12 @@ class VectorDatabase:
             logger.error(f"Chunking failed for {filename}: {e}")
             chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
-        # Fallback for empty chunking
         if not chunks_data and text:
-            chunks_data = [{"text": text[:2000], "type": "fallback", "name": "full_document"}]
         if not chunks_data:
             logger.error(f"No chunks generated for {filename}")
@@ -285,46 +262,26 @@ class VectorDatabase:
         final_meta = []
         for chunk in chunks_data:
-            # A. Store Standard Chunk (The "Paragraph")
             final_texts.append(chunk["text"])
             final_meta.append({
                 "text": chunk["text"],
                 "source": filename,
                 "file_id": file_id,
-                "type": "chunk", # Standard reading block
                 "subtype": chunk.get("type", "general"),
                 "name": chunk.get("name", "unknown"),
                 "user_id": user_id,
                 "chat_id": chat_id,
-                "timestamp": time.time()
             })
-            # B. Generate & Store Beads (The "Atomic Facts")
-            # We ONLY bead natural text or comments. We do NOT bead raw code functions (keeps syntax intact).
-            if chunk.get("type") in ["text_paragraph", "text_block", "code_context", "fallback"]:
-                beads = self._text_to_beads(chunk["text"], filename)
-                for bead in beads:
-                    final_texts.append(bead)
-                    final_meta.append({
-                        "text": bead,
-                        "source": filename,
-                        "file_id": file_id,
-                        "type": "bead", # Marked for high-precision search
-                        "subtype": "atomic_fact",
-                        "user_id": user_id,
-                        "chat_id": chat_id,
-                        "timestamp": time.time(),
-                        "actual_content": chunk["text"] # Link back to parent chunk
-                    })
-        # C. Store Whole File Reference (The "Library Book")
-        # We increase the embedding limit to 8000 chars for better summary matching.
-        # CRITICAL: 'actual_content' stores the UNLIMITED full text as requested.
-        whole_file_summary = text[:8000]
-        final_texts.append(f"Complete File: {filename} | Content Start: {whole_file_summary}")
         final_meta.append({
-            "text": whole_file_summary,
-            "actual_content": text, # <--- UNLIMITED FULL CONTENT STORAGE
             "source": filename,
             "file_id": file_id,
             "type": "file",
@@ -337,7 +294,7 @@ class VectorDatabase:
         })
         try:
-            # Optimized Batch Embedding
             embeddings = self.embedder.encode(
                 final_texts,
                 show_progress_bar=False,
@@ -353,20 +310,663 @@ class VectorDatabase:
                 self.metadata.extend(final_meta)
                 self._save_index()
-            # Invalidate Cache (Forces system to see new data immediately)
             self._invalidate_bm25_cache(user_id, chat_id)
-            logger.info(f"✅ Stored {len(final_texts)} vectors (Chunks + Beads) for {filename}")
             return True
         except Exception as e:
             logger.error(f"❌ Failed to store vectors for {filename}: {e}")
-            # Rollback logic (safety net)
             with self.memory_lock:
                 if self.index.ntotal >= len(final_texts):
                     self._rollback_partial_storage(user_id, chat_id)
             return False
     def delete_file(self, user_id: str, chat_id: str, file_id: str) -> bool:
         """Surgical Strike: Remove chunks belonging to a specific file ID"""
         with self.memory_lock:
@@ -542,8 +1142,7 @@ class VectorDatabase:
             results = self._semantic_first_fusion(semantic_results, bm25_results, final_k)
         elif strategy == "fusion":
             results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
-        elif strategy == "weighted":
-            results = self._weighted_fusion(bm25_results, semantic_results, final_k)
         else:
             # Default to fusion
             results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
@@ -612,51 +1211,85 @@ class VectorDatabase:
     # ==================== CORE METHODS (PRESERVED WITH FIXES) ====================
-    def _chunk_python_ast(self, text: str, filename: str) -> List[Dict[str, Any]]:
-        """Enhanced AST chunker with better context preservation"""
         chunks = []
         try:
             tree = ast.parse(text)
             lines = text.splitlines()
-            global_context = []
-            for node in tree.body:
-                if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
-                    start = max(0, node.lineno - 4)
-                    end = node.end_lineno + 2
-                    block_text = "\n".join(lines[start:end])
                     chunks.append({
-                        "text": f"File: {filename} | Type: {type(node).__name__} | Name: {node.name} | Content: {block_text}",
-                        "type": "code_function",
                         "name": node.name,
                         "line_start": start,
                         "line_end": end
                     })
-                elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
-                    if hasattr(node, 'end_lineno'):
-                        start = node.lineno - 1
-                        end = node.end_lineno
-                        global_context.append("\n".join(lines[start:end]))
-            # Add global context as a separate chunk
             if global_context:
-                full_global = "\n".join(global_context)
-                if len(full_global) > 50:
-                    chunks.insert(0, {
-                        "text": f"File: {filename} | Type: imports_and_globals | Content: {full_global[:2000]}",
-                        "type": "code_context",
-                        "name": "imports_and_globals"
-                    })
         except Exception as e:
-            logger.warning(f"AST parsing failed for {filename}: {e}")
-            return self._chunk_text_enhanced(text)
-        if not chunks:
-            return self._chunk_text_enhanced(text)
         return chunks
     def _chunk_smart_code(self, text: str, filename: str) -> List[Dict[str, Any]]:
@@ -670,10 +1303,10 @@ class VectorDatabase:
             '.htm': r'(?=\n\s*<[^/])',
             '.xml': r'(?=\n\s*<[^/])',
             '.vue': r'(?=\n\s*<[^/])',
-            '.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
-            '.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
-            '.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
-            '.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
             '.css': r'(?=\n\s*[.#@a-zA-Z])',
             '.scss': r'(?=\n\s*[.#@a-zA-Z])',
             '.java': r'(?=\n\s*(?:public|private|protected|class|interface|enum|@))',
@@ -691,7 +1324,7 @@ class VectorDatabase:
             # Process with CONTEXT OVERLAP for better retrieval
             current_chunk = ""
-            TARGET_SIZE = 800
             OVERLAP_SIZE = 100
             for seg_idx, seg in enumerate(segments):
@@ -784,6 +1417,8 @@ class VectorDatabase:
         return chunks
     # ==================== HELPER METHODS FOR HYBRID SEARCH ====================
     def _classify_query(self, query: str) -> str:
@@ -857,92 +1492,47 @@ class VectorDatabase:
         return results[:final_k]
-    def _reciprocal_rank_fusion(self, results1: List[Dict], results2: List[Dict],
-                               final_k: int, k: int = 60) -> List[Dict]:
-        """Combine results using Reciprocal Rank Fusion (RRF)"""
-        # Create rank dictionaries
-        rank_map1 = {r["id"]: rank + 1 for rank, r in enumerate(results1)}
-        rank_map2 = {r["id"]: rank + 1 for rank, r in enumerate(results2)}
-        # Get all unique IDs
-        all_ids = set(rank_map1.keys()) | set(rank_map2.keys())
-        # Calculate RRF scores
-        rrf_scores = []
-        for doc_id in all_ids:
-            score = 0.0
-            if doc_id in rank_map1:
-                score += 1.0 / (rank_map1[doc_id] + k)
-            if doc_id in rank_map2:
-                score += 1.0 / (rank_map2[doc_id] + k)
-            rrf_scores.append((doc_id, score))
-        # Sort by RRF score
-        rrf_scores.sort(key=lambda x: x[1], reverse=True)
-        # Create result mapping for quick lookup
-        results_map = {}
-        for r in results1 + results2:
-            if r["id"] not in results_map:
-                results_map[r["id"]] = r
-        # Build final results
-        combined_results = []
-        for doc_id, rrf_score in rrf_scores:
-            if doc_id in results_map:
-                result = results_map[doc_id].copy()
-                result["score"] = rrf_score
-                result["match_type"] = "rrf_fusion"
-                combined_results.append(result)
-        return combined_results[:final_k]
-    def _weighted_fusion(self, bm25_results: List[Dict], semantic_results: List[Dict],
-                        final_k: int, bm25_weight: float = 0.4,
-                        semantic_weight: float = 0.6) -> List[Dict]:
-        """Weighted combination of BM25 and semantic scores"""
-        # Normalize scores within each result set
-        def normalize_scores(results):
-            if not results:
-                return {}
-            max_score = max(r["score"] for r in results) if results else 1.0
-            if max_score == 0:
-                max_score = 1.0
-            return {r["id"]: r["score"] / max_score for r in results}
-        bm25_scores = normalize_scores(bm25_results)
-        semantic_scores = normalize_scores(semantic_results)
-        # Get all unique IDs
-        all_ids = set(bm25_scores.keys()) | set(semantic_scores.keys())
-        # Calculate weighted scores
-        weighted_scores = []
-        for doc_id in all_ids:
-            bm25_score = bm25_scores.get(doc_id, 0.0)
-            semantic_score = semantic_scores.get(doc_id, 0.0)
-            weighted = (bm25_score * bm25_weight) + (semantic_score * semantic_weight)
-            weighted_scores.append((doc_id, weighted))
-        # Sort by weighted score
-        weighted_scores.sort(key=lambda x: x[1], reverse=True)
-        # Create result mapping
-        results_map = {}
-        for r in bm25_results + semantic_results:
-            if r["id"] not in results_map:
-                results_map[r["id"]] = r
-        # Build final results
-        combined_results = []
-        for doc_id, weighted_score in weighted_scores:
-            if doc_id in results_map:
-                result = results_map[doc_id].copy()
-                result["score"] = weighted_score
-                result["match_type"] = "weighted_fusion"
-                combined_results.append(result)
-        return combined_results[:final_k]
     def _smart_rerank(self, query: str, candidates: List[Dict], final_k: int) -> List[Dict]:
         """Smart reranking using cross-encoder"""

 import gc
 from typing import List, Dict, Any, Optional, Tuple, Union
 from collections import defaultdict, OrderedDict  # <-- FIX 1: Add OrderedDict
+try:
+    import tree_sitter
+    from tree_sitter import Language, Parser
+    # Import individual language modules
+    try:
+        from tree_sitter_languages import get_language, get_parser
+        TREE_SITTER_IMPORTS_AVAILABLE = True
+    except ImportError:
+        TREE_SITTER_IMPORTS_AVAILABLE = False
+    TREE_SITTER_AVAILABLE = True
+    logger = logging.getLogger("NeuralSessionEngine")
+    logger.info("🌳 Tree-sitter successfully imported")
+    # Initialize parsers dictionary
+    TREE_SITTER_PARSERS = {}
+    TREE_SITTER_LANGUAGES = {}
+except ImportError as e:
+    TREE_SITTER_AVAILABLE = False
+    TREE_SITTER_IMPORTS_AVAILABLE = False
+    logging.warning(f"❌ Tree-sitter import failed: {e}")
+    logging.warning("Install: pip install tree-sitter tree-sitter-languages")
+# === HYBRID SEARCH IMPORTS ===
 try:
     from rank_bm25 import BM25Okapi
     BM25_AVAILABLE = True
 )
 logger = logging.getLogger("NeuralSessionEngine")
 class VectorDatabase:
     def __init__(self, index_path="faiss_session_index.bin", metadata_path="session_metadata.pkl"):
         self.index_path = index_path
             logger.error(f"❌ Failed to load models: {e}")
             raise RuntimeError(f"Model initialization failed: {e}")
+        self.tree_sitter_parsers = {}
+        self.tree_sitter_languages = {}
         # Load or create index with file locking
         self._load_or_create_index()
     # ==================== FIX 2: LAZY BM25 LOADING ====================
     def _get_or_build_bm25(self, user_id: str, chat_id: str) -> Optional[BM25Okapi]:
         """
         Retrieve BM25 index from cache or build it on-demand (Lazy Load).
     # ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
     def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str, file_id: str = None) -> bool:
+        """Store extracted file content with enhanced chunking and cache invalidation"""
         if not text or len(text) < 10 or not user_id:
             logger.warning(f"Invalid input for {filename}")
             return False
         chunks_data = []
         ext = os.path.splitext(filename)[1].lower()
         try:
+            if TREE_SITTER_AVAILABLE and ext in [
+                '.py', '.js', '.jsx', '.ts', '.tsx', '.java', '.cpp', '.c', '.cc',
+                '.go', '.rs', '.php', '.rb', '.cs', '.swift', '.kt', '.scala',
+                '.lua', '.r', '.sh', '.bash', '.sql', '.html', '.css', '.xml',
+                '.json', '.yaml', '.yml', '.toml', '.vue', '.md'
+            ]:
+                chunks_data = self._chunk_with_tree_sitter(text, filename)
+                logger.debug(f"Used Tree-sitter for {filename}")
+            elif ext == '.py':
+                chunks_data = self._chunk_python_ast_enhanced(text, filename)
             elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml', '.scss']:
                 chunks_data = self._chunk_smart_code(text, filename)
             else:
             logger.error(f"Chunking failed for {filename}: {e}")
             chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
         if not chunks_data and text:
+            chunks_data = [{
+                "text": text[:2000],
+                "type": "fallback",
+                "name": "full_document"
+            }]
         if not chunks_data:
             logger.error(f"No chunks generated for {filename}")
         final_meta = []
         for chunk in chunks_data:
             final_texts.append(chunk["text"])
             final_meta.append({
                 "text": chunk["text"],
                 "source": filename,
                 "file_id": file_id,
+                "type": "file",
                 "subtype": chunk.get("type", "general"),
                 "name": chunk.get("name", "unknown"),
                 "user_id": user_id,
                 "chat_id": chat_id,
+                "timestamp": time.time(),
+                "chunk_index": len(final_texts)
             })
+        # Whole file embedding for comprehensive answers
+        whole_file_text = text[:4000] if len(text) > 4000 else text
+        final_texts.append(f"Complete File: {filename} | Full Content: {whole_file_text}")
         final_meta.append({
+            "text": whole_file_text,
+            "actual_content": text,
             "source": filename,
             "file_id": file_id,
             "type": "file",
         })
         try:
+            # Optimized embedding
             embeddings = self.embedder.encode(
                 final_texts,
                 show_progress_bar=False,
                 self.metadata.extend(final_meta)
                 self._save_index()
+            logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
+            # ===== FIX 4: CACHE INVALIDATION instead of Immediate Rebuild =====
+            # When new files arrive, just invalidate the old cache.
+            # It will auto-rebuild (including the new file) on next search.
             self._invalidate_bm25_cache(user_id, chat_id)
+            self._verify_storage(user_id, chat_id, len(final_texts))
             return True
         except Exception as e:
             logger.error(f"❌ Failed to store vectors for {filename}: {e}")
+            # Clean up partial storage
             with self.memory_lock:
                 if self.index.ntotal >= len(final_texts):
+                    logger.warning("Rolling back partial storage...")
                     self._rollback_partial_storage(user_id, chat_id)
             return False
+    def _get_tree_sitter_parser(self, language_name: str) -> Optional[Any]:
+        """Get or create a tree-sitter parser for a specific language (Robust Loader)."""
+        if not TREE_SITTER_AVAILABLE:
+            return None
+        # 1. CHECK CACHE FIRST
+        if language_name in self.tree_sitter_parsers:
+            return self.tree_sitter_parsers[language_name]
+        # 2. DEFINE MAP EARLY (Critical for fallback logic)
+        lang_lib_map = {
+            'python': 'tree_sitter_python',
+            'javascript': 'tree_sitter_javascript',
+            'typescript': 'tree_sitter_typescript',
+            'java': 'tree_sitter_java',
+            'cpp': 'tree_sitter_cpp',
+            'c': 'tree_sitter_c',
+            'go': 'tree_sitter_go',
+            'rust': 'tree_sitter_rust',
+            'php': 'tree_sitter_php',
+            'ruby': 'tree_sitter_ruby',
+            'c_sharp': 'tree_sitter_c_sharp',
+            'swift': 'tree_sitter_swift',
+            'kotlin': 'tree_sitter_kotlin',
+            'scala': 'tree_sitter_scala',
+            'html': 'tree_sitter_html',
+            'css': 'tree_sitter_css',
+            'json': 'tree_sitter_json',
+            'yaml': 'tree_sitter_yaml',
+            'toml': 'tree_sitter_toml',
+            'xml': 'tree_sitter_xml',
+            'markdown': 'tree_sitter_markdown',
+            'bash': 'tree_sitter_bash',
+            'sql': 'tree_sitter_sql'
+        }
+        try:
+            logger.debug(f"🌳 Creating parser for {language_name}")
+            # 3. PLAN A: Try using tree_sitter_languages (The Easy Way)
+            if TREE_SITTER_IMPORTS_AVAILABLE:
+                try:
+                    parser = get_parser(language_name)
+                    if parser:
+                        self.tree_sitter_parsers[language_name] = parser
+                        # self.tree_sitter_languages[language_name] = ... (helper handles this usually)
+                        logger.debug(f"✅ Got parser for {language_name} via tree_sitter_languages")
+                        return parser
+                except Exception as e:
+                    logger.warning(f"⚠️ Plan A failed (tree_sitter_languages) for {language_name}: {e}")
+            # 4. PLAN B: Manual Loading (The Robust Way)
+            # This handles cases where the helper lib fails but the specific lang lib is installed
+            if language_name in lang_lib_map:
+                lib_name = lang_lib_map[language_name]
+                try:
+                    parser = Parser()
+                    language = None
+                    # Import the specific module
+                    module = __import__(lib_name)
+                    # Extract Language object (supports both Property and Function styles)
+                    if hasattr(module, 'language'):
+                        lang_obj = module.language
+                        if callable(lang_obj):
+                            language = lang_obj()
+                        else:
+                            language = lang_obj
+                    if language:
+                        parser.set_language(language)
+                        self.tree_sitter_parsers[language_name] = parser
+                        self.tree_sitter_languages[language_name] = language
+                        logger.debug(f"✅ Loaded {language_name} manually from {lib_name}")
+                        return parser
+                except ImportError:
+                    # Silence this warning usually, or log debug if needed
+                    logger.debug(f"⚠️ Manual load skipped: {lib_name} not installed.")
+                except Exception as e:
+                    logger.warning(f"❌ Manual load error for {lib_name}: {e}")
+            logger.warning(f"❌ Could not load parser for {language_name} (Plan A and B failed)")
+            return None
+        except Exception as e:
+            logger.error(f"❌ Critical parser error for {language_name}: {e}")
+            return None
+    def _chunk_with_tree_sitter(self, text: str, filename: str) -> List[Dict[str, Any]]:
+        """
+        ENHANCED Tree-sitter based code chunking with hybrid language support.
+        Now properly handles files with multiple languages (HTML/CSS/JS, Vue, etc.)
+        """
+        if not TREE_SITTER_AVAILABLE:
+            logger.warning("❌ TREE-SITTER UNAVAILABLE: Falling back to alternative methods")
+            ext = os.path.splitext(filename)[1].lower()
+            if ext == '.py':
+                return self._chunk_python_ast_enhanced(text, filename)
+            return self._chunk_smart_code(text, filename)
+        ext = os.path.splitext(filename)[1].lower()
+        # Map extensions to tree-sitter language names
+        language_map = {
+            '.py': 'python',
+            '.js': 'javascript',
+            '.jsx': 'javascript',
+            '.ts': 'typescript',
+            '.tsx': 'typescript',
+            '.java': 'java',
+            '.cpp': 'cpp',
+            '.c': 'c',
+            '.cc': 'cpp',
+            '.h': 'c',
+            '.hpp': 'cpp',
+            '.go': 'go',
+            '.rs': 'rust',
+            '.php': 'php',
+            '.rb': 'ruby',
+            '.cs': 'c_sharp',
+            '.swift': 'swift',
+            '.kt': 'kotlin',
+            '.kts': 'kotlin',
+            '.scala': 'scala',
+            '.lua': 'lua',
+            '.r': 'r',
+            '.sh': 'bash',
+            '.bash': 'bash',
+            '.zsh': 'bash',
+            '.sql': 'sql',
+            '.html': 'html',
+            '.htm': 'html',
+            '.css': 'css',
+            '.scss': 'css',
+            '.sass': 'css',
+            '.json': 'json',
+            '.yaml': 'yaml',
+            '.yml': 'yaml',
+            '.toml': 'toml',
+            '.xml': 'xml',
+            '.vue': 'vue',
+            '.md': 'markdown',
+        }
+        language_name = language_map.get(ext)
+        if not language_name:
+            logger.warning(f"🌐 NO PARSER FOR EXTENSION: {ext} for {filename}, falling back to smart chunking")
+            return self._chunk_smart_code(text, filename)
+        # Define fallback chains for robust parsing
+        fallback_sequence = [language_name]
+        if language_name == 'javascript':
+            fallback_sequence = ['javascript', 'tsx', 'typescript']
+        elif language_name == 'typescript':
+            fallback_sequence = ['typescript', 'tsx']
+        elif language_name == 'jsx':
+            fallback_sequence = ['javascript', 'tsx']
+        elif language_name == 'tsx':
+            fallback_sequence = ['tsx', 'typescript']
+        # Special handling for hybrid language files
+        if language_name in ['html', 'vue']:
+            return self._chunk_hybrid_file(text, filename, language_name)
+        return self._chunk_single_language(text, filename, fallback_sequence)
+    def _chunk_single_language(self, text: str, filename: str, language_names: Union[str, List[str]]) -> List[Dict[str, Any]]:
+        """Chunk a file with a single programming language, trying multiple parsers if needed."""
+        if isinstance(language_names, str):
+            language_names = [language_names]
+        chunks = []
+        for lang in language_names:
+            try:
+                parser = self._get_tree_sitter_parser(lang)
+                if not parser:
+                    continue
+                # Ensure text is bytes for tree-sitter
+                text_bytes = bytes(text, 'utf-8')
+                tree = parser.parse(text_bytes)
+                root_node = tree.root_node
+                # CRITICAL CHECK: If root is ERROR, this parser failed completely
+                if not root_node or root_node.type == 'ERROR':
+                    logger.warning(f"⚠️ Parser {lang} failed (Root ERROR) for {filename}. Trying next..." if len(language_names) > 1 else f"⚠️ Parser {lang} failed for {filename}")
+                    continue
+                # Define node types to extract based on language
+                node_types_config = self._get_node_types_config(lang)
+                target_types = node_types_config.get('extract', [])
+                skip_types = node_types_config.get('skip', [])
+                name_fields = node_types_config.get('name_fields', ['identifier', 'name'])
+                local_chunks = []
+                # Helper to extract node text with context
+                def extract_node_with_context(node, node_type, current_lang):
+                    start_line = node.start_point[0]
+                    end_line = node.end_point[0]
+                    # Adjust context based on language type
+                    context_config = node_types_config.get('context', {})
+                    context_before = context_config.get('before', 5)
+                    context_after = context_config.get('after', 5)
+                    # Extract the node text
+                    node_text = text_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')
+                    # Get context lines
+                    lines = text.splitlines()
+                    context_start = max(0, start_line - context_before)
+                    context_end = min(len(lines), end_line + context_after + 1)
+                    # Build context segment
+                    if context_start < start_line or context_end > end_line + 1:
+                        segment_lines = lines[context_start:context_end]
+                        segment = '\n'.join(segment_lines)
+                    else:
+                        segment = node_text
+                    # Extract node name
+                    node_name = self._extract_node_name(node, text_bytes, name_fields)
+                    if not node_name:
+                        node_name = f"{node_type}_{start_line + 1}"
+                    return {
+                        "text": f"File: {filename} | Type: {node_type} | Name: {node_name}\n{segment}",
+                        "type": f"code_{node_type}",
+                        "name": node_name,
+                        "line_start": start_line + 1,
+                        "line_end": end_line + 1,
+                        "context_start": context_start + 1,
+                        "context_end": context_end,
+                        "language": current_lang
+                    }
+                # Recursively find target nodes
+                def find_target_nodes(node, depth=0):
+                    if depth > 200:  # Prevent infinite recursion
+                        return
+                    if node.type in skip_types:
+                        return
+                    if node.type in target_types:
+                        extract = True
+                        # Heuristic: If node has ERROR child, it might be granularly broken
+                        # But for now we accept it unless it's total garbage
+                        if extract:
+                            local_chunks.append(extract_node_with_context(node, node.type, lang))
+                    for child in node.children:
+                        find_target_nodes(child, depth + 1)
+                # Start traversal
+                find_target_nodes(root_node)
+                # Add imports/top-level declarations
+                import_chunks = self._extract_imports(root_node, text_bytes, lang, filename)
+                if import_chunks:
+                    local_chunks = import_chunks + local_chunks
+                # Success criteria: If we found chunks, we consider this parser successful
+                if local_chunks:
+                    chunks = local_chunks
+                    logger.info(f"✅ TREE-SITTER SUCCESS: Parsed {filename} with ({lang}) into {len(chunks)} chunks")
+                    return chunks
+                # If no chunks found, it might mean the parser didn't match anything useful (or syntax was weird)
+                # We continue to next parser if available
+                logger.debug(f"ℹ️ Parser {lang} yielded 0 chunks for {filename}. Trying next...")
+            except Exception as e:
+                logger.warning(f"⚠️ Parser {lang} exception for {filename}: {e}")
+                continue
+        # If we get here, all parsers failed or returned 0 chunks
+        logger.warning(f"❌ ALL Parsers failed for {filename}, falling back to smart chunking")
+        # Final fallback check
+        ext = os.path.splitext(filename)[1].lower()
+        if ext == '.py':
+            return self._chunk_python_ast_enhanced(text, filename)
+        return self._chunk_smart_code(text, filename)
+    def _chunk_hybrid_file(self, text: str, filename: str, primary_lang: str) -> List[Dict[str, Any]]:
+        """
+        Chunk files that contain multiple languages (HTML with CSS/JS, Vue files, etc.)
+        """
+        chunks = []
+        if primary_lang == 'html':
+            # Use regex-based approach for HTML to avoid tree-sitter issues
+            return self._chunk_html_with_embedded_languages(text, filename)
+        elif primary_lang == 'vue':
+            # Vue files have template, script, style sections
+            return self._chunk_vue_file(text, filename)
+        # Default fallback
+        return self._chunk_smart_code(text, filename)
+    def _chunk_html_with_embedded_languages(self, text: str, filename: str) -> List[Dict[str, Any]]:
+        """Chunk HTML files with embedded CSS and JavaScript."""
+        chunks = []
+        # Split HTML into sections
+        lines = text.splitlines()
+        # Find all script and style tags
+        script_pattern = re.compile(r'<script(\s[^>]*)?>([\s\S]*?)</script>', re.IGNORECASE)
+        style_pattern = re.compile(r'<style(\s[^>]*)?>([\s\S]*?)</style>', re.IGNORECASE)
+        # Extract and chunk script blocks
+        for match in script_pattern.finditer(text):
+            full_match = match.group(0)
+            attrs = match.group(1) or ""
+            content = match.group(2)
+            # Determine language
+            lang = 'javascript'
+            if 'type="text/typescript"' in attrs or 'lang="ts"' in attrs:
+                lang = 'typescript'
+            # Find line numbers
+            start_pos = match.start()
+            line_num = text[:start_pos].count('\n') + 1
+            # Chunk the script content
+            if content.strip():
+                script_chunks = self._chunk_single_language(content, filename, lang)
+                if script_chunks:
+                    for chunk in script_chunks:
+                        chunk['text'] = f"File: {filename} | In <script> block (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
+                        chunk['type'] = 'html_script_' + chunk['type']
+                        chunk['language'] = lang
+                    chunks.extend(script_chunks)
+        # Extract and chunk style blocks
+        for match in style_pattern.finditer(text):
+            full_match = match.group(0)
+            attrs = match.group(1) or ""
+            content = match.group(2)
+            # Determine language
+            lang = 'css'
+            if 'lang="scss"' in attrs:
+                lang = 'css'  # Treat SCSS as CSS for now
+            # Find line numbers
+            start_pos = match.start()
+            line_num = text[:start_pos].count('\n') + 1
+            # Chunk the style content
+            if content.strip():
+                style_chunks = self._chunk_single_language(content, filename, lang)
+                if style_chunks:
+                    for chunk in style_chunks:
+                        chunk['text'] = f"File: {filename} | In <style> block (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
+                        chunk['type'] = 'html_style_' + chunk['type']
+                        chunk['language'] = lang
+                    chunks.extend(style_chunks)
+        # Chunk remaining HTML content
+        # Remove script and style blocks for HTML-only chunking
+        html_only = text
+        for match in script_pattern.finditer(text):
+            # Calculate line numbers separately to avoid backslash in f-string
+            start_line = text[:match.start()].count('\n') + 1
+            end_line = text[:match.end()].count('\n') + 1
+            html_only = html_only.replace(match.group(0), f"<!-- SCRIPT BLOCK REMOVED (lines {start_line}-{end_line}) -->")
+        for match in style_pattern.finditer(text):
+            # Calculate line numbers separately to avoid backslash in f-string
+            start_line = text[:match.start()].count('\n') + 1
+            end_line = text[:match.end()].count('\n') + 1
+            html_only = html_only.replace(match.group(0), f"<!-- STYLE BLOCK REMOVED (lines {start_line}-{end_line}) -->")
+        # Use smart chunking for HTML
+        html_chunks = self._chunk_smart_code(html_only, filename)
+        if html_chunks:
+            for chunk in html_chunks:
+                chunk['type'] = 'html_' + chunk['type']
+                chunk['language'] = 'html'
+            chunks.extend(html_chunks)
+        if not chunks:
+            return self._chunk_smart_code(text, filename)
+        logger.info(f"✅ HYBRID HTML PARSED: {filename} into {len(chunks)} mixed-language chunks")
+        return chunks
+    def _chunk_vue_file(self, text: str, filename: str) -> List[Dict[str, Any]]:
+        """Chunk Vue.js files with template, script, and style sections."""
+        chunks = []
+        # Extract template section
+        template_match = re.search(r'<template[^>]*>([\s\S]*?)</template>', text)
+        if template_match:
+            template_content = template_match.group(1)
+            # Find line numbers
+            start_pos = template_match.start()
+            line_num = text[:start_pos].count('\n') + 1
+            # Chunk template (treat as HTML)
+            template_chunks = self._chunk_smart_code(template_content, filename)
+            if template_chunks:
+                for chunk in template_chunks:
+                    chunk['text'] = f"File: {filename} | Vue Template Section (starting line {line_num})\n{chunk['text']}"
+                    chunk['type'] = 'vue_template_' + chunk['type']
+                    chunk['language'] = 'html'
+                chunks.extend(template_chunks)
+        # Extract script section
+        script_match = re.search(r'<script[^>]*>([\s\S]*?)</script>', text, re.DOTALL)
+        if script_match:
+            script_content = script_match.group(1)
+            attrs = script_match.group(0)[:script_match.group(0).index('>')]
+            # Find line numbers
+            start_pos = script_match.start()
+            line_num = text[:start_pos].count('\n') + 1
+            # Detect language
+            lang = 'javascript'
+            if 'lang="ts"' in attrs or 'lang="typescript"' in attrs:
+                lang = 'typescript'
+            # Chunk script
+            script_chunks = self._chunk_single_language(script_content, filename, lang)
+            if script_chunks:
+                for chunk in script_chunks:
+                    chunk['text'] = f"File: {filename} | Vue Script Section (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
+                    chunk['type'] = 'vue_script_' + chunk['type']
+                    chunk['language'] = lang
+                chunks.extend(script_chunks)
+        # Extract style section
+        style_match = re.search(r'<style[^>]*>([\s\S]*?)</style>', text, re.DOTALL)
+        if style_match:
+            style_content = style_match.group(1)
+            attrs = style_match.group(0)[:style_match.group(0).index('>')]
+            # Find line numbers
+            start_pos = style_match.start()
+            line_num = text[:start_pos].count('\n') + 1
+            # Detect language
+            lang = 'css'
+            if 'lang="scss"' in attrs:
+                lang = 'css'  # Treat SCSS as CSS
+            # Chunk style
+            style_chunks = self._chunk_single_language(style_content, filename, lang)
+            if style_chunks:
+                for chunk in style_chunks:
+                    chunk['text'] = f"File: {filename} | Vue Style Section (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
+                    chunk['type'] = 'vue_style_' + chunk['type']
+                    chunk['language'] = lang
+                chunks.extend(style_chunks)
+        if not chunks:
+            return self._chunk_smart_code(text, filename)
+        logger.info(f"✅ VUE PARSED: {filename} into {len(chunks)} chunks")
+        return chunks
+    def _get_node_types_config(self, language_name: str) -> Dict[str, Any]:
+        """Get configuration for what node types to extract for each language."""
+        configs = {
+            'python': {
+                'extract': ['function_definition', 'class_definition', 'async_function_definition'],
+                'skip': ['decorated_definition'],
+                'name_fields': ['identifier', 'name'],
+                'context': {'before': 2, 'after': 2}
+            },
+            'javascript': {
+                'extract': ['function_declaration', 'method_definition', 'class_declaration',
+                          'arrow_function', 'function_expression', 'variable_declaration',
+                          'export_statement'],
+                'skip': [],
+                'name_fields': ['identifier', 'name', 'property_identifier'],
+                'context': {'before': 5, 'after': 5}
+            },
+            'tsx': {
+                'extract': ['function_declaration', 'method_declaration', 'class_declaration',
+                        'arrow_function', 'interface_declaration', 'type_alias_declaration',
+                        'enum_declaration', 'export_statement', 'variable_declaration',
+                        'lexical_declaration'
+                    ],
+                'skip': [],
+                'name_fields': ['identifier', 'name', 'type_identifier'],
+                'context': {'before': 2, 'after': 2}
+            },
+            'java': {
+                'extract': ['method_declaration', 'class_declaration', 'interface_declaration',
+                          'constructor_declaration'],
+                'skip': [],
+                'name_fields': ['identifier'],
+                'context': {'before': 2, 'after': 2}
+            },
+            'cpp': {
+                'extract': ['function_definition', 'class_specifier', 'struct_specifier',
+                          'namespace_definition'],
+                'skip': [],
+                'name_fields': ['identifier', 'type_identifier'],
+                'context': {'before': 2, 'after': 2}
+            },
+            'c': {
+                'extract': ['function_definition', 'struct_specifier', 'declaration'],
+                'skip': [],
+                'name_fields': ['identifier'],
+                'context': {'before': 2, 'after': 2}
+            },
+            'go': {
+                'extract': ['function_declaration', 'method_declaration', 'type_declaration'],
+                'skip': [],
+                'name_fields': ['identifier'],
+                'context': {'before': 2, 'after': 2}
+            },
+            'rust': {
+                'extract': ['function_item', 'impl_item', 'struct_item', 'trait_item',
+                          'enum_item', 'mod_item'],
+                'skip': [],
+                'name_fields': ['identifier'],
+                'context': {'before': 2, 'after': 2}
+            },
+            'html': {
+                'extract': ['element', 'script_element', 'style_element'],
+                'skip': ['text'],
+                'name_fields': ['tag_name'],
+                'context': {'before': 1, 'after': 1}
+            },
+            'css': {
+                'extract': ['rule_set', 'at_rule'],
+                'skip': [],
+                'name_fields': [],
+                'context': {'before': 1, 'after': 1}
+            },
+            'sql': {
+                'extract': ['select_statement', 'insert_statement', 'update_statement',
+                          'delete_statement', 'create_statement'],
+                'skip': [],
+                'name_fields': [],
+                'context': {'before': 1, 'after': 1}
+            }
+        }
+        return configs.get(language_name, {
+            'extract': ['function_definition', 'class_definition'],
+            'skip': [],
+            'name_fields': ['identifier', 'name'],
+            'context': {'before': 2, 'after': 2}
+        })
+    def _extract_node_name(self, node, text_bytes: bytes, name_fields: List[str]) -> str:
+        """Extract the name/identifier from a node."""
+        for field in name_fields:
+            for child in node.children:
+                if child.type == field:
+                    return text_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
+        # Try to find any identifier
+        for child in node.children:
+            if 'identifier' in child.type or 'name' in child.type:
+                return text_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
+        return ""
+    def _extract_imports(self, root_node, text_bytes: bytes, language_name: str, filename: str) -> List[Dict[str, Any]]:
+        """Extract import statements from the code."""
+        import_chunks = []
+        import_types = {
+            'python': ['import_statement', 'import_from_statement'],
+            'javascript': ['import_statement', 'import_declaration'],
+            'typescript': ['import_statement', 'import_declaration'],
+            'java': ['import_declaration'],
+            'cpp': ['preproc_include'],
+            'rust': ['use_declaration'],
+            'go': ['import_declaration'],
+            'php': ['use_declaration'],
+            'c_sharp': ['using_directive']
+        }
+        target_types = import_types.get(language_name, [])
+        def collect_imports(node):
+            if node.type in target_types:
+                import_text = text_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')
+                if import_text:
+                    import_chunks.append({
+                        "text": f"File: {filename} | Import Statement:\n{import_text}",
+                        "type": "code_imports",
+                        "name": "imports",
+                        "line_start": node.start_point[0] + 1,
+                        "line_end": node.end_point[0] + 1,
+                        "language": language_name
+                    })
+            for child in node.children:
+                collect_imports(child)
+        collect_imports(root_node)
+        # Group imports if there are many
+        if len(import_chunks) > 5:
+            import_texts = []
+            for chunk in import_chunks:
+                # Extract just the import statement from the chunk text
+                import_lines = chunk['text'].split('\n', 1)
+                if len(import_lines) > 1:
+                    import_texts.append(import_lines[1])
+            return [{
+                "text": f"File: {filename} | Import Statements:\n" + "\n".join(import_texts[:10]) +
+                       (f"\n... and {len(import_texts) - 10} more" if len(import_texts) > 10 else ""),
+                "type": "code_imports",
+                "name": "imports_grouped",
+                "language": language_name
+            }]
+        return import_chunks
+    def _fallback_chunking(self, text: str, filename: str) -> List[Dict[str, Any]]:
+        """Fallback chunking method when tree-sitter fails."""
+        ext = os.path.splitext(filename)[1].lower()
+        if ext == '.py':
+            return self._chunk_python_ast_enhanced(text, filename)
+        elif ext in ['.js', '.jsx', '.ts', '.tsx', '.java', '.cpp', '.c', '.html', '.css', '.vue']:
+            return self._chunk_smart_code(text, filename)
+        else:
+            return self._chunk_text_enhanced(text)
     def delete_file(self, user_id: str, chat_id: str, file_id: str) -> bool:
         """Surgical Strike: Remove chunks belonging to a specific file ID"""
         with self.memory_lock:
             results = self._semantic_first_fusion(semantic_results, bm25_results, final_k)
         elif strategy == "fusion":
             results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
         else:
             # Default to fusion
             results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
     # ==================== CORE METHODS (PRESERVED WITH FIXES) ====================
+    def _chunk_python_ast_enhanced(self, text: str, filename: str) -> List[Dict[str, Any]]:
         chunks = []
         try:
             tree = ast.parse(text)
             lines = text.splitlines()
+            # Helper to extract exact source including decorators
+            def get_source_segment(node):
+                # 1. Find start line (check decorators first)
+                start_lineno = node.lineno
+                if hasattr(node, 'decorator_list') and node.decorator_list:
+                    start_lineno = node.decorator_list[0].lineno
+                # 2. Add minimal context buffer (1 line)
+                start_idx = max(0, start_lineno - 2)
+                end_idx = getattr(node, 'end_lineno', start_lineno) + 1
+                return "\n".join(lines[start_idx:end_idx]), start_idx, end_idx
+            # Recursive visitor to flatten nested structures
+            class CodeVisitor(ast.NodeVisitor):
+                def visit_FunctionDef(self, node):
+                    self._add_chunk(node, "function")
+                    # Do NOT generic_visit chunks we've already handled to avoid duplicates
+                    # But DO visit nested functions if needed (optional)
+                def visit_AsyncFunctionDef(self, node):
+                    self._add_chunk(node, "async_function")
+                def visit_ClassDef(self, node):
+                    # 1. Create a "Summary Chunk" for the class definition (docstring + init)
+                    class_header, start, _ = get_source_segment(node)
+                    # Truncate body for the summary
+                    summary_text = f"Class Definition: {node.name}\n" + "\n".join(class_header.splitlines()[:10])
+                    chunks.append({
+                        "text": f"File: {filename} | Type: class_def | Name: {node.name}\n{summary_text}",
+                        "type": "code_class",
+                        "name": node.name,
+                        "line_start": start
+                    })
+                    # 2. Recursively visit children (methods)
+                    self.generic_visit(node)
+                def _add_chunk(self, node, type_label):
+                    content, start, end = get_source_segment(node)
+                    # Enforce context window limits here if needed
                     chunks.append({
+                        "text": f"File: {filename} | Type: {type_label} | Name: {node.name}\n{content}",
+                        "type": f"code_{type_label}",
                         "name": node.name,
                         "line_start": start,
                         "line_end": end
                     })
+            # Run the visitor
+            CodeVisitor().visit(tree)
+            # Capture Globals (Imports, Constants, Main Guard)
+            global_context = []
+            for node in tree.body:
+                if isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.If)):
+                    # Only capture short logic blocks, skip giant if-blocks
+                    segment, _, _ = get_source_segment(node)
+                    if len(segment) < 500:
+                        global_context.append(segment)
             if global_context:
+                chunks.insert(0, {
+                    "text": f"File: {filename} | Global Context\n" + "\n".join(global_context),
+                    "type": "code_globals",
+                    "name": "globals"
+                })
         except Exception as e:
+            logger.warning(f"AST Parsing failed: {e}")
+            return self._chunk_text_enhanced(text) # Fallback
         return chunks
     def _chunk_smart_code(self, text: str, filename: str) -> List[Dict[str, Any]]:
             '.htm': r'(?=\n\s*<[^/])',
             '.xml': r'(?=\n\s*<[^/])',
             '.vue': r'(?=\n\s*<[^/])',
+            '.js': r'(?=\n\s*(?:function|class|export|import|async|def))',
+            '.jsx': r'(?=\n\s*(?:function|class|export|import|async|def))',
+            '.ts': r'(?=\n\s*(?:function|class|export|import|async|interface|type|def))',
+            '.tsx': r'(?=\n\s*(?:function|class|export|import|async|interface|type|def))',
             '.css': r'(?=\n\s*[.#@a-zA-Z])',
             '.scss': r'(?=\n\s*[.#@a-zA-Z])',
             '.java': r'(?=\n\s*(?:public|private|protected|class|interface|enum|@))',
             # Process with CONTEXT OVERLAP for better retrieval
             current_chunk = ""
+            TARGET_SIZE = 1900
             OVERLAP_SIZE = 100
             for seg_idx, seg in enumerate(segments):
         return chunks
     # ==================== HELPER METHODS FOR HYBRID SEARCH ====================
     def _classify_query(self, query: str) -> str:
         return results[:final_k]
+    def _reciprocal_rank_fusion(self, results1: List[Dict[str, Any]], results2: List[Dict[str, Any]],
+                           final_k: int, k: int = 60) -> List[Dict[str, Any]]:
+        """
+        Robust RRF Fusion for hybrid search (BM25 + Semantic).
+        Prioritizes BM25 metadata (results1) on overlaps for keyword precision.
+        Handles empty lists/duplicates gracefully; O(n log n) efficient.
+        """
+        merged_scores = defaultdict(float)
+        merged_meta: Dict[str, Dict[str, Any]] = {}
+        # Process semantic (results2) first
+        for rank, item in enumerate(results2):
+            doc_id = item.get("id")
+            if doc_id is None:
+                continue  # Skip invalid
+            score = 1.0 / (rank + k)
+            merged_scores[doc_id] += score
+            merged_meta[doc_id] = item.copy()  # Avoid mutating input
+        # Process BM25 (results1) second: overwrites meta for precision
+        for rank, item in enumerate(results1):
+            doc_id = item.get("id")
+            if doc_id is None:
+                continue
+            score = 1.0 / (rank + k)
+            merged_scores[doc_id] += score
+            merged_meta[doc_id] = item.copy()
+        # Sort by descending RRF score
+        sorted_ids = sorted(merged_scores, key=merged_scores.get, reverse=True)
+        # Package top-k
+        final_results = []
+        for doc_id in sorted_ids[:final_k]:
+            if doc_id in merged_meta:
+                res = merged_meta[doc_id].copy()
+                res["score"] = merged_scores[doc_id]
+                res["match_type"] = "hybrid_rrf"
+                final_results.append(res)
+        return final_results
     def _smart_rerank(self, query: str, candidates: List[Dict], final_k: int) -> List[Dict]:
         """Smart reranking using cross-encoder"""