Rajhuggingface4253 commited on
Commit
8720a18
·
verified ·
1 Parent(s): 172427a

Update vector.py

Browse files
Files changed (1) hide show
  1. vector.py +812 -222
vector.py CHANGED
@@ -14,8 +14,31 @@ import atexit
14
  import gc
15
  from typing import List, Dict, Any, Optional, Tuple, Union
16
  from collections import defaultdict, OrderedDict # <-- FIX 1: Add OrderedDict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # === NEW IMPORTS FOR HYBRID SEARCH ===
19
  try:
20
  from rank_bm25 import BM25Okapi
21
  BM25_AVAILABLE = True
@@ -38,6 +61,7 @@ logging.basicConfig(
38
  )
39
  logger = logging.getLogger("NeuralSessionEngine")
40
 
 
41
  class VectorDatabase:
42
  def __init__(self, index_path="faiss_session_index.bin", metadata_path="session_metadata.pkl"):
43
  self.index_path = index_path
@@ -58,6 +82,8 @@ class VectorDatabase:
58
  logger.error(f"❌ Failed to load models: {e}")
59
  raise RuntimeError(f"Model initialization failed: {e}")
60
 
 
 
61
  # Load or create index with file locking
62
  self._load_or_create_index()
63
 
@@ -92,31 +118,6 @@ class VectorDatabase:
92
 
93
  # ==================== FIX 2: LAZY BM25 LOADING ====================
94
 
95
- # ==================== BEADS ENGINE (Granular Context) ====================
96
- def _text_to_beads(self, text: str, source: str) -> List[str]:
97
- """
98
- Converts text blocks into 'Beads' (Atomic, Self-Contained Facts).
99
- Prevents context loss when chunks are retrieved out of order.
100
- """
101
- # 1. Clean and normalize
102
- text = re.sub(r'\s+', ' ', text).strip()
103
-
104
- # 2. Split into sentences (Robust Regex for speed)
105
- # Matches punctuation followed by space, avoiding abbreviations like "Mr." or "v1.0"
106
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
107
-
108
- beads = []
109
- for sent in sentences:
110
- if len(sent) < 15: continue # Skip noise/titles
111
-
112
- # 3. CONTEXT INJECTION (The "Bead" Magic)
113
- # We prepend the filename to EVERY sentence.
114
- # Old: "It returns 404." (Useless) -> New: "[Source: app.py] It returns 404." (Valuable)
115
- bead = f"[{source}] {sent}"
116
- beads.append(bead)
117
-
118
- return beads
119
-
120
  def _get_or_build_bm25(self, user_id: str, chat_id: str) -> Optional[BM25Okapi]:
121
  """
122
  Retrieve BM25 index from cache or build it on-demand (Lazy Load).
@@ -216,42 +217,8 @@ class VectorDatabase:
216
 
217
  # ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
218
 
219
- # ==================== BEADS ENGINE (New Feature) ====================
220
-
221
- def _text_to_beads(self, text: str, source: str) -> List[str]:
222
- """
223
- Converts text blocks into 'Beads' (Atomic, Self-Contained Facts).
224
- High-Precision Logic: Prevents context loss by attaching source metadata to every sentence.
225
- """
226
- # 1. Clean and normalize
227
- text = re.sub(r'\s+', ' ', text).strip()
228
-
229
- # 2. Robust Sentence Splitting (Regex)
230
- # Splits on [.!?] but avoids common abbreviations (e.g., "Mr.", "v1.0", "e.g.")
231
- # This is faster and lighter than NLTK for a production environment
232
- sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
233
-
234
- beads = []
235
- for sent in sentences:
236
- if len(sent) < 15: continue # Skip noise/tiny fragments
237
-
238
- # 3. CONTEXT INJECTION (The "Bead" Magic)
239
- # Old: "It returns 404." (Useless in isolation)
240
- # New: "[Source: auth.py] It returns 404." (Highly searchable)
241
- bead = f"[{source}] {sent}"
242
- beads.append(bead)
243
-
244
- return beads
245
-
246
- # ==================== STORAGE ENGINE (Updated) ====================
247
-
248
  def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str, file_id: str = None) -> bool:
249
- """
250
- Store extracted file content with Hybrid Bead Strategy.
251
- - Stores CHUNKS for reading/context.
252
- - Stores BEADS for high-precision search.
253
- - Stores WHOLE FILE for full retrieval.
254
- """
255
  if not text or len(text) < 10 or not user_id:
256
  logger.warning(f"Invalid input for {filename}")
257
  return False
@@ -261,10 +228,17 @@ class VectorDatabase:
261
  chunks_data = []
262
  ext = os.path.splitext(filename)[1].lower()
263
 
264
- # 1. Intelligent Chunking Strategy
265
  try:
266
- if ext == '.py':
267
- chunks_data = self._chunk_python_ast(text, filename)
 
 
 
 
 
 
 
 
268
  elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml', '.scss']:
269
  chunks_data = self._chunk_smart_code(text, filename)
270
  else:
@@ -273,9 +247,12 @@ class VectorDatabase:
273
  logger.error(f"Chunking failed for {filename}: {e}")
274
  chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
275
 
276
- # Fallback for empty chunking
277
  if not chunks_data and text:
278
- chunks_data = [{"text": text[:2000], "type": "fallback", "name": "full_document"}]
 
 
 
 
279
 
280
  if not chunks_data:
281
  logger.error(f"No chunks generated for {filename}")
@@ -285,46 +262,26 @@ class VectorDatabase:
285
  final_meta = []
286
 
287
  for chunk in chunks_data:
288
- # A. Store Standard Chunk (The "Paragraph")
289
  final_texts.append(chunk["text"])
290
  final_meta.append({
291
  "text": chunk["text"],
292
  "source": filename,
293
  "file_id": file_id,
294
- "type": "chunk", # Standard reading block
295
  "subtype": chunk.get("type", "general"),
296
  "name": chunk.get("name", "unknown"),
297
  "user_id": user_id,
298
  "chat_id": chat_id,
299
- "timestamp": time.time()
 
300
  })
301
 
302
- # B. Generate & Store Beads (The "Atomic Facts")
303
- # We ONLY bead natural text or comments. We do NOT bead raw code functions (keeps syntax intact).
304
- if chunk.get("type") in ["text_paragraph", "text_block", "code_context", "fallback"]:
305
- beads = self._text_to_beads(chunk["text"], filename)
306
- for bead in beads:
307
- final_texts.append(bead)
308
- final_meta.append({
309
- "text": bead,
310
- "source": filename,
311
- "file_id": file_id,
312
- "type": "bead", # Marked for high-precision search
313
- "subtype": "atomic_fact",
314
- "user_id": user_id,
315
- "chat_id": chat_id,
316
- "timestamp": time.time(),
317
- "actual_content": chunk["text"] # Link back to parent chunk
318
- })
319
-
320
- # C. Store Whole File Reference (The "Library Book")
321
- # We increase the embedding limit to 8000 chars for better summary matching.
322
- # CRITICAL: 'actual_content' stores the UNLIMITED full text as requested.
323
- whole_file_summary = text[:8000]
324
- final_texts.append(f"Complete File: {filename} | Content Start: {whole_file_summary}")
325
  final_meta.append({
326
- "text": whole_file_summary,
327
- "actual_content": text, # <--- UNLIMITED FULL CONTENT STORAGE
328
  "source": filename,
329
  "file_id": file_id,
330
  "type": "file",
@@ -337,7 +294,7 @@ class VectorDatabase:
337
  })
338
 
339
  try:
340
- # Optimized Batch Embedding
341
  embeddings = self.embedder.encode(
342
  final_texts,
343
  show_progress_bar=False,
@@ -353,20 +310,663 @@ class VectorDatabase:
353
  self.metadata.extend(final_meta)
354
  self._save_index()
355
 
356
- # Invalidate Cache (Forces system to see new data immediately)
 
 
 
 
357
  self._invalidate_bm25_cache(user_id, chat_id)
358
 
359
- logger.info(f"✅ Stored {len(final_texts)} vectors (Chunks + Beads) for {filename}")
 
360
  return True
361
 
362
  except Exception as e:
363
  logger.error(f"❌ Failed to store vectors for {filename}: {e}")
364
- # Rollback logic (safety net)
365
  with self.memory_lock:
366
  if self.index.ntotal >= len(final_texts):
 
367
  self._rollback_partial_storage(user_id, chat_id)
368
  return False
369
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  def delete_file(self, user_id: str, chat_id: str, file_id: str) -> bool:
371
  """Surgical Strike: Remove chunks belonging to a specific file ID"""
372
  with self.memory_lock:
@@ -542,8 +1142,7 @@ class VectorDatabase:
542
  results = self._semantic_first_fusion(semantic_results, bm25_results, final_k)
543
  elif strategy == "fusion":
544
  results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
545
- elif strategy == "weighted":
546
- results = self._weighted_fusion(bm25_results, semantic_results, final_k)
547
  else:
548
  # Default to fusion
549
  results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
@@ -612,51 +1211,85 @@ class VectorDatabase:
612
 
613
  # ==================== CORE METHODS (PRESERVED WITH FIXES) ====================
614
 
615
- def _chunk_python_ast(self, text: str, filename: str) -> List[Dict[str, Any]]:
616
- """Enhanced AST chunker with better context preservation"""
617
  chunks = []
618
  try:
619
  tree = ast.parse(text)
620
  lines = text.splitlines()
621
 
622
- global_context = []
623
-
624
- for node in tree.body:
625
- if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
626
- start = max(0, node.lineno - 4)
627
- end = node.end_lineno + 2
628
- block_text = "\n".join(lines[start:end])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
 
 
 
 
 
 
630
  chunks.append({
631
- "text": f"File: {filename} | Type: {type(node).__name__} | Name: {node.name} | Content: {block_text}",
632
- "type": "code_function",
633
  "name": node.name,
634
  "line_start": start,
635
  "line_end": end
636
  })
637
- elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
638
- if hasattr(node, 'end_lineno'):
639
- start = node.lineno - 1
640
- end = node.end_lineno
641
- global_context.append("\n".join(lines[start:end]))
642
 
643
- # Add global context as a separate chunk
 
 
 
 
 
 
 
 
644
  if global_context:
645
- full_global = "\n".join(global_context)
646
- if len(full_global) > 50:
647
- chunks.insert(0, {
648
- "text": f"File: {filename} | Type: imports_and_globals | Content: {full_global[:2000]}",
649
- "type": "code_context",
650
- "name": "imports_and_globals"
651
- })
652
-
653
  except Exception as e:
654
- logger.warning(f"AST parsing failed for {filename}: {e}")
655
- return self._chunk_text_enhanced(text)
656
-
657
- if not chunks:
658
- return self._chunk_text_enhanced(text)
659
-
660
  return chunks
661
 
662
  def _chunk_smart_code(self, text: str, filename: str) -> List[Dict[str, Any]]:
@@ -670,10 +1303,10 @@ class VectorDatabase:
670
  '.htm': r'(?=\n\s*<[^/])',
671
  '.xml': r'(?=\n\s*<[^/])',
672
  '.vue': r'(?=\n\s*<[^/])',
673
- '.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
674
- '.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|def|if|for|while|switch))',
675
- '.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
676
- '.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type|def|if|for|while))',
677
  '.css': r'(?=\n\s*[.#@a-zA-Z])',
678
  '.scss': r'(?=\n\s*[.#@a-zA-Z])',
679
  '.java': r'(?=\n\s*(?:public|private|protected|class|interface|enum|@))',
@@ -691,7 +1324,7 @@ class VectorDatabase:
691
 
692
  # Process with CONTEXT OVERLAP for better retrieval
693
  current_chunk = ""
694
- TARGET_SIZE = 800
695
  OVERLAP_SIZE = 100
696
 
697
  for seg_idx, seg in enumerate(segments):
@@ -784,6 +1417,8 @@ class VectorDatabase:
784
 
785
  return chunks
786
 
 
 
787
  # ==================== HELPER METHODS FOR HYBRID SEARCH ====================
788
 
789
  def _classify_query(self, query: str) -> str:
@@ -857,92 +1492,47 @@ class VectorDatabase:
857
 
858
  return results[:final_k]
859
 
860
- def _reciprocal_rank_fusion(self, results1: List[Dict], results2: List[Dict],
861
- final_k: int, k: int = 60) -> List[Dict]:
862
- """Combine results using Reciprocal Rank Fusion (RRF)"""
863
- # Create rank dictionaries
864
- rank_map1 = {r["id"]: rank + 1 for rank, r in enumerate(results1)}
865
- rank_map2 = {r["id"]: rank + 1 for rank, r in enumerate(results2)}
866
-
867
- # Get all unique IDs
868
- all_ids = set(rank_map1.keys()) | set(rank_map2.keys())
869
-
870
- # Calculate RRF scores
871
- rrf_scores = []
872
- for doc_id in all_ids:
873
- score = 0.0
874
- if doc_id in rank_map1:
875
- score += 1.0 / (rank_map1[doc_id] + k)
876
- if doc_id in rank_map2:
877
- score += 1.0 / (rank_map2[doc_id] + k)
878
- rrf_scores.append((doc_id, score))
879
-
880
- # Sort by RRF score
881
- rrf_scores.sort(key=lambda x: x[1], reverse=True)
882
-
883
- # Create result mapping for quick lookup
884
- results_map = {}
885
- for r in results1 + results2:
886
- if r["id"] not in results_map:
887
- results_map[r["id"]] = r
888
-
889
- # Build final results
890
- combined_results = []
891
- for doc_id, rrf_score in rrf_scores:
892
- if doc_id in results_map:
893
- result = results_map[doc_id].copy()
894
- result["score"] = rrf_score
895
- result["match_type"] = "rrf_fusion"
896
- combined_results.append(result)
897
-
898
- return combined_results[:final_k]
899
-
900
- def _weighted_fusion(self, bm25_results: List[Dict], semantic_results: List[Dict],
901
- final_k: int, bm25_weight: float = 0.4,
902
- semantic_weight: float = 0.6) -> List[Dict]:
903
- """Weighted combination of BM25 and semantic scores"""
904
- # Normalize scores within each result set
905
- def normalize_scores(results):
906
- if not results:
907
- return {}
908
- max_score = max(r["score"] for r in results) if results else 1.0
909
- if max_score == 0:
910
- max_score = 1.0
911
- return {r["id"]: r["score"] / max_score for r in results}
912
-
913
- bm25_scores = normalize_scores(bm25_results)
914
- semantic_scores = normalize_scores(semantic_results)
915
-
916
- # Get all unique IDs
917
- all_ids = set(bm25_scores.keys()) | set(semantic_scores.keys())
918
-
919
- # Calculate weighted scores
920
- weighted_scores = []
921
- for doc_id in all_ids:
922
- bm25_score = bm25_scores.get(doc_id, 0.0)
923
- semantic_score = semantic_scores.get(doc_id, 0.0)
924
- weighted = (bm25_score * bm25_weight) + (semantic_score * semantic_weight)
925
- weighted_scores.append((doc_id, weighted))
926
-
927
- # Sort by weighted score
928
- weighted_scores.sort(key=lambda x: x[1], reverse=True)
929
-
930
- # Create result mapping
931
- results_map = {}
932
- for r in bm25_results + semantic_results:
933
- if r["id"] not in results_map:
934
- results_map[r["id"]] = r
935
-
936
- # Build final results
937
- combined_results = []
938
- for doc_id, weighted_score in weighted_scores:
939
- if doc_id in results_map:
940
- result = results_map[doc_id].copy()
941
- result["score"] = weighted_score
942
- result["match_type"] = "weighted_fusion"
943
- combined_results.append(result)
944
-
945
- return combined_results[:final_k]
946
 
947
  def _smart_rerank(self, query: str, candidates: List[Dict], final_k: int) -> List[Dict]:
948
  """Smart reranking using cross-encoder"""
 
14
  import gc
15
  from typing import List, Dict, Any, Optional, Tuple, Union
16
  from collections import defaultdict, OrderedDict # <-- FIX 1: Add OrderedDict
17
+ try:
18
+ import tree_sitter
19
+ from tree_sitter import Language, Parser
20
+ # Import individual language modules
21
+ try:
22
+ from tree_sitter_languages import get_language, get_parser
23
+ TREE_SITTER_IMPORTS_AVAILABLE = True
24
+ except ImportError:
25
+ TREE_SITTER_IMPORTS_AVAILABLE = False
26
+
27
+ TREE_SITTER_AVAILABLE = True
28
+ logger = logging.getLogger("NeuralSessionEngine")
29
+ logger.info("🌳 Tree-sitter successfully imported")
30
+
31
+ # Initialize parsers dictionary
32
+ TREE_SITTER_PARSERS = {}
33
+ TREE_SITTER_LANGUAGES = {}
34
+
35
+ except ImportError as e:
36
+ TREE_SITTER_AVAILABLE = False
37
+ TREE_SITTER_IMPORTS_AVAILABLE = False
38
+ logging.warning(f"❌ Tree-sitter import failed: {e}")
39
+ logging.warning("Install: pip install tree-sitter tree-sitter-languages")
40
 
41
+ # === HYBRID SEARCH IMPORTS ===
42
  try:
43
  from rank_bm25 import BM25Okapi
44
  BM25_AVAILABLE = True
 
61
  )
62
  logger = logging.getLogger("NeuralSessionEngine")
63
 
64
+
65
  class VectorDatabase:
66
  def __init__(self, index_path="faiss_session_index.bin", metadata_path="session_metadata.pkl"):
67
  self.index_path = index_path
 
82
  logger.error(f"❌ Failed to load models: {e}")
83
  raise RuntimeError(f"Model initialization failed: {e}")
84
 
85
+ self.tree_sitter_parsers = {}
86
+ self.tree_sitter_languages = {}
87
  # Load or create index with file locking
88
  self._load_or_create_index()
89
 
 
118
 
119
  # ==================== FIX 2: LAZY BM25 LOADING ====================
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def _get_or_build_bm25(self, user_id: str, chat_id: str) -> Optional[BM25Okapi]:
122
  """
123
  Retrieve BM25 index from cache or build it on-demand (Lazy Load).
 
217
 
218
  # ==================== ENHANCED STORAGE WITH CACHE INVALIDATION ====================
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str, file_id: str = None) -> bool:
221
+ """Store extracted file content with enhanced chunking and cache invalidation"""
 
 
 
 
 
222
  if not text or len(text) < 10 or not user_id:
223
  logger.warning(f"Invalid input for {filename}")
224
  return False
 
228
  chunks_data = []
229
  ext = os.path.splitext(filename)[1].lower()
230
 
 
231
  try:
232
+ if TREE_SITTER_AVAILABLE and ext in [
233
+ '.py', '.js', '.jsx', '.ts', '.tsx', '.java', '.cpp', '.c', '.cc',
234
+ '.go', '.rs', '.php', '.rb', '.cs', '.swift', '.kt', '.scala',
235
+ '.lua', '.r', '.sh', '.bash', '.sql', '.html', '.css', '.xml',
236
+ '.json', '.yaml', '.yml', '.toml', '.vue', '.md'
237
+ ]:
238
+ chunks_data = self._chunk_with_tree_sitter(text, filename)
239
+ logger.debug(f"Used Tree-sitter for {filename}")
240
+ elif ext == '.py':
241
+ chunks_data = self._chunk_python_ast_enhanced(text, filename)
242
  elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml', '.scss']:
243
  chunks_data = self._chunk_smart_code(text, filename)
244
  else:
 
247
  logger.error(f"Chunking failed for {filename}: {e}")
248
  chunks_data = self._chunk_text_enhanced(text, chunk_size=600, overlap=100)
249
 
 
250
  if not chunks_data and text:
251
+ chunks_data = [{
252
+ "text": text[:2000],
253
+ "type": "fallback",
254
+ "name": "full_document"
255
+ }]
256
 
257
  if not chunks_data:
258
  logger.error(f"No chunks generated for {filename}")
 
262
  final_meta = []
263
 
264
  for chunk in chunks_data:
 
265
  final_texts.append(chunk["text"])
266
  final_meta.append({
267
  "text": chunk["text"],
268
  "source": filename,
269
  "file_id": file_id,
270
+ "type": "file",
271
  "subtype": chunk.get("type", "general"),
272
  "name": chunk.get("name", "unknown"),
273
  "user_id": user_id,
274
  "chat_id": chat_id,
275
+ "timestamp": time.time(),
276
+ "chunk_index": len(final_texts)
277
  })
278
 
279
+ # Whole file embedding for comprehensive answers
280
+ whole_file_text = text[:4000] if len(text) > 4000 else text
281
+ final_texts.append(f"Complete File: {filename} | Full Content: {whole_file_text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  final_meta.append({
283
+ "text": whole_file_text,
284
+ "actual_content": text,
285
  "source": filename,
286
  "file_id": file_id,
287
  "type": "file",
 
294
  })
295
 
296
  try:
297
+ # Optimized embedding
298
  embeddings = self.embedder.encode(
299
  final_texts,
300
  show_progress_bar=False,
 
310
  self.metadata.extend(final_meta)
311
  self._save_index()
312
 
313
+ logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
314
+
315
+ # ===== FIX 4: CACHE INVALIDATION instead of Immediate Rebuild =====
316
+ # When new files arrive, just invalidate the old cache.
317
+ # It will auto-rebuild (including the new file) on next search.
318
  self._invalidate_bm25_cache(user_id, chat_id)
319
 
320
+ self._verify_storage(user_id, chat_id, len(final_texts))
321
+
322
  return True
323
 
324
  except Exception as e:
325
  logger.error(f"❌ Failed to store vectors for {filename}: {e}")
326
+ # Clean up partial storage
327
  with self.memory_lock:
328
  if self.index.ntotal >= len(final_texts):
329
+ logger.warning("Rolling back partial storage...")
330
  self._rollback_partial_storage(user_id, chat_id)
331
  return False
332
 
333
+ def _get_tree_sitter_parser(self, language_name: str) -> Optional[Any]:
334
+ """Get or create a tree-sitter parser for a specific language (Robust Loader)."""
335
+ if not TREE_SITTER_AVAILABLE:
336
+ return None
337
+
338
+ # 1. CHECK CACHE FIRST
339
+ if language_name in self.tree_sitter_parsers:
340
+ return self.tree_sitter_parsers[language_name]
341
+
342
+ # 2. DEFINE MAP EARLY (Critical for fallback logic)
343
+ lang_lib_map = {
344
+ 'python': 'tree_sitter_python',
345
+ 'javascript': 'tree_sitter_javascript',
346
+ 'typescript': 'tree_sitter_typescript',
347
+ 'java': 'tree_sitter_java',
348
+ 'cpp': 'tree_sitter_cpp',
349
+ 'c': 'tree_sitter_c',
350
+ 'go': 'tree_sitter_go',
351
+ 'rust': 'tree_sitter_rust',
352
+ 'php': 'tree_sitter_php',
353
+ 'ruby': 'tree_sitter_ruby',
354
+ 'c_sharp': 'tree_sitter_c_sharp',
355
+ 'swift': 'tree_sitter_swift',
356
+ 'kotlin': 'tree_sitter_kotlin',
357
+ 'scala': 'tree_sitter_scala',
358
+ 'html': 'tree_sitter_html',
359
+ 'css': 'tree_sitter_css',
360
+ 'json': 'tree_sitter_json',
361
+ 'yaml': 'tree_sitter_yaml',
362
+ 'toml': 'tree_sitter_toml',
363
+ 'xml': 'tree_sitter_xml',
364
+ 'markdown': 'tree_sitter_markdown',
365
+ 'bash': 'tree_sitter_bash',
366
+ 'sql': 'tree_sitter_sql'
367
+ }
368
+
369
+ try:
370
+ logger.debug(f"🌳 Creating parser for {language_name}")
371
+
372
+ # 3. PLAN A: Try using tree_sitter_languages (The Easy Way)
373
+ if TREE_SITTER_IMPORTS_AVAILABLE:
374
+ try:
375
+ parser = get_parser(language_name)
376
+ if parser:
377
+ self.tree_sitter_parsers[language_name] = parser
378
+ # self.tree_sitter_languages[language_name] = ... (helper handles this usually)
379
+ logger.debug(f"✅ Got parser for {language_name} via tree_sitter_languages")
380
+ return parser
381
+ except Exception as e:
382
+ logger.warning(f"⚠️ Plan A failed (tree_sitter_languages) for {language_name}: {e}")
383
+
384
+ # 4. PLAN B: Manual Loading (The Robust Way)
385
+ # This handles cases where the helper lib fails but the specific lang lib is installed
386
+ if language_name in lang_lib_map:
387
+ lib_name = lang_lib_map[language_name]
388
+ try:
389
+ parser = Parser()
390
+ language = None
391
+
392
+ # Import the specific module
393
+ module = __import__(lib_name)
394
+
395
+ # Extract Language object (supports both Property and Function styles)
396
+ if hasattr(module, 'language'):
397
+ lang_obj = module.language
398
+ if callable(lang_obj):
399
+ language = lang_obj()
400
+ else:
401
+ language = lang_obj
402
+
403
+ if language:
404
+ parser.set_language(language)
405
+ self.tree_sitter_parsers[language_name] = parser
406
+ self.tree_sitter_languages[language_name] = language
407
+ logger.debug(f"✅ Loaded {language_name} manually from {lib_name}")
408
+ return parser
409
+
410
+ except ImportError:
411
+ # Silence this warning usually, or log debug if needed
412
+ logger.debug(f"⚠️ Manual load skipped: {lib_name} not installed.")
413
+ except Exception as e:
414
+ logger.warning(f"❌ Manual load error for {lib_name}: {e}")
415
+
416
+ logger.warning(f"❌ Could not load parser for {language_name} (Plan A and B failed)")
417
+ return None
418
+
419
+ except Exception as e:
420
+ logger.error(f"❌ Critical parser error for {language_name}: {e}")
421
+ return None
422
+
423
+ def _chunk_with_tree_sitter(self, text: str, filename: str) -> List[Dict[str, Any]]:
424
+ """
425
+ ENHANCED Tree-sitter based code chunking with hybrid language support.
426
+ Now properly handles files with multiple languages (HTML/CSS/JS, Vue, etc.)
427
+ """
428
+ if not TREE_SITTER_AVAILABLE:
429
+ logger.warning("❌ TREE-SITTER UNAVAILABLE: Falling back to alternative methods")
430
+ ext = os.path.splitext(filename)[1].lower()
431
+ if ext == '.py':
432
+ return self._chunk_python_ast_enhanced(text, filename)
433
+ return self._chunk_smart_code(text, filename)
434
+
435
+ ext = os.path.splitext(filename)[1].lower()
436
+
437
+ # Map extensions to tree-sitter language names
438
+ language_map = {
439
+ '.py': 'python',
440
+ '.js': 'javascript',
441
+ '.jsx': 'javascript',
442
+ '.ts': 'typescript',
443
+ '.tsx': 'typescript',
444
+ '.java': 'java',
445
+ '.cpp': 'cpp',
446
+ '.c': 'c',
447
+ '.cc': 'cpp',
448
+ '.h': 'c',
449
+ '.hpp': 'cpp',
450
+ '.go': 'go',
451
+ '.rs': 'rust',
452
+ '.php': 'php',
453
+ '.rb': 'ruby',
454
+ '.cs': 'c_sharp',
455
+ '.swift': 'swift',
456
+ '.kt': 'kotlin',
457
+ '.kts': 'kotlin',
458
+ '.scala': 'scala',
459
+ '.lua': 'lua',
460
+ '.r': 'r',
461
+ '.sh': 'bash',
462
+ '.bash': 'bash',
463
+ '.zsh': 'bash',
464
+ '.sql': 'sql',
465
+ '.html': 'html',
466
+ '.htm': 'html',
467
+ '.css': 'css',
468
+ '.scss': 'css',
469
+ '.sass': 'css',
470
+ '.json': 'json',
471
+ '.yaml': 'yaml',
472
+ '.yml': 'yaml',
473
+ '.toml': 'toml',
474
+ '.xml': 'xml',
475
+ '.vue': 'vue',
476
+ '.md': 'markdown',
477
+ }
478
+
479
+ language_name = language_map.get(ext)
480
+ if not language_name:
481
+ logger.warning(f"🌐 NO PARSER FOR EXTENSION: {ext} for {filename}, falling back to smart chunking")
482
+ return self._chunk_smart_code(text, filename)
483
+
484
+ # Define fallback chains for robust parsing
485
+ fallback_sequence = [language_name]
486
+
487
+ if language_name == 'javascript':
488
+ fallback_sequence = ['javascript', 'tsx', 'typescript']
489
+ elif language_name == 'typescript':
490
+ fallback_sequence = ['typescript', 'tsx']
491
+ elif language_name == 'jsx':
492
+ fallback_sequence = ['javascript', 'tsx']
493
+ elif language_name == 'tsx':
494
+ fallback_sequence = ['tsx', 'typescript']
495
+
496
+ # Special handling for hybrid language files
497
+ if language_name in ['html', 'vue']:
498
+ return self._chunk_hybrid_file(text, filename, language_name)
499
+
500
+ return self._chunk_single_language(text, filename, fallback_sequence)
501
+
502
+ def _chunk_single_language(self, text: str, filename: str, language_names: Union[str, List[str]]) -> List[Dict[str, Any]]:
503
+ """Chunk a file with a single programming language, trying multiple parsers if needed."""
504
+ if isinstance(language_names, str):
505
+ language_names = [language_names]
506
+
507
+ chunks = []
508
+
509
+ for lang in language_names:
510
+ try:
511
+ parser = self._get_tree_sitter_parser(lang)
512
+ if not parser:
513
+ continue
514
+
515
+ # Ensure text is bytes for tree-sitter
516
+ text_bytes = bytes(text, 'utf-8')
517
+ tree = parser.parse(text_bytes)
518
+ root_node = tree.root_node
519
+
520
+ # CRITICAL CHECK: If root is ERROR, this parser failed completely
521
+ if not root_node or root_node.type == 'ERROR':
522
+ logger.warning(f"⚠️ Parser {lang} failed (Root ERROR) for {filename}. Trying next..." if len(language_names) > 1 else f"⚠️ Parser {lang} failed for {filename}")
523
+ continue
524
+
525
+ # Define node types to extract based on language
526
+ node_types_config = self._get_node_types_config(lang)
527
+ target_types = node_types_config.get('extract', [])
528
+ skip_types = node_types_config.get('skip', [])
529
+ name_fields = node_types_config.get('name_fields', ['identifier', 'name'])
530
+
531
+ local_chunks = []
532
+
533
+ # Helper to extract node text with context
534
+ def extract_node_with_context(node, node_type, current_lang):
535
+ start_line = node.start_point[0]
536
+ end_line = node.end_point[0]
537
+
538
+ # Adjust context based on language type
539
+ context_config = node_types_config.get('context', {})
540
+ context_before = context_config.get('before', 5)
541
+ context_after = context_config.get('after', 5)
542
+
543
+ # Extract the node text
544
+ node_text = text_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')
545
+
546
+ # Get context lines
547
+ lines = text.splitlines()
548
+ context_start = max(0, start_line - context_before)
549
+ context_end = min(len(lines), end_line + context_after + 1)
550
+
551
+ # Build context segment
552
+ if context_start < start_line or context_end > end_line + 1:
553
+ segment_lines = lines[context_start:context_end]
554
+ segment = '\n'.join(segment_lines)
555
+ else:
556
+ segment = node_text
557
+
558
+ # Extract node name
559
+ node_name = self._extract_node_name(node, text_bytes, name_fields)
560
+ if not node_name:
561
+ node_name = f"{node_type}_{start_line + 1}"
562
+
563
+ return {
564
+ "text": f"File: {filename} | Type: {node_type} | Name: {node_name}\n{segment}",
565
+ "type": f"code_{node_type}",
566
+ "name": node_name,
567
+ "line_start": start_line + 1,
568
+ "line_end": end_line + 1,
569
+ "context_start": context_start + 1,
570
+ "context_end": context_end,
571
+ "language": current_lang
572
+ }
573
+
574
+ # Recursively find target nodes
575
+ def find_target_nodes(node, depth=0):
576
+ if depth > 200: # Prevent infinite recursion
577
+ return
578
+
579
+ if node.type in skip_types:
580
+ return
581
+
582
+ if node.type in target_types:
583
+ extract = True
584
+ # Heuristic: If node has ERROR child, it might be granularly broken
585
+ # But for now we accept it unless it's total garbage
586
+ if extract:
587
+ local_chunks.append(extract_node_with_context(node, node.type, lang))
588
+
589
+ for child in node.children:
590
+ find_target_nodes(child, depth + 1)
591
+
592
+ # Start traversal
593
+ find_target_nodes(root_node)
594
+
595
+ # Add imports/top-level declarations
596
+ import_chunks = self._extract_imports(root_node, text_bytes, lang, filename)
597
+ if import_chunks:
598
+ local_chunks = import_chunks + local_chunks
599
+
600
+ # Success criteria: If we found chunks, we consider this parser successful
601
+ if local_chunks:
602
+ chunks = local_chunks
603
+ logger.info(f"✅ TREE-SITTER SUCCESS: Parsed {filename} with ({lang}) into {len(chunks)} chunks")
604
+ return chunks
605
+
606
+ # If no chunks found, it might mean the parser didn't match anything useful (or syntax was weird)
607
+ # We continue to next parser if available
608
+ logger.debug(f"ℹ️ Parser {lang} yielded 0 chunks for {filename}. Trying next...")
609
+
610
+ except Exception as e:
611
+ logger.warning(f"⚠️ Parser {lang} exception for {filename}: {e}")
612
+ continue
613
+
614
+ # If we get here, all parsers failed or returned 0 chunks
615
+ logger.warning(f"❌ ALL Parsers failed for {filename}, falling back to smart chunking")
616
+ # Final fallback check
617
+ ext = os.path.splitext(filename)[1].lower()
618
+ if ext == '.py':
619
+ return self._chunk_python_ast_enhanced(text, filename)
620
+ return self._chunk_smart_code(text, filename)
621
+
622
+ def _chunk_hybrid_file(self, text: str, filename: str, primary_lang: str) -> List[Dict[str, Any]]:
623
+ """
624
+ Chunk files that contain multiple languages (HTML with CSS/JS, Vue files, etc.)
625
+ """
626
+ chunks = []
627
+
628
+ if primary_lang == 'html':
629
+ # Use regex-based approach for HTML to avoid tree-sitter issues
630
+ return self._chunk_html_with_embedded_languages(text, filename)
631
+
632
+ elif primary_lang == 'vue':
633
+ # Vue files have template, script, style sections
634
+ return self._chunk_vue_file(text, filename)
635
+
636
+ # Default fallback
637
+ return self._chunk_smart_code(text, filename)
638
+
639
+ def _chunk_html_with_embedded_languages(self, text: str, filename: str) -> List[Dict[str, Any]]:
640
+ """Chunk HTML files with embedded CSS and JavaScript."""
641
+ chunks = []
642
+
643
+ # Split HTML into sections
644
+ lines = text.splitlines()
645
+
646
+ # Find all script and style tags
647
+ script_pattern = re.compile(r'<script(\s[^>]*)?>([\s\S]*?)</script>', re.IGNORECASE)
648
+ style_pattern = re.compile(r'<style(\s[^>]*)?>([\s\S]*?)</style>', re.IGNORECASE)
649
+
650
+ # Extract and chunk script blocks
651
+ for match in script_pattern.finditer(text):
652
+ full_match = match.group(0)
653
+ attrs = match.group(1) or ""
654
+ content = match.group(2)
655
+
656
+ # Determine language
657
+ lang = 'javascript'
658
+ if 'type="text/typescript"' in attrs or 'lang="ts"' in attrs:
659
+ lang = 'typescript'
660
+
661
+ # Find line numbers
662
+ start_pos = match.start()
663
+ line_num = text[:start_pos].count('\n') + 1
664
+
665
+ # Chunk the script content
666
+ if content.strip():
667
+ script_chunks = self._chunk_single_language(content, filename, lang)
668
+ if script_chunks:
669
+ for chunk in script_chunks:
670
+ chunk['text'] = f"File: {filename} | In <script> block (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
671
+ chunk['type'] = 'html_script_' + chunk['type']
672
+ chunk['language'] = lang
673
+ chunks.extend(script_chunks)
674
+
675
+ # Extract and chunk style blocks
676
+ for match in style_pattern.finditer(text):
677
+ full_match = match.group(0)
678
+ attrs = match.group(1) or ""
679
+ content = match.group(2)
680
+
681
+ # Determine language
682
+ lang = 'css'
683
+ if 'lang="scss"' in attrs:
684
+ lang = 'css' # Treat SCSS as CSS for now
685
+
686
+ # Find line numbers
687
+ start_pos = match.start()
688
+ line_num = text[:start_pos].count('\n') + 1
689
+
690
+ # Chunk the style content
691
+ if content.strip():
692
+ style_chunks = self._chunk_single_language(content, filename, lang)
693
+ if style_chunks:
694
+ for chunk in style_chunks:
695
+ chunk['text'] = f"File: {filename} | In <style> block (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
696
+ chunk['type'] = 'html_style_' + chunk['type']
697
+ chunk['language'] = lang
698
+ chunks.extend(style_chunks)
699
+
700
+ # Chunk remaining HTML content
701
+ # Remove script and style blocks for HTML-only chunking
702
+ html_only = text
703
+ for match in script_pattern.finditer(text):
704
+ # Calculate line numbers separately to avoid backslash in f-string
705
+ start_line = text[:match.start()].count('\n') + 1
706
+ end_line = text[:match.end()].count('\n') + 1
707
+ html_only = html_only.replace(match.group(0), f"<!-- SCRIPT BLOCK REMOVED (lines {start_line}-{end_line}) -->")
708
+
709
+ for match in style_pattern.finditer(text):
710
+ # Calculate line numbers separately to avoid backslash in f-string
711
+ start_line = text[:match.start()].count('\n') + 1
712
+ end_line = text[:match.end()].count('\n') + 1
713
+ html_only = html_only.replace(match.group(0), f"<!-- STYLE BLOCK REMOVED (lines {start_line}-{end_line}) -->")
714
+
715
+ # Use smart chunking for HTML
716
+ html_chunks = self._chunk_smart_code(html_only, filename)
717
+ if html_chunks:
718
+ for chunk in html_chunks:
719
+ chunk['type'] = 'html_' + chunk['type']
720
+ chunk['language'] = 'html'
721
+ chunks.extend(html_chunks)
722
+
723
+ if not chunks:
724
+ return self._chunk_smart_code(text, filename)
725
+
726
+ logger.info(f"✅ HYBRID HTML PARSED: {filename} into {len(chunks)} mixed-language chunks")
727
+ return chunks
728
+
729
+ def _chunk_vue_file(self, text: str, filename: str) -> List[Dict[str, Any]]:
730
+ """Chunk Vue.js files with template, script, and style sections."""
731
+ chunks = []
732
+
733
+ # Extract template section
734
+ template_match = re.search(r'<template[^>]*>([\s\S]*?)</template>', text)
735
+ if template_match:
736
+ template_content = template_match.group(1)
737
+ # Find line numbers
738
+ start_pos = template_match.start()
739
+ line_num = text[:start_pos].count('\n') + 1
740
+
741
+ # Chunk template (treat as HTML)
742
+ template_chunks = self._chunk_smart_code(template_content, filename)
743
+ if template_chunks:
744
+ for chunk in template_chunks:
745
+ chunk['text'] = f"File: {filename} | Vue Template Section (starting line {line_num})\n{chunk['text']}"
746
+ chunk['type'] = 'vue_template_' + chunk['type']
747
+ chunk['language'] = 'html'
748
+ chunks.extend(template_chunks)
749
+
750
+ # Extract script section
751
+ script_match = re.search(r'<script[^>]*>([\s\S]*?)</script>', text, re.DOTALL)
752
+ if script_match:
753
+ script_content = script_match.group(1)
754
+ attrs = script_match.group(0)[:script_match.group(0).index('>')]
755
+ # Find line numbers
756
+ start_pos = script_match.start()
757
+ line_num = text[:start_pos].count('\n') + 1
758
+
759
+ # Detect language
760
+ lang = 'javascript'
761
+ if 'lang="ts"' in attrs or 'lang="typescript"' in attrs:
762
+ lang = 'typescript'
763
+
764
+ # Chunk script
765
+ script_chunks = self._chunk_single_language(script_content, filename, lang)
766
+ if script_chunks:
767
+ for chunk in script_chunks:
768
+ chunk['text'] = f"File: {filename} | Vue Script Section (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
769
+ chunk['type'] = 'vue_script_' + chunk['type']
770
+ chunk['language'] = lang
771
+ chunks.extend(script_chunks)
772
+
773
+ # Extract style section
774
+ style_match = re.search(r'<style[^>]*>([\s\S]*?)</style>', text, re.DOTALL)
775
+ if style_match:
776
+ style_content = style_match.group(1)
777
+ attrs = style_match.group(0)[:style_match.group(0).index('>')]
778
+ # Find line numbers
779
+ start_pos = style_match.start()
780
+ line_num = text[:start_pos].count('\n') + 1
781
+
782
+ # Detect language
783
+ lang = 'css'
784
+ if 'lang="scss"' in attrs:
785
+ lang = 'css' # Treat SCSS as CSS
786
+
787
+ # Chunk style
788
+ style_chunks = self._chunk_single_language(style_content, filename, lang)
789
+ if style_chunks:
790
+ for chunk in style_chunks:
791
+ chunk['text'] = f"File: {filename} | Vue Style Section (starting line {line_num}) | Language: {lang}\n{chunk['text']}"
792
+ chunk['type'] = 'vue_style_' + chunk['type']
793
+ chunk['language'] = lang
794
+ chunks.extend(style_chunks)
795
+
796
+ if not chunks:
797
+ return self._chunk_smart_code(text, filename)
798
+
799
+ logger.info(f"✅ VUE PARSED: {filename} into {len(chunks)} chunks")
800
+ return chunks
801
+
802
+ def _get_node_types_config(self, language_name: str) -> Dict[str, Any]:
803
+ """Get configuration for what node types to extract for each language."""
804
+ configs = {
805
+ 'python': {
806
+ 'extract': ['function_definition', 'class_definition', 'async_function_definition'],
807
+ 'skip': ['decorated_definition'],
808
+ 'name_fields': ['identifier', 'name'],
809
+ 'context': {'before': 2, 'after': 2}
810
+ },
811
+ 'javascript': {
812
+ 'extract': ['function_declaration', 'method_definition', 'class_declaration',
813
+ 'arrow_function', 'function_expression', 'variable_declaration',
814
+ 'export_statement'],
815
+ 'skip': [],
816
+ 'name_fields': ['identifier', 'name', 'property_identifier'],
817
+ 'context': {'before': 5, 'after': 5}
818
+ },
819
+ 'tsx': {
820
+ 'extract': ['function_declaration', 'method_declaration', 'class_declaration',
821
+ 'arrow_function', 'interface_declaration', 'type_alias_declaration',
822
+ 'enum_declaration', 'export_statement', 'variable_declaration',
823
+ 'lexical_declaration'
824
+ ],
825
+ 'skip': [],
826
+ 'name_fields': ['identifier', 'name', 'type_identifier'],
827
+ 'context': {'before': 2, 'after': 2}
828
+ },
829
+ 'java': {
830
+ 'extract': ['method_declaration', 'class_declaration', 'interface_declaration',
831
+ 'constructor_declaration'],
832
+ 'skip': [],
833
+ 'name_fields': ['identifier'],
834
+ 'context': {'before': 2, 'after': 2}
835
+ },
836
+ 'cpp': {
837
+ 'extract': ['function_definition', 'class_specifier', 'struct_specifier',
838
+ 'namespace_definition'],
839
+ 'skip': [],
840
+ 'name_fields': ['identifier', 'type_identifier'],
841
+ 'context': {'before': 2, 'after': 2}
842
+ },
843
+ 'c': {
844
+ 'extract': ['function_definition', 'struct_specifier', 'declaration'],
845
+ 'skip': [],
846
+ 'name_fields': ['identifier'],
847
+ 'context': {'before': 2, 'after': 2}
848
+ },
849
+ 'go': {
850
+ 'extract': ['function_declaration', 'method_declaration', 'type_declaration'],
851
+ 'skip': [],
852
+ 'name_fields': ['identifier'],
853
+ 'context': {'before': 2, 'after': 2}
854
+ },
855
+ 'rust': {
856
+ 'extract': ['function_item', 'impl_item', 'struct_item', 'trait_item',
857
+ 'enum_item', 'mod_item'],
858
+ 'skip': [],
859
+ 'name_fields': ['identifier'],
860
+ 'context': {'before': 2, 'after': 2}
861
+ },
862
+ 'html': {
863
+ 'extract': ['element', 'script_element', 'style_element'],
864
+ 'skip': ['text'],
865
+ 'name_fields': ['tag_name'],
866
+ 'context': {'before': 1, 'after': 1}
867
+ },
868
+ 'css': {
869
+ 'extract': ['rule_set', 'at_rule'],
870
+ 'skip': [],
871
+ 'name_fields': [],
872
+ 'context': {'before': 1, 'after': 1}
873
+ },
874
+ 'sql': {
875
+ 'extract': ['select_statement', 'insert_statement', 'update_statement',
876
+ 'delete_statement', 'create_statement'],
877
+ 'skip': [],
878
+ 'name_fields': [],
879
+ 'context': {'before': 1, 'after': 1}
880
+ }
881
+ }
882
+
883
+ return configs.get(language_name, {
884
+ 'extract': ['function_definition', 'class_definition'],
885
+ 'skip': [],
886
+ 'name_fields': ['identifier', 'name'],
887
+ 'context': {'before': 2, 'after': 2}
888
+ })
889
+
890
+ def _extract_node_name(self, node, text_bytes: bytes, name_fields: List[str]) -> str:
891
+ """Extract the name/identifier from a node."""
892
+ for field in name_fields:
893
+ for child in node.children:
894
+ if child.type == field:
895
+ return text_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
896
+
897
+ # Try to find any identifier
898
+ for child in node.children:
899
+ if 'identifier' in child.type or 'name' in child.type:
900
+ return text_bytes[child.start_byte:child.end_byte].decode('utf-8', errors='ignore')
901
+
902
+ return ""
903
+
904
+ def _extract_imports(self, root_node, text_bytes: bytes, language_name: str, filename: str) -> List[Dict[str, Any]]:
905
+ """Extract import statements from the code."""
906
+ import_chunks = []
907
+
908
+ import_types = {
909
+ 'python': ['import_statement', 'import_from_statement'],
910
+ 'javascript': ['import_statement', 'import_declaration'],
911
+ 'typescript': ['import_statement', 'import_declaration'],
912
+ 'java': ['import_declaration'],
913
+ 'cpp': ['preproc_include'],
914
+ 'rust': ['use_declaration'],
915
+ 'go': ['import_declaration'],
916
+ 'php': ['use_declaration'],
917
+ 'c_sharp': ['using_directive']
918
+ }
919
+
920
+ target_types = import_types.get(language_name, [])
921
+
922
+ def collect_imports(node):
923
+ if node.type in target_types:
924
+ import_text = text_bytes[node.start_byte:node.end_byte].decode('utf-8', errors='ignore')
925
+ if import_text:
926
+ import_chunks.append({
927
+ "text": f"File: {filename} | Import Statement:\n{import_text}",
928
+ "type": "code_imports",
929
+ "name": "imports",
930
+ "line_start": node.start_point[0] + 1,
931
+ "line_end": node.end_point[0] + 1,
932
+ "language": language_name
933
+ })
934
+
935
+ for child in node.children:
936
+ collect_imports(child)
937
+
938
+ collect_imports(root_node)
939
+
940
+ # Group imports if there are many
941
+ if len(import_chunks) > 5:
942
+ import_texts = []
943
+ for chunk in import_chunks:
944
+ # Extract just the import statement from the chunk text
945
+ import_lines = chunk['text'].split('\n', 1)
946
+ if len(import_lines) > 1:
947
+ import_texts.append(import_lines[1])
948
+
949
+ return [{
950
+ "text": f"File: {filename} | Import Statements:\n" + "\n".join(import_texts[:10]) +
951
+ (f"\n... and {len(import_texts) - 10} more" if len(import_texts) > 10 else ""),
952
+ "type": "code_imports",
953
+ "name": "imports_grouped",
954
+ "language": language_name
955
+ }]
956
+
957
+ return import_chunks
958
+
959
+ def _fallback_chunking(self, text: str, filename: str) -> List[Dict[str, Any]]:
960
+ """Fallback chunking method when tree-sitter fails."""
961
+ ext = os.path.splitext(filename)[1].lower()
962
+
963
+ if ext == '.py':
964
+ return self._chunk_python_ast_enhanced(text, filename)
965
+ elif ext in ['.js', '.jsx', '.ts', '.tsx', '.java', '.cpp', '.c', '.html', '.css', '.vue']:
966
+ return self._chunk_smart_code(text, filename)
967
+ else:
968
+ return self._chunk_text_enhanced(text)
969
+
970
  def delete_file(self, user_id: str, chat_id: str, file_id: str) -> bool:
971
  """Surgical Strike: Remove chunks belonging to a specific file ID"""
972
  with self.memory_lock:
 
1142
  results = self._semantic_first_fusion(semantic_results, bm25_results, final_k)
1143
  elif strategy == "fusion":
1144
  results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
1145
+
 
1146
  else:
1147
  # Default to fusion
1148
  results = self._reciprocal_rank_fusion(bm25_results, semantic_results, final_k)
 
1211
 
1212
  # ==================== CORE METHODS (PRESERVED WITH FIXES) ====================
1213
 
1214
+ def _chunk_python_ast_enhanced(self, text: str, filename: str) -> List[Dict[str, Any]]:
 
1215
  chunks = []
1216
  try:
1217
  tree = ast.parse(text)
1218
  lines = text.splitlines()
1219
 
1220
+ # Helper to extract exact source including decorators
1221
+ def get_source_segment(node):
1222
+ # 1. Find start line (check decorators first)
1223
+ start_lineno = node.lineno
1224
+ if hasattr(node, 'decorator_list') and node.decorator_list:
1225
+ start_lineno = node.decorator_list[0].lineno
1226
+
1227
+ # 2. Add minimal context buffer (1 line)
1228
+ start_idx = max(0, start_lineno - 2)
1229
+ end_idx = getattr(node, 'end_lineno', start_lineno) + 1
1230
+
1231
+ return "\n".join(lines[start_idx:end_idx]), start_idx, end_idx
1232
+
1233
+ # Recursive visitor to flatten nested structures
1234
+ class CodeVisitor(ast.NodeVisitor):
1235
+ def visit_FunctionDef(self, node):
1236
+ self._add_chunk(node, "function")
1237
+ # Do NOT generic_visit chunks we've already handled to avoid duplicates
1238
+ # But DO visit nested functions if needed (optional)
1239
+
1240
+ def visit_AsyncFunctionDef(self, node):
1241
+ self._add_chunk(node, "async_function")
1242
+
1243
+ def visit_ClassDef(self, node):
1244
+ # 1. Create a "Summary Chunk" for the class definition (docstring + init)
1245
+ class_header, start, _ = get_source_segment(node)
1246
+ # Truncate body for the summary
1247
+ summary_text = f"Class Definition: {node.name}\n" + "\n".join(class_header.splitlines()[:10])
1248
+
1249
+ chunks.append({
1250
+ "text": f"File: {filename} | Type: class_def | Name: {node.name}\n{summary_text}",
1251
+ "type": "code_class",
1252
+ "name": node.name,
1253
+ "line_start": start
1254
+ })
1255
 
1256
+ # 2. Recursively visit children (methods)
1257
+ self.generic_visit(node)
1258
+
1259
+ def _add_chunk(self, node, type_label):
1260
+ content, start, end = get_source_segment(node)
1261
+ # Enforce context window limits here if needed
1262
  chunks.append({
1263
+ "text": f"File: {filename} | Type: {type_label} | Name: {node.name}\n{content}",
1264
+ "type": f"code_{type_label}",
1265
  "name": node.name,
1266
  "line_start": start,
1267
  "line_end": end
1268
  })
1269
+
1270
+ # Run the visitor
1271
+ CodeVisitor().visit(tree)
 
 
1272
 
1273
+ # Capture Globals (Imports, Constants, Main Guard)
1274
+ global_context = []
1275
+ for node in tree.body:
1276
+ if isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.If)):
1277
+ # Only capture short logic blocks, skip giant if-blocks
1278
+ segment, _, _ = get_source_segment(node)
1279
+ if len(segment) < 500:
1280
+ global_context.append(segment)
1281
+
1282
  if global_context:
1283
+ chunks.insert(0, {
1284
+ "text": f"File: {filename} | Global Context\n" + "\n".join(global_context),
1285
+ "type": "code_globals",
1286
+ "name": "globals"
1287
+ })
1288
+
 
 
1289
  except Exception as e:
1290
+ logger.warning(f"AST Parsing failed: {e}")
1291
+ return self._chunk_text_enhanced(text) # Fallback
1292
+
 
 
 
1293
  return chunks
1294
 
1295
  def _chunk_smart_code(self, text: str, filename: str) -> List[Dict[str, Any]]:
 
1303
  '.htm': r'(?=\n\s*<[^/])',
1304
  '.xml': r'(?=\n\s*<[^/])',
1305
  '.vue': r'(?=\n\s*<[^/])',
1306
+ '.js': r'(?=\n\s*(?:function|class|export|import|async|def))',
1307
+ '.jsx': r'(?=\n\s*(?:function|class|export|import|async|def))',
1308
+ '.ts': r'(?=\n\s*(?:function|class|export|import|async|interface|type|def))',
1309
+ '.tsx': r'(?=\n\s*(?:function|class|export|import|async|interface|type|def))',
1310
  '.css': r'(?=\n\s*[.#@a-zA-Z])',
1311
  '.scss': r'(?=\n\s*[.#@a-zA-Z])',
1312
  '.java': r'(?=\n\s*(?:public|private|protected|class|interface|enum|@))',
 
1324
 
1325
  # Process with CONTEXT OVERLAP for better retrieval
1326
  current_chunk = ""
1327
+ TARGET_SIZE = 1900
1328
  OVERLAP_SIZE = 100
1329
 
1330
  for seg_idx, seg in enumerate(segments):
 
1417
 
1418
  return chunks
1419
 
1420
+
1421
+
1422
  # ==================== HELPER METHODS FOR HYBRID SEARCH ====================
1423
 
1424
  def _classify_query(self, query: str) -> str:
 
1492
 
1493
  return results[:final_k]
1494
 
1495
+ def _reciprocal_rank_fusion(self, results1: List[Dict[str, Any]], results2: List[Dict[str, Any]],
1496
+ final_k: int, k: int = 60) -> List[Dict[str, Any]]:
1497
+ """
1498
+ Robust RRF Fusion for hybrid search (BM25 + Semantic).
1499
+ Prioritizes BM25 metadata (results1) on overlaps for keyword precision.
1500
+ Handles empty lists/duplicates gracefully; O(n log n) efficient.
1501
+ """
1502
+ merged_scores = defaultdict(float)
1503
+ merged_meta: Dict[str, Dict[str, Any]] = {}
1504
+
1505
+ # Process semantic (results2) first
1506
+ for rank, item in enumerate(results2):
1507
+ doc_id = item.get("id")
1508
+ if doc_id is None:
1509
+ continue # Skip invalid
1510
+ score = 1.0 / (rank + k)
1511
+ merged_scores[doc_id] += score
1512
+ merged_meta[doc_id] = item.copy() # Avoid mutating input
1513
+
1514
+ # Process BM25 (results1) second: overwrites meta for precision
1515
+ for rank, item in enumerate(results1):
1516
+ doc_id = item.get("id")
1517
+ if doc_id is None:
1518
+ continue
1519
+ score = 1.0 / (rank + k)
1520
+ merged_scores[doc_id] += score
1521
+ merged_meta[doc_id] = item.copy()
1522
+
1523
+ # Sort by descending RRF score
1524
+ sorted_ids = sorted(merged_scores, key=merged_scores.get, reverse=True)
1525
+
1526
+ # Package top-k
1527
+ final_results = []
1528
+ for doc_id in sorted_ids[:final_k]:
1529
+ if doc_id in merged_meta:
1530
+ res = merged_meta[doc_id].copy()
1531
+ res["score"] = merged_scores[doc_id]
1532
+ res["match_type"] = "hybrid_rrf"
1533
+ final_results.append(res)
1534
+
1535
+ return final_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1536
 
1537
  def _smart_rerank(self, query: str, candidates: List[Dict], final_k: int) -> List[Dict]:
1538
  """Smart reranking using cross-encoder"""