Spaces:

Rajhuggingface4253
/

grammar

Running

App Files Files Community

Rajhuggingface4253 commited on Dec 26, 2025

Commit

adbab41

verified ·

1 Parent(s): 05eba13

Update vector.py

Browse files

Files changed (1) hide show

vector.py +199 -56

vector.py CHANGED Viewed

@@ -97,45 +97,123 @@ class VectorDatabase:
                         except:
                             pass
     def _chunk_python_code(self, text, filename):
-        """Smart Python code chunking using AST"""
         chunks = []
         try:
             tree = ast.parse(text)
             lines = text.splitlines()
-            # Extract global context
             global_context = []
-            for node in tree.body:
-                if isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign)):
-                    start = node.lineno - 1
-                    end = node.end_lineno
-                    global_context.append("\n".join(lines[start:end]))
-            if global_context:
-                chunks.append({
-                    "text": "\n".join(global_context),
-                    "type": "code_context",
-                    "name": "Imports & Globals"
-                })
-            # Extract functions & classes
             for node in tree.body:
                 if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
                     start = node.lineno - 1
                     end = node.end_lineno
-                    block_content = "\n".join(lines[start:end])
                     chunks.append({
-                        "text": block_content,
                         "type": "code_function",
                         "name": node.name
                     })
         except Exception as e:
             logger.warning(f"AST parsing failed for {filename}: {e}")
             return self._chunk_text_standard(text)
         return chunks
     def _chunk_text_standard(self, text, chunk_size=500, overlap=50):
@@ -163,7 +241,7 @@ class VectorDatabase:
         return chunks
     def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str):
-        """Store extracted file content with user session isolation"""
         if not text or len(text) < 10 or not user_id:
             logger.warning(f"Invalid input for {filename}")
             return False
@@ -177,13 +255,13 @@ class VectorDatabase:
         try:
             if ext == '.py':
                 chunks_data = self._chunk_python_code(text, filename)
-            elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx']:
-                chunks_data = self._chunk_text_standard(text, chunk_size=800, overlap=100)
             else:
                 chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
         except Exception as e:
             logger.error(f"Chunking failed for {filename}: {e}")
-            # Fallback to simple chunking
             chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
         # Ensure we have chunks
@@ -202,6 +280,7 @@ class VectorDatabase:
         final_texts = []
         final_meta = []
         for chunk in chunks_data:
             final_texts.append(chunk["text"])
             final_meta.append({
@@ -215,6 +294,21 @@ class VectorDatabase:
                 "timestamp": time.time(),
                 "chunk_index": len(final_texts)
             })
         # Embed and add to index
         try:
@@ -232,7 +326,7 @@ class VectorDatabase:
             logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
-            # Verify storage
             self._verify_storage(user_id, len(final_texts))
             return True
@@ -307,17 +401,16 @@ class VectorDatabase:
     def retrieve_session_context(self, query: str, user_id: str, chat_id: str, filter_type: str = None, top_k=100, final_k=5, min_score=0.25):
         """
-        Retrieve context only from the user's SPECIFIC session.
-        Includes a 'min_score' threshold to filter out irrelevant noise.
         """
         if self.index.ntotal == 0 or not user_id:
             logger.warning(f"Empty index or missing user_id. Index size: {self.index.ntotal}")
             return []
-        # Debug: show current state
         with self.memory_lock:
             total_vectors = self.index.ntotal
-            total_metadata = len(self.metadata)
             user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
         logger.info(f"🔍 Searching for user {user_id[:8]} (User vectors: {user_vectors}/{total_vectors})")
@@ -326,39 +419,45 @@ class VectorDatabase:
         query_vec = self.embedder.encode([query])
         faiss.normalize_L2(query_vec)
-        # Search (we search more than needed to account for filtering)
         search_k = min(top_k * 3, self.index.ntotal) if self.index.ntotal > 0 else 1
         with self.memory_lock:
             D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
-        # Process results
         candidates = []
         valid_count = 0
         for i, idx in enumerate(I[0]):
-            if idx == -1 or idx >= len(self.metadata):
-                continue
             item = self.metadata[idx]
-            # 1. STRICT ISOLATION FILTERING
-            if item.get("user_id") != user_id:
-                continue
-            if item.get("chat_id") != chat_id:
-                continue
-            if filter_type and item.get("type") != filter_type:
-                continue
-            # 2. SCORE CORRECTION (CRITICAL FIX)
-            # Since we use IndexFlatIP with normalized vectors, D[0][i] IS the cosine similarity.
-            # Do NOT subtract from 1.0.
             score = D[0][i]
-            # 3. THE GATEKEEPER (Prevents Hallucinations)
-            # If the similarity is too low (noise), we discard it immediately.
-            if score < min_score:
-                continue
             candidates.append({
                 "id": int(idx),
@@ -367,32 +466,76 @@ class VectorDatabase:
                 "score": score
             })
             valid_count += 1
         logger.info(f"📊 Found {valid_count} candidates above threshold {min_score}")
-        if not candidates:
-            return []
         # Rerank with FlashRank
         try:
             rerank_request = RerankRequest(query=query, passages=candidates)
             results = self.ranker.rerank(rerank_request)
-            # Filter Reranked Results (Double Safety)
-            # Sometimes vectors are okay but semantic meaning is still weak.
-            # We keep only the top K that also pass the score check.
             final_results = [r for r in results[:final_k] if r['score'] > min_score]
-            logger.info(f"🎯 Reranked to {len(final_results)} results (scores: {[round(r['score'], 3) for r in final_results]})")
             return final_results
         except Exception as e:
             logger.error(f"Reranking failed: {e}")
-            # Fallback: return top candidates by vector similarity
-            candidates.sort(key=lambda x: x["score"], reverse=True)
             return candidates[:final_k]
     def get_user_stats(self, user_id: str):
         """Get statistics for a user's session"""
         with self.memory_lock:

                         except:
                             pass
+    def _chunk_smart_code(self, text, filename):
+        """
+        Structure-aware chunker for JS, HTML, CSS, etc.
+        Splits by logical boundaries (tags, functions) instead of random characters.
+        """
+        ext = os.path.splitext(filename)[1].lower()
+        chunks = []
+        # Define split patterns for different languages
+        patterns = {
+            # HTML/XML: Split before opening tags, effectively keeping tags grouped
+            '.html': r'(?=\n\s*<[^/])',
+            '.htm': r'(?=\n\s*<[^/])',
+            '.xml': r'(?=\n\s*<[^/])',
+            '.vue': r'(?=\n\s*<[^/])',
+            # JS/TS: Split before major keywords
+            '.js': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
+            '.jsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async))',
+            '.ts': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
+            '.tsx': r'(?=\n\s*(?:function|class|const|let|var|export|import|async|interface|type))',
+            # CSS: Split before selectors
+            '.css': r'(?=\n\s*[.#@a-zA-Z])',
+            '.scss': r'(?=\n\s*[.#@a-zA-Z])',
+        }
+        pattern = patterns.get(ext)
+        # Fallback to standard if no pattern matches or regex fails
+        if not pattern:
+            return self._chunk_text_standard(text)
+        try:
+            # 1. Split by pattern
+            segments = re.split(pattern, text)
+            # 2. Re-group segments into chunks of appropriate size (e.g., 1000 chars)
+            current_chunk = ""
+            TARGET_SIZE = 1000
+            for seg in segments:
+                if not seg.strip(): continue
+                # If adding this segment exceeds target, save current and start new
+                if len(current_chunk) + len(seg) > TARGET_SIZE and len(current_chunk) > 100:
+                    chunks.append({
+                        "text": current_chunk.strip(),
+                        "type": "code_block",
+                        "name": f"block_{len(chunks)}"
+                    })
+                    current_chunk = seg
+                else:
+                    current_chunk += seg
+            # Add final chunk
+            if current_chunk:
+                chunks.append({
+                    "text": current_chunk.strip(),
+                    "type": "code_block",
+                    "name": f"block_{len(chunks)}"
+                })
+            return chunks
+        except Exception as e:
+            logger.warning(f"Smart chunking failed for {filename}: {e}. Falling back.")
+            return self._chunk_text_standard(text)
     def _chunk_python_code(self, text, filename):
+        """Improved AST chunker that captures EVERYTHING (not just functions)"""
         chunks = []
         try:
             tree = ast.parse(text)
             lines = text.splitlines()
+            # 1. Global Context (Imports & Assignments)
             global_context = []
+            # 2. Iterate nodes to find blocks
             for node in tree.body:
                 if isinstance(node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)):
+                    # Extract the block
                     start = node.lineno - 1
                     end = node.end_lineno
+                    block_text = "\n".join(lines[start:end])
                     chunks.append({
+                        "text": block_text,
                         "type": "code_function",
                         "name": node.name
                     })
+                elif isinstance(node, (ast.Import, ast.ImportFrom, ast.Assign, ast.Expr)):
+                    # Group top-level scripts/imports together
+                    # We approximate by grabbing the line
+                    if hasattr(node, 'end_lineno'):
+                        start = node.lineno - 1
+                        end = node.end_lineno
+                        global_context.append("\n".join(lines[start:end]))
+            # Add the collected global context as the first chunk
+            if global_context:
+                # Group globals into chunks of 1000 chars
+                full_global = "\n".join(global_context)
+                if len(full_global) > 100:
+                    chunks.insert(0, {
+                        "text": full_global[:1500], # Cap context size
+                        "type": "code_context",
+                        "name": "imports_and_globals"
+                    })
         except Exception as e:
             logger.warning(f"AST parsing failed for {filename}: {e}")
             return self._chunk_text_standard(text)
+        # Fallback: if AST yielded nothing (e.g. empty file), use standard
+        if not chunks:
+            return self._chunk_text_standard(text)
         return chunks
     def _chunk_text_standard(self, text, chunk_size=500, overlap=50):
         return chunks
     def store_session_document(self, text: str, filename: str, user_id: str, chat_id: str):
+        """Store extracted file content with 'Whole File' capability & Verification"""
         if not text or len(text) < 10 or not user_id:
             logger.warning(f"Invalid input for {filename}")
             return False
         try:
             if ext == '.py':
                 chunks_data = self._chunk_python_code(text, filename)
+            elif ext in ['.js', '.html', '.css', '.java', '.cpp', '.ts', '.tsx', '.jsx', '.vue', '.xml']:
+                # Use Smart Regex Chunking
+                chunks_data = self._chunk_smart_code(text, filename)
             else:
                 chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
         except Exception as e:
             logger.error(f"Chunking failed for {filename}: {e}")
             chunks_data = self._chunk_text_standard(text, chunk_size=500, overlap=50)
         # Ensure we have chunks
         final_texts = []
         final_meta = []
+        # 1. Process Standard Chunks
         for chunk in chunks_data:
             final_texts.append(chunk["text"])
             final_meta.append({
                 "timestamp": time.time(),
                 "chunk_index": len(final_texts)
             })
+        # 2. Add "Whole File" Entry (CRITICAL FOR FULL FILE RETRIEVAL)
+        # We embed a marker text, but store the ACTUAL content in metadata.
+        marker_text = f"Entire full content of file {filename} code"
+        final_texts.append(marker_text)
+        final_meta.append({
+            "text": marker_text,        # Marker for search
+            "actual_content": text,     # <<< THE FULL CONTENT
+            "source": filename,
+            "type": "whole_file",       # Special type
+            "user_id": user_id,
+            "chat_id": chat_id,
+            "timestamp": time.time(),
+            "chunk_index": -1
+        })
         # Embed and add to index
         try:
             logger.info(f"✅ Stored {len(final_texts)} chunks from {filename} for user {user_id[:8]}")
+            # Verify storage (Self-Check)
             self._verify_storage(user_id, len(final_texts))
             return True
     def retrieve_session_context(self, query: str, user_id: str, chat_id: str, filter_type: str = None, top_k=100, final_k=5, min_score=0.25):
         """
+        Retrieve context with Filename Ranking Logic.
+        If user asks for a specific file, returns the WHOLE content.
         """
         if self.index.ntotal == 0 or not user_id:
             logger.warning(f"Empty index or missing user_id. Index size: {self.index.ntotal}")
             return []
+        # Debug info
         with self.memory_lock:
             total_vectors = self.index.ntotal
             user_vectors = sum(1 for m in self.metadata if m.get("user_id") == user_id)
         logger.info(f"🔍 Searching for user {user_id[:8]} (User vectors: {user_vectors}/{total_vectors})")
         query_vec = self.embedder.encode([query])
         faiss.normalize_L2(query_vec)
+        # Search
         search_k = min(top_k * 3, self.index.ntotal) if self.index.ntotal > 0 else 1
         with self.memory_lock:
             D, I = self.index.search(np.array(query_vec).astype('float32'), search_k)
         candidates = []
         valid_count = 0
+        query_lower = query.lower()
         for i, idx in enumerate(I[0]):
+            if idx == -1 or idx >= len(self.metadata): continue
             item = self.metadata[idx]
+            # 1. STRICT ISOLATION
+            if item.get("user_id") != user_id: continue
+            if item.get("chat_id") != chat_id: continue
+            if filter_type and item.get("type") != filter_type: continue
             score = D[0][i]
+            # 2. WHOLE FILE RANKING (The Missing Piece)
+            # If this is a "whole_file" marker AND the filename is in the query...
+            filename = item.get("source", "").lower()
+            is_whole_file = item.get("type") == "whole_file"
+            if is_whole_file:
+                # If user specifically asked for this file (e.g. "read index.html")
+                if filename in query_lower:
+                    score = 2.0 # Force to top (override similarity)
+                # Replace the "marker text" with the ACTUAL full content
+                # This ensures the LLM gets the real code
+                if item.get("actual_content"):
+                    item = item.copy() # Don't mutate original metadata
+                    item["text"] = item["actual_content"]
+            # 3. GATEKEEPER (Noise Filter)
+            if score < min_score: continue
             candidates.append({
                 "id": int(idx),
                 "score": score
             })
             valid_count += 1
         logger.info(f"📊 Found {valid_count} candidates above threshold {min_score}")
+        if not candidates: return []
+        # Sort manually first (to handle our forced 2.0 scores)
+        candidates.sort(key=lambda x: x["score"], reverse=True)
+        # Optimization: If we found a forced match (Whole File), return immediately
+        # We don't need to rerank if we know exactly what the user wanted.
+        if candidates[0]["score"] >= 2.0:
+             logger.info(f"🎯 Returning Whole File: {candidates[0]['meta'].get('source')}")
+             return candidates[:1]
         # Rerank with FlashRank
         try:
             rerank_request = RerankRequest(query=query, passages=candidates)
             results = self.ranker.rerank(rerank_request)
+            # Filter low quality rerank results
             final_results = [r for r in results[:final_k] if r['score'] > min_score]
+            logger.info(f"🎯 Reranked to {len(final_results)} results")
             return final_results
         except Exception as e:
             logger.error(f"Reranking failed: {e}")
             return candidates[:final_k]
+    def delete_session(self, user_id: str, chat_id: str):
+        """Surgical Strike: Permanently remove ONLY one specific session"""
+        with self.memory_lock:
+            # 1. Filter: Keep everything that is NOT this specific chat
+            new_metadata = []
+            removed_count = 0
+            for meta in self.metadata:
+                # Check strict ownership and ID match
+                if meta.get("user_id") == user_id and meta.get("chat_id") == chat_id:
+                    removed_count += 1
+                else:
+                    new_metadata.append(meta)
+            if removed_count == 0:
+                return False # Nothing to delete
+            logger.info(f"🧹 Surgically removing {removed_count} vectors for session {chat_id}...")
+            # 2. Rebuild Index (Required for FAISS IndexFlatIP)
+            if not new_metadata:
+                self.index = faiss.IndexFlatIP(384) # Reset empty
+            else:
+                # Re-embed surviving text to rebuild index
+                # (Optimization: In a huge DB, use IndexIDMap, but for now this is safe)
+                surviving_texts = [m["text"] for m in new_metadata]
+                try:
+                    embeddings = self.embedder.encode(surviving_texts)
+                    faiss.normalize_L2(embeddings)
+                    new_index = faiss.IndexFlatIP(384)
+                    new_index.add(np.array(embeddings).astype('float32'))
+                    self.index = new_index
+                except Exception as e:
+                    logger.error(f"Rebuild failed: {e}")
+                    return False
+            self.metadata = new_metadata
+            self._save_index()
+            return True
     def get_user_stats(self, user_id: str):
         """Get statistics for a user's session"""
         with self.memory_lock: