Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import faiss | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import List, Dict, Any | |
| from sentence_transformers import SentenceTransformer | |
| # Initialize embedding model | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Directory to store FAISS indexes | |
| INDEXES_DIR = Path('faiss_indexes') | |
| INDEXES_DIR.mkdir(exist_ok=True) | |
| def _sanitize_repo_url(repo_url: str) -> str: | |
| """Sanitize repository URL to create a valid filename.""" | |
| return repo_url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_').replace(':', '_')[:63] | |
| def store_docs(repo_url: str, docs_list: List[Dict[str, Any]]) -> None: | |
| """ | |
| Stores generated documentation in FAISS with sentence-transformers embeddings. | |
| Args: | |
| repo_url: The repository URL (used for index filename) | |
| docs_list: List of dicts containing 'filename' and 'documentation' | |
| """ | |
| repo_slug = _sanitize_repo_url(repo_url) | |
| print(f"[STORE_DOCS] Starting storage for repo: {repo_url}") | |
| print(f"[STORE_DOCS] Repo slug: {repo_slug}") | |
| # Extract all documentation text | |
| doc_texts = [] | |
| doc_metadata = [] | |
| for doc_item in docs_list: | |
| text = doc_item['documentation'] | |
| filename = doc_item['filename'] | |
| doc_texts.append(text) | |
| doc_metadata.append({'filename': filename, 'repo_url': repo_url}) | |
| print(f"[STORE_DOCS] Extracted {len(doc_texts)} documents") | |
| if not doc_texts: | |
| print(f"[STORE_DOCS] No documents to store, returning early") | |
| return | |
| # Generate embeddings for all documents | |
| print(f"[STORE_DOCS] Generating embeddings for {len(doc_texts)} documents...") | |
| embeddings = model.encode(doc_texts, convert_to_numpy=True) | |
| print(f"[STORE_DOCS] Embeddings shape: {embeddings.shape}") | |
| # Create FAISS index | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings.astype(np.float32)) | |
| print(f"[STORE_DOCS] FAISS index created with {index.ntotal} vectors") | |
| # Save FAISS index | |
| index_path = INDEXES_DIR / f'{repo_slug}.index' | |
| faiss.write_index(index, str(index_path)) | |
| print(f"[STORE_DOCS] FAISS index saved to: {index_path}") | |
| # Save text chunks and metadata | |
| texts_path = INDEXES_DIR / f'{repo_slug}_texts.json' | |
| with open(texts_path, 'w') as f: | |
| json.dump({ | |
| 'texts': doc_texts, | |
| 'metadata': doc_metadata, | |
| 'repo_url': repo_url | |
| }, f) | |
| print(f"[STORE_DOCS] Metadata saved to: {texts_path}") | |
| print(f"[STORE_DOCS] Storage complete!") | |
| def update_docs(repo_url: str, updated_docs_list: List[Dict[str, Any]]) -> None: | |
| """ | |
| Updates documentation for specific files in an existing FAISS index. | |
| Loads existing index, updates entries for the specified files, and rebuilds. | |
| Args: | |
| repo_url: The repository URL | |
| updated_docs_list: List of dicts with 'filename' and 'documentation' to update | |
| """ | |
| repo_slug = _sanitize_repo_url(repo_url) | |
| print(f"[UPDATE_DOCS] Starting update for repo: {repo_url}") | |
| print(f"[UPDATE_DOCS] Updating {len(updated_docs_list)} files") | |
| index_path = INDEXES_DIR / f'{repo_slug}.index' | |
| texts_path = INDEXES_DIR / f'{repo_slug}_texts.json' | |
| # Load existing index and texts | |
| if not index_path.exists() or not texts_path.exists(): | |
| print(f"[UPDATE_DOCS] No existing index found, creating new one") | |
| # No existing index, create new one with these docs | |
| store_docs(repo_url, updated_docs_list) | |
| return | |
| try: | |
| # Load existing data | |
| with open(texts_path, 'r') as f: | |
| data = json.load(f) | |
| existing_texts = data['texts'] | |
| existing_metadata = data['metadata'] | |
| print(f"[UPDATE_DOCS] Loaded {len(existing_texts)} existing documents") | |
| # Get filenames being updated | |
| updated_filenames = {doc['filename'] for doc in updated_docs_list} | |
| # Remove entries for files being updated | |
| filtered_texts = [] | |
| filtered_metadata = [] | |
| for text, meta in zip(existing_texts, existing_metadata): | |
| if meta.get('filename') not in updated_filenames: | |
| filtered_texts.append(text) | |
| filtered_metadata.append(meta) | |
| print(f"[UPDATE_DOCS] After filtering: {len(filtered_texts)} documents remain") | |
| # Add new entries | |
| for doc in updated_docs_list: | |
| filtered_texts.append(doc['documentation']) | |
| filtered_metadata.append({ | |
| 'filename': doc['filename'], | |
| 'repo_url': repo_url | |
| }) | |
| print(f"[UPDATE_DOCS] After adding updates: {len(filtered_texts)} documents total") | |
| if not filtered_texts: | |
| print(f"[UPDATE_DOCS] No documents remaining, returning early") | |
| return | |
| # Rebuild FAISS index | |
| print(f"[UPDATE_DOCS] Rebuilding FAISS index...") | |
| embeddings = model.encode(filtered_texts, convert_to_numpy=True) | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings.astype(np.float32)) | |
| print(f"[UPDATE_DOCS] FAISS index rebuilt with {index.ntotal} vectors") | |
| # Save updated index | |
| faiss.write_index(index, str(index_path)) | |
| print(f"[UPDATE_DOCS] Updated FAISS index saved to: {index_path}") | |
| # Save updated texts | |
| with open(texts_path, 'w') as f: | |
| json.dump({ | |
| 'texts': filtered_texts, | |
| 'metadata': filtered_metadata, | |
| 'repo_url': repo_url | |
| }, f) | |
| print(f"[UPDATE_DOCS] Updated metadata saved to: {texts_path}") | |
| print(f"[UPDATE_DOCS] Update complete!") | |
| except Exception as e: | |
| print(f"Error updating index for {repo_url}: {e}") | |
| raise | |
| def search_docs(repo_url: str, question: str, num_results: int = 3) -> List[Dict[str, Any]]: | |
| """ | |
| Searches stored documentation using semantic similarity. | |
| Args: | |
| repo_url: The repository URL | |
| question: Natural language question to search for | |
| num_results: Number of top results to return (default: 3) | |
| Returns: | |
| List of dicts containing 'document', 'filename', and 'distance' | |
| """ | |
| repo_slug = _sanitize_repo_url(repo_url) | |
| print(f"[SEARCH_DOCS] Searching for: '{question}' in repo: {repo_url}") | |
| # Check if index exists | |
| index_path = INDEXES_DIR / f'{repo_slug}.index' | |
| texts_path = INDEXES_DIR / f'{repo_slug}_texts.json' | |
| if not index_path.exists() or not texts_path.exists(): | |
| print(f"[SEARCH_DOCS] ERROR: No index found for repo. Please generate documentation first.") | |
| print(f"[SEARCH_DOCS] Expected paths: {index_path} and {texts_path}") | |
| return [] | |
| try: | |
| # Load FAISS index | |
| print(f"[SEARCH_DOCS] Loading FAISS index from: {index_path}") | |
| index = faiss.read_index(str(index_path)) | |
| print(f"[SEARCH_DOCS] FAISS index loaded with {index.ntotal} vectors") | |
| # Load text chunks | |
| with open(texts_path, 'r') as f: | |
| data = json.load(f) | |
| texts = data['texts'] | |
| metadata = data['metadata'] | |
| print(f"[SEARCH_DOCS] Loaded {len(texts)} text documents and metadata") | |
| except Exception as e: | |
| print(f"[SEARCH_DOCS] Error loading index for {repo_url}: {e}") | |
| return [] | |
| # Embed the question | |
| print(f"[SEARCH_DOCS] Encoding question embedding...") | |
| question_embedding = model.encode([question], convert_to_numpy=True) | |
| # Search the index | |
| print(f"[SEARCH_DOCS] Searching for top {min(num_results, len(texts))} results...") | |
| distances, indices = index.search(question_embedding.astype(np.float32), min(num_results, len(texts))) | |
| # Format results | |
| formatted_results = [] | |
| for idx, distance in zip(indices[0], distances[0]): | |
| if idx == -1: # No valid result | |
| continue | |
| formatted_results.append({ | |
| 'document': texts[idx], | |
| 'filename': metadata[idx].get('filename', 'unknown'), | |
| 'distance': float(distance) | |
| }) | |
| print(f"[SEARCH_DOCS] Found {len(formatted_results)} results") | |
| return formatted_results | |