Spaces:

Manisankarrr
/

RepoAnalyzer

Sleeping

App Files Files Community

RepoAnalyzer / backend /vector_store.py

Manisankarrr

project completed

f2f397e 3 months ago

Raw

History Blame Contribute Delete

8.61 kB

	import os
	import json
	import faiss
	import numpy as np
	from pathlib import Path
	from typing import List, Dict, Any
	from sentence_transformers import SentenceTransformer

	# Initialize embedding model
	model = SentenceTransformer('all-MiniLM-L6-v2')

	# Directory to store FAISS indexes
	INDEXES_DIR = Path('faiss_indexes')
	INDEXES_DIR.mkdir(exist_ok=True)


	def _sanitize_repo_url(repo_url: str) -> str:
	"""Sanitize repository URL to create a valid filename."""
	return repo_url.replace('https://', '').replace('http://', '').replace('/', '_').replace('.', '_').replace(':', '_')[:63]


	def store_docs(repo_url: str, docs_list: List[Dict[str, Any]]) -> None:
	"""
	Stores generated documentation in FAISS with sentence-transformers embeddings.

	Args:
	repo_url: The repository URL (used for index filename)
	docs_list: List of dicts containing 'filename' and 'documentation'
	"""
	repo_slug = _sanitize_repo_url(repo_url)
	print(f"[STORE_DOCS] Starting storage for repo: {repo_url}")
	print(f"[STORE_DOCS] Repo slug: {repo_slug}")

	# Extract all documentation text
	doc_texts = []
	doc_metadata = []

	for doc_item in docs_list:
	text = doc_item['documentation']
	filename = doc_item['filename']
	doc_texts.append(text)
	doc_metadata.append({'filename': filename, 'repo_url': repo_url})

	print(f"[STORE_DOCS] Extracted {len(doc_texts)} documents")

	if not doc_texts:
	print(f"[STORE_DOCS] No documents to store, returning early")
	return

	# Generate embeddings for all documents
	print(f"[STORE_DOCS] Generating embeddings for {len(doc_texts)} documents...")
	embeddings = model.encode(doc_texts, convert_to_numpy=True)
	print(f"[STORE_DOCS] Embeddings shape: {embeddings.shape}")

	# Create FAISS index
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings.astype(np.float32))
	print(f"[STORE_DOCS] FAISS index created with {index.ntotal} vectors")

	# Save FAISS index
	index_path = INDEXES_DIR / f'{repo_slug}.index'
	faiss.write_index(index, str(index_path))
	print(f"[STORE_DOCS] FAISS index saved to: {index_path}")

	# Save text chunks and metadata
	texts_path = INDEXES_DIR / f'{repo_slug}_texts.json'
	with open(texts_path, 'w') as f:
	json.dump({
	'texts': doc_texts,
	'metadata': doc_metadata,
	'repo_url': repo_url
	}, f)
	print(f"[STORE_DOCS] Metadata saved to: {texts_path}")
	print(f"[STORE_DOCS] Storage complete!")


	def update_docs(repo_url: str, updated_docs_list: List[Dict[str, Any]]) -> None:
	"""
	Updates documentation for specific files in an existing FAISS index.
	Loads existing index, updates entries for the specified files, and rebuilds.

	Args:
	repo_url: The repository URL
	updated_docs_list: List of dicts with 'filename' and 'documentation' to update
	"""
	repo_slug = _sanitize_repo_url(repo_url)
	print(f"[UPDATE_DOCS] Starting update for repo: {repo_url}")
	print(f"[UPDATE_DOCS] Updating {len(updated_docs_list)} files")

	index_path = INDEXES_DIR / f'{repo_slug}.index'
	texts_path = INDEXES_DIR / f'{repo_slug}_texts.json'

	# Load existing index and texts
	if not index_path.exists() or not texts_path.exists():
	print(f"[UPDATE_DOCS] No existing index found, creating new one")
	# No existing index, create new one with these docs
	store_docs(repo_url, updated_docs_list)
	return

	try:
	# Load existing data
	with open(texts_path, 'r') as f:
	data = json.load(f)

	existing_texts = data['texts']
	existing_metadata = data['metadata']
	print(f"[UPDATE_DOCS] Loaded {len(existing_texts)} existing documents")

	# Get filenames being updated
	updated_filenames = {doc['filename'] for doc in updated_docs_list}

	# Remove entries for files being updated
	filtered_texts = []
	filtered_metadata = []
	for text, meta in zip(existing_texts, existing_metadata):
	if meta.get('filename') not in updated_filenames:
	filtered_texts.append(text)
	filtered_metadata.append(meta)

	print(f"[UPDATE_DOCS] After filtering: {len(filtered_texts)} documents remain")

	# Add new entries
	for doc in updated_docs_list:
	filtered_texts.append(doc['documentation'])
	filtered_metadata.append({
	'filename': doc['filename'],
	'repo_url': repo_url
	})

	print(f"[UPDATE_DOCS] After adding updates: {len(filtered_texts)} documents total")

	if not filtered_texts:
	print(f"[UPDATE_DOCS] No documents remaining, returning early")
	return

	# Rebuild FAISS index
	print(f"[UPDATE_DOCS] Rebuilding FAISS index...")
	embeddings = model.encode(filtered_texts, convert_to_numpy=True)
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings.astype(np.float32))
	print(f"[UPDATE_DOCS] FAISS index rebuilt with {index.ntotal} vectors")

	# Save updated index
	faiss.write_index(index, str(index_path))
	print(f"[UPDATE_DOCS] Updated FAISS index saved to: {index_path}")

	# Save updated texts
	with open(texts_path, 'w') as f:
	json.dump({
	'texts': filtered_texts,
	'metadata': filtered_metadata,
	'repo_url': repo_url
	}, f)
	print(f"[UPDATE_DOCS] Updated metadata saved to: {texts_path}")
	print(f"[UPDATE_DOCS] Update complete!")

	except Exception as e:
	print(f"Error updating index for {repo_url}: {e}")
	raise


	def search_docs(repo_url: str, question: str, num_results: int = 3) -> List[Dict[str, Any]]:
	"""
	Searches stored documentation using semantic similarity.

	Args:
	repo_url: The repository URL
	question: Natural language question to search for
	num_results: Number of top results to return (default: 3)

	Returns:
	List of dicts containing 'document', 'filename', and 'distance'
	"""
	repo_slug = _sanitize_repo_url(repo_url)
	print(f"[SEARCH_DOCS] Searching for: '{question}' in repo: {repo_url}")

	# Check if index exists
	index_path = INDEXES_DIR / f'{repo_slug}.index'
	texts_path = INDEXES_DIR / f'{repo_slug}_texts.json'

	if not index_path.exists() or not texts_path.exists():
	print(f"[SEARCH_DOCS] ERROR: No index found for repo. Please generate documentation first.")
	print(f"[SEARCH_DOCS] Expected paths: {index_path} and {texts_path}")
	return []

	try:
	# Load FAISS index
	print(f"[SEARCH_DOCS] Loading FAISS index from: {index_path}")
	index = faiss.read_index(str(index_path))
	print(f"[SEARCH_DOCS] FAISS index loaded with {index.ntotal} vectors")

	# Load text chunks
	with open(texts_path, 'r') as f:
	data = json.load(f)

	texts = data['texts']
	metadata = data['metadata']
	print(f"[SEARCH_DOCS] Loaded {len(texts)} text documents and metadata")
	except Exception as e:
	print(f"[SEARCH_DOCS] Error loading index for {repo_url}: {e}")
	return []

	# Embed the question
	print(f"[SEARCH_DOCS] Encoding question embedding...")
	question_embedding = model.encode([question], convert_to_numpy=True)

	# Search the index
	print(f"[SEARCH_DOCS] Searching for top {min(num_results, len(texts))} results...")
	distances, indices = index.search(question_embedding.astype(np.float32), min(num_results, len(texts)))

	# Format results
	formatted_results = []
	for idx, distance in zip(indices[0], distances[0]):
	if idx == -1: # No valid result
	continue
	formatted_results.append({
	'document': texts[idx],
	'filename': metadata[idx].get('filename', 'unknown'),
	'distance': float(distance)
	})

	print(f"[SEARCH_DOCS] Found {len(formatted_results)} results")
	return formatted_results