v3_ai_assistant / simple_vector_search.py
Julian Vanecek
Add document search functionality
9967c6c
"""
Minimal vector search implementation for HuggingFace deployment
"""
import json
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple
class SimpleVectorSearch:
"""Simple in-memory vector search"""
def __init__(self, data_dir: str = "py/backend/data/embeddings"):
self.data_dir = Path(data_dir)
self.documents = []
self.embeddings = None
self._load_embeddings()
def _load_embeddings(self):
"""Load all embedding files"""
all_docs = []
all_embeddings = []
# Load all JSON files
for json_file in self.data_dir.glob("*.json"):
try:
with open(json_file, 'r') as f:
data = json.load(f)
for item in data:
all_docs.append({
'content': item['content'],
'metadata': item.get('metadata', {})
})
all_embeddings.append(item['embedding'])
except Exception as e:
print(f"Error loading {json_file}: {e}")
self.documents = all_docs
self.embeddings = np.array(all_embeddings) if all_embeddings else None
def search(self, query_embedding: List[float], k: int = 3) -> List[Dict]:
"""Search for similar documents"""
if self.embeddings is None or len(self.embeddings) == 0:
return []
# Convert query to numpy array
query_vec = np.array(query_embedding)
# Compute cosine similarity
query_norm = query_vec / (np.linalg.norm(query_vec) + 1e-10)
embeddings_norm = self.embeddings / (np.linalg.norm(self.embeddings, axis=1, keepdims=True) + 1e-10)
similarities = np.dot(embeddings_norm, query_norm)
# Get top k indices
top_indices = np.argsort(similarities)[-k:][::-1]
# Return documents with scores
results = []
for idx in top_indices:
results.append({
'content': self.documents[idx]['content'],
'metadata': self.documents[idx]['metadata'],
'score': float(similarities[idx])
})
return results