clausewatch-api / app /services /vector_store.py
Nilyzz's picture
Add files
306e475
import google.generativeai as genai
import numpy as np
import os
class InMemoryVectorStore:
def __init__(self):
self.store = {}
self.model_name = "models/text-embedding-004"
def get_embedding(self, text):
try:
result = genai.embed_content(
model=self.model_name,
content=text,
task_type="retrieval_document"
)
return result['embedding']
except Exception as e:
print(f"Error getting embedding: {e}")
return []
def add_contract(self, filename: str, chunks: list):
print(f"Indexing {filename} using Google Embeddings...")
self.store[filename] = []
for chunk in chunks:
text = chunk["text"]
vector = self.get_embedding(text)
if vector:
self.store[filename].append({
"text": text,
"vector": np.array(vector),
"metadata": {"page": chunk["page"]}
})
print(f"Indexed {len(self.store[filename])} chunks for {filename}")
def search_similar(self, query: str, filename: str, n_results: int = 3):
if filename not in self.store:
return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
try:
query_emb = genai.embed_content(
model=self.model_name,
content=query,
task_type="retrieval_query"
)['embedding']
query_vec = np.array(query_emb)
except:
return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
scores = []
for item in self.store[filename]:
doc_vec = item["vector"]
score = np.dot(query_vec, doc_vec) / (np.linalg.norm(query_vec) * np.linalg.norm(doc_vec))
scores.append((score, item))
scores.sort(key=lambda x: x[0], reverse=True)
top_results = scores[:n_results]
return {
"documents": [[res[1]["text"] for res in top_results]],
"metadatas": [[res[1]["metadata"] for res in top_results]],
"distances": [[1 - res[0] for res in top_results]]
}
# Instancia global (Singularidad)
vector_db = InMemoryVectorStore()