Spaces:
Build error
Build error
File size: 3,100 Bytes
785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b cb3b8cb 785785b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
#!/usr/bin/env python3
import yaml
import logging
from typing import List, Dict
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class QueryEngine:
def __init__(self, config_path='config.yaml'):
logger.info("Inicializando QueryEngine...")
with open(config_path) as f:
self.config = yaml.safe_load(f)
model_name = self.config.get('embedding_model', 'all-MiniLM-L6-v2')
logger.info(f"Modelo: {model_name}")
self.embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu'}
)
faiss_path = self.config.get('faiss_path', '/app/faiss_index')
logger.info(f"Carregando FAISS de: {faiss_path}")
self.vectorstore = FAISS.load_local(
faiss_path,
self.embeddings,
allow_dangerous_deserialization=True
)
logger.info("✅ QueryEngine pronto!")
def search_by_embedding(self, query: str, top_k: int = 10, return_embeddings: bool = False) -> Dict:
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
formatted = []
for doc, score in results:
formatted.append({
'id': doc.metadata.get('id'),
'ementa': doc.page_content,
'score': float(score),
'metadata': doc.metadata
})
return {
'cluster_id': self.config.get('cluster_id'),
'query': query,
'total_results': len(formatted),
'results': formatted
}
def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
query = ' '.join(keywords)
return self.search_by_embedding(query, top_k)
def search_by_ids(self, ids: List[str], return_embeddings: bool = False) -> Dict:
# FAISS não tem busca direta por ID - implementação simplificada
all_docs = self.vectorstore.similarity_search("", k=10000)
results = []
for doc in all_docs:
if doc.metadata.get('id') in ids:
results.append({
'id': doc.metadata.get('id'),
'ementa': doc.page_content,
'metadata': doc.metadata
})
if len(results) >= len(ids):
break
return {
'cluster_id': self.config.get('cluster_id'),
'total_results': len(results),
'results': results
}
def get_cluster_info(self) -> Dict:
return {
'cluster_id': self.config.get('cluster_id'),
'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')],
'embedding_model': self.config.get('embedding_model'),
'embedding_dim': 384,
'vector_store': 'FAISS',
'backend': 'LangChain + CPU',
'status': 'ready'
}
|