File size: 3,100 Bytes
785785b
 
 
cb3b8cb
785785b
cb3b8cb
 
 
 
785785b
 
 
cb3b8cb
 
785785b
 
 
 
cb3b8cb
 
785785b
cb3b8cb
 
 
 
785785b
cb3b8cb
 
785785b
cb3b8cb
 
 
 
 
785785b
cb3b8cb
785785b
cb3b8cb
 
785785b
cb3b8cb
 
 
 
 
 
 
785785b
 
 
cb3b8cb
 
 
 
785785b
 
cb3b8cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785785b
 
cb3b8cb
 
 
785785b
 
 
 
cb3b8cb
 
 
 
 
 
785785b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
import yaml
import logging
from typing import List, Dict

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class QueryEngine:
    def __init__(self, config_path='config.yaml'):
        logger.info("Inicializando QueryEngine...")

        with open(config_path) as f:
            self.config = yaml.safe_load(f)

        model_name = self.config.get('embedding_model', 'all-MiniLM-L6-v2')
        logger.info(f"Modelo: {model_name}")

        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': 'cpu'}
        )

        faiss_path = self.config.get('faiss_path', '/app/faiss_index')
        logger.info(f"Carregando FAISS de: {faiss_path}")

        self.vectorstore = FAISS.load_local(
            faiss_path, 
            self.embeddings,
            allow_dangerous_deserialization=True
        )

        logger.info("✅ QueryEngine pronto!")

    def search_by_embedding(self, query: str, top_k: int = 10, return_embeddings: bool = False) -> Dict:
        results = self.vectorstore.similarity_search_with_score(query, k=top_k)

        formatted = []
        for doc, score in results:
            formatted.append({
                'id': doc.metadata.get('id'),
                'ementa': doc.page_content,
                'score': float(score),
                'metadata': doc.metadata
            })

        return {
            'cluster_id': self.config.get('cluster_id'),
            'query': query,
            'total_results': len(formatted),
            'results': formatted
        }

    def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
        query = ' '.join(keywords)
        return self.search_by_embedding(query, top_k)

    def search_by_ids(self, ids: List[str], return_embeddings: bool = False) -> Dict:
        # FAISS não tem busca direta por ID - implementação simplificada
        all_docs = self.vectorstore.similarity_search("", k=10000)

        results = []
        for doc in all_docs:
            if doc.metadata.get('id') in ids:
                results.append({
                    'id': doc.metadata.get('id'),
                    'ementa': doc.page_content,
                    'metadata': doc.metadata
                })
                if len(results) >= len(ids):
                    break

        return {
            'cluster_id': self.config.get('cluster_id'),
            'total_results': len(results),
            'results': results
        }

    def get_cluster_info(self) -> Dict:
        return {
            'cluster_id': self.config.get('cluster_id'),
            'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')],
            'embedding_model': self.config.get('embedding_model'),
            'embedding_dim': 384,
            'vector_store': 'FAISS',
            'backend': 'LangChain + CPU',
            'status': 'ready'
        }