caarleexx commited on
Commit
cb3b8cb
·
verified ·
1 Parent(s): 2c57779

Upload 9 files

Browse files
Files changed (9) hide show
  1. .gitignore +1 -15
  2. Dockerfile +0 -1
  3. app.py +14 -103
  4. entrypoint.sh +6 -20
  5. filter_fields.py +19 -34
  6. query_engine.py +61 -154
  7. rag_builder.py +68 -79
  8. requirements.txt +6 -5
  9. setup.py +43 -135
.gitignore CHANGED
@@ -1,29 +1,15 @@
1
- # Python
2
  __pycache__/
3
  *.py[cod]
4
- *$py.class
5
- *.so
6
  .Python
7
  env/
8
  venv/
9
  *.egg-info/
10
- .pytest_cache/
11
-
12
- # ChromaDB
13
- chromadb/
14
  *.sqlite3
15
-
16
- # Temporários
17
  /tmp/
18
  *.tar.gz
19
  *.jsonl
20
  repo_git_temp/
21
-
22
- # IDE
23
  .vscode/
24
  .idea/
25
- *.swp
26
- *.swo
27
-
28
- # Logs
29
  *.log
 
 
1
  __pycache__/
2
  *.py[cod]
 
 
3
  .Python
4
  env/
5
  venv/
6
  *.egg-info/
7
+ faiss_index/
 
 
 
8
  *.sqlite3
 
 
9
  /tmp/
10
  *.tar.gz
11
  *.jsonl
12
  repo_git_temp/
 
 
13
  .vscode/
14
  .idea/
 
 
 
 
15
  *.log
Dockerfile CHANGED
@@ -30,4 +30,3 @@ EXPOSE 7860
30
 
31
  # Comando de inicialização
32
  CMD ["./entrypoint.sh"]
33
-
 
30
 
31
  # Comando de inicialização
32
  CMD ["./entrypoint.sh"]
 
app.py CHANGED
@@ -1,11 +1,5 @@
1
  #!/usr/bin/env python3
2
- """
3
- Para.AI RAG Cluster - FastAPI Application
4
- Inicia IMEDIATAMENTE (antes do setup terminar) para evitar timeout HF
5
- """
6
-
7
  from fastapi import FastAPI, HTTPException
8
- from fastapi.responses import JSONResponse
9
  from pydantic import BaseModel
10
  from typing import List, Optional
11
  import logging
@@ -16,73 +10,35 @@ from pathlib import Path
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
- # ============================================================================
20
- # VERIFICAÇÃO DE STATUS DO SETUP
21
- # ============================================================================
22
-
23
  STATUS_FILE = Path('/tmp/setup_status.json')
24
- READY_FLAG = Path('/tmp/chromadb_ready')
25
 
26
  def get_setup_status():
27
- """Lê status do setup em background"""
28
  if not STATUS_FILE.exists():
29
- return {
30
- 'status': 'initializing',
31
- 'message': 'Setup ainda não iniciado',
32
- 'progress': 0
33
- }
34
-
35
  try:
36
  with open(STATUS_FILE) as f:
37
  return json.load(f)
38
  except:
39
- return {
40
- 'status': 'unknown',
41
- 'message': 'Erro ao ler status',
42
- 'progress': 0
43
- }
44
 
45
  def is_ready():
46
- """Verifica se ChromaDB está pronto"""
47
  return READY_FLAG.exists()
48
 
49
- # ============================================================================
50
- # LAZY LOADING DO QUERY ENGINE
51
- # ============================================================================
52
-
53
  query_engine = None
54
 
55
  def get_query_engine():
56
- """Carrega QueryEngine apenas quando ChromaDB estiver pronto"""
57
  global query_engine
58
-
59
  if query_engine is None:
60
  if not is_ready():
61
- raise HTTPException(
62
- status_code=503,
63
- detail="RAG ainda em construção. Tente novamente em alguns minutos."
64
- )
65
-
66
  logger.info("Carregando QueryEngine...")
67
  from query_engine import QueryEngine
68
  query_engine = QueryEngine()
69
  logger.info("✅ QueryEngine carregado!")
70
-
71
  return query_engine
72
 
73
- # ============================================================================
74
- # FASTAPI APP
75
- # ============================================================================
76
-
77
- app = FastAPI(
78
- title="Para.AI RAG Cluster",
79
- description="Micro-cluster RAG para jurisprudências do TJPR",
80
- version="1.0.0"
81
- )
82
-
83
- # ============================================================================
84
- # MODELS (Pydantic)
85
- # ============================================================================
86
 
87
  class EmbeddingSearchRequest(BaseModel):
88
  query: str
@@ -98,121 +54,76 @@ class IDSearchRequest(BaseModel):
98
  ids: List[str]
99
  return_embeddings: bool = False
100
 
101
- # ============================================================================
102
- # ENDPOINTS
103
- # ============================================================================
104
-
105
  @app.get("/")
106
  async def root():
107
- """Health check - SEMPRE responde (mesmo durante setup)"""
108
  setup_status = get_setup_status()
109
  ready = is_ready()
110
 
111
- response = {
112
- "status": "online",
113
- "rag_ready": ready,
114
- "setup": setup_status
115
- }
116
 
117
  if ready and query_engine:
118
- response["cluster_id"] = query_engine.config['cluster_id']
119
- response["chunk_range"] = [
120
- query_engine.config['chunk_start'],
121
- query_engine.config['chunk_end']
122
- ]
123
- response["endpoints"] = [
124
- "/search/embedding",
125
- "/search/keywords",
126
- "/search/by_id",
127
- "/cluster/info",
128
- "/setup/status"
129
- ]
130
 
131
  return response
132
 
133
  @app.get("/setup/status")
134
  async def setup_status():
135
- """Retorna status detalhado do setup"""
136
  return get_setup_status()
137
 
138
  @app.get("/health")
139
  async def health():
140
- """Health check simples para HF Spaces"""
141
  return {"status": "ok", "timestamp": time.time()}
142
 
143
  @app.post("/search/embedding")
144
  async def search_embedding(request: EmbeddingSearchRequest):
145
- """Busca por similaridade semântica (embeddings)"""
146
- engine = get_query_engine() # Lança 503 se não estiver pronto
147
-
148
  try:
149
  start = time.time()
150
- results = engine.search_by_embedding(
151
- query=request.query,
152
- top_k=request.top_k,
153
- return_embeddings=request.return_embeddings
154
- )
155
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
156
  return results
157
  except Exception as e:
158
- logger.error(f"Erro em search_embedding: {e}")
159
  raise HTTPException(status_code=500, detail=str(e))
160
 
161
  @app.post("/search/keywords")
162
  async def search_keywords(request: KeywordSearchRequest):
163
- """Busca por termos-chave (full-text search)"""
164
  engine = get_query_engine()
165
-
166
  try:
167
  start = time.time()
168
- results = engine.search_by_keywords(
169
- keywords=request.keywords,
170
- operator=request.operator,
171
- top_k=request.top_k
172
- )
173
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
174
  return results
175
  except Exception as e:
176
- logger.error(f"Erro em search_keywords: {e}")
177
  raise HTTPException(status_code=500, detail=str(e))
178
 
179
  @app.post("/search/by_id")
180
  async def search_by_id(request: IDSearchRequest):
181
- """Busca direta por ID(s)"""
182
  engine = get_query_engine()
183
-
184
  try:
185
  start = time.time()
186
- results = engine.search_by_ids(
187
- ids=request.ids,
188
- return_embeddings=request.return_embeddings
189
- )
190
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
191
  return results
192
  except Exception as e:
193
- logger.error(f"Erro em search_by_id: {e}")
194
  raise HTTPException(status_code=500, detail=str(e))
195
 
196
  @app.get("/cluster/info")
197
  async def cluster_info():
198
- """Informações detalhadas do cluster"""
199
  engine = get_query_engine()
200
-
201
  try:
202
  info = engine.get_cluster_info()
203
  info['uptime_seconds'] = round(time.time() - app.state.start_time, 2)
204
  return info
205
  except Exception as e:
206
- logger.error(f"Erro em cluster_info: {e}")
207
  raise HTTPException(status_code=500, detail=str(e))
208
 
209
  @app.on_event("startup")
210
  async def startup_event():
211
- """Evento de startup - RÁPIDO (não aguarda setup)"""
212
  app.state.start_time = time.time()
213
  logger.info("="*80)
214
- logger.info("🚀 Para.AI RAG Cluster FastAPI ONLINE")
215
- logger.info("Setup em background: verificar /setup/status")
216
  logger.info("="*80)
217
 
218
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
 
 
 
 
 
2
  from fastapi import FastAPI, HTTPException
 
3
  from pydantic import BaseModel
4
  from typing import List, Optional
5
  import logging
 
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
 
 
 
 
13
  STATUS_FILE = Path('/tmp/setup_status.json')
14
+ READY_FLAG = Path('/tmp/faiss_ready')
15
 
16
  def get_setup_status():
 
17
  if not STATUS_FILE.exists():
18
+ return {'status': 'initializing', 'message': 'Setup não iniciado', 'progress': 0}
 
 
 
 
 
19
  try:
20
  with open(STATUS_FILE) as f:
21
  return json.load(f)
22
  except:
23
+ return {'status': 'unknown', 'message': 'Erro ao ler status', 'progress': 0}
 
 
 
 
24
 
25
  def is_ready():
 
26
  return READY_FLAG.exists()
27
 
 
 
 
 
28
  query_engine = None
29
 
30
  def get_query_engine():
 
31
  global query_engine
 
32
  if query_engine is None:
33
  if not is_ready():
34
+ raise HTTPException(status_code=503, detail="RAG em construção. Tente em alguns minutos.")
 
 
 
 
35
  logger.info("Carregando QueryEngine...")
36
  from query_engine import QueryEngine
37
  query_engine = QueryEngine()
38
  logger.info("✅ QueryEngine carregado!")
 
39
  return query_engine
40
 
41
+ app = FastAPI(title="Para.AI RAG Cluster (LangChain)", version="1.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  class EmbeddingSearchRequest(BaseModel):
44
  query: str
 
54
  ids: List[str]
55
  return_embeddings: bool = False
56
 
 
 
 
 
57
  @app.get("/")
58
  async def root():
 
59
  setup_status = get_setup_status()
60
  ready = is_ready()
61
 
62
+ response = {"status": "online", "rag_ready": ready, "setup": setup_status, "backend": "LangChain + FAISS (CPU)"}
 
 
 
 
63
 
64
  if ready and query_engine:
65
+ response["cluster_id"] = query_engine.config.get('cluster_id')
66
+ response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
 
 
 
 
 
 
 
 
 
 
67
 
68
  return response
69
 
70
  @app.get("/setup/status")
71
  async def setup_status():
 
72
  return get_setup_status()
73
 
74
  @app.get("/health")
75
  async def health():
 
76
  return {"status": "ok", "timestamp": time.time()}
77
 
78
  @app.post("/search/embedding")
79
  async def search_embedding(request: EmbeddingSearchRequest):
80
+ engine = get_query_engine()
 
 
81
  try:
82
  start = time.time()
83
+ results = engine.search_by_embedding(request.query, request.top_k, request.return_embeddings)
 
 
 
 
84
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
85
  return results
86
  except Exception as e:
87
+ logger.error(f"Erro: {e}")
88
  raise HTTPException(status_code=500, detail=str(e))
89
 
90
  @app.post("/search/keywords")
91
  async def search_keywords(request: KeywordSearchRequest):
 
92
  engine = get_query_engine()
 
93
  try:
94
  start = time.time()
95
+ results = engine.search_by_keywords(request.keywords, request.operator, request.top_k)
 
 
 
 
96
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
97
  return results
98
  except Exception as e:
 
99
  raise HTTPException(status_code=500, detail=str(e))
100
 
101
  @app.post("/search/by_id")
102
  async def search_by_id(request: IDSearchRequest):
 
103
  engine = get_query_engine()
 
104
  try:
105
  start = time.time()
106
+ results = engine.search_by_ids(request.ids, request.return_embeddings)
 
 
 
107
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
108
  return results
109
  except Exception as e:
 
110
  raise HTTPException(status_code=500, detail=str(e))
111
 
112
  @app.get("/cluster/info")
113
  async def cluster_info():
 
114
  engine = get_query_engine()
 
115
  try:
116
  info = engine.get_cluster_info()
117
  info['uptime_seconds'] = round(time.time() - app.state.start_time, 2)
118
  return info
119
  except Exception as e:
 
120
  raise HTTPException(status_code=500, detail=str(e))
121
 
122
  @app.on_event("startup")
123
  async def startup_event():
 
124
  app.state.start_time = time.time()
125
  logger.info("="*80)
126
+ logger.info("🚀 Para.AI RAG (LangChain + FAISS) ONLINE")
 
127
  logger.info("="*80)
128
 
129
  if __name__ == "__main__":
entrypoint.sh CHANGED
@@ -2,41 +2,27 @@
2
  set -e
3
 
4
  echo "=================================="
5
- echo "🚀 Para.AI RAG Cluster Startup"
6
  echo "=================================="
7
 
8
- # Ir para diretório da aplicação
9
  cd /home/user/app
10
 
11
- # ESTRATÉGIA: Iniciar setup em background PRIMEIRO, depois FastAPI
12
- # Isso evita timeout de inicialização do HF Spaces
13
-
14
  echo ""
15
  echo "1️⃣ Iniciando setup em background..."
16
- echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
17
-
18
- # Iniciar setup.py em background com output unbuffered (-u)
19
- # Redirecionar output para arquivo + tela
20
  python3 -u setup.py > /tmp/setup_output.log 2>&1 &
21
  SETUP_PID=$!
22
 
23
- echo "✅ Setup iniciado em background (PID: $SETUP_PID)"
24
- echo "📋 Logs em: /tmp/setup_output.log"
25
- echo "📊 Status em: /tmp/setup_status.json"
26
  echo ""
27
 
28
- # Esperar 2 segundos para setup criar arquivo de status
29
  sleep 2
30
 
31
  echo "2️⃣ Iniciando FastAPI..."
32
- echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
33
- echo "🎯 FastAPI estará online IMEDIATAMENTE"
34
- echo "🔧 RAG estará disponível quando setup terminar (~10-15 min)"
35
- echo "📡 Acompanhe em: /setup/status"
36
  echo ""
37
  echo "=================================="
38
- echo "🚀 Iniciando API REST..."
39
- echo "=================================="
40
 
41
- # Iniciar FastAPI (bloqueia aqui)
42
  exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
 
2
  set -e
3
 
4
  echo "=================================="
5
+ echo "🚀 Para.AI RAG (LangChain) Startup"
6
  echo "=================================="
7
 
 
8
  cd /home/user/app
9
 
 
 
 
10
  echo ""
11
  echo "1️⃣ Iniciando setup em background..."
 
 
 
 
12
  python3 -u setup.py > /tmp/setup_output.log 2>&1 &
13
  SETUP_PID=$!
14
 
15
+ echo "✅ Setup PID: $SETUP_PID"
16
+ echo "📋 Logs: /tmp/setup_output.log"
 
17
  echo ""
18
 
 
19
  sleep 2
20
 
21
  echo "2️⃣ Iniciando FastAPI..."
22
+ echo "🎯 API online IMEDIATAMENTE"
23
+ echo "🔧 RAG disponível quando setup terminar (~10-15min)"
24
+ echo "📡 Acompanhe: /setup/status"
 
25
  echo ""
26
  echo "=================================="
 
 
27
 
 
28
  exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
filter_fields.py CHANGED
@@ -1,44 +1,29 @@
1
  #!/usr/bin/env python3
2
- """
3
- Filtrar campos de JSONL mantendo apenas os especificados
4
- """
5
  import json
6
- import yaml
7
- from pathlib import Path
8
  import argparse
9
- from tqdm import tqdm
10
-
11
- def filter_jsonl(input_path: str, output_path: str, keep_fields: list = None):
12
- """Filtra campos de arquivo JSONL"""
13
-
14
- # Carregar campos da config se não especificados
15
- if keep_fields is None:
16
- with open('config.yaml') as f:
17
- config = yaml.safe_load(f)
18
- keep_fields = config['campos_filter']
19
-
20
- print(f"📥 Input: {input_path}")
21
- print(f"📤 Output: {output_path}")
22
- print(f"🔧 Mantendo campos: {keep_fields}")
23
-
24
- # Contar linhas
25
- with open(input_path) as f:
26
- total = sum(1 for _ in f)
27
-
28
- # Filtrar
29
- with open(input_path) as fin, open(output_path, 'w') as fout:
30
- for line in tqdm(fin, total=total, desc="Filtrando"):
31
- record = json.loads(line)
32
- filtered = {k: record[k] for k in keep_fields if k in record}
33
- fout.write(json.dumps(filtered, ensure_ascii=False) + '\n')
34
 
35
- print(f"✅ {total} registros filtrados!")
 
 
 
 
 
 
 
36
 
37
- if __name__ == "__main__":
38
  parser = argparse.ArgumentParser()
39
  parser.add_argument('--input', required=True)
40
  parser.add_argument('--output', required=True)
41
- parser.add_argument('--keep', nargs='+', default=None)
42
  args = parser.parse_args()
43
 
44
- filter_jsonl(args.input, args.output, args.keep)
 
 
 
 
 
 
 
 
1
  #!/usr/bin/env python3
 
 
 
2
  import json
 
 
3
  import argparse
4
+ import yaml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ def filter_jsonl(input_file, output_file, fields_to_keep):
7
+ with open(input_file, 'r', encoding='utf-8') as fin:
8
+ with open(output_file, 'w', encoding='utf-8') as fout:
9
+ for line in fin:
10
+ if line.strip():
11
+ record = json.loads(line)
12
+ filtered = {k: record.get(k) for k in fields_to_keep if k in record}
13
+ fout.write(json.dumps(filtered, ensure_ascii=False) + '\n')
14
 
15
+ def main():
16
  parser = argparse.ArgumentParser()
17
  parser.add_argument('--input', required=True)
18
  parser.add_argument('--output', required=True)
19
+ parser.add_argument('--config', default='config.yaml')
20
  args = parser.parse_args()
21
 
22
+ with open(args.config) as f:
23
+ config = yaml.safe_load(f)
24
+
25
+ filter_jsonl(args.input, args.output, config['campos_filter'])
26
+ print(f"✅ Filtrado: {args.output}")
27
+
28
+ if __name__ == '__main__':
29
+ main()
query_engine.py CHANGED
@@ -1,184 +1,91 @@
1
  #!/usr/bin/env python3
2
- """
3
- Engine de busca para ChromaDB
4
- """
5
  import yaml
6
- import chromadb
7
- from sentence_transformers import SentenceTransformer
8
- from typing import List, Dict, Optional
9
  import logging
 
10
 
 
 
 
 
11
  logger = logging.getLogger(__name__)
12
 
13
  class QueryEngine:
14
- """Engine de busca com ChromaDB"""
 
15
 
16
- def __init__(self, config_path: str = 'config.yaml'):
17
- # Carregar config
18
  with open(config_path) as f:
19
  self.config = yaml.safe_load(f)
20
 
21
- # Carregar modelo de embedding
22
- logger.info(f"Carregando modelo {self.config['embedding_model']}...")
23
- self.model = SentenceTransformer(self.config['embedding_model'])
24
-
25
- # Conectar ao ChromaDB
26
- logger.info(f"Conectando ao ChromaDB...")
27
- self.client = chromadb.PersistentClient(path=self.config['chromadb_path'])
28
- self.collection = self.client.get_collection(self.config['collection_name'])
29
-
30
- logger.info(f"✅ QueryEngine pronto ({self.collection.count():,} registros)")
31
-
32
- def search_by_embedding(
33
- self,
34
- query: str,
35
- top_k: int = 10,
36
- return_embeddings: bool = False
37
- ) -> Dict:
38
- """Busca por similaridade semântica"""
39
-
40
- # Gerar embedding da query
41
- query_embedding = self.model.encode(query).tolist()
42
-
43
- # Buscar no ChromaDB
44
- results = self.collection.query(
45
- query_embeddings=[query_embedding],
46
- n_results=top_k,
47
- include=['documents', 'metadatas', 'distances', 'embeddings'] if return_embeddings
48
- else ['documents', 'metadatas', 'distances']
49
- )
50
 
51
- # Formatar resposta
52
- formatted_results = []
53
- for i in range(len(results['ids'][0])):
54
- result = {
55
- 'id': results['ids'][0][i],
56
- 'ementa': results['documents'][0][i],
57
- 'distance': results['distances'][0][i],
58
- 'score': 1.0 - results['distances'][0][i] # Converter distância para score
59
- }
60
 
61
- if return_embeddings and 'embeddings' in results:
62
- result['embedding'] = results['embeddings'][0][i]
63
 
64
- formatted_results.append(result)
 
 
 
 
65
 
66
- return {
67
- 'cluster_id': self.config['cluster_id'],
68
- 'chunk_range': [self.config['chunk_start'], self.config['chunk_end']],
69
- 'results': formatted_results,
70
- 'total_found': len(formatted_results)
71
- }
72
 
73
- def search_by_keywords(
74
- self,
75
- keywords: List[str],
76
- operator: str = 'AND',
77
- top_k: int = 20
78
- ) -> Dict:
79
- """Busca por termos-chave (full-text search)"""
80
-
81
- # Construir query string
82
- if operator.upper() == 'AND':
83
- query_str = ' '.join(keywords)
84
- else: # OR
85
- query_str = '|'.join(keywords)
86
-
87
- # Buscar usando where_document (full-text search do ChromaDB)
88
- results = self.collection.query(
89
- query_texts=[query_str],
90
- n_results=top_k,
91
- include=['documents', 'metadatas']
92
- )
93
 
94
- # Formatar resposta
95
- formatted_results = []
96
- for i in range(len(results['ids'][0])):
97
- # Verificar quais keywords foram matchadas
98
- doc = results['documents'][0][i].lower()
99
- matched = [kw for kw in keywords if kw.lower() in doc]
100
-
101
- formatted_results.append({
102
- 'id': results['ids'][0][i],
103
- 'ementa': results['documents'][0][i],
104
- 'matched_keywords': matched
105
  })
106
 
107
  return {
108
- 'cluster_id': self.config['cluster_id'],
109
- 'results': formatted_results,
110
- 'total_found': len(formatted_results)
 
111
  }
112
 
113
- def search_by_ids(
114
- self,
115
- ids: List[str],
116
- return_embeddings: bool = False
117
- ) -> Dict:
118
- """Busca direta por ID(s)"""
119
-
120
- # Buscar por IDs
121
- try:
122
- results = self.collection.get(
123
- ids=ids,
124
- include=['documents', 'metadatas', 'embeddings'] if return_embeddings
125
- else ['documents', 'metadatas']
126
- )
127
- except Exception as e:
128
- logger.error(f"Erro ao buscar IDs: {e}")
129
- return {
130
- 'cluster_id': self.config['cluster_id'],
131
- 'results': [],
132
- 'not_found': ids,
133
- 'total_found': 0
134
- }
135
-
136
- # Formatar resposta
137
- formatted_results = []
138
- found_ids = set(results['ids'])
139
-
140
- for i in range(len(results['ids'])):
141
- result = {
142
- 'id': results['ids'][i],
143
- 'ementa': results['documents'][i]
144
- }
145
-
146
- if return_embeddings and 'embeddings' in results:
147
- result['embedding'] = results['embeddings'][i]
148
-
149
- formatted_results.append(result)
150
-
151
- # IDs não encontrados
152
- not_found = [id for id in ids if id not in found_ids]
153
 
154
  return {
155
- 'cluster_id': self.config['cluster_id'],
156
- 'results': formatted_results,
157
- 'not_found': not_found,
158
- 'total_found': len(formatted_results)
159
  }
160
 
161
  def get_cluster_info(self) -> Dict:
162
- """Retorna informações do cluster"""
163
- import os
164
-
165
- # Calcular tamanho do ChromaDB
166
- db_path = self.config['chromadb_path']
167
- total_size = 0
168
- for dirpath, dirnames, filenames in os.walk(db_path):
169
- for f in filenames:
170
- fp = os.path.join(dirpath, f)
171
- total_size += os.path.getsize(fp)
172
-
173
- db_size_mb = total_size / (1024 * 1024)
174
-
175
  return {
176
- 'cluster_id': self.config['cluster_id'],
177
- 'chunk_range': [self.config['chunk_start'], self.config['chunk_end']],
178
- 'total_records': self.collection.count(),
179
- 'embedding_model': self.config['embedding_model'],
180
- 'embedding_dim': self.config['embedding_dim'],
181
- 'campos_disponiveis': self.config['campos_filter'],
182
- 'db_size_mb': round(db_size_mb, 2),
183
  'status': 'ready'
184
  }
 
1
  #!/usr/bin/env python3
 
 
 
2
  import yaml
 
 
 
3
  import logging
4
+ from typing import List, Dict
5
 
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
  class QueryEngine:
13
+ def __init__(self, config_path='config.yaml'):
14
+ logger.info("Inicializando QueryEngine...")
15
 
 
 
16
  with open(config_path) as f:
17
  self.config = yaml.safe_load(f)
18
 
19
+ model_name = self.config.get('embedding_model', 'all-MiniLM-L6-v2')
20
+ logger.info(f"Modelo: {model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ self.embeddings = HuggingFaceEmbeddings(
23
+ model_name=model_name,
24
+ model_kwargs={'device': 'cpu'}
25
+ )
 
 
 
 
 
26
 
27
+ faiss_path = self.config.get('faiss_path', '/app/faiss_index')
28
+ logger.info(f"Carregando FAISS de: {faiss_path}")
29
 
30
+ self.vectorstore = FAISS.load_local(
31
+ faiss_path,
32
+ self.embeddings,
33
+ allow_dangerous_deserialization=True
34
+ )
35
 
36
+ logger.info("✅ QueryEngine pronto!")
 
 
 
 
 
37
 
38
+ def search_by_embedding(self, query: str, top_k: int = 10, return_embeddings: bool = False) -> Dict:
39
+ results = self.vectorstore.similarity_search_with_score(query, k=top_k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ formatted = []
42
+ for doc, score in results:
43
+ formatted.append({
44
+ 'id': doc.metadata.get('id'),
45
+ 'ementa': doc.page_content,
46
+ 'score': float(score),
47
+ 'metadata': doc.metadata
 
 
 
 
48
  })
49
 
50
  return {
51
+ 'cluster_id': self.config.get('cluster_id'),
52
+ 'query': query,
53
+ 'total_results': len(formatted),
54
+ 'results': formatted
55
  }
56
 
57
+ def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
58
+ query = ' '.join(keywords)
59
+ return self.search_by_embedding(query, top_k)
60
+
61
+ def search_by_ids(self, ids: List[str], return_embeddings: bool = False) -> Dict:
62
+ # FAISS não tem busca direta por ID - implementação simplificada
63
+ all_docs = self.vectorstore.similarity_search("", k=10000)
64
+
65
+ results = []
66
+ for doc in all_docs:
67
+ if doc.metadata.get('id') in ids:
68
+ results.append({
69
+ 'id': doc.metadata.get('id'),
70
+ 'ementa': doc.page_content,
71
+ 'metadata': doc.metadata
72
+ })
73
+ if len(results) >= len(ids):
74
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  return {
77
+ 'cluster_id': self.config.get('cluster_id'),
78
+ 'total_results': len(results),
79
+ 'results': results
 
80
  }
81
 
82
  def get_cluster_info(self) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  return {
84
+ 'cluster_id': self.config.get('cluster_id'),
85
+ 'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')],
86
+ 'embedding_model': self.config.get('embedding_model'),
87
+ 'embedding_dim': 384,
88
+ 'vector_store': 'FAISS',
89
+ 'backend': 'LangChain + CPU',
 
90
  'status': 'ready'
91
  }
rag_builder.py CHANGED
@@ -1,105 +1,94 @@
1
  #!/usr/bin/env python3
2
  """
3
- Constrói ChromaDB com embeddings a partir de JSONL filtrado
 
4
  """
 
 
 
5
  import json
6
- import yaml
7
- from pathlib import Path
8
  import argparse
9
- import chromadb
10
- from sentence_transformers import SentenceTransformer
11
- from tqdm import tqdm
12
  import logging
13
 
 
 
 
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
- def build_chromadb(input_jsonl: str, config_path: str = 'config.yaml'):
18
- """Constrói ChromaDB a partir de JSONL"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Carregar config
21
- with open(config_path) as f:
22
- config = yaml.safe_load(f)
23
 
 
24
  logger.info("="*80)
25
- logger.info("🔧 CONSTRUINDO CHROMADB")
26
  logger.info("="*80)
27
- logger.info(f"Cluster ID: {config['cluster_id']}")
28
- logger.info(f"Chunks: {config['chunk_start']} - {config['chunk_end']}")
29
- logger.info(f"Embedding Model: {config['embedding_model']}")
30
-
31
- # Carregar modelo de embedding
32
- logger.info("\n📥 Carregando modelo de embedding...")
33
- model = SentenceTransformer(config['embedding_model'])
34
- logger.info(f"✅ Modelo carregado (dim={config['embedding_dim']})")
35
-
36
- # Inicializar ChromaDB
37
- logger.info(f"\n💾 Inicializando ChromaDB em {config['chromadb_path']}...")
38
- client = chromadb.PersistentClient(path=config['chromadb_path'])
39
-
40
- # Criar/obter collection
41
- try:
42
- collection = client.get_collection(config['collection_name'])
43
- logger.info(f"⚠️ Collection '{config['collection_name']}' já existe! Apagando...")
44
- client.delete_collection(config['collection_name'])
45
- except:
46
- pass
47
-
48
- collection = client.create_collection(
49
- name=config['collection_name'],
50
- metadata={
51
- "cluster_id": config['cluster_id'],
52
- "chunk_start": config['chunk_start'],
53
- "chunk_end": config['chunk_end']
54
- }
55
- )
56
- logger.info(f"✅ Collection criada")
57
 
58
- # Carregar registros
59
- logger.info(f"\n📖 Carregando registros de {input_jsonl}...")
60
- records = []
61
- with open(input_jsonl) as f:
62
- for line in f:
63
- records.append(json.loads(line))
64
-
65
- total = len(records)
66
- logger.info(f"✅ {total:,} registros carregados")
67
-
68
- # Processar em batches
69
- batch_size = config['embedding_batch_size']
70
- logger.info(f"\n🚀 Gerando embeddings em batches de {batch_size}...")
71
 
72
- for i in tqdm(range(0, total, batch_size), desc="Embedding"):
73
- batch = records[i:i+batch_size]
 
74
 
75
- # IDs
76
- ids = [str(r['id']) for r in batch]
 
77
 
78
- # Documentos (usar ementa para embedding)
79
- documents = [r.get('ementa', '') for r in batch]
 
 
 
 
80
 
81
- # Metadatas
82
- metadatas = [{'id': r['id']} for r in batch]
83
 
84
- # Gerar embeddings
85
- embeddings = model.encode(documents, show_progress_bar=False).tolist()
 
86
 
87
- # Adicionar ao ChromaDB
88
- collection.add(
89
- ids=ids,
90
- embeddings=embeddings,
91
- documents=documents,
92
- metadatas=metadatas
93
- )
94
 
95
- logger.info(f"\n✅ ChromaDB construído com sucesso!")
96
- logger.info(f"📊 Total de registros: {collection.count():,}")
97
- logger.info("="*80)
98
 
99
- if __name__ == "__main__":
100
  parser = argparse.ArgumentParser()
101
- parser.add_argument('--input', required=True, help='JSONL filtrado')
102
- parser.add_argument('--config', default='config.yaml')
 
 
103
  args = parser.parse_args()
104
 
105
- build_chromadb(args.input, args.config)
 
 
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ RAG Builder usando LangChain + HuggingFaceEmbeddings (CPU)
4
+ Constrói FAISS vector store a partir de JSONL filtrado
5
  """
6
+
7
+ import os
8
+ import sys
9
  import json
 
 
10
  import argparse
11
+ from pathlib import Path
12
+ from typing import List, Dict
 
13
  import logging
14
 
15
+ from langchain.docstore.document import Document
16
+ from langchain_community.embeddings import HuggingFaceEmbeddings
17
+ from langchain_community.vectorstores import FAISS
18
+
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
22
+ def load_jsonl(filepath: str) -> List[Dict]:
23
+ records = []
24
+ with open(filepath, 'r', encoding='utf-8') as f:
25
+ for line in f:
26
+ if line.strip():
27
+ records.append(json.loads(line))
28
+ return records
29
+
30
+ def create_documents(records: List[Dict]) -> List[Document]:
31
+ documents = []
32
+ for record in records:
33
+ doc_id = record.get('id', 'unknown')
34
+ ementa = record.get('ementa', '')
35
+
36
+ if not ementa:
37
+ continue
38
+
39
+ doc = Document(
40
+ page_content=ementa,
41
+ metadata={'id': doc_id, 'source': 'tjpr'}
42
+ )
43
+ documents.append(doc)
44
 
45
+ return documents
 
 
46
 
47
+ def build_vectorstore(input_file, output_dir='/app/faiss_index', model_name='all-MiniLM-L6-v2', batch_size=64):
48
  logger.info("="*80)
49
+ logger.info("🚀 RAG Builder - LangChain + FAISS (CPU)")
50
  logger.info("="*80)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ logger.info(f"\n📂 Carregando {input_file}...")
53
+ records = load_jsonl(input_file)
54
+ logger.info(f"✅ {len(records):,} registros")
 
 
 
 
 
 
 
 
 
 
55
 
56
+ logger.info("\n📄 Criando Documents...")
57
+ documents = create_documents(records)
58
+ logger.info(f"✅ {len(documents):,} documentos")
59
 
60
+ if not documents:
61
+ logger.error("❌ Nenhum documento válido!")
62
+ sys.exit(1)
63
 
64
+ logger.info(f"\n🤖 Criando embeddings com {model_name} (CPU)...")
65
+ embeddings = HuggingFaceEmbeddings(
66
+ model_name=model_name,
67
+ model_kwargs={'device': 'cpu'},
68
+ encode_kwargs={'batch_size': batch_size, 'show_progress_bar': True}
69
+ )
70
 
71
+ logger.info("\n🔍 Construindo FAISS index...")
72
+ vectorstore = FAISS.from_documents(documents, embeddings)
73
 
74
+ logger.info(f"\n💾 Salvando em {output_dir}...")
75
+ os.makedirs(output_dir, exist_ok=True)
76
+ vectorstore.save_local(output_dir)
77
 
78
+ logger.info("\n✅ FAISS INDEX CRIADO!")
79
+ logger.info(f"📊 {len(documents):,} documentos indexados")
 
 
 
 
 
80
 
81
+ return vectorstore
 
 
82
 
83
+ def main():
84
  parser = argparse.ArgumentParser()
85
+ parser.add_argument('--input', required=True)
86
+ parser.add_argument('--output', default='/app/faiss_index')
87
+ parser.add_argument('--model', default='all-MiniLM-L6-v2')
88
+ parser.add_argument('--batch-size', type=int, default=64)
89
  args = parser.parse_args()
90
 
91
+ build_vectorstore(args.input, args.output, args.model, args.batch_size)
92
+
93
+ if __name__ == '__main__':
94
+ main()
requirements.txt CHANGED
@@ -3,12 +3,13 @@ fastapi==0.109.0
3
  uvicorn[standard]==0.27.0
4
  pydantic==2.5.0
5
 
6
- # RAG / Embeddings - VERSÕES COMPATÍVEIS TESTADAS
7
- torch==2.2.0
8
- safetensors==0.4.2
9
- transformers==4.37.2
10
  sentence-transformers==2.5.1
11
- chromadb==0.4.22
 
 
12
 
13
  # Utilities
14
  PyYAML==6.0.1
 
3
  uvicorn[standard]==0.27.0
4
  pydantic==2.5.0
5
 
6
+ # LangChain + Embeddings (CPU-only)
7
+ langchain==0.1.11
8
+ langchain-community==0.0.24
 
9
  sentence-transformers==2.5.1
10
+
11
+ # Vector Store
12
+ faiss-cpu==1.8.0
13
 
14
  # Utilities
15
  PyYAML==6.0.1
setup.py CHANGED
@@ -1,8 +1,4 @@
1
  #!/usr/bin/env python3
2
- """
3
- Setup em background - Clona dados, constrói ChromaDB
4
- Executa enquanto FastAPI já está respondendo (evita timeout HF)
5
- """
6
  import os
7
  import sys
8
  import yaml
@@ -12,64 +8,34 @@ import logging
12
  from pathlib import Path
13
  from datetime import datetime
14
 
15
- # Setup logging com flush imediato
16
- logging.basicConfig(
17
- level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s - %(message)s',
19
- handlers=[
20
- logging.StreamHandler(sys.stdout),
21
- logging.FileHandler('/tmp/setup.log')
22
- ]
23
- )
24
  logger = logging.getLogger(__name__)
25
 
26
- # Forçar flush imediato
27
- for handler in logger.handlers:
28
- handler.flush = lambda: None
29
-
30
  STATUS_FILE = Path('/tmp/setup_status.json')
31
- READY_FLAG = Path('/tmp/chromadb_ready')
32
-
33
- def update_status(status: str, message: str, progress: int = 0):
34
- """Atualiza arquivo de status para app.py ler"""
35
- data = {
36
- 'status': status,
37
- 'message': message,
38
- 'progress': progress,
39
- 'timestamp': datetime.now().isoformat()
40
- }
41
  with open(STATUS_FILE, 'w') as f:
42
  json.dump(data, f)
43
  logger.info(f"[{progress}%] {status}: {message}")
44
  sys.stdout.flush()
45
 
46
- def run_command(cmd: str, description: str):
47
- """Executa comando shell com logging"""
48
- logger.info(f"Executando: {description}")
49
- logger.info(f"Comando: {cmd}")
50
-
51
- result = subprocess.run(
52
- cmd,
53
- shell=True,
54
- capture_output=True,
55
- text=True
56
- )
57
-
58
  if result.returncode != 0:
59
  logger.error(f"ERRO: {result.stderr}")
60
- raise Exception(f"{description} falhou: {result.stderr}")
61
-
62
- logger.info(f"✅ {description} completo")
63
  return result.stdout
64
 
65
  def main():
66
- """Setup completo em background"""
67
  try:
68
  logger.info("="*80)
69
- logger.info("🚀 PARA.AI RAG CLUSTER - SETUP EM BACKGROUND")
70
  logger.info("="*80)
71
 
72
- # Carregar configuração
73
  update_status('loading', 'Carregando configuração', 0)
74
  with open('config.yaml') as f:
75
  config = yaml.safe_load(f)
@@ -79,120 +45,62 @@ def main():
79
  chunk_end = config['chunk_end']
80
  github_repo = config['github_repo']
81
 
82
- logger.info(f"Cluster: {cluster_id}")
83
- logger.info(f"Chunks: {chunk_start} - {chunk_end}")
84
- logger.info("")
85
-
86
- # Verificar se ChromaDB já existe
87
  if READY_FLAG.exists():
88
- logger.info("✅ ChromaDB já pronto! Pulando setup...")
89
- update_status('ready', 'ChromaDB já existe', 100)
90
  return
91
 
92
- # ETAPA 1: Git Sparse Checkout
93
- update_status('cloning', 'Clonando chunks do GitHub (sparse checkout)', 10)
94
-
95
  os.makedirs('/tmp/repo', exist_ok=True)
96
  os.chdir('/tmp/repo')
97
 
98
- # Clone inicial
99
- run_command(
100
- f"git clone --filter=blob:none --sparse {github_repo} .",
101
- "Git clone inicial"
102
- )
103
-
104
- run_command(
105
- "git sparse-checkout init --cone",
106
- "Sparse checkout init"
107
- )
108
-
109
- # Gerar pattern de chunks
110
- logger.info(f"Gerando pattern para chunks {chunk_start}-{chunk_end}...")
111
- pattern_parts = []
112
- for i in range(chunk_start, chunk_end + 1):
113
- pattern_parts.append(f"chunks_dados/chunk_dados_{i:04d}.tar.gz")
114
-
115
- # Set sparse checkout (em batches para evitar arg list too long)
116
- batch_size = 50
117
- for i in range(0, len(pattern_parts), batch_size):
118
- batch = pattern_parts[i:i+batch_size]
119
- pattern = ' '.join(batch)
120
- run_command(
121
- f"git sparse-checkout add {pattern}",
122
- f"Sparse checkout batch {i//batch_size + 1}"
123
- )
124
-
125
- # Contar chunks clonados
126
- result = run_command(
127
- "find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l",
128
- "Contar chunks"
129
- )
130
- chunks_count = int(result.strip())
131
  logger.info(f"✅ {chunks_count} chunks clonados")
132
 
133
- # ETAPA 2: Descompactar
134
  update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
135
-
136
  os.makedirs('/tmp/extracted', exist_ok=True)
 
137
 
138
- run_command(
139
- "find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true",
140
- "Descompactar chunks"
141
- )
142
-
143
- # ETAPA 3: Concatenar JSONL
144
- update_status('concatenating', 'Concatenando jurisprudencias.jsonl', 50)
145
-
146
- run_command(
147
- "find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true",
148
- "Concatenar JSONL"
149
- )
150
 
151
- # Contar registros
152
- result = run_command(
153
- "wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'",
154
- "Contar registros"
155
- )
156
- total_records = int(result.strip())
157
- logger.info(f"✅ {total_records:,} registros concatenados")
158
 
159
- # ETAPA 4: Filtrar campos
160
  update_status('filtering', 'Filtrando campos (id + ementa)', 60)
161
-
162
  os.chdir('/home/user/app')
163
- run_command(
164
- "python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl",
165
- "Filtrar campos"
166
- )
167
-
168
- # ETAPA 5: Build ChromaDB
169
- update_status('building', 'Construindo ChromaDB com embeddings (pode demorar)', 70)
170
-
171
- run_command(
172
- "python3 rag_builder.py --input /tmp/filtered.jsonl",
173
- "Build ChromaDB"
174
- )
175
 
176
- # ETAPA 6: Limpar temporários
177
- update_status('cleaning', 'Limpando arquivos temporários', 95)
 
178
 
179
- run_command(
180
- "rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl",
181
- "Limpar temporários"
182
- )
183
 
184
- # ETAPA 7: Marcar como pronto
185
- update_status('ready', f'ChromaDB pronto com {total_records:,} registros!', 100)
186
  READY_FLAG.touch()
187
 
188
  logger.info("="*80)
189
- logger.info("✅ SETUP COMPLETO - RAG PRONTO PARA USO!")
190
  logger.info("="*80)
191
 
192
  except Exception as e:
193
- logger.error("="*80)
194
- logger.error(f"❌ ERRO NO SETUP: {e}")
195
- logger.error("="*80)
196
  update_status('error', str(e), 0)
197
  sys.exit(1)
198
 
 
1
  #!/usr/bin/env python3
 
 
 
 
2
  import os
3
  import sys
4
  import yaml
 
8
  from pathlib import Path
9
  from datetime import datetime
10
 
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
 
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
 
14
  STATUS_FILE = Path('/tmp/setup_status.json')
15
+ READY_FLAG = Path('/tmp/faiss_ready')
16
+
17
+ def update_status(status, message, progress=0):
18
+ data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
 
 
 
 
 
 
19
  with open(STATUS_FILE, 'w') as f:
20
  json.dump(data, f)
21
  logger.info(f"[{progress}%] {status}: {message}")
22
  sys.stdout.flush()
23
 
24
+ def run_cmd(cmd, desc):
25
+ logger.info(f"Executando: {desc}")
26
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
 
 
 
 
 
 
 
 
 
27
  if result.returncode != 0:
28
  logger.error(f"ERRO: {result.stderr}")
29
+ raise Exception(f"{desc} falhou")
30
+ logger.info(f"✅ {desc}")
 
31
  return result.stdout
32
 
33
  def main():
 
34
  try:
35
  logger.info("="*80)
36
+ logger.info("🚀 PARA.AI RAG (LangChain) - SETUP EM BACKGROUND")
37
  logger.info("="*80)
38
 
 
39
  update_status('loading', 'Carregando configuração', 0)
40
  with open('config.yaml') as f:
41
  config = yaml.safe_load(f)
 
45
  chunk_end = config['chunk_end']
46
  github_repo = config['github_repo']
47
 
 
 
 
 
 
48
  if READY_FLAG.exists():
49
+ logger.info("✅ FAISS já pronto!")
50
+ update_status('ready', 'FAISS já existe', 100)
51
  return
52
 
53
+ # CLONE
54
+ update_status('cloning', 'Clonando chunks (sparse checkout)', 10)
 
55
  os.makedirs('/tmp/repo', exist_ok=True)
56
  os.chdir('/tmp/repo')
57
 
58
+ run_cmd(f"git clone --filter=blob:none --sparse {github_repo} .", "Git clone")
59
+ run_cmd("git sparse-checkout init --cone", "Sparse checkout init")
60
+
61
+ patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(chunk_start, chunk_end + 1)]
62
+ for i in range(0, len(patterns), 50):
63
+ batch = ' '.join(patterns[i:i+50])
64
+ run_cmd(f"git sparse-checkout add {batch}", f"Batch {i//50 + 1}")
65
+
66
+ chunks_count = int(run_cmd("find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l", "Contar chunks").strip())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  logger.info(f"✅ {chunks_count} chunks clonados")
68
 
69
+ # EXTRACT
70
  update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
 
71
  os.makedirs('/tmp/extracted', exist_ok=True)
72
+ run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Descompactar")
73
 
74
+ # CONCAT
75
+ update_status('concatenating', 'Concatenando JSONL', 50)
76
+ run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concatenar")
 
 
 
 
 
 
 
 
 
77
 
78
+ total_records = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Contar registros").strip())
79
+ logger.info(f"✅ {total_records:,} registros")
 
 
 
 
 
80
 
81
+ # FILTER
82
  update_status('filtering', 'Filtrando campos (id + ementa)', 60)
 
83
  os.chdir('/home/user/app')
84
+ run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filtrar")
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # BUILD FAISS
87
+ update_status('building', 'Construindo FAISS index (pode demorar)', 70)
88
+ run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS")
89
 
90
+ # CLEANUP
91
+ update_status('cleaning', 'Limpando temporários', 95)
92
+ run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Limpar")
 
93
 
94
+ # DONE
95
+ update_status('ready', f'FAISS pronto com {total_records:,} registros!', 100)
96
  READY_FLAG.touch()
97
 
98
  logger.info("="*80)
99
+ logger.info("✅ SETUP COMPLETO!")
100
  logger.info("="*80)
101
 
102
  except Exception as e:
103
+ logger.error(f"❌ ERRO: {e}")
 
 
104
  update_status('error', str(e), 0)
105
  sys.exit(1)
106