caarleexx commited on
Commit
e1a830c
·
verified ·
1 Parent(s): f0cb3e9

Upload 8 files

Browse files
Files changed (8) hide show
  1. Dockerfile +1 -1
  2. README.md +8 -5
  3. app.py +12 -8
  4. entrypoint.sh +4 -4
  5. query_engine.py +50 -10
  6. rag_builder.py +75 -20
  7. requirements.txt +16 -17
  8. setup.py +156 -284
Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
  FROM python:3.11-slim
2
 
3
- RUN apt-get update && apt-get install -y git curl && rm -rf /var/lib/apt/lists/*
4
 
5
  RUN useradd -m -u 1000 user
6
  USER user
 
1
  FROM python:3.11-slim
2
 
3
+ RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
4
 
5
  RUN useradd -m -u 1000 user
6
  USER user
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Para.AI RAG Cluster DEBUG
3
  emoji: ⚖️
4
  colorFrom: blue
5
  colorTo: purple
@@ -7,12 +7,15 @@ sdk: docker
7
  pinned: false
8
  ---
9
 
10
- # ⚖️ Para.AI RAG (DEBUG VERSION)
11
 
12
- Versão com logs de depuração intensivos.
13
 
14
- ## Endpoints adicionais
15
 
16
- - `GET /setup/logs` - Ver logs completos do setup
 
 
 
17
 
18
  ⚖️ **InJustiça não para o Paraná!** 🐝
 
1
  ---
2
+ title: Para.AI RAG Cluster
3
  emoji: ⚖️
4
  colorFrom: blue
5
  colorTo: purple
 
7
  pinned: false
8
  ---
9
 
10
+ # ⚖️ Para.AI RAG v2.0 - Otimizado
11
 
12
+ Versão otimizada com download direto de chunks (sem git clone).
13
 
14
+ ## Melhorias
15
 
16
+ - wget/curl direto (não clona repo inteiro)
17
+ - ✅ Processa chunks um por vez (economiza espaço)
18
+ - ✅ Filtro de campos corrigido (id minúsculo)
19
+ - ✅ Cleanup automático de temporários
20
 
21
  ⚖️ **InJustiça não para o Paraná!** 🐝
app.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
- from typing import List, Optional
5
  import logging
6
  import time
7
  import json
@@ -38,12 +38,11 @@ def get_query_engine():
38
  logger.info("✅ QueryEngine carregado!")
39
  return query_engine
40
 
41
- app = FastAPI(title="Para.AI RAG Cluster", version="1.0.0")
42
 
43
  class EmbeddingSearchRequest(BaseModel):
44
  query: str
45
  top_k: int = 10
46
- return_embeddings: bool = False
47
 
48
  class KeywordSearchRequest(BaseModel):
49
  keywords: List[str]
@@ -52,13 +51,18 @@ class KeywordSearchRequest(BaseModel):
52
 
53
  class IDSearchRequest(BaseModel):
54
  ids: List[str]
55
- return_embeddings: bool = False
56
 
57
  @app.get("/")
58
  async def root():
59
  setup_status = get_setup_status()
60
  ready = is_ready()
61
- response = {"status": "online", "rag_ready": ready, "setup": setup_status, "backend": "LangChain + FAISS (CPU)"}
 
 
 
 
 
 
62
  if ready and query_engine:
63
  response["cluster_id"] = query_engine.config.get('cluster_id')
64
  response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
@@ -86,7 +90,7 @@ async def search_embedding(request: EmbeddingSearchRequest):
86
  engine = get_query_engine()
87
  try:
88
  start = time.time()
89
- results = engine.search_by_embedding(request.query, request.top_k, request.return_embeddings)
90
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
91
  return results
92
  except Exception as e:
@@ -109,7 +113,7 @@ async def search_by_id(request: IDSearchRequest):
109
  engine = get_query_engine()
110
  try:
111
  start = time.time()
112
- results = engine.search_by_ids(request.ids, request.return_embeddings)
113
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
114
  return results
115
  except Exception as e:
@@ -129,7 +133,7 @@ async def cluster_info():
129
  async def startup_event():
130
  app.state.start_time = time.time()
131
  logger.info("="*80)
132
- logger.info("🚀 Para.AI RAG ONLINE")
133
  logger.info("="*80)
134
 
135
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
+ from typing import List
5
  import logging
6
  import time
7
  import json
 
38
  logger.info("✅ QueryEngine carregado!")
39
  return query_engine
40
 
41
+ app = FastAPI(title="Para.AI RAG Cluster", version="2.0.0")
42
 
43
  class EmbeddingSearchRequest(BaseModel):
44
  query: str
45
  top_k: int = 10
 
46
 
47
  class KeywordSearchRequest(BaseModel):
48
  keywords: List[str]
 
51
 
52
  class IDSearchRequest(BaseModel):
53
  ids: List[str]
 
54
 
55
  @app.get("/")
56
  async def root():
57
  setup_status = get_setup_status()
58
  ready = is_ready()
59
+ response = {
60
+ "status": "online",
61
+ "rag_ready": ready,
62
+ "setup": setup_status,
63
+ "backend": "LangChain + FAISS (CPU)",
64
+ "version": "2.0.0 - Otimizado"
65
+ }
66
  if ready and query_engine:
67
  response["cluster_id"] = query_engine.config.get('cluster_id')
68
  response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
 
90
  engine = get_query_engine()
91
  try:
92
  start = time.time()
93
+ results = engine.search_by_embedding(request.query, request.top_k)
94
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
95
  return results
96
  except Exception as e:
 
113
  engine = get_query_engine()
114
  try:
115
  start = time.time()
116
+ results = engine.search_by_ids(request.ids)
117
  results['query_time_ms'] = round((time.time() - start) * 1000, 2)
118
  return results
119
  except Exception as e:
 
133
  async def startup_event():
134
  app.state.start_time = time.time()
135
  logger.info("="*80)
136
+ logger.info("🚀 Para.AI RAG v2.0 ONLINE")
137
  logger.info("="*80)
138
 
139
  if __name__ == "__main__":
entrypoint.sh CHANGED
@@ -1,7 +1,7 @@
1
  #!/bin/bash
2
  set -e
3
  echo "=========================================="
4
- echo "🚀 Para.AI RAG Startup"
5
  echo "=========================================="
6
  cd /home/user/app
7
  echo "1️⃣ Iniciando setup em background..."
@@ -9,8 +9,8 @@ python3 -u setup.py > /tmp/setup_output.log 2>&1 &
9
  echo "✅ Setup PID: $!"
10
  sleep 2
11
  echo "2️⃣ Iniciando FastAPI..."
12
- echo "🎯 API online"
13
- echo "📊 Status: /setup/status"
14
- echo "📋 Logs: /setup/logs"
15
  echo "=========================================="
16
  exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
 
1
  #!/bin/bash
2
  set -e
3
  echo "=========================================="
4
+ echo "🚀 Para.AI RAG v2.0 Startup"
5
  echo "=========================================="
6
  cd /home/user/app
7
  echo "1️⃣ Iniciando setup em background..."
 
9
  echo "✅ Setup PID: $!"
10
  sleep 2
11
  echo "2️⃣ Iniciando FastAPI..."
12
+ echo "🎯 API online em http://0.0.0.0:7860"
13
+ echo "📊 Status: GET /setup/status"
14
+ echo "📋 Logs: GET /setup/logs"
15
  echo "=========================================="
16
  exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
query_engine.py CHANGED
@@ -12,24 +12,64 @@ class QueryEngine:
12
  logger.info("Inicializando QueryEngine...")
13
  with open(config_path) as f:
14
  self.config = yaml.safe_load(f)
15
- model_name = self.config.get('embedding_model', 'sentence-transformers/all-MiniLM-L6-v2')
16
- self.embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'})
 
 
 
17
  faiss_path = self.config.get('faiss_path', '/app/faiss_index')
18
- self.vectorstore = FAISS.load_local(faiss_path, self.embeddings, allow_dangerous_deserialization=True)
 
 
 
 
19
  logger.info("✅ QueryEngine pronto!")
20
 
21
- def search_by_embedding(self, query: str, top_k: int = 10, return_embeddings: bool = False) -> Dict:
22
  results = self.vectorstore.similarity_search_with_score(query, k=top_k)
23
- formatted = [{'id': doc.metadata.get('id'), 'ementa': doc.page_content, 'score': float(score), 'metadata': doc.metadata} for doc, score in results]
24
- return {'cluster_id': self.config.get('cluster_id'), 'query': query, 'total_results': len(formatted), 'results': formatted}
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
27
  return self.search_by_embedding(' '.join(keywords), top_k)
28
 
29
- def search_by_ids(self, ids: List[str], return_embeddings: bool = False) -> Dict:
30
  all_docs = self.vectorstore.similarity_search("", k=10000)
31
- results = [{'id': doc.metadata.get('id'), 'ementa': doc.page_content, 'metadata': doc.metadata} for doc in all_docs if doc.metadata.get('id') in ids][:len(ids)]
32
- return {'cluster_id': self.config.get('cluster_id'), 'total_results': len(results), 'results': results}
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def get_cluster_info(self) -> Dict:
35
- return {'cluster_id': self.config.get('cluster_id'), 'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')], 'embedding_model': self.config.get('embedding_model'), 'embedding_dim': 384, 'vector_store': 'FAISS', 'backend': 'LangChain + CPU', 'status': 'ready'}
 
 
 
 
 
 
 
 
 
12
  logger.info("Inicializando QueryEngine...")
13
  with open(config_path) as f:
14
  self.config = yaml.safe_load(f)
15
+ model_name = self.config.get('embedding_model')
16
+ self.embeddings = HuggingFaceEmbeddings(
17
+ model_name=model_name,
18
+ model_kwargs={'device': 'cpu'}
19
+ )
20
  faiss_path = self.config.get('faiss_path', '/app/faiss_index')
21
+ self.vectorstore = FAISS.load_local(
22
+ faiss_path,
23
+ self.embeddings,
24
+ allow_dangerous_deserialization=True
25
+ )
26
  logger.info("✅ QueryEngine pronto!")
27
 
28
+ def search_by_embedding(self, query: str, top_k: int = 10) -> Dict:
29
  results = self.vectorstore.similarity_search_with_score(query, k=top_k)
30
+ formatted = [
31
+ {
32
+ 'id': doc.metadata.get('id'),
33
+ 'ementa': doc.page_content,
34
+ 'score': float(score),
35
+ 'metadata': doc.metadata
36
+ }
37
+ for doc, score in results
38
+ ]
39
+ return {
40
+ 'cluster_id': self.config.get('cluster_id'),
41
+ 'query': query,
42
+ 'total_results': len(formatted),
43
+ 'results': formatted
44
+ }
45
 
46
  def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
47
  return self.search_by_embedding(' '.join(keywords), top_k)
48
 
49
+ def search_by_ids(self, ids: List[str]) -> Dict:
50
  all_docs = self.vectorstore.similarity_search("", k=10000)
51
+ results = [
52
+ {
53
+ 'id': doc.metadata.get('id'),
54
+ 'ementa': doc.page_content,
55
+ 'metadata': doc.metadata
56
+ }
57
+ for doc in all_docs
58
+ if doc.metadata.get('id') in ids
59
+ ][:len(ids)]
60
+ return {
61
+ 'cluster_id': self.config.get('cluster_id'),
62
+ 'total_results': len(results),
63
+ 'results': results
64
+ }
65
 
66
  def get_cluster_info(self) -> Dict:
67
+ return {
68
+ 'cluster_id': self.config.get('cluster_id'),
69
+ 'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')],
70
+ 'embedding_model': self.config.get('embedding_model'),
71
+ 'embedding_dim': 384,
72
+ 'vector_store': 'FAISS',
73
+ 'backend': 'LangChain + CPU',
74
+ 'status': 'ready'
75
+ }
rag_builder.py CHANGED
@@ -1,12 +1,16 @@
1
  #!/usr/bin/env python3
2
- import os, sys, json, argparse, logging, traceback, time
3
  from pathlib import Path
4
  from typing import List, Dict
5
  from langchain.docstore.document import Document
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
 
9
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
10
  logger = logging.getLogger(__name__)
11
 
12
  def load_jsonl(filepath: str) -> List[Dict]:
@@ -15,48 +19,98 @@ def load_jsonl(filepath: str) -> List[Dict]:
15
  with open(filepath, 'r', encoding='utf-8') as f:
16
  for i, line in enumerate(f, 1):
17
  if line.strip():
18
- records.append(json.loads(line))
19
- if i % 50000 == 0:
 
 
 
20
  logger.info(f" {i:,} linhas...")
21
- logger.info(f"✅ {len(records):,} registros")
22
  return records
23
 
24
  def create_documents(records: List[Dict]) -> List[Document]:
25
  documents = []
 
 
26
  for i, record in enumerate(records, 1):
27
  ementa = record.get('ementa', '')
28
- if ementa:
29
- documents.append(Document(page_content=ementa, metadata={'id': str(record.get('id', f'u{i}')), 'source': 'tjpr'}))
30
- if i % 50000 == 0:
 
 
 
 
 
 
 
 
 
 
 
31
  logger.info(f" {i:,}/{len(records):,}...")
32
- logger.info(f"✅ {len(documents):,} documentos")
 
33
  return documents
34
 
35
- def build_vectorstore(input_file, output_dir='/app/faiss_index', model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=16):
 
 
 
 
 
36
  try:
37
  logger.info("="*80)
38
- logger.info("🚀 RAG Builder")
39
  logger.info("="*80)
40
- logger.info("\nPASSO 1/5: Carregando JSONL")
 
41
  records = load_jsonl(input_file)
42
- logger.info("\nPASSO 2/5: Criando Documents")
 
 
 
 
43
  documents = create_documents(records)
44
- logger.info(f"\nPASSO 3/5: Inicializando Embeddings ({model_name})")
45
- embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs={'batch_size': batch_size, 'show_progress_bar': True, 'normalize_embeddings': True})
46
- logger.info(" Embeddings OK")
47
- logger.info(f"\nPASSO 4/5: Construindo FAISS ({len(documents):,} docs)")
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  start = time.time()
 
49
  vectorstore = FAISS.from_documents(documents, embeddings)
50
- logger.info(f"✅ FAISS em {time.time()-start:.1f}s")
51
- logger.info(f"\nPASSO 5/5: Salvando em {output_dir}")
 
 
 
52
  os.makedirs(output_dir, exist_ok=True)
53
  vectorstore.save_local(output_dir)
 
 
54
  logger.info("✅ BUILD COMPLETO!")
 
 
55
  return vectorstore
 
56
  except Exception as e:
57
  logger.error(f"\n❌ ERRO: {type(e).__name__}: {e}")
 
58
  logger.error(traceback.format_exc())
59
- raise
60
 
61
  def main():
62
  parser = argparse.ArgumentParser()
@@ -65,6 +119,7 @@ def main():
65
  parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2')
66
  parser.add_argument('--batch-size', type=int, default=16)
67
  args = parser.parse_args()
 
68
  build_vectorstore(args.input, args.output, args.model, args.batch_size)
69
 
70
  if __name__ == '__main__':
 
1
  #!/usr/bin/env python3
2
+ import os, sys, json, argparse, logging, time
3
  from pathlib import Path
4
  from typing import List, Dict
5
  from langchain.docstore.document import Document
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
 
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='%(asctime)s - %(levelname)s - %(message)s',
12
+ stream=sys.stdout
13
+ )
14
  logger = logging.getLogger(__name__)
15
 
16
  def load_jsonl(filepath: str) -> List[Dict]:
 
19
  with open(filepath, 'r', encoding='utf-8') as f:
20
  for i, line in enumerate(f, 1):
21
  if line.strip():
22
+ try:
23
+ records.append(json.loads(line))
24
+ except json.JSONDecodeError as e:
25
+ logger.warning(f"Linha {i} inválida: {e}")
26
+ if i % 10000 == 0:
27
  logger.info(f" {i:,} linhas...")
28
+ logger.info(f"✅ {len(records):,} registros carregados")
29
  return records
30
 
31
  def create_documents(records: List[Dict]) -> List[Document]:
32
  documents = []
33
+ logger.info("📝 Criando documentos...")
34
+
35
  for i, record in enumerate(records, 1):
36
  ementa = record.get('ementa', '')
37
+ doc_id = record.get('id', f'unknown_{i}')
38
+
39
+ if ementa and ementa.strip():
40
+ documents.append(
41
+ Document(
42
+ page_content=ementa,
43
+ metadata={
44
+ 'id': str(doc_id),
45
+ 'source': 'tjpr'
46
+ }
47
+ )
48
+ )
49
+
50
+ if i % 10000 == 0:
51
  logger.info(f" {i:,}/{len(records):,}...")
52
+
53
+ logger.info(f"✅ {len(documents):,} documentos criados")
54
  return documents
55
 
56
+ def build_vectorstore(
57
+ input_file,
58
+ output_dir='/app/faiss_index',
59
+ model_name='sentence-transformers/all-MiniLM-L6-v2',
60
+ batch_size=16
61
+ ):
62
  try:
63
  logger.info("="*80)
64
+ logger.info("🚀 RAG Builder v2.0")
65
  logger.info("="*80)
66
+
67
+ logger.info("\nPASSO 1/4: Carregando JSONL")
68
  records = load_jsonl(input_file)
69
+
70
+ if not records:
71
+ raise Exception("Nenhum registro carregado!")
72
+
73
+ logger.info("\nPASSO 2/4: Criando Documents")
74
  documents = create_documents(records)
75
+
76
+ if not documents:
77
+ raise Exception("Nenhum documento criado!")
78
+
79
+ logger.info(f"\nPASSO 3/4: Inicializando Embeddings ({model_name})")
80
+ embeddings = HuggingFaceEmbeddings(
81
+ model_name=model_name,
82
+ model_kwargs={'device': 'cpu'},
83
+ encode_kwargs={
84
+ 'batch_size': batch_size,
85
+ 'show_progress_bar': True,
86
+ 'normalize_embeddings': True
87
+ }
88
+ )
89
+ logger.info("✅ Embeddings inicializados")
90
+
91
+ logger.info(f"\nPASSO 4/4: Construindo FAISS ({len(documents):,} docs)")
92
  start = time.time()
93
+
94
  vectorstore = FAISS.from_documents(documents, embeddings)
95
+
96
+ elapsed = time.time() - start
97
+ logger.info(f"✅ FAISS construído em {elapsed:.1f}s")
98
+
99
+ logger.info(f"\nSalvando em {output_dir}")
100
  os.makedirs(output_dir, exist_ok=True)
101
  vectorstore.save_local(output_dir)
102
+
103
+ logger.info("="*80)
104
  logger.info("✅ BUILD COMPLETO!")
105
+ logger.info("="*80)
106
+
107
  return vectorstore
108
+
109
  except Exception as e:
110
  logger.error(f"\n❌ ERRO: {type(e).__name__}: {e}")
111
+ import traceback
112
  logger.error(traceback.format_exc())
113
+ sys.exit(1)
114
 
115
  def main():
116
  parser = argparse.ArgumentParser()
 
119
  parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2')
120
  parser.add_argument('--batch-size', type=int, default=16)
121
  args = parser.parse_args()
122
+
123
  build_vectorstore(args.input, args.output, args.model, args.batch_size)
124
 
125
  if __name__ == '__main__':
requirements.txt CHANGED
@@ -1,28 +1,27 @@
1
  # FastAPI
2
- fastapi
3
- uvicorn[standard]
4
- pydantic
5
 
6
  # LangChain + Embeddings
7
- langchain
8
- langchain-community
9
 
10
  # Sentence Transformers
11
- sentence-transformers
12
- transformers
13
- torch
14
- tokenizers
15
- safetensors
16
 
17
  # FAISS
18
- faiss-cpu
19
 
20
  # HuggingFace Hub
21
- huggingface-hub
22
 
23
  # Utilities
24
- PyYAML
25
- GitPython
26
- pandas
27
- numpy
28
- tqdm
 
1
  # FastAPI
2
+ fastapi==0.109.0
3
+ uvicorn[standard]==0.27.0
4
+ pydantic==2.5.0
5
 
6
  # LangChain + Embeddings
7
+ langchain==0.1.11
8
+ langchain-community==0.0.24
9
 
10
  # Sentence Transformers
11
+ sentence-transformers==2.5.1
12
+ transformers==4.37.2
13
+ torch==2.2.0
14
+ tokenizers==0.15.2
15
+ safetensors==0.4.2
16
 
17
  # FAISS
18
+ faiss-cpu==1.8.0
19
 
20
  # HuggingFace Hub
21
+ huggingface-hub==0.20.3
22
 
23
  # Utilities
24
+ PyYAML==6.0.1
25
+ pandas==2.1.4
26
+ numpy==1.26.3
27
+ tqdm==4.66.1
 
setup.py CHANGED
@@ -1,11 +1,10 @@
1
  #!/usr/bin/env python3
2
- import os, sys, yaml, json, subprocess, logging, traceback, time
3
  from pathlib import Path
4
  from datetime import datetime
5
 
6
- # Configurar logging VERBOSE
7
  logging.basicConfig(
8
- level=logging.DEBUG,
9
  format='%(asctime)s [%(levelname)s] %(message)s',
10
  handlers=[
11
  logging.StreamHandler(sys.stdout),
@@ -26,364 +25,240 @@ def update_status(status, message, progress=0):
26
  }
27
  with open(STATUS_FILE, 'w') as f:
28
  json.dump(data, f)
29
- logger.info(f"STATUS UPDATE [{progress}%]: {status} - {message}")
30
  sys.stdout.flush()
31
 
32
- def run_cmd(cmd, desc, capture=True, check=True):
33
  logger.info("="*80)
34
- logger.info(f"🔧 EXECUTANDO: {desc}")
35
  logger.info(f"📝 Comando: {cmd}")
36
- logger.info(f"📂 PWD: {os.getcwd()}")
37
- logger.info(f"👤 USER: {os.getenv('USER', 'unknown')}")
38
- logger.info(f"🏠 HOME: {os.getenv('HOME', 'unknown')}")
39
  logger.info("-"*80)
40
 
41
  try:
42
  start = time.time()
43
-
44
- if capture:
45
- result = subprocess.run(
46
- cmd,
47
- shell=True,
48
- capture_output=True,
49
- text=True,
50
- timeout=600
51
- )
52
- else:
53
- result = subprocess.run(
54
- cmd,
55
- shell=True,
56
- timeout=600
57
- )
58
- result.stdout = ""
59
- result.stderr = ""
60
-
61
  elapsed = time.time() - start
62
 
63
- logger.info(f"⏱️ Tempo: {elapsed:.2f}s")
64
- logger.info(f"🔢 Exit code: {result.returncode}")
65
 
66
  if result.stdout:
67
- logger.info(f"📤 STDOUT ({len(result.stdout)} chars):")
68
- for line in result.stdout.split('\n')[:50]: # Primeiras 50 linhas
69
- logger.info(f" | {line}")
70
- if len(result.stdout.split('\n')) > 50:
71
- logger.info(f" | ... ({len(result.stdout.split(chr(10))) - 50} linhas omitidas)")
72
-
73
  if result.stderr:
74
- logger.warning(f"⚠️ STDERR ({len(result.stderr)} chars):")
75
- for line in result.stderr.split('\n')[:50]:
76
- logger.warning(f" | {line}")
77
- if len(result.stderr.split('\n')) > 50:
78
- logger.warning(f" | ... ({len(result.stderr.split(chr(10))) - 50} linhas omitidas)")
79
-
80
- if check and result.returncode != 0:
81
- logger.error("="*80)
82
- logger.error(f"❌ COMANDO FALHOU: {desc}")
83
- logger.error(f"Exit code: {result.returncode}")
84
- logger.error("="*80)
85
- raise Exception(f"{desc} falhou com exit code {result.returncode}")
86
 
87
  logger.info(f"✅ {desc} - OK")
88
- logger.info("="*80)
89
-
90
- return result.stdout if capture else ""
91
 
92
  except subprocess.TimeoutExpired:
93
- logger.error(f"⏰ TIMEOUT após 600s: {desc}")
94
- raise Exception(f"{desc} - timeout")
 
 
 
 
 
95
  except Exception as e:
96
  logger.error(f"💥 EXCEÇÃO: {type(e).__name__}: {e}")
97
- logger.error(traceback.format_exc())
98
  raise
99
 
100
- def check_environment():
101
- logger.info("\n" + "="*80)
102
- logger.info("🔍 VERIFICANDO AMBIENTE")
103
- logger.info("="*80)
104
 
105
- checks = [
106
- ("pwd", "Diretório atual"),
107
- ("whoami", "Usuário"),
108
- ("id", "UID/GID"),
109
- ("git --version", "Git version"),
110
- ("python3 --version", "Python version"),
111
- ("pip list | grep -E '(langchain|torch|transformers)'", "Pacotes principais"),
112
- ("df -h /tmp", "Espaço em /tmp"),
113
- ("free -h", "Memória disponível"),
114
- ("ls -la /home/user/app", "Arquivos app"),
115
- ("cat /etc/resolv.conf", "DNS config"),
116
- ("ping -c 2 github.com || echo 'ping falhou'", "Conectividade GitHub"),
117
- ]
118
 
119
- for cmd, desc in checks:
120
- try:
121
- logger.info(f"\n🔎 {desc}:")
122
- output = run_cmd(cmd, desc, capture=True, check=False)
123
- except Exception as e:
124
- logger.warning(f" ⚠️ Falhou: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  def main():
127
  try:
128
  logger.info("\n" + "="*80)
129
- logger.info("🚀 PARA.AI RAG SETUP - VERSÃO DEBUG")
130
  logger.info("="*80)
131
  logger.info(f"⏰ Início: {datetime.now()}")
132
- logger.info(f"🐍 Python: {sys.version}")
133
- logger.info(f"📂 CWD: {os.getcwd()}")
134
- logger.info("="*80)
135
 
136
- # VERIFICAÇÃO DE AMBIENTE
137
- check_environment()
 
 
 
138
 
139
  # CARREGAR CONFIG
140
- logger.info("\n" + "="*80)
141
- logger.info("📝 PASSO 0: Carregando configuração")
142
- logger.info("="*80)
143
  update_status('loading', 'Carregando configuração', 0)
144
 
145
- config_path = 'config.yaml'
146
- logger.info(f"Config file: {config_path}")
147
- logger.info(f"Existe? {os.path.exists(config_path)}")
148
-
149
- if os.path.exists(config_path):
150
- with open(config_path) as f:
151
- config_content = f.read()
152
- logger.info(f"Config content ({len(config_content)} chars):")
153
- logger.info(config_content)
154
-
155
- with open(config_path) as f:
156
  config = yaml.safe_load(f)
157
 
158
- logger.info(f"✅ Config carregado:")
159
- for key, value in config.items():
160
- logger.info(f" {key}: {value}")
161
-
162
  cluster_id = config['cluster_id']
163
  chunk_start = config['chunk_start']
164
  chunk_end = config['chunk_end']
165
  github_repo = config['github_repo']
 
166
 
167
- logger.info(f"\n📊 Configuração:")
168
  logger.info(f" Cluster: {cluster_id}")
169
  logger.info(f" Chunks: {chunk_start} → {chunk_end} ({chunk_end - chunk_start + 1} chunks)")
170
- logger.info(f" Repo: {github_repo}")
171
-
172
- # VERIFICAR SE JÁ PRONTO
173
- if READY_FLAG.exists():
174
- logger.info(f"\n✅ FAISS já existe em {READY_FLAG}")
175
- update_status('ready', 'FAISS já existe', 100)
176
- return
177
-
178
- # PASSO 1: GIT CLONE
179
- logger.info("\n" + "="*80)
180
- logger.info("📥 PASSO 1: Git Clone (Sparse Checkout)")
181
- logger.info("="*80)
182
- update_status('cloning', 'Iniciando clone do repositório', 10)
183
 
184
- repo_dir = '/tmp/repo'
185
- logger.info(f"Diretório destino: {repo_dir}")
 
 
 
 
186
 
187
- # Criar diretório
188
- logger.info(f"Criando {repo_dir}...")
189
- os.makedirs(repo_dir, exist_ok=True)
190
- logger.info(f"✅ Diretório criado")
191
 
192
- # Listar /tmp antes
193
- logger.info("\nConteúdo de /tmp ANTES:")
194
- run_cmd("ls -la /tmp", "List /tmp", check=False)
195
 
196
- # Mudar para diretório
197
- logger.info(f"\nMudando para {repo_dir}...")
198
- os.chdir(repo_dir)
199
- logger.info(f"✅ PWD agora: {os.getcwd()}")
200
 
201
- # Testar conectividade GitHub
202
- logger.info("\n🌐 Testando conectividade com GitHub...")
203
- run_cmd("curl -I https://github.com 2>&1 | head -5", "GitHub connectivity", check=False)
204
-
205
- # Git clone
206
- logger.info(f"\n📦 Clonando {github_repo}...")
207
- logger.info("Comando: git clone --filter=blob:none --sparse ...")
208
 
209
- try:
210
- run_cmd(
211
- f"git clone --filter=blob:none --no-checkout --sparse {github_repo} . 2>&1",
212
- "Git clone sparse"
213
- )
214
- except Exception as e:
215
- logger.error("\n❌ GIT CLONE FALHOU!")
216
- logger.error("Tentando diagnóstico adicional...")
217
-
218
- # Diagnóstico
219
- run_cmd("ls -la", "List current dir", check=False)
220
- run_cmd("git --version", "Git version", check=False)
221
- run_cmd(f"git ls-remote {github_repo} 2>&1 | head -10", "Git ls-remote", check=False)
222
- run_cmd("cat ~/.gitconfig 2>/dev/null || echo 'No gitconfig'", "Git config", check=False)
223
-
224
- raise Exception(f"Git clone falhou: {e}")
225
-
226
- # Verificar clone
227
- logger.info("\n✅ Clone concluído, verificando...")
228
- run_cmd("ls -la", "List repo dir")
229
- run_cmd("git status 2>&1 | head -20", "Git status", check=False)
230
-
231
- # Sparse checkout init
232
- logger.info("\n🔧 Configurando sparse checkout...")
233
- run_cmd("git sparse-checkout init --cone", "Sparse checkout init")
234
-
235
- # Adicionar patterns em batches
236
- logger.info(f"\n📋 Adicionando patterns para chunks {chunk_start}-{chunk_end}...")
237
- patterns = [
238
- f"chunks_dados/chunk_dados_{i:04d}.tar.gz"
239
- for i in range(chunk_start, chunk_end + 1)
240
- ]
241
-
242
- logger.info(f"Total de patterns: {len(patterns)}")
243
- logger.info(f"Primeiros 5: {patterns[:5]}")
244
- logger.info(f"Últimos 5: {patterns[-5:]}")
245
-
246
- batch_size = 50
247
- for i in range(0, len(patterns), batch_size):
248
- batch = patterns[i:i+batch_size]
249
- batch_num = i // batch_size + 1
250
- total_batches = (len(patterns) + batch_size - 1) // batch_size
251
-
252
- logger.info(f"\nBatch {batch_num}/{total_batches} ({len(batch)} patterns)...")
253
-
254
- patterns_str = ' '.join(batch)
255
- run_cmd(
256
- f"git sparse-checkout add {patterns_str}",
257
- f"Add patterns batch {batch_num}"
258
- )
259
 
260
- # Checkout
261
- logger.info("\n📥 Fazendo checkout dos arquivos...")
262
- run_cmd("git checkout main 2>&1", "Git checkout main")
 
 
263
 
264
- # Verificar chunks baixados
265
- logger.info("\n📊 Verificando chunks baixados...")
266
- chunks_found = run_cmd(
267
- "find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l",
268
- "Count chunks"
269
- ).strip()
270
 
271
- logger.info(f"✅ Chunks encontrados: {chunks_found}")
 
 
272
 
273
- if chunks_found == '0':
274
- logger.error("❌ NENHUM CHUNK ENCONTRADO!")
275
- run_cmd("find . -type f 2>/dev/null | head -20", "List all files", check=False)
276
- raise Exception("Nenhum chunk foi baixado")
 
 
277
 
278
- # PASSO 2: EXTRACT
279
- logger.info("\n" + "="*80)
280
- logger.info("📦 PASSO 2: Extraindo chunks")
281
- logger.info("="*80)
282
- update_status('extracting', f'Extraindo {chunks_found} chunks', 30)
283
 
284
- extract_dir = '/tmp/extracted'
285
- logger.info(f"Diretório destino: {extract_dir}")
286
- os.makedirs(extract_dir, exist_ok=True)
287
 
288
- run_cmd(
289
- "find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>&1 | head -50",
290
- "Extract all chunks",
291
- check=False
292
- )
293
 
294
- # Verificar extração
295
- jsonl_count = run_cmd(
296
- "find /tmp/extracted -name 'jurisprudencias.jsonl' 2>/dev/null | wc -l",
297
- "Count JSONL files"
298
- ).strip()
299
- logger.info(f"✅ Arquivos JSONL extraídos: {jsonl_count}")
300
 
301
- # PASSO 3: CONCAT
302
- logger.info("\n" + "="*80)
303
- logger.info("📄 PASSO 3: Concatenando JSONL")
304
- logger.info("="*80)
305
- update_status('concatenating', 'Concatenando registros', 50)
 
306
 
307
- run_cmd(
308
- "find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>&1",
309
- "Concatenate JSONL"
310
- )
311
 
312
- total_lines = run_cmd(
313
- "wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'",
314
- "Count lines"
315
- ).strip()
316
 
317
- logger.info(f"✅ Total de registros: {total_lines}")
 
 
318
 
319
- if total_lines == '0':
320
- raise Exception("JSONL concatenado está vazio!")
321
 
322
- # PASSO 4: FILTER
323
  logger.info("\n" + "="*80)
324
- logger.info("🔍 PASSO 4: Filtrando campos")
325
  logger.info("="*80)
326
- update_status('filtering', 'Filtrando campos (id + ementa)', 60)
327
 
328
  os.chdir('/home/user/app')
329
- logger.info(f"PWD: {os.getcwd()}")
330
-
331
- run_cmd(
332
- "python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl 2>&1",
333
- "Filter fields"
334
- )
335
-
336
- filtered_lines = run_cmd(
337
- "wc -l < /tmp/filtered.jsonl 2>/dev/null || echo '0'",
338
- "Count filtered"
339
- ).strip()
340
- logger.info(f"✅ Registros filtrados: {filtered_lines}")
341
 
342
- # PASSO 5: BUILD FAISS
343
- logger.info("\n" + "="*80)
344
- logger.info("🤖 PASSO 5: Construindo FAISS index")
345
- logger.info("="*80)
346
- update_status('building', f'Construindo FAISS com {filtered_lines} documentos', 70)
347
-
348
- logger.info("⚠️ Este passo pode demorar ~10-15 minutos...")
349
-
350
- run_cmd(
351
- "python3 rag_builder.py --input /tmp/filtered.jsonl 2>&1",
352
- "Build FAISS",
353
- capture=False
354
- )
355
 
356
  logger.info("✅ FAISS construído!")
357
 
358
- # Verificar FAISS
359
- faiss_files = run_cmd(
360
- "ls -lh /app/faiss_index 2>&1",
361
- "List FAISS files",
362
- check=False
363
- )
364
-
365
- # PASSO 6: CLEANUP
366
- logger.info("\n" + "="*80)
367
- logger.info("🧹 PASSO 6: Limpando temporários")
368
- logger.info("="*80)
369
  update_status('cleaning', 'Limpando arquivos temporários', 95)
370
 
371
- run_cmd(
372
- "rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl",
373
- "Cleanup temp files"
374
- )
375
 
376
  # CONCLUÍDO
377
  logger.info("\n" + "="*80)
378
  logger.info("✅ SETUP COMPLETO!")
379
  logger.info("="*80)
380
 
381
- update_status('ready', f'FAISS pronto com {total_lines} registros!', 100)
382
  READY_FLAG.touch()
383
 
384
  logger.info(f"⏰ Fim: {datetime.now()}")
385
- logger.info(f"📁 Logs salvos em: /tmp/setup_debug.log")
386
- logger.info("="*80)
387
 
388
  except Exception as e:
389
  logger.error("\n" + "="*80)
@@ -391,13 +266,10 @@ def main():
391
  logger.error("="*80)
392
  logger.error(f"Tipo: {type(e).__name__}")
393
  logger.error(f"Mensagem: {str(e)}")
394
- logger.error("\nTraceback completo:")
395
  logger.error(traceback.format_exc())
396
- logger.error("="*80)
397
- logger.error(f"📁 Logs completos em: /tmp/setup_debug.log")
398
- logger.error("="*80)
399
 
400
- update_status('error', f'Build FAISS falhou: {str(e)}', 0)
401
  sys.exit(1)
402
 
403
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
+ import os, sys, yaml, json, subprocess, logging, traceback, time, tarfile, io
3
  from pathlib import Path
4
  from datetime import datetime
5
 
 
6
  logging.basicConfig(
7
+ level=logging.INFO,
8
  format='%(asctime)s [%(levelname)s] %(message)s',
9
  handlers=[
10
  logging.StreamHandler(sys.stdout),
 
25
  }
26
  with open(STATUS_FILE, 'w') as f:
27
  json.dump(data, f)
28
+ logger.info(f"STATUS [{progress}%]: {status} - {message}")
29
  sys.stdout.flush()
30
 
31
+ def run_cmd(cmd, desc, check=True, timeout=300):
32
  logger.info("="*80)
33
+ logger.info(f"🔧 {desc}")
34
  logger.info(f"📝 Comando: {cmd}")
 
 
 
35
  logger.info("-"*80)
36
 
37
  try:
38
  start = time.time()
39
+ result = subprocess.run(
40
+ cmd,
41
+ shell=True,
42
+ capture_output=True,
43
+ text=True,
44
+ timeout=timeout,
45
+ check=check
46
+ )
 
 
 
 
 
 
 
 
 
 
47
  elapsed = time.time() - start
48
 
49
+ logger.info(f"⏱️ {elapsed:.2f}s | Exit: {result.returncode}")
 
50
 
51
  if result.stdout:
52
+ logger.info(f"STDOUT: {result.stdout[:500]}")
 
 
 
 
 
53
  if result.stderr:
54
+ logger.warning(f"STDERR: {result.stderr[:500]}")
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  logger.info(f"✅ {desc} - OK")
57
+ return result.stdout
 
 
58
 
59
  except subprocess.TimeoutExpired:
60
+ logger.error(f"⏰ TIMEOUT após {timeout}s: {desc}")
61
+ raise
62
+ except subprocess.CalledProcessError as e:
63
+ logger.error(f"❌ FALHOU: {desc}")
64
+ logger.error(f"Exit code: {e.returncode}")
65
+ logger.error(f"STDERR: {e.stderr[:500]}")
66
+ raise
67
  except Exception as e:
68
  logger.error(f"💥 EXCEÇÃO: {type(e).__name__}: {e}")
 
69
  raise
70
 
71
+ def filter_jsonl_record(record, fields_to_keep):
72
+ """Filtra campos de um registro JSONL"""
73
+ return {k: record.get(k) for k in fields_to_keep if k in record}
 
74
 
75
+ def process_tar_gz(tar_path, output_jsonl, fields_to_keep):
76
+ """Extrai TAR.GZ, filtra campos do JSONL, e concatena"""
77
+ logger.info(f"📦 Processando: {tar_path.name}")
 
 
 
 
 
 
 
 
 
 
78
 
79
+ try:
80
+ with tarfile.open(tar_path, 'r:gz') as tar:
81
+ members = tar.getmembers()
82
+ logger.info(f" Arquivos no TAR.GZ: {len(members)}")
83
+
84
+ for member in members:
85
+ if member.name.endswith('jurisprudencias.jsonl') and member.isfile():
86
+ logger.info(f" ✅ Encontrado: {member.name}")
87
+
88
+ # Extrai JSONL para memória
89
+ file_obj = tar.extractfile(member)
90
+ content = file_obj.read().decode('utf-8')
91
+
92
+ # Processa linha por linha
93
+ lines = content.strip().split('\n')
94
+ logger.info(f" 📋 Linhas: {len(lines)}")
95
+
96
+ processed = 0
97
+ with open(output_jsonl, 'a', encoding='utf-8') as out:
98
+ for line in lines:
99
+ if line.strip():
100
+ try:
101
+ record = json.loads(line)
102
+ filtered = filter_jsonl_record(record, fields_to_keep)
103
+ out.write(json.dumps(filtered, ensure_ascii=False) + '\n')
104
+ processed += 1
105
+ except json.JSONDecodeError:
106
+ continue
107
+
108
+ logger.info(f" ✅ Processados: {processed} registros")
109
+ return processed
110
+
111
+ logger.warning(f" ⚠️ Nenhum jurisprudencias.jsonl encontrado")
112
+ return 0
113
+
114
+ except Exception as e:
115
+ logger.error(f" ❌ Erro ao processar {tar_path.name}: {e}")
116
+ raise
117
 
118
  def main():
119
  try:
120
  logger.info("\n" + "="*80)
121
+ logger.info("🚀 PARA.AI RAG SETUP - VERSÃO OTIMIZADA")
122
  logger.info("="*80)
123
  logger.info(f"⏰ Início: {datetime.now()}")
 
 
 
124
 
125
+ # VERIFICAR SE JÁ PRONTO
126
+ if READY_FLAG.exists():
127
+ logger.info("✅ FAISS já existe")
128
+ update_status('ready', 'FAISS já pronto', 100)
129
+ return
130
 
131
  # CARREGAR CONFIG
132
+ logger.info("\n📝 PASSO 0: Carregando configuração")
 
 
133
  update_status('loading', 'Carregando configuração', 0)
134
 
135
+ with open('config.yaml') as f:
 
 
 
 
 
 
 
 
 
 
136
  config = yaml.safe_load(f)
137
 
 
 
 
 
138
  cluster_id = config['cluster_id']
139
  chunk_start = config['chunk_start']
140
  chunk_end = config['chunk_end']
141
  github_repo = config['github_repo']
142
+ campos_filter = config['campos_filter']
143
 
 
144
  logger.info(f" Cluster: {cluster_id}")
145
  logger.info(f" Chunks: {chunk_start} → {chunk_end} ({chunk_end - chunk_start + 1} chunks)")
146
+ logger.info(f" Campos: {campos_filter}")
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ # PREPARAR URLS
149
+ # URL correta: https://raw.githubusercontent.com/USER/REPO/BRANCH/PATH
150
+ base_url = github_repo.replace('https://github.com/', 'https://raw.githubusercontent.com/')
151
+ if base_url.endswith('.git'):
152
+ base_url = base_url[:-4]
153
+ base_url = f"{base_url}/main/chunks_dados"
154
 
155
+ logger.info(f" Base URL: {base_url}")
 
 
 
156
 
157
+ # CRIAR DIRETÓRIOS
158
+ work_dir = Path('/tmp/work')
159
+ work_dir.mkdir(exist_ok=True)
160
 
161
+ output_jsonl = work_dir / 'all_filtered.jsonl'
162
+ if output_jsonl.exists():
163
+ output_jsonl.unlink()
 
164
 
165
+ # PASSO 1: DOWNLOAD E PROCESSAR CHUNKS (UM POR VEZ)
166
+ logger.info("\n" + "="*80)
167
+ logger.info("📥 PASSO 1: Download e Processamento de Chunks")
168
+ logger.info("="*80)
169
+ update_status('downloading', 'Baixando e processando chunks', 10)
 
 
170
 
171
+ total_records = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ for chunk_num in range(chunk_start, chunk_end + 1):
174
+ # Nome do arquivo: chunk_dados_000001.tar.gz
175
+ chunk_name = f"chunk_dados_{chunk_num:06d}.tar.gz"
176
+ chunk_url = f"{base_url}/{chunk_name}"
177
+ chunk_path = work_dir / chunk_name
178
 
179
+ logger.info(f"\n📦 Chunk {chunk_num}/{chunk_end}")
180
+ logger.info(f" URL: {chunk_url}")
 
 
 
 
181
 
182
+ # Download com curl
183
+ progress = 10 + ((chunk_num - chunk_start) * 40 // (chunk_end - chunk_start + 1))
184
+ update_status('downloading', f'Baixando chunk {chunk_num}/{chunk_end}', progress)
185
 
186
+ try:
187
+ run_cmd(
188
+ f"curl -L -f -o {chunk_path} {chunk_url}",
189
+ f"Download {chunk_name}",
190
+ timeout=300
191
+ )
192
 
193
+ if not chunk_path.exists() or chunk_path.stat().st_size == 0:
194
+ logger.warning(f" ⚠️ Arquivo vazio ou não baixado: {chunk_name}")
195
+ continue
 
 
196
 
197
+ logger.info(f" ✅ Baixado: {chunk_path.stat().st_size / 1024 / 1024:.2f} MB")
 
 
198
 
199
+ # Processar TAR.GZ (extrai + filtra + concatena)
200
+ records = process_tar_gz(chunk_path, output_jsonl, campos_filter)
201
+ total_records += records
 
 
202
 
203
+ # Apagar TAR.GZ para economizar espaço
204
+ chunk_path.unlink()
205
+ logger.info(f" 🗑️ Arquivo TAR.GZ apagado (economizando espaço)")
 
 
 
206
 
207
+ except Exception as e:
208
+ logger.error(f" Erro no chunk {chunk_num}: {e}")
209
+ # Continua com próximo chunk
210
+ if chunk_path.exists():
211
+ chunk_path.unlink()
212
+ continue
213
 
214
+ logger.info(f"\n✅ Total de registros processados: {total_records}")
 
 
 
215
 
216
+ if total_records == 0:
217
+ raise Exception("Nenhum registro foi processado!")
 
 
218
 
219
+ # Verificar arquivo final
220
+ if not output_jsonl.exists():
221
+ raise Exception("Arquivo all_filtered.jsonl não foi criado!")
222
 
223
+ final_lines = int(run_cmd(f"wc -l < {output_jsonl}", "Count lines").strip())
224
+ logger.info(f"✅ Linhas no JSONL final: {final_lines}")
225
 
226
+ # PASSO 2: BUILD FAISS
227
  logger.info("\n" + "="*80)
228
+ logger.info("🤖 PASSO 2: Construindo FAISS index")
229
  logger.info("="*80)
230
+ update_status('building', f'Construindo FAISS com {final_lines} documentos', 70)
231
 
232
  os.chdir('/home/user/app')
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ try:
235
+ run_cmd(
236
+ f"python3 rag_builder.py --input {output_jsonl} 2>&1",
237
+ "Build FAISS",
238
+ timeout=900 # 15 minutos
239
+ )
240
+ except Exception as e:
241
+ logger.error(f"❌ Build FAISS falhou: {e}")
242
+ # Tentar ler stderr do rag_builder
243
+ raise Exception(f"Build FAISS falhou: {e}")
 
 
 
244
 
245
  logger.info("✅ FAISS construído!")
246
 
247
+ # PASSO 3: CLEANUP
248
+ logger.info("\n🧹 PASSO 3: Limpando temporários")
 
 
 
 
 
 
 
 
 
249
  update_status('cleaning', 'Limpando arquivos temporários', 95)
250
 
251
+ run_cmd(f"rm -rf {work_dir}", "Cleanup", check=False)
 
 
 
252
 
253
  # CONCLUÍDO
254
  logger.info("\n" + "="*80)
255
  logger.info("✅ SETUP COMPLETO!")
256
  logger.info("="*80)
257
 
258
+ update_status('ready', f'FAISS pronto com {total_records} registros!', 100)
259
  READY_FLAG.touch()
260
 
261
  logger.info(f"⏰ Fim: {datetime.now()}")
 
 
262
 
263
  except Exception as e:
264
  logger.error("\n" + "="*80)
 
266
  logger.error("="*80)
267
  logger.error(f"Tipo: {type(e).__name__}")
268
  logger.error(f"Mensagem: {str(e)}")
269
+ logger.error("\nTraceback:")
270
  logger.error(traceback.format_exc())
 
 
 
271
 
272
+ update_status('error', f'Setup falhou: {str(e)}', 0)
273
  sys.exit(1)
274
 
275
  if __name__ == "__main__":