Upload 8 files
Browse files- Dockerfile +1 -1
- README.md +8 -5
- app.py +12 -8
- entrypoint.sh +4 -4
- query_engine.py +50 -10
- rag_builder.py +75 -20
- requirements.txt +16 -17
- setup.py +156 -284
Dockerfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
-
RUN apt-get update && apt-get install -y
|
| 4 |
|
| 5 |
RUN useradd -m -u 1000 user
|
| 6 |
USER user
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
+
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
|
| 4 |
|
| 5 |
RUN useradd -m -u 1000 user
|
| 6 |
USER user
|
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: Para.AI RAG Cluster
|
| 3 |
emoji: ⚖️
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
|
@@ -7,12 +7,15 @@ sdk: docker
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
# ⚖️ Para.AI RAG
|
| 11 |
|
| 12 |
-
Versão com
|
| 13 |
|
| 14 |
-
##
|
| 15 |
|
| 16 |
-
-
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
⚖️ **InJustiça não para o Paraná!** 🐝
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Para.AI RAG Cluster
|
| 3 |
emoji: ⚖️
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# ⚖️ Para.AI RAG v2.0 - Otimizado
|
| 11 |
|
| 12 |
+
Versão otimizada com download direto de chunks (sem git clone).
|
| 13 |
|
| 14 |
+
## Melhorias
|
| 15 |
|
| 16 |
+
- ✅ wget/curl direto (não clona repo inteiro)
|
| 17 |
+
- ✅ Processa chunks um por vez (economiza espaço)
|
| 18 |
+
- ✅ Filtro de campos corrigido (id minúsculo)
|
| 19 |
+
- ✅ Cleanup automático de temporários
|
| 20 |
|
| 21 |
⚖️ **InJustiça não para o Paraná!** 🐝
|
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
from pydantic import BaseModel
|
| 4 |
-
from typing import List
|
| 5 |
import logging
|
| 6 |
import time
|
| 7 |
import json
|
|
@@ -38,12 +38,11 @@ def get_query_engine():
|
|
| 38 |
logger.info("✅ QueryEngine carregado!")
|
| 39 |
return query_engine
|
| 40 |
|
| 41 |
-
app = FastAPI(title="Para.AI RAG Cluster", version="
|
| 42 |
|
| 43 |
class EmbeddingSearchRequest(BaseModel):
|
| 44 |
query: str
|
| 45 |
top_k: int = 10
|
| 46 |
-
return_embeddings: bool = False
|
| 47 |
|
| 48 |
class KeywordSearchRequest(BaseModel):
|
| 49 |
keywords: List[str]
|
|
@@ -52,13 +51,18 @@ class KeywordSearchRequest(BaseModel):
|
|
| 52 |
|
| 53 |
class IDSearchRequest(BaseModel):
|
| 54 |
ids: List[str]
|
| 55 |
-
return_embeddings: bool = False
|
| 56 |
|
| 57 |
@app.get("/")
|
| 58 |
async def root():
|
| 59 |
setup_status = get_setup_status()
|
| 60 |
ready = is_ready()
|
| 61 |
-
response = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
if ready and query_engine:
|
| 63 |
response["cluster_id"] = query_engine.config.get('cluster_id')
|
| 64 |
response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
|
|
@@ -86,7 +90,7 @@ async def search_embedding(request: EmbeddingSearchRequest):
|
|
| 86 |
engine = get_query_engine()
|
| 87 |
try:
|
| 88 |
start = time.time()
|
| 89 |
-
results = engine.search_by_embedding(request.query, request.top_k
|
| 90 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 91 |
return results
|
| 92 |
except Exception as e:
|
|
@@ -109,7 +113,7 @@ async def search_by_id(request: IDSearchRequest):
|
|
| 109 |
engine = get_query_engine()
|
| 110 |
try:
|
| 111 |
start = time.time()
|
| 112 |
-
results = engine.search_by_ids(request.ids
|
| 113 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 114 |
return results
|
| 115 |
except Exception as e:
|
|
@@ -129,7 +133,7 @@ async def cluster_info():
|
|
| 129 |
async def startup_event():
|
| 130 |
app.state.start_time = time.time()
|
| 131 |
logger.info("="*80)
|
| 132 |
-
logger.info("🚀 Para.AI RAG ONLINE")
|
| 133 |
logger.info("="*80)
|
| 134 |
|
| 135 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
from pydantic import BaseModel
|
| 4 |
+
from typing import List
|
| 5 |
import logging
|
| 6 |
import time
|
| 7 |
import json
|
|
|
|
| 38 |
logger.info("✅ QueryEngine carregado!")
|
| 39 |
return query_engine
|
| 40 |
|
| 41 |
+
app = FastAPI(title="Para.AI RAG Cluster", version="2.0.0")
|
| 42 |
|
| 43 |
class EmbeddingSearchRequest(BaseModel):
|
| 44 |
query: str
|
| 45 |
top_k: int = 10
|
|
|
|
| 46 |
|
| 47 |
class KeywordSearchRequest(BaseModel):
|
| 48 |
keywords: List[str]
|
|
|
|
| 51 |
|
| 52 |
class IDSearchRequest(BaseModel):
|
| 53 |
ids: List[str]
|
|
|
|
| 54 |
|
| 55 |
@app.get("/")
|
| 56 |
async def root():
|
| 57 |
setup_status = get_setup_status()
|
| 58 |
ready = is_ready()
|
| 59 |
+
response = {
|
| 60 |
+
"status": "online",
|
| 61 |
+
"rag_ready": ready,
|
| 62 |
+
"setup": setup_status,
|
| 63 |
+
"backend": "LangChain + FAISS (CPU)",
|
| 64 |
+
"version": "2.0.0 - Otimizado"
|
| 65 |
+
}
|
| 66 |
if ready and query_engine:
|
| 67 |
response["cluster_id"] = query_engine.config.get('cluster_id')
|
| 68 |
response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
|
|
|
|
| 90 |
engine = get_query_engine()
|
| 91 |
try:
|
| 92 |
start = time.time()
|
| 93 |
+
results = engine.search_by_embedding(request.query, request.top_k)
|
| 94 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 95 |
return results
|
| 96 |
except Exception as e:
|
|
|
|
| 113 |
engine = get_query_engine()
|
| 114 |
try:
|
| 115 |
start = time.time()
|
| 116 |
+
results = engine.search_by_ids(request.ids)
|
| 117 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 118 |
return results
|
| 119 |
except Exception as e:
|
|
|
|
| 133 |
async def startup_event():
|
| 134 |
app.state.start_time = time.time()
|
| 135 |
logger.info("="*80)
|
| 136 |
+
logger.info("🚀 Para.AI RAG v2.0 ONLINE")
|
| 137 |
logger.info("="*80)
|
| 138 |
|
| 139 |
if __name__ == "__main__":
|
entrypoint.sh
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
set -e
|
| 3 |
echo "=========================================="
|
| 4 |
-
echo "🚀 Para.AI RAG Startup"
|
| 5 |
echo "=========================================="
|
| 6 |
cd /home/user/app
|
| 7 |
echo "1️⃣ Iniciando setup em background..."
|
|
@@ -9,8 +9,8 @@ python3 -u setup.py > /tmp/setup_output.log 2>&1 &
|
|
| 9 |
echo "✅ Setup PID: $!"
|
| 10 |
sleep 2
|
| 11 |
echo "2️⃣ Iniciando FastAPI..."
|
| 12 |
-
echo "🎯 API online"
|
| 13 |
-
echo "📊 Status: /setup/status"
|
| 14 |
-
echo "📋 Logs: /setup/logs"
|
| 15 |
echo "=========================================="
|
| 16 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
set -e
|
| 3 |
echo "=========================================="
|
| 4 |
+
echo "🚀 Para.AI RAG v2.0 Startup"
|
| 5 |
echo "=========================================="
|
| 6 |
cd /home/user/app
|
| 7 |
echo "1️⃣ Iniciando setup em background..."
|
|
|
|
| 9 |
echo "✅ Setup PID: $!"
|
| 10 |
sleep 2
|
| 11 |
echo "2️⃣ Iniciando FastAPI..."
|
| 12 |
+
echo "🎯 API online em http://0.0.0.0:7860"
|
| 13 |
+
echo "📊 Status: GET /setup/status"
|
| 14 |
+
echo "📋 Logs: GET /setup/logs"
|
| 15 |
echo "=========================================="
|
| 16 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
query_engine.py
CHANGED
|
@@ -12,24 +12,64 @@ class QueryEngine:
|
|
| 12 |
logger.info("Inicializando QueryEngine...")
|
| 13 |
with open(config_path) as f:
|
| 14 |
self.config = yaml.safe_load(f)
|
| 15 |
-
model_name = self.config.get('embedding_model'
|
| 16 |
-
self.embeddings = HuggingFaceEmbeddings(
|
|
|
|
|
|
|
|
|
|
| 17 |
faiss_path = self.config.get('faiss_path', '/app/faiss_index')
|
| 18 |
-
self.vectorstore = FAISS.load_local(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
logger.info("✅ QueryEngine pronto!")
|
| 20 |
|
| 21 |
-
def search_by_embedding(self, query: str, top_k: int = 10
|
| 22 |
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
|
| 23 |
-
formatted = [
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
|
| 27 |
return self.search_by_embedding(' '.join(keywords), top_k)
|
| 28 |
|
| 29 |
-
def search_by_ids(self, ids: List[str]
|
| 30 |
all_docs = self.vectorstore.similarity_search("", k=10000)
|
| 31 |
-
results = [
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def get_cluster_info(self) -> Dict:
|
| 35 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
logger.info("Inicializando QueryEngine...")
|
| 13 |
with open(config_path) as f:
|
| 14 |
self.config = yaml.safe_load(f)
|
| 15 |
+
model_name = self.config.get('embedding_model')
|
| 16 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 17 |
+
model_name=model_name,
|
| 18 |
+
model_kwargs={'device': 'cpu'}
|
| 19 |
+
)
|
| 20 |
faiss_path = self.config.get('faiss_path', '/app/faiss_index')
|
| 21 |
+
self.vectorstore = FAISS.load_local(
|
| 22 |
+
faiss_path,
|
| 23 |
+
self.embeddings,
|
| 24 |
+
allow_dangerous_deserialization=True
|
| 25 |
+
)
|
| 26 |
logger.info("✅ QueryEngine pronto!")
|
| 27 |
|
| 28 |
+
def search_by_embedding(self, query: str, top_k: int = 10) -> Dict:
|
| 29 |
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
|
| 30 |
+
formatted = [
|
| 31 |
+
{
|
| 32 |
+
'id': doc.metadata.get('id'),
|
| 33 |
+
'ementa': doc.page_content,
|
| 34 |
+
'score': float(score),
|
| 35 |
+
'metadata': doc.metadata
|
| 36 |
+
}
|
| 37 |
+
for doc, score in results
|
| 38 |
+
]
|
| 39 |
+
return {
|
| 40 |
+
'cluster_id': self.config.get('cluster_id'),
|
| 41 |
+
'query': query,
|
| 42 |
+
'total_results': len(formatted),
|
| 43 |
+
'results': formatted
|
| 44 |
+
}
|
| 45 |
|
| 46 |
def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
|
| 47 |
return self.search_by_embedding(' '.join(keywords), top_k)
|
| 48 |
|
| 49 |
+
def search_by_ids(self, ids: List[str]) -> Dict:
|
| 50 |
all_docs = self.vectorstore.similarity_search("", k=10000)
|
| 51 |
+
results = [
|
| 52 |
+
{
|
| 53 |
+
'id': doc.metadata.get('id'),
|
| 54 |
+
'ementa': doc.page_content,
|
| 55 |
+
'metadata': doc.metadata
|
| 56 |
+
}
|
| 57 |
+
for doc in all_docs
|
| 58 |
+
if doc.metadata.get('id') in ids
|
| 59 |
+
][:len(ids)]
|
| 60 |
+
return {
|
| 61 |
+
'cluster_id': self.config.get('cluster_id'),
|
| 62 |
+
'total_results': len(results),
|
| 63 |
+
'results': results
|
| 64 |
+
}
|
| 65 |
|
| 66 |
def get_cluster_info(self) -> Dict:
|
| 67 |
+
return {
|
| 68 |
+
'cluster_id': self.config.get('cluster_id'),
|
| 69 |
+
'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')],
|
| 70 |
+
'embedding_model': self.config.get('embedding_model'),
|
| 71 |
+
'embedding_dim': 384,
|
| 72 |
+
'vector_store': 'FAISS',
|
| 73 |
+
'backend': 'LangChain + CPU',
|
| 74 |
+
'status': 'ready'
|
| 75 |
+
}
|
rag_builder.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
import os, sys, json, argparse, logging,
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import List, Dict
|
| 5 |
from langchain.docstore.document import Document
|
| 6 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 7 |
from langchain_community.vectorstores import FAISS
|
| 8 |
|
| 9 |
-
logging.basicConfig(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
| 12 |
def load_jsonl(filepath: str) -> List[Dict]:
|
|
@@ -15,48 +19,98 @@ def load_jsonl(filepath: str) -> List[Dict]:
|
|
| 15 |
with open(filepath, 'r', encoding='utf-8') as f:
|
| 16 |
for i, line in enumerate(f, 1):
|
| 17 |
if line.strip():
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
logger.info(f" {i:,} linhas...")
|
| 21 |
-
logger.info(f"✅ {len(records):,} registros")
|
| 22 |
return records
|
| 23 |
|
| 24 |
def create_documents(records: List[Dict]) -> List[Document]:
|
| 25 |
documents = []
|
|
|
|
|
|
|
| 26 |
for i, record in enumerate(records, 1):
|
| 27 |
ementa = record.get('ementa', '')
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
logger.info(f" {i:,}/{len(records):,}...")
|
| 32 |
-
|
|
|
|
| 33 |
return documents
|
| 34 |
|
| 35 |
-
def build_vectorstore(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
try:
|
| 37 |
logger.info("="*80)
|
| 38 |
-
logger.info("🚀 RAG Builder")
|
| 39 |
logger.info("="*80)
|
| 40 |
-
|
|
|
|
| 41 |
records = load_jsonl(input_file)
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
documents = create_documents(records)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
start = time.time()
|
|
|
|
| 49 |
vectorstore = FAISS.from_documents(documents, embeddings)
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
os.makedirs(output_dir, exist_ok=True)
|
| 53 |
vectorstore.save_local(output_dir)
|
|
|
|
|
|
|
| 54 |
logger.info("✅ BUILD COMPLETO!")
|
|
|
|
|
|
|
| 55 |
return vectorstore
|
|
|
|
| 56 |
except Exception as e:
|
| 57 |
logger.error(f"\n❌ ERRO: {type(e).__name__}: {e}")
|
|
|
|
| 58 |
logger.error(traceback.format_exc())
|
| 59 |
-
|
| 60 |
|
| 61 |
def main():
|
| 62 |
parser = argparse.ArgumentParser()
|
|
@@ -65,6 +119,7 @@ def main():
|
|
| 65 |
parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2')
|
| 66 |
parser.add_argument('--batch-size', type=int, default=16)
|
| 67 |
args = parser.parse_args()
|
|
|
|
| 68 |
build_vectorstore(args.input, args.output, args.model, args.batch_size)
|
| 69 |
|
| 70 |
if __name__ == '__main__':
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
import os, sys, json, argparse, logging, time
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import List, Dict
|
| 5 |
from langchain.docstore.document import Document
|
| 6 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 7 |
from langchain_community.vectorstores import FAISS
|
| 8 |
|
| 9 |
+
logging.basicConfig(
|
| 10 |
+
level=logging.INFO,
|
| 11 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 12 |
+
stream=sys.stdout
|
| 13 |
+
)
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
def load_jsonl(filepath: str) -> List[Dict]:
|
|
|
|
| 19 |
with open(filepath, 'r', encoding='utf-8') as f:
|
| 20 |
for i, line in enumerate(f, 1):
|
| 21 |
if line.strip():
|
| 22 |
+
try:
|
| 23 |
+
records.append(json.loads(line))
|
| 24 |
+
except json.JSONDecodeError as e:
|
| 25 |
+
logger.warning(f"Linha {i} inválida: {e}")
|
| 26 |
+
if i % 10000 == 0:
|
| 27 |
logger.info(f" {i:,} linhas...")
|
| 28 |
+
logger.info(f"✅ {len(records):,} registros carregados")
|
| 29 |
return records
|
| 30 |
|
| 31 |
def create_documents(records: List[Dict]) -> List[Document]:
|
| 32 |
documents = []
|
| 33 |
+
logger.info("📝 Criando documentos...")
|
| 34 |
+
|
| 35 |
for i, record in enumerate(records, 1):
|
| 36 |
ementa = record.get('ementa', '')
|
| 37 |
+
doc_id = record.get('id', f'unknown_{i}')
|
| 38 |
+
|
| 39 |
+
if ementa and ementa.strip():
|
| 40 |
+
documents.append(
|
| 41 |
+
Document(
|
| 42 |
+
page_content=ementa,
|
| 43 |
+
metadata={
|
| 44 |
+
'id': str(doc_id),
|
| 45 |
+
'source': 'tjpr'
|
| 46 |
+
}
|
| 47 |
+
)
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
if i % 10000 == 0:
|
| 51 |
logger.info(f" {i:,}/{len(records):,}...")
|
| 52 |
+
|
| 53 |
+
logger.info(f"✅ {len(documents):,} documentos criados")
|
| 54 |
return documents
|
| 55 |
|
| 56 |
+
def build_vectorstore(
|
| 57 |
+
input_file,
|
| 58 |
+
output_dir='/app/faiss_index',
|
| 59 |
+
model_name='sentence-transformers/all-MiniLM-L6-v2',
|
| 60 |
+
batch_size=16
|
| 61 |
+
):
|
| 62 |
try:
|
| 63 |
logger.info("="*80)
|
| 64 |
+
logger.info("🚀 RAG Builder v2.0")
|
| 65 |
logger.info("="*80)
|
| 66 |
+
|
| 67 |
+
logger.info("\nPASSO 1/4: Carregando JSONL")
|
| 68 |
records = load_jsonl(input_file)
|
| 69 |
+
|
| 70 |
+
if not records:
|
| 71 |
+
raise Exception("Nenhum registro carregado!")
|
| 72 |
+
|
| 73 |
+
logger.info("\nPASSO 2/4: Criando Documents")
|
| 74 |
documents = create_documents(records)
|
| 75 |
+
|
| 76 |
+
if not documents:
|
| 77 |
+
raise Exception("Nenhum documento criado!")
|
| 78 |
+
|
| 79 |
+
logger.info(f"\nPASSO 3/4: Inicializando Embeddings ({model_name})")
|
| 80 |
+
embeddings = HuggingFaceEmbeddings(
|
| 81 |
+
model_name=model_name,
|
| 82 |
+
model_kwargs={'device': 'cpu'},
|
| 83 |
+
encode_kwargs={
|
| 84 |
+
'batch_size': batch_size,
|
| 85 |
+
'show_progress_bar': True,
|
| 86 |
+
'normalize_embeddings': True
|
| 87 |
+
}
|
| 88 |
+
)
|
| 89 |
+
logger.info("✅ Embeddings inicializados")
|
| 90 |
+
|
| 91 |
+
logger.info(f"\nPASSO 4/4: Construindo FAISS ({len(documents):,} docs)")
|
| 92 |
start = time.time()
|
| 93 |
+
|
| 94 |
vectorstore = FAISS.from_documents(documents, embeddings)
|
| 95 |
+
|
| 96 |
+
elapsed = time.time() - start
|
| 97 |
+
logger.info(f"✅ FAISS construído em {elapsed:.1f}s")
|
| 98 |
+
|
| 99 |
+
logger.info(f"\nSalvando em {output_dir}")
|
| 100 |
os.makedirs(output_dir, exist_ok=True)
|
| 101 |
vectorstore.save_local(output_dir)
|
| 102 |
+
|
| 103 |
+
logger.info("="*80)
|
| 104 |
logger.info("✅ BUILD COMPLETO!")
|
| 105 |
+
logger.info("="*80)
|
| 106 |
+
|
| 107 |
return vectorstore
|
| 108 |
+
|
| 109 |
except Exception as e:
|
| 110 |
logger.error(f"\n❌ ERRO: {type(e).__name__}: {e}")
|
| 111 |
+
import traceback
|
| 112 |
logger.error(traceback.format_exc())
|
| 113 |
+
sys.exit(1)
|
| 114 |
|
| 115 |
def main():
|
| 116 |
parser = argparse.ArgumentParser()
|
|
|
|
| 119 |
parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2')
|
| 120 |
parser.add_argument('--batch-size', type=int, default=16)
|
| 121 |
args = parser.parse_args()
|
| 122 |
+
|
| 123 |
build_vectorstore(args.input, args.output, args.model, args.batch_size)
|
| 124 |
|
| 125 |
if __name__ == '__main__':
|
requirements.txt
CHANGED
|
@@ -1,28 +1,27 @@
|
|
| 1 |
# FastAPI
|
| 2 |
-
fastapi
|
| 3 |
-
uvicorn[standard]
|
| 4 |
-
pydantic
|
| 5 |
|
| 6 |
# LangChain + Embeddings
|
| 7 |
-
langchain
|
| 8 |
-
langchain-community
|
| 9 |
|
| 10 |
# Sentence Transformers
|
| 11 |
-
sentence-transformers
|
| 12 |
-
transformers
|
| 13 |
-
torch
|
| 14 |
-
tokenizers
|
| 15 |
-
safetensors
|
| 16 |
|
| 17 |
# FAISS
|
| 18 |
-
faiss-cpu
|
| 19 |
|
| 20 |
# HuggingFace Hub
|
| 21 |
-
huggingface-hub
|
| 22 |
|
| 23 |
# Utilities
|
| 24 |
-
PyYAML
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
tqdm
|
|
|
|
| 1 |
# FastAPI
|
| 2 |
+
fastapi==0.109.0
|
| 3 |
+
uvicorn[standard]==0.27.0
|
| 4 |
+
pydantic==2.5.0
|
| 5 |
|
| 6 |
# LangChain + Embeddings
|
| 7 |
+
langchain==0.1.11
|
| 8 |
+
langchain-community==0.0.24
|
| 9 |
|
| 10 |
# Sentence Transformers
|
| 11 |
+
sentence-transformers==2.5.1
|
| 12 |
+
transformers==4.37.2
|
| 13 |
+
torch==2.2.0
|
| 14 |
+
tokenizers==0.15.2
|
| 15 |
+
safetensors==0.4.2
|
| 16 |
|
| 17 |
# FAISS
|
| 18 |
+
faiss-cpu==1.8.0
|
| 19 |
|
| 20 |
# HuggingFace Hub
|
| 21 |
+
huggingface-hub==0.20.3
|
| 22 |
|
| 23 |
# Utilities
|
| 24 |
+
PyYAML==6.0.1
|
| 25 |
+
pandas==2.1.4
|
| 26 |
+
numpy==1.26.3
|
| 27 |
+
tqdm==4.66.1
|
|
|
setup.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
import os, sys, yaml, json, subprocess, logging, traceback, time
|
| 3 |
from pathlib import Path
|
| 4 |
from datetime import datetime
|
| 5 |
|
| 6 |
-
# Configurar logging VERBOSE
|
| 7 |
logging.basicConfig(
|
| 8 |
-
level=logging.
|
| 9 |
format='%(asctime)s [%(levelname)s] %(message)s',
|
| 10 |
handlers=[
|
| 11 |
logging.StreamHandler(sys.stdout),
|
|
@@ -26,364 +25,240 @@ def update_status(status, message, progress=0):
|
|
| 26 |
}
|
| 27 |
with open(STATUS_FILE, 'w') as f:
|
| 28 |
json.dump(data, f)
|
| 29 |
-
logger.info(f"STATUS
|
| 30 |
sys.stdout.flush()
|
| 31 |
|
| 32 |
-
def run_cmd(cmd, desc,
|
| 33 |
logger.info("="*80)
|
| 34 |
-
logger.info(f"🔧
|
| 35 |
logger.info(f"📝 Comando: {cmd}")
|
| 36 |
-
logger.info(f"📂 PWD: {os.getcwd()}")
|
| 37 |
-
logger.info(f"👤 USER: {os.getenv('USER', 'unknown')}")
|
| 38 |
-
logger.info(f"🏠 HOME: {os.getenv('HOME', 'unknown')}")
|
| 39 |
logger.info("-"*80)
|
| 40 |
|
| 41 |
try:
|
| 42 |
start = time.time()
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
)
|
| 52 |
-
else:
|
| 53 |
-
result = subprocess.run(
|
| 54 |
-
cmd,
|
| 55 |
-
shell=True,
|
| 56 |
-
timeout=600
|
| 57 |
-
)
|
| 58 |
-
result.stdout = ""
|
| 59 |
-
result.stderr = ""
|
| 60 |
-
|
| 61 |
elapsed = time.time() - start
|
| 62 |
|
| 63 |
-
logger.info(f"⏱️
|
| 64 |
-
logger.info(f"🔢 Exit code: {result.returncode}")
|
| 65 |
|
| 66 |
if result.stdout:
|
| 67 |
-
logger.info(f"
|
| 68 |
-
for line in result.stdout.split('\n')[:50]: # Primeiras 50 linhas
|
| 69 |
-
logger.info(f" | {line}")
|
| 70 |
-
if len(result.stdout.split('\n')) > 50:
|
| 71 |
-
logger.info(f" | ... ({len(result.stdout.split(chr(10))) - 50} linhas omitidas)")
|
| 72 |
-
|
| 73 |
if result.stderr:
|
| 74 |
-
logger.warning(f"
|
| 75 |
-
for line in result.stderr.split('\n')[:50]:
|
| 76 |
-
logger.warning(f" | {line}")
|
| 77 |
-
if len(result.stderr.split('\n')) > 50:
|
| 78 |
-
logger.warning(f" | ... ({len(result.stderr.split(chr(10))) - 50} linhas omitidas)")
|
| 79 |
-
|
| 80 |
-
if check and result.returncode != 0:
|
| 81 |
-
logger.error("="*80)
|
| 82 |
-
logger.error(f"❌ COMANDO FALHOU: {desc}")
|
| 83 |
-
logger.error(f"Exit code: {result.returncode}")
|
| 84 |
-
logger.error("="*80)
|
| 85 |
-
raise Exception(f"{desc} falhou com exit code {result.returncode}")
|
| 86 |
|
| 87 |
logger.info(f"✅ {desc} - OK")
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
return result.stdout if capture else ""
|
| 91 |
|
| 92 |
except subprocess.TimeoutExpired:
|
| 93 |
-
logger.error(f"⏰ TIMEOUT após
|
| 94 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
except Exception as e:
|
| 96 |
logger.error(f"💥 EXCEÇÃO: {type(e).__name__}: {e}")
|
| 97 |
-
logger.error(traceback.format_exc())
|
| 98 |
raise
|
| 99 |
|
| 100 |
-
def
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
logger.info("="*80)
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
("id", "UID/GID"),
|
| 109 |
-
("git --version", "Git version"),
|
| 110 |
-
("python3 --version", "Python version"),
|
| 111 |
-
("pip list | grep -E '(langchain|torch|transformers)'", "Pacotes principais"),
|
| 112 |
-
("df -h /tmp", "Espaço em /tmp"),
|
| 113 |
-
("free -h", "Memória disponível"),
|
| 114 |
-
("ls -la /home/user/app", "Arquivos app"),
|
| 115 |
-
("cat /etc/resolv.conf", "DNS config"),
|
| 116 |
-
("ping -c 2 github.com || echo 'ping falhou'", "Conectividade GitHub"),
|
| 117 |
-
]
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
def main():
|
| 127 |
try:
|
| 128 |
logger.info("\n" + "="*80)
|
| 129 |
-
logger.info("🚀 PARA.AI RAG SETUP - VERSÃO
|
| 130 |
logger.info("="*80)
|
| 131 |
logger.info(f"⏰ Início: {datetime.now()}")
|
| 132 |
-
logger.info(f"🐍 Python: {sys.version}")
|
| 133 |
-
logger.info(f"📂 CWD: {os.getcwd()}")
|
| 134 |
-
logger.info("="*80)
|
| 135 |
|
| 136 |
-
#
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
# CARREGAR CONFIG
|
| 140 |
-
logger.info("\n
|
| 141 |
-
logger.info("📝 PASSO 0: Carregando configuração")
|
| 142 |
-
logger.info("="*80)
|
| 143 |
update_status('loading', 'Carregando configuração', 0)
|
| 144 |
|
| 145 |
-
|
| 146 |
-
logger.info(f"Config file: {config_path}")
|
| 147 |
-
logger.info(f"Existe? {os.path.exists(config_path)}")
|
| 148 |
-
|
| 149 |
-
if os.path.exists(config_path):
|
| 150 |
-
with open(config_path) as f:
|
| 151 |
-
config_content = f.read()
|
| 152 |
-
logger.info(f"Config content ({len(config_content)} chars):")
|
| 153 |
-
logger.info(config_content)
|
| 154 |
-
|
| 155 |
-
with open(config_path) as f:
|
| 156 |
config = yaml.safe_load(f)
|
| 157 |
|
| 158 |
-
logger.info(f"✅ Config carregado:")
|
| 159 |
-
for key, value in config.items():
|
| 160 |
-
logger.info(f" {key}: {value}")
|
| 161 |
-
|
| 162 |
cluster_id = config['cluster_id']
|
| 163 |
chunk_start = config['chunk_start']
|
| 164 |
chunk_end = config['chunk_end']
|
| 165 |
github_repo = config['github_repo']
|
|
|
|
| 166 |
|
| 167 |
-
logger.info(f"\n📊 Configuração:")
|
| 168 |
logger.info(f" Cluster: {cluster_id}")
|
| 169 |
logger.info(f" Chunks: {chunk_start} → {chunk_end} ({chunk_end - chunk_start + 1} chunks)")
|
| 170 |
-
logger.info(f"
|
| 171 |
-
|
| 172 |
-
# VERIFICAR SE JÁ PRONTO
|
| 173 |
-
if READY_FLAG.exists():
|
| 174 |
-
logger.info(f"\n✅ FAISS já existe em {READY_FLAG}")
|
| 175 |
-
update_status('ready', 'FAISS já existe', 100)
|
| 176 |
-
return
|
| 177 |
-
|
| 178 |
-
# PASSO 1: GIT CLONE
|
| 179 |
-
logger.info("\n" + "="*80)
|
| 180 |
-
logger.info("📥 PASSO 1: Git Clone (Sparse Checkout)")
|
| 181 |
-
logger.info("="*80)
|
| 182 |
-
update_status('cloning', 'Iniciando clone do repositório', 10)
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
logger.info(f"Criando {repo_dir}...")
|
| 189 |
-
os.makedirs(repo_dir, exist_ok=True)
|
| 190 |
-
logger.info(f"✅ Diretório criado")
|
| 191 |
|
| 192 |
-
#
|
| 193 |
-
|
| 194 |
-
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
logger.info(f"✅ PWD agora: {os.getcwd()}")
|
| 200 |
|
| 201 |
-
#
|
| 202 |
-
logger.info("\n
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
logger.info(f"\n📦 Clonando {github_repo}...")
|
| 207 |
-
logger.info("Comando: git clone --filter=blob:none --sparse ...")
|
| 208 |
|
| 209 |
-
|
| 210 |
-
run_cmd(
|
| 211 |
-
f"git clone --filter=blob:none --no-checkout --sparse {github_repo} . 2>&1",
|
| 212 |
-
"Git clone sparse"
|
| 213 |
-
)
|
| 214 |
-
except Exception as e:
|
| 215 |
-
logger.error("\n❌ GIT CLONE FALHOU!")
|
| 216 |
-
logger.error("Tentando diagnóstico adicional...")
|
| 217 |
-
|
| 218 |
-
# Diagnóstico
|
| 219 |
-
run_cmd("ls -la", "List current dir", check=False)
|
| 220 |
-
run_cmd("git --version", "Git version", check=False)
|
| 221 |
-
run_cmd(f"git ls-remote {github_repo} 2>&1 | head -10", "Git ls-remote", check=False)
|
| 222 |
-
run_cmd("cat ~/.gitconfig 2>/dev/null || echo 'No gitconfig'", "Git config", check=False)
|
| 223 |
-
|
| 224 |
-
raise Exception(f"Git clone falhou: {e}")
|
| 225 |
-
|
| 226 |
-
# Verificar clone
|
| 227 |
-
logger.info("\n✅ Clone concluído, verificando...")
|
| 228 |
-
run_cmd("ls -la", "List repo dir")
|
| 229 |
-
run_cmd("git status 2>&1 | head -20", "Git status", check=False)
|
| 230 |
-
|
| 231 |
-
# Sparse checkout init
|
| 232 |
-
logger.info("\n🔧 Configurando sparse checkout...")
|
| 233 |
-
run_cmd("git sparse-checkout init --cone", "Sparse checkout init")
|
| 234 |
-
|
| 235 |
-
# Adicionar patterns em batches
|
| 236 |
-
logger.info(f"\n📋 Adicionando patterns para chunks {chunk_start}-{chunk_end}...")
|
| 237 |
-
patterns = [
|
| 238 |
-
f"chunks_dados/chunk_dados_{i:04d}.tar.gz"
|
| 239 |
-
for i in range(chunk_start, chunk_end + 1)
|
| 240 |
-
]
|
| 241 |
-
|
| 242 |
-
logger.info(f"Total de patterns: {len(patterns)}")
|
| 243 |
-
logger.info(f"Primeiros 5: {patterns[:5]}")
|
| 244 |
-
logger.info(f"Últimos 5: {patterns[-5:]}")
|
| 245 |
-
|
| 246 |
-
batch_size = 50
|
| 247 |
-
for i in range(0, len(patterns), batch_size):
|
| 248 |
-
batch = patterns[i:i+batch_size]
|
| 249 |
-
batch_num = i // batch_size + 1
|
| 250 |
-
total_batches = (len(patterns) + batch_size - 1) // batch_size
|
| 251 |
-
|
| 252 |
-
logger.info(f"\nBatch {batch_num}/{total_batches} ({len(batch)} patterns)...")
|
| 253 |
-
|
| 254 |
-
patterns_str = ' '.join(batch)
|
| 255 |
-
run_cmd(
|
| 256 |
-
f"git sparse-checkout add {patterns_str}",
|
| 257 |
-
f"Add patterns batch {batch_num}"
|
| 258 |
-
)
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
| 263 |
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
chunks_found = run_cmd(
|
| 267 |
-
"find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l",
|
| 268 |
-
"Count chunks"
|
| 269 |
-
).strip()
|
| 270 |
|
| 271 |
-
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
| 277 |
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
logger.info("="*80)
|
| 282 |
-
update_status('extracting', f'Extraindo {chunks_found} chunks', 30)
|
| 283 |
|
| 284 |
-
|
| 285 |
-
logger.info(f"Diretório destino: {extract_dir}")
|
| 286 |
-
os.makedirs(extract_dir, exist_ok=True)
|
| 287 |
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
check=False
|
| 292 |
-
)
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
"Count JSONL files"
|
| 298 |
-
).strip()
|
| 299 |
-
logger.info(f"✅ Arquivos JSONL extraídos: {jsonl_count}")
|
| 300 |
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
|
|
|
| 306 |
|
| 307 |
-
|
| 308 |
-
"find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>&1",
|
| 309 |
-
"Concatenate JSONL"
|
| 310 |
-
)
|
| 311 |
|
| 312 |
-
|
| 313 |
-
"
|
| 314 |
-
"Count lines"
|
| 315 |
-
).strip()
|
| 316 |
|
| 317 |
-
|
|
|
|
|
|
|
| 318 |
|
| 319 |
-
|
| 320 |
-
|
| 321 |
|
| 322 |
-
# PASSO
|
| 323 |
logger.info("\n" + "="*80)
|
| 324 |
-
logger.info("
|
| 325 |
logger.info("="*80)
|
| 326 |
-
update_status('
|
| 327 |
|
| 328 |
os.chdir('/home/user/app')
|
| 329 |
-
logger.info(f"PWD: {os.getcwd()}")
|
| 330 |
-
|
| 331 |
-
run_cmd(
|
| 332 |
-
"python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl 2>&1",
|
| 333 |
-
"Filter fields"
|
| 334 |
-
)
|
| 335 |
-
|
| 336 |
-
filtered_lines = run_cmd(
|
| 337 |
-
"wc -l < /tmp/filtered.jsonl 2>/dev/null || echo '0'",
|
| 338 |
-
"Count filtered"
|
| 339 |
-
).strip()
|
| 340 |
-
logger.info(f"✅ Registros filtrados: {filtered_lines}")
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
"
|
| 352 |
-
"Build FAISS",
|
| 353 |
-
capture=False
|
| 354 |
-
)
|
| 355 |
|
| 356 |
logger.info("✅ FAISS construído!")
|
| 357 |
|
| 358 |
-
#
|
| 359 |
-
|
| 360 |
-
"ls -lh /app/faiss_index 2>&1",
|
| 361 |
-
"List FAISS files",
|
| 362 |
-
check=False
|
| 363 |
-
)
|
| 364 |
-
|
| 365 |
-
# PASSO 6: CLEANUP
|
| 366 |
-
logger.info("\n" + "="*80)
|
| 367 |
-
logger.info("🧹 PASSO 6: Limpando temporários")
|
| 368 |
-
logger.info("="*80)
|
| 369 |
update_status('cleaning', 'Limpando arquivos temporários', 95)
|
| 370 |
|
| 371 |
-
run_cmd(
|
| 372 |
-
"rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl",
|
| 373 |
-
"Cleanup temp files"
|
| 374 |
-
)
|
| 375 |
|
| 376 |
# CONCLUÍDO
|
| 377 |
logger.info("\n" + "="*80)
|
| 378 |
logger.info("✅ SETUP COMPLETO!")
|
| 379 |
logger.info("="*80)
|
| 380 |
|
| 381 |
-
update_status('ready', f'FAISS pronto com {
|
| 382 |
READY_FLAG.touch()
|
| 383 |
|
| 384 |
logger.info(f"⏰ Fim: {datetime.now()}")
|
| 385 |
-
logger.info(f"📁 Logs salvos em: /tmp/setup_debug.log")
|
| 386 |
-
logger.info("="*80)
|
| 387 |
|
| 388 |
except Exception as e:
|
| 389 |
logger.error("\n" + "="*80)
|
|
@@ -391,13 +266,10 @@ def main():
|
|
| 391 |
logger.error("="*80)
|
| 392 |
logger.error(f"Tipo: {type(e).__name__}")
|
| 393 |
logger.error(f"Mensagem: {str(e)}")
|
| 394 |
-
logger.error("\nTraceback
|
| 395 |
logger.error(traceback.format_exc())
|
| 396 |
-
logger.error("="*80)
|
| 397 |
-
logger.error(f"📁 Logs completos em: /tmp/setup_debug.log")
|
| 398 |
-
logger.error("="*80)
|
| 399 |
|
| 400 |
-
update_status('error', f'
|
| 401 |
sys.exit(1)
|
| 402 |
|
| 403 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
import os, sys, yaml, json, subprocess, logging, traceback, time, tarfile, io
|
| 3 |
from pathlib import Path
|
| 4 |
from datetime import datetime
|
| 5 |
|
|
|
|
| 6 |
logging.basicConfig(
|
| 7 |
+
level=logging.INFO,
|
| 8 |
format='%(asctime)s [%(levelname)s] %(message)s',
|
| 9 |
handlers=[
|
| 10 |
logging.StreamHandler(sys.stdout),
|
|
|
|
| 25 |
}
|
| 26 |
with open(STATUS_FILE, 'w') as f:
|
| 27 |
json.dump(data, f)
|
| 28 |
+
logger.info(f"STATUS [{progress}%]: {status} - {message}")
|
| 29 |
sys.stdout.flush()
|
| 30 |
|
| 31 |
+
def run_cmd(cmd, desc, check=True, timeout=300):
|
| 32 |
logger.info("="*80)
|
| 33 |
+
logger.info(f"🔧 {desc}")
|
| 34 |
logger.info(f"📝 Comando: {cmd}")
|
|
|
|
|
|
|
|
|
|
| 35 |
logger.info("-"*80)
|
| 36 |
|
| 37 |
try:
|
| 38 |
start = time.time()
|
| 39 |
+
result = subprocess.run(
|
| 40 |
+
cmd,
|
| 41 |
+
shell=True,
|
| 42 |
+
capture_output=True,
|
| 43 |
+
text=True,
|
| 44 |
+
timeout=timeout,
|
| 45 |
+
check=check
|
| 46 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
elapsed = time.time() - start
|
| 48 |
|
| 49 |
+
logger.info(f"⏱️ {elapsed:.2f}s | Exit: {result.returncode}")
|
|
|
|
| 50 |
|
| 51 |
if result.stdout:
|
| 52 |
+
logger.info(f"STDOUT: {result.stdout[:500]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
if result.stderr:
|
| 54 |
+
logger.warning(f"STDERR: {result.stderr[:500]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
logger.info(f"✅ {desc} - OK")
|
| 57 |
+
return result.stdout
|
|
|
|
|
|
|
| 58 |
|
| 59 |
except subprocess.TimeoutExpired:
|
| 60 |
+
logger.error(f"⏰ TIMEOUT após {timeout}s: {desc}")
|
| 61 |
+
raise
|
| 62 |
+
except subprocess.CalledProcessError as e:
|
| 63 |
+
logger.error(f"❌ FALHOU: {desc}")
|
| 64 |
+
logger.error(f"Exit code: {e.returncode}")
|
| 65 |
+
logger.error(f"STDERR: {e.stderr[:500]}")
|
| 66 |
+
raise
|
| 67 |
except Exception as e:
|
| 68 |
logger.error(f"💥 EXCEÇÃO: {type(e).__name__}: {e}")
|
|
|
|
| 69 |
raise
|
| 70 |
|
| 71 |
+
def filter_jsonl_record(record, fields_to_keep):
|
| 72 |
+
"""Filtra campos de um registro JSONL"""
|
| 73 |
+
return {k: record.get(k) for k in fields_to_keep if k in record}
|
|
|
|
| 74 |
|
| 75 |
+
def process_tar_gz(tar_path, output_jsonl, fields_to_keep):
|
| 76 |
+
"""Extrai TAR.GZ, filtra campos do JSONL, e concatena"""
|
| 77 |
+
logger.info(f"📦 Processando: {tar_path.name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
try:
|
| 80 |
+
with tarfile.open(tar_path, 'r:gz') as tar:
|
| 81 |
+
members = tar.getmembers()
|
| 82 |
+
logger.info(f" Arquivos no TAR.GZ: {len(members)}")
|
| 83 |
+
|
| 84 |
+
for member in members:
|
| 85 |
+
if member.name.endswith('jurisprudencias.jsonl') and member.isfile():
|
| 86 |
+
logger.info(f" ✅ Encontrado: {member.name}")
|
| 87 |
+
|
| 88 |
+
# Extrai JSONL para memória
|
| 89 |
+
file_obj = tar.extractfile(member)
|
| 90 |
+
content = file_obj.read().decode('utf-8')
|
| 91 |
+
|
| 92 |
+
# Processa linha por linha
|
| 93 |
+
lines = content.strip().split('\n')
|
| 94 |
+
logger.info(f" 📋 Linhas: {len(lines)}")
|
| 95 |
+
|
| 96 |
+
processed = 0
|
| 97 |
+
with open(output_jsonl, 'a', encoding='utf-8') as out:
|
| 98 |
+
for line in lines:
|
| 99 |
+
if line.strip():
|
| 100 |
+
try:
|
| 101 |
+
record = json.loads(line)
|
| 102 |
+
filtered = filter_jsonl_record(record, fields_to_keep)
|
| 103 |
+
out.write(json.dumps(filtered, ensure_ascii=False) + '\n')
|
| 104 |
+
processed += 1
|
| 105 |
+
except json.JSONDecodeError:
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
logger.info(f" ✅ Processados: {processed} registros")
|
| 109 |
+
return processed
|
| 110 |
+
|
| 111 |
+
logger.warning(f" ⚠️ Nenhum jurisprudencias.jsonl encontrado")
|
| 112 |
+
return 0
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
logger.error(f" ❌ Erro ao processar {tar_path.name}: {e}")
|
| 116 |
+
raise
|
| 117 |
|
| 118 |
def main():
|
| 119 |
try:
|
| 120 |
logger.info("\n" + "="*80)
|
| 121 |
+
logger.info("🚀 PARA.AI RAG SETUP - VERSÃO OTIMIZADA")
|
| 122 |
logger.info("="*80)
|
| 123 |
logger.info(f"⏰ Início: {datetime.now()}")
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
# VERIFICAR SE JÁ PRONTO
|
| 126 |
+
if READY_FLAG.exists():
|
| 127 |
+
logger.info("✅ FAISS já existe")
|
| 128 |
+
update_status('ready', 'FAISS já pronto', 100)
|
| 129 |
+
return
|
| 130 |
|
| 131 |
# CARREGAR CONFIG
|
| 132 |
+
logger.info("\n📝 PASSO 0: Carregando configuração")
|
|
|
|
|
|
|
| 133 |
update_status('loading', 'Carregando configuração', 0)
|
| 134 |
|
| 135 |
+
with open('config.yaml') as f:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
config = yaml.safe_load(f)
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
cluster_id = config['cluster_id']
|
| 139 |
chunk_start = config['chunk_start']
|
| 140 |
chunk_end = config['chunk_end']
|
| 141 |
github_repo = config['github_repo']
|
| 142 |
+
campos_filter = config['campos_filter']
|
| 143 |
|
|
|
|
| 144 |
logger.info(f" Cluster: {cluster_id}")
|
| 145 |
logger.info(f" Chunks: {chunk_start} → {chunk_end} ({chunk_end - chunk_start + 1} chunks)")
|
| 146 |
+
logger.info(f" Campos: {campos_filter}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
+
# PREPARAR URLS
|
| 149 |
+
# URL correta: https://raw.githubusercontent.com/USER/REPO/BRANCH/PATH
|
| 150 |
+
base_url = github_repo.replace('https://github.com/', 'https://raw.githubusercontent.com/')
|
| 151 |
+
if base_url.endswith('.git'):
|
| 152 |
+
base_url = base_url[:-4]
|
| 153 |
+
base_url = f"{base_url}/main/chunks_dados"
|
| 154 |
|
| 155 |
+
logger.info(f" Base URL: {base_url}")
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
+
# CRIAR DIRETÓRIOS
|
| 158 |
+
work_dir = Path('/tmp/work')
|
| 159 |
+
work_dir.mkdir(exist_ok=True)
|
| 160 |
|
| 161 |
+
output_jsonl = work_dir / 'all_filtered.jsonl'
|
| 162 |
+
if output_jsonl.exists():
|
| 163 |
+
output_jsonl.unlink()
|
|
|
|
| 164 |
|
| 165 |
+
# PASSO 1: DOWNLOAD E PROCESSAR CHUNKS (UM POR VEZ)
|
| 166 |
+
logger.info("\n" + "="*80)
|
| 167 |
+
logger.info("📥 PASSO 1: Download e Processamento de Chunks")
|
| 168 |
+
logger.info("="*80)
|
| 169 |
+
update_status('downloading', 'Baixando e processando chunks', 10)
|
|
|
|
|
|
|
| 170 |
|
| 171 |
+
total_records = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
for chunk_num in range(chunk_start, chunk_end + 1):
|
| 174 |
+
# Nome do arquivo: chunk_dados_000001.tar.gz
|
| 175 |
+
chunk_name = f"chunk_dados_{chunk_num:06d}.tar.gz"
|
| 176 |
+
chunk_url = f"{base_url}/{chunk_name}"
|
| 177 |
+
chunk_path = work_dir / chunk_name
|
| 178 |
|
| 179 |
+
logger.info(f"\n📦 Chunk {chunk_num}/{chunk_end}")
|
| 180 |
+
logger.info(f" URL: {chunk_url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
+
# Download com curl
|
| 183 |
+
progress = 10 + ((chunk_num - chunk_start) * 40 // (chunk_end - chunk_start + 1))
|
| 184 |
+
update_status('downloading', f'Baixando chunk {chunk_num}/{chunk_end}', progress)
|
| 185 |
|
| 186 |
+
try:
|
| 187 |
+
run_cmd(
|
| 188 |
+
f"curl -L -f -o {chunk_path} {chunk_url}",
|
| 189 |
+
f"Download {chunk_name}",
|
| 190 |
+
timeout=300
|
| 191 |
+
)
|
| 192 |
|
| 193 |
+
if not chunk_path.exists() or chunk_path.stat().st_size == 0:
|
| 194 |
+
logger.warning(f" ⚠️ Arquivo vazio ou não baixado: {chunk_name}")
|
| 195 |
+
continue
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
logger.info(f" ✅ Baixado: {chunk_path.stat().st_size / 1024 / 1024:.2f} MB")
|
|
|
|
|
|
|
| 198 |
|
| 199 |
+
# Processar TAR.GZ (extrai + filtra + concatena)
|
| 200 |
+
records = process_tar_gz(chunk_path, output_jsonl, campos_filter)
|
| 201 |
+
total_records += records
|
|
|
|
|
|
|
| 202 |
|
| 203 |
+
# Apagar TAR.GZ para economizar espaço
|
| 204 |
+
chunk_path.unlink()
|
| 205 |
+
logger.info(f" 🗑️ Arquivo TAR.GZ apagado (economizando espaço)")
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.error(f" ❌ Erro no chunk {chunk_num}: {e}")
|
| 209 |
+
# Continua com próximo chunk
|
| 210 |
+
if chunk_path.exists():
|
| 211 |
+
chunk_path.unlink()
|
| 212 |
+
continue
|
| 213 |
|
| 214 |
+
logger.info(f"\n✅ Total de registros processados: {total_records}")
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
+
if total_records == 0:
|
| 217 |
+
raise Exception("Nenhum registro foi processado!")
|
|
|
|
|
|
|
| 218 |
|
| 219 |
+
# Verificar arquivo final
|
| 220 |
+
if not output_jsonl.exists():
|
| 221 |
+
raise Exception("Arquivo all_filtered.jsonl não foi criado!")
|
| 222 |
|
| 223 |
+
final_lines = int(run_cmd(f"wc -l < {output_jsonl}", "Count lines").strip())
|
| 224 |
+
logger.info(f"✅ Linhas no JSONL final: {final_lines}")
|
| 225 |
|
| 226 |
+
# PASSO 2: BUILD FAISS
|
| 227 |
logger.info("\n" + "="*80)
|
| 228 |
+
logger.info("🤖 PASSO 2: Construindo FAISS index")
|
| 229 |
logger.info("="*80)
|
| 230 |
+
update_status('building', f'Construindo FAISS com {final_lines} documentos', 70)
|
| 231 |
|
| 232 |
os.chdir('/home/user/app')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
+
try:
|
| 235 |
+
run_cmd(
|
| 236 |
+
f"python3 rag_builder.py --input {output_jsonl} 2>&1",
|
| 237 |
+
"Build FAISS",
|
| 238 |
+
timeout=900 # 15 minutos
|
| 239 |
+
)
|
| 240 |
+
except Exception as e:
|
| 241 |
+
logger.error(f"❌ Build FAISS falhou: {e}")
|
| 242 |
+
# Tentar ler stderr do rag_builder
|
| 243 |
+
raise Exception(f"Build FAISS falhou: {e}")
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
logger.info("✅ FAISS construído!")
|
| 246 |
|
| 247 |
+
# PASSO 3: CLEANUP
|
| 248 |
+
logger.info("\n🧹 PASSO 3: Limpando temporários")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
update_status('cleaning', 'Limpando arquivos temporários', 95)
|
| 250 |
|
| 251 |
+
run_cmd(f"rm -rf {work_dir}", "Cleanup", check=False)
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
# CONCLUÍDO
|
| 254 |
logger.info("\n" + "="*80)
|
| 255 |
logger.info("✅ SETUP COMPLETO!")
|
| 256 |
logger.info("="*80)
|
| 257 |
|
| 258 |
+
update_status('ready', f'FAISS pronto com {total_records} registros!', 100)
|
| 259 |
READY_FLAG.touch()
|
| 260 |
|
| 261 |
logger.info(f"⏰ Fim: {datetime.now()}")
|
|
|
|
|
|
|
| 262 |
|
| 263 |
except Exception as e:
|
| 264 |
logger.error("\n" + "="*80)
|
|
|
|
| 266 |
logger.error("="*80)
|
| 267 |
logger.error(f"Tipo: {type(e).__name__}")
|
| 268 |
logger.error(f"Mensagem: {str(e)}")
|
| 269 |
+
logger.error("\nTraceback:")
|
| 270 |
logger.error(traceback.format_exc())
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
+
update_status('error', f'Setup falhou: {str(e)}', 0)
|
| 273 |
sys.exit(1)
|
| 274 |
|
| 275 |
if __name__ == "__main__":
|