Spaces:
Build error
Build error
Upload 9 files
Browse files- .gitignore +1 -15
- Dockerfile +0 -1
- app.py +14 -103
- entrypoint.sh +6 -20
- filter_fields.py +19 -34
- query_engine.py +61 -154
- rag_builder.py +68 -79
- requirements.txt +6 -5
- setup.py +43 -135
.gitignore
CHANGED
|
@@ -1,29 +1,15 @@
|
|
| 1 |
-
# Python
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
| 4 |
-
*$py.class
|
| 5 |
-
*.so
|
| 6 |
.Python
|
| 7 |
env/
|
| 8 |
venv/
|
| 9 |
*.egg-info/
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# ChromaDB
|
| 13 |
-
chromadb/
|
| 14 |
*.sqlite3
|
| 15 |
-
|
| 16 |
-
# Temporários
|
| 17 |
/tmp/
|
| 18 |
*.tar.gz
|
| 19 |
*.jsonl
|
| 20 |
repo_git_temp/
|
| 21 |
-
|
| 22 |
-
# IDE
|
| 23 |
.vscode/
|
| 24 |
.idea/
|
| 25 |
-
*.swp
|
| 26 |
-
*.swo
|
| 27 |
-
|
| 28 |
-
# Logs
|
| 29 |
*.log
|
|
|
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
*.py[cod]
|
|
|
|
|
|
|
| 3 |
.Python
|
| 4 |
env/
|
| 5 |
venv/
|
| 6 |
*.egg-info/
|
| 7 |
+
faiss_index/
|
|
|
|
|
|
|
|
|
|
| 8 |
*.sqlite3
|
|
|
|
|
|
|
| 9 |
/tmp/
|
| 10 |
*.tar.gz
|
| 11 |
*.jsonl
|
| 12 |
repo_git_temp/
|
|
|
|
|
|
|
| 13 |
.vscode/
|
| 14 |
.idea/
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
*.log
|
Dockerfile
CHANGED
|
@@ -30,4 +30,3 @@ EXPOSE 7860
|
|
| 30 |
|
| 31 |
# Comando de inicialização
|
| 32 |
CMD ["./entrypoint.sh"]
|
| 33 |
-
|
|
|
|
| 30 |
|
| 31 |
# Comando de inicialização
|
| 32 |
CMD ["./entrypoint.sh"]
|
|
|
app.py
CHANGED
|
@@ -1,11 +1,5 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Para.AI RAG Cluster - FastAPI Application
|
| 4 |
-
Inicia IMEDIATAMENTE (antes do setup terminar) para evitar timeout HF
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
from fastapi import FastAPI, HTTPException
|
| 8 |
-
from fastapi.responses import JSONResponse
|
| 9 |
from pydantic import BaseModel
|
| 10 |
from typing import List, Optional
|
| 11 |
import logging
|
|
@@ -16,73 +10,35 @@ from pathlib import Path
|
|
| 16 |
logging.basicConfig(level=logging.INFO)
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
-
# ============================================================================
|
| 20 |
-
# VERIFICAÇÃO DE STATUS DO SETUP
|
| 21 |
-
# ============================================================================
|
| 22 |
-
|
| 23 |
STATUS_FILE = Path('/tmp/setup_status.json')
|
| 24 |
-
READY_FLAG = Path('/tmp/
|
| 25 |
|
| 26 |
def get_setup_status():
|
| 27 |
-
"""Lê status do setup em background"""
|
| 28 |
if not STATUS_FILE.exists():
|
| 29 |
-
return {
|
| 30 |
-
'status': 'initializing',
|
| 31 |
-
'message': 'Setup ainda não iniciado',
|
| 32 |
-
'progress': 0
|
| 33 |
-
}
|
| 34 |
-
|
| 35 |
try:
|
| 36 |
with open(STATUS_FILE) as f:
|
| 37 |
return json.load(f)
|
| 38 |
except:
|
| 39 |
-
return {
|
| 40 |
-
'status': 'unknown',
|
| 41 |
-
'message': 'Erro ao ler status',
|
| 42 |
-
'progress': 0
|
| 43 |
-
}
|
| 44 |
|
| 45 |
def is_ready():
|
| 46 |
-
"""Verifica se ChromaDB está pronto"""
|
| 47 |
return READY_FLAG.exists()
|
| 48 |
|
| 49 |
-
# ============================================================================
|
| 50 |
-
# LAZY LOADING DO QUERY ENGINE
|
| 51 |
-
# ============================================================================
|
| 52 |
-
|
| 53 |
query_engine = None
|
| 54 |
|
| 55 |
def get_query_engine():
|
| 56 |
-
"""Carrega QueryEngine apenas quando ChromaDB estiver pronto"""
|
| 57 |
global query_engine
|
| 58 |
-
|
| 59 |
if query_engine is None:
|
| 60 |
if not is_ready():
|
| 61 |
-
raise HTTPException(
|
| 62 |
-
status_code=503,
|
| 63 |
-
detail="RAG ainda em construção. Tente novamente em alguns minutos."
|
| 64 |
-
)
|
| 65 |
-
|
| 66 |
logger.info("Carregando QueryEngine...")
|
| 67 |
from query_engine import QueryEngine
|
| 68 |
query_engine = QueryEngine()
|
| 69 |
logger.info("✅ QueryEngine carregado!")
|
| 70 |
-
|
| 71 |
return query_engine
|
| 72 |
|
| 73 |
-
|
| 74 |
-
# FASTAPI APP
|
| 75 |
-
# ============================================================================
|
| 76 |
-
|
| 77 |
-
app = FastAPI(
|
| 78 |
-
title="Para.AI RAG Cluster",
|
| 79 |
-
description="Micro-cluster RAG para jurisprudências do TJPR",
|
| 80 |
-
version="1.0.0"
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
# ============================================================================
|
| 84 |
-
# MODELS (Pydantic)
|
| 85 |
-
# ============================================================================
|
| 86 |
|
| 87 |
class EmbeddingSearchRequest(BaseModel):
|
| 88 |
query: str
|
|
@@ -98,121 +54,76 @@ class IDSearchRequest(BaseModel):
|
|
| 98 |
ids: List[str]
|
| 99 |
return_embeddings: bool = False
|
| 100 |
|
| 101 |
-
# ============================================================================
|
| 102 |
-
# ENDPOINTS
|
| 103 |
-
# ============================================================================
|
| 104 |
-
|
| 105 |
@app.get("/")
|
| 106 |
async def root():
|
| 107 |
-
"""Health check - SEMPRE responde (mesmo durante setup)"""
|
| 108 |
setup_status = get_setup_status()
|
| 109 |
ready = is_ready()
|
| 110 |
|
| 111 |
-
response = {
|
| 112 |
-
"status": "online",
|
| 113 |
-
"rag_ready": ready,
|
| 114 |
-
"setup": setup_status
|
| 115 |
-
}
|
| 116 |
|
| 117 |
if ready and query_engine:
|
| 118 |
-
response["cluster_id"] = query_engine.config
|
| 119 |
-
response["chunk_range"] = [
|
| 120 |
-
query_engine.config['chunk_start'],
|
| 121 |
-
query_engine.config['chunk_end']
|
| 122 |
-
]
|
| 123 |
-
response["endpoints"] = [
|
| 124 |
-
"/search/embedding",
|
| 125 |
-
"/search/keywords",
|
| 126 |
-
"/search/by_id",
|
| 127 |
-
"/cluster/info",
|
| 128 |
-
"/setup/status"
|
| 129 |
-
]
|
| 130 |
|
| 131 |
return response
|
| 132 |
|
| 133 |
@app.get("/setup/status")
|
| 134 |
async def setup_status():
|
| 135 |
-
"""Retorna status detalhado do setup"""
|
| 136 |
return get_setup_status()
|
| 137 |
|
| 138 |
@app.get("/health")
|
| 139 |
async def health():
|
| 140 |
-
"""Health check simples para HF Spaces"""
|
| 141 |
return {"status": "ok", "timestamp": time.time()}
|
| 142 |
|
| 143 |
@app.post("/search/embedding")
|
| 144 |
async def search_embedding(request: EmbeddingSearchRequest):
|
| 145 |
-
|
| 146 |
-
engine = get_query_engine() # Lança 503 se não estiver pronto
|
| 147 |
-
|
| 148 |
try:
|
| 149 |
start = time.time()
|
| 150 |
-
results = engine.search_by_embedding(
|
| 151 |
-
query=request.query,
|
| 152 |
-
top_k=request.top_k,
|
| 153 |
-
return_embeddings=request.return_embeddings
|
| 154 |
-
)
|
| 155 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 156 |
return results
|
| 157 |
except Exception as e:
|
| 158 |
-
logger.error(f"Erro
|
| 159 |
raise HTTPException(status_code=500, detail=str(e))
|
| 160 |
|
| 161 |
@app.post("/search/keywords")
|
| 162 |
async def search_keywords(request: KeywordSearchRequest):
|
| 163 |
-
"""Busca por termos-chave (full-text search)"""
|
| 164 |
engine = get_query_engine()
|
| 165 |
-
|
| 166 |
try:
|
| 167 |
start = time.time()
|
| 168 |
-
results = engine.search_by_keywords(
|
| 169 |
-
keywords=request.keywords,
|
| 170 |
-
operator=request.operator,
|
| 171 |
-
top_k=request.top_k
|
| 172 |
-
)
|
| 173 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 174 |
return results
|
| 175 |
except Exception as e:
|
| 176 |
-
logger.error(f"Erro em search_keywords: {e}")
|
| 177 |
raise HTTPException(status_code=500, detail=str(e))
|
| 178 |
|
| 179 |
@app.post("/search/by_id")
|
| 180 |
async def search_by_id(request: IDSearchRequest):
|
| 181 |
-
"""Busca direta por ID(s)"""
|
| 182 |
engine = get_query_engine()
|
| 183 |
-
|
| 184 |
try:
|
| 185 |
start = time.time()
|
| 186 |
-
results = engine.search_by_ids(
|
| 187 |
-
ids=request.ids,
|
| 188 |
-
return_embeddings=request.return_embeddings
|
| 189 |
-
)
|
| 190 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 191 |
return results
|
| 192 |
except Exception as e:
|
| 193 |
-
logger.error(f"Erro em search_by_id: {e}")
|
| 194 |
raise HTTPException(status_code=500, detail=str(e))
|
| 195 |
|
| 196 |
@app.get("/cluster/info")
|
| 197 |
async def cluster_info():
|
| 198 |
-
"""Informações detalhadas do cluster"""
|
| 199 |
engine = get_query_engine()
|
| 200 |
-
|
| 201 |
try:
|
| 202 |
info = engine.get_cluster_info()
|
| 203 |
info['uptime_seconds'] = round(time.time() - app.state.start_time, 2)
|
| 204 |
return info
|
| 205 |
except Exception as e:
|
| 206 |
-
logger.error(f"Erro em cluster_info: {e}")
|
| 207 |
raise HTTPException(status_code=500, detail=str(e))
|
| 208 |
|
| 209 |
@app.on_event("startup")
|
| 210 |
async def startup_event():
|
| 211 |
-
"""Evento de startup - RÁPIDO (não aguarda setup)"""
|
| 212 |
app.state.start_time = time.time()
|
| 213 |
logger.info("="*80)
|
| 214 |
-
logger.info("🚀 Para.AI RAG
|
| 215 |
-
logger.info("Setup em background: verificar /setup/status")
|
| 216 |
logger.info("="*80)
|
| 217 |
|
| 218 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 3 |
from pydantic import BaseModel
|
| 4 |
from typing import List, Optional
|
| 5 |
import logging
|
|
|
|
| 10 |
logging.basicConfig(level=logging.INFO)
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
STATUS_FILE = Path('/tmp/setup_status.json')
|
| 14 |
+
READY_FLAG = Path('/tmp/faiss_ready')
|
| 15 |
|
| 16 |
def get_setup_status():
|
|
|
|
| 17 |
if not STATUS_FILE.exists():
|
| 18 |
+
return {'status': 'initializing', 'message': 'Setup não iniciado', 'progress': 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
try:
|
| 20 |
with open(STATUS_FILE) as f:
|
| 21 |
return json.load(f)
|
| 22 |
except:
|
| 23 |
+
return {'status': 'unknown', 'message': 'Erro ao ler status', 'progress': 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def is_ready():
|
|
|
|
| 26 |
return READY_FLAG.exists()
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
query_engine = None
|
| 29 |
|
| 30 |
def get_query_engine():
|
|
|
|
| 31 |
global query_engine
|
|
|
|
| 32 |
if query_engine is None:
|
| 33 |
if not is_ready():
|
| 34 |
+
raise HTTPException(status_code=503, detail="RAG em construção. Tente em alguns minutos.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
logger.info("Carregando QueryEngine...")
|
| 36 |
from query_engine import QueryEngine
|
| 37 |
query_engine = QueryEngine()
|
| 38 |
logger.info("✅ QueryEngine carregado!")
|
|
|
|
| 39 |
return query_engine
|
| 40 |
|
| 41 |
+
app = FastAPI(title="Para.AI RAG Cluster (LangChain)", version="1.0.0")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
class EmbeddingSearchRequest(BaseModel):
|
| 44 |
query: str
|
|
|
|
| 54 |
ids: List[str]
|
| 55 |
return_embeddings: bool = False
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
@app.get("/")
|
| 58 |
async def root():
|
|
|
|
| 59 |
setup_status = get_setup_status()
|
| 60 |
ready = is_ready()
|
| 61 |
|
| 62 |
+
response = {"status": "online", "rag_ready": ready, "setup": setup_status, "backend": "LangChain + FAISS (CPU)"}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
if ready and query_engine:
|
| 65 |
+
response["cluster_id"] = query_engine.config.get('cluster_id')
|
| 66 |
+
response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
return response
|
| 69 |
|
| 70 |
@app.get("/setup/status")
|
| 71 |
async def setup_status():
|
|
|
|
| 72 |
return get_setup_status()
|
| 73 |
|
| 74 |
@app.get("/health")
|
| 75 |
async def health():
|
|
|
|
| 76 |
return {"status": "ok", "timestamp": time.time()}
|
| 77 |
|
| 78 |
@app.post("/search/embedding")
|
| 79 |
async def search_embedding(request: EmbeddingSearchRequest):
|
| 80 |
+
engine = get_query_engine()
|
|
|
|
|
|
|
| 81 |
try:
|
| 82 |
start = time.time()
|
| 83 |
+
results = engine.search_by_embedding(request.query, request.top_k, request.return_embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 85 |
return results
|
| 86 |
except Exception as e:
|
| 87 |
+
logger.error(f"Erro: {e}")
|
| 88 |
raise HTTPException(status_code=500, detail=str(e))
|
| 89 |
|
| 90 |
@app.post("/search/keywords")
|
| 91 |
async def search_keywords(request: KeywordSearchRequest):
|
|
|
|
| 92 |
engine = get_query_engine()
|
|
|
|
| 93 |
try:
|
| 94 |
start = time.time()
|
| 95 |
+
results = engine.search_by_keywords(request.keywords, request.operator, request.top_k)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 97 |
return results
|
| 98 |
except Exception as e:
|
|
|
|
| 99 |
raise HTTPException(status_code=500, detail=str(e))
|
| 100 |
|
| 101 |
@app.post("/search/by_id")
|
| 102 |
async def search_by_id(request: IDSearchRequest):
|
|
|
|
| 103 |
engine = get_query_engine()
|
|
|
|
| 104 |
try:
|
| 105 |
start = time.time()
|
| 106 |
+
results = engine.search_by_ids(request.ids, request.return_embeddings)
|
|
|
|
|
|
|
|
|
|
| 107 |
results['query_time_ms'] = round((time.time() - start) * 1000, 2)
|
| 108 |
return results
|
| 109 |
except Exception as e:
|
|
|
|
| 110 |
raise HTTPException(status_code=500, detail=str(e))
|
| 111 |
|
| 112 |
@app.get("/cluster/info")
|
| 113 |
async def cluster_info():
|
|
|
|
| 114 |
engine = get_query_engine()
|
|
|
|
| 115 |
try:
|
| 116 |
info = engine.get_cluster_info()
|
| 117 |
info['uptime_seconds'] = round(time.time() - app.state.start_time, 2)
|
| 118 |
return info
|
| 119 |
except Exception as e:
|
|
|
|
| 120 |
raise HTTPException(status_code=500, detail=str(e))
|
| 121 |
|
| 122 |
@app.on_event("startup")
|
| 123 |
async def startup_event():
|
|
|
|
| 124 |
app.state.start_time = time.time()
|
| 125 |
logger.info("="*80)
|
| 126 |
+
logger.info("🚀 Para.AI RAG (LangChain + FAISS) ONLINE")
|
|
|
|
| 127 |
logger.info("="*80)
|
| 128 |
|
| 129 |
if __name__ == "__main__":
|
entrypoint.sh
CHANGED
|
@@ -2,41 +2,27 @@
|
|
| 2 |
set -e
|
| 3 |
|
| 4 |
echo "=================================="
|
| 5 |
-
echo "🚀 Para.AI RAG
|
| 6 |
echo "=================================="
|
| 7 |
|
| 8 |
-
# Ir para diretório da aplicação
|
| 9 |
cd /home/user/app
|
| 10 |
|
| 11 |
-
# ESTRATÉGIA: Iniciar setup em background PRIMEIRO, depois FastAPI
|
| 12 |
-
# Isso evita timeout de inicialização do HF Spaces
|
| 13 |
-
|
| 14 |
echo ""
|
| 15 |
echo "1️⃣ Iniciando setup em background..."
|
| 16 |
-
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
| 17 |
-
|
| 18 |
-
# Iniciar setup.py em background com output unbuffered (-u)
|
| 19 |
-
# Redirecionar output para arquivo + tela
|
| 20 |
python3 -u setup.py > /tmp/setup_output.log 2>&1 &
|
| 21 |
SETUP_PID=$!
|
| 22 |
|
| 23 |
-
echo "✅ Setup
|
| 24 |
-
echo "📋 Logs
|
| 25 |
-
echo "📊 Status em: /tmp/setup_status.json"
|
| 26 |
echo ""
|
| 27 |
|
| 28 |
-
# Esperar 2 segundos para setup criar arquivo de status
|
| 29 |
sleep 2
|
| 30 |
|
| 31 |
echo "2️⃣ Iniciando FastAPI..."
|
| 32 |
-
echo "
|
| 33 |
-
echo "
|
| 34 |
-
echo "
|
| 35 |
-
echo "📡 Acompanhe em: /setup/status"
|
| 36 |
echo ""
|
| 37 |
echo "=================================="
|
| 38 |
-
echo "🚀 Iniciando API REST..."
|
| 39 |
-
echo "=================================="
|
| 40 |
|
| 41 |
-
# Iniciar FastAPI (bloqueia aqui)
|
| 42 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
|
|
|
| 2 |
set -e
|
| 3 |
|
| 4 |
echo "=================================="
|
| 5 |
+
echo "🚀 Para.AI RAG (LangChain) Startup"
|
| 6 |
echo "=================================="
|
| 7 |
|
|
|
|
| 8 |
cd /home/user/app
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
echo ""
|
| 11 |
echo "1️⃣ Iniciando setup em background..."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
python3 -u setup.py > /tmp/setup_output.log 2>&1 &
|
| 13 |
SETUP_PID=$!
|
| 14 |
|
| 15 |
+
echo "✅ Setup PID: $SETUP_PID"
|
| 16 |
+
echo "📋 Logs: /tmp/setup_output.log"
|
|
|
|
| 17 |
echo ""
|
| 18 |
|
|
|
|
| 19 |
sleep 2
|
| 20 |
|
| 21 |
echo "2️⃣ Iniciando FastAPI..."
|
| 22 |
+
echo "🎯 API online IMEDIATAMENTE"
|
| 23 |
+
echo "🔧 RAG disponível quando setup terminar (~10-15min)"
|
| 24 |
+
echo "📡 Acompanhe: /setup/status"
|
|
|
|
| 25 |
echo ""
|
| 26 |
echo "=================================="
|
|
|
|
|
|
|
| 27 |
|
|
|
|
| 28 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
filter_fields.py
CHANGED
|
@@ -1,44 +1,29 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Filtrar campos de JSONL mantendo apenas os especificados
|
| 4 |
-
"""
|
| 5 |
import json
|
| 6 |
-
import yaml
|
| 7 |
-
from pathlib import Path
|
| 8 |
import argparse
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def filter_jsonl(input_path: str, output_path: str, keep_fields: list = None):
|
| 12 |
-
"""Filtra campos de arquivo JSONL"""
|
| 13 |
-
|
| 14 |
-
# Carregar campos da config se não especificados
|
| 15 |
-
if keep_fields is None:
|
| 16 |
-
with open('config.yaml') as f:
|
| 17 |
-
config = yaml.safe_load(f)
|
| 18 |
-
keep_fields = config['campos_filter']
|
| 19 |
-
|
| 20 |
-
print(f"📥 Input: {input_path}")
|
| 21 |
-
print(f"📤 Output: {output_path}")
|
| 22 |
-
print(f"🔧 Mantendo campos: {keep_fields}")
|
| 23 |
-
|
| 24 |
-
# Contar linhas
|
| 25 |
-
with open(input_path) as f:
|
| 26 |
-
total = sum(1 for _ in f)
|
| 27 |
-
|
| 28 |
-
# Filtrar
|
| 29 |
-
with open(input_path) as fin, open(output_path, 'w') as fout:
|
| 30 |
-
for line in tqdm(fin, total=total, desc="Filtrando"):
|
| 31 |
-
record = json.loads(line)
|
| 32 |
-
filtered = {k: record[k] for k in keep_fields if k in record}
|
| 33 |
-
fout.write(json.dumps(filtered, ensure_ascii=False) + '\n')
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
parser = argparse.ArgumentParser()
|
| 39 |
parser.add_argument('--input', required=True)
|
| 40 |
parser.add_argument('--output', required=True)
|
| 41 |
-
parser.add_argument('--
|
| 42 |
args = parser.parse_args()
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
| 2 |
import json
|
|
|
|
|
|
|
| 3 |
import argparse
|
| 4 |
+
import yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
def filter_jsonl(input_file, output_file, fields_to_keep):
|
| 7 |
+
with open(input_file, 'r', encoding='utf-8') as fin:
|
| 8 |
+
with open(output_file, 'w', encoding='utf-8') as fout:
|
| 9 |
+
for line in fin:
|
| 10 |
+
if line.strip():
|
| 11 |
+
record = json.loads(line)
|
| 12 |
+
filtered = {k: record.get(k) for k in fields_to_keep if k in record}
|
| 13 |
+
fout.write(json.dumps(filtered, ensure_ascii=False) + '\n')
|
| 14 |
|
| 15 |
+
def main():
|
| 16 |
parser = argparse.ArgumentParser()
|
| 17 |
parser.add_argument('--input', required=True)
|
| 18 |
parser.add_argument('--output', required=True)
|
| 19 |
+
parser.add_argument('--config', default='config.yaml')
|
| 20 |
args = parser.parse_args()
|
| 21 |
|
| 22 |
+
with open(args.config) as f:
|
| 23 |
+
config = yaml.safe_load(f)
|
| 24 |
+
|
| 25 |
+
filter_jsonl(args.input, args.output, config['campos_filter'])
|
| 26 |
+
print(f"✅ Filtrado: {args.output}")
|
| 27 |
+
|
| 28 |
+
if __name__ == '__main__':
|
| 29 |
+
main()
|
query_engine.py
CHANGED
|
@@ -1,184 +1,91 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Engine de busca para ChromaDB
|
| 4 |
-
"""
|
| 5 |
import yaml
|
| 6 |
-
import chromadb
|
| 7 |
-
from sentence_transformers import SentenceTransformer
|
| 8 |
-
from typing import List, Dict, Optional
|
| 9 |
import logging
|
|
|
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
class QueryEngine:
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
-
def __init__(self, config_path: str = 'config.yaml'):
|
| 17 |
-
# Carregar config
|
| 18 |
with open(config_path) as f:
|
| 19 |
self.config = yaml.safe_load(f)
|
| 20 |
|
| 21 |
-
|
| 22 |
-
logger.info(f"
|
| 23 |
-
self.model = SentenceTransformer(self.config['embedding_model'])
|
| 24 |
-
|
| 25 |
-
# Conectar ao ChromaDB
|
| 26 |
-
logger.info(f"Conectando ao ChromaDB...")
|
| 27 |
-
self.client = chromadb.PersistentClient(path=self.config['chromadb_path'])
|
| 28 |
-
self.collection = self.client.get_collection(self.config['collection_name'])
|
| 29 |
-
|
| 30 |
-
logger.info(f"✅ QueryEngine pronto ({self.collection.count():,} registros)")
|
| 31 |
-
|
| 32 |
-
def search_by_embedding(
|
| 33 |
-
self,
|
| 34 |
-
query: str,
|
| 35 |
-
top_k: int = 10,
|
| 36 |
-
return_embeddings: bool = False
|
| 37 |
-
) -> Dict:
|
| 38 |
-
"""Busca por similaridade semântica"""
|
| 39 |
-
|
| 40 |
-
# Gerar embedding da query
|
| 41 |
-
query_embedding = self.model.encode(query).tolist()
|
| 42 |
-
|
| 43 |
-
# Buscar no ChromaDB
|
| 44 |
-
results = self.collection.query(
|
| 45 |
-
query_embeddings=[query_embedding],
|
| 46 |
-
n_results=top_k,
|
| 47 |
-
include=['documents', 'metadatas', 'distances', 'embeddings'] if return_embeddings
|
| 48 |
-
else ['documents', 'metadatas', 'distances']
|
| 49 |
-
)
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
'id': results['ids'][0][i],
|
| 56 |
-
'ementa': results['documents'][0][i],
|
| 57 |
-
'distance': results['distances'][0][i],
|
| 58 |
-
'score': 1.0 - results['distances'][0][i] # Converter distância para score
|
| 59 |
-
}
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
| 67 |
-
'cluster_id': self.config['cluster_id'],
|
| 68 |
-
'chunk_range': [self.config['chunk_start'], self.config['chunk_end']],
|
| 69 |
-
'results': formatted_results,
|
| 70 |
-
'total_found': len(formatted_results)
|
| 71 |
-
}
|
| 72 |
|
| 73 |
-
def
|
| 74 |
-
self,
|
| 75 |
-
keywords: List[str],
|
| 76 |
-
operator: str = 'AND',
|
| 77 |
-
top_k: int = 20
|
| 78 |
-
) -> Dict:
|
| 79 |
-
"""Busca por termos-chave (full-text search)"""
|
| 80 |
-
|
| 81 |
-
# Construir query string
|
| 82 |
-
if operator.upper() == 'AND':
|
| 83 |
-
query_str = ' '.join(keywords)
|
| 84 |
-
else: # OR
|
| 85 |
-
query_str = '|'.join(keywords)
|
| 86 |
-
|
| 87 |
-
# Buscar usando where_document (full-text search do ChromaDB)
|
| 88 |
-
results = self.collection.query(
|
| 89 |
-
query_texts=[query_str],
|
| 90 |
-
n_results=top_k,
|
| 91 |
-
include=['documents', 'metadatas']
|
| 92 |
-
)
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
formatted_results.append({
|
| 102 |
-
'id': results['ids'][0][i],
|
| 103 |
-
'ementa': results['documents'][0][i],
|
| 104 |
-
'matched_keywords': matched
|
| 105 |
})
|
| 106 |
|
| 107 |
return {
|
| 108 |
-
'cluster_id': self.config
|
| 109 |
-
'
|
| 110 |
-
'
|
|
|
|
| 111 |
}
|
| 112 |
|
| 113 |
-
def
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
) -> Dict:
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
'results': [],
|
| 132 |
-
'not_found': ids,
|
| 133 |
-
'total_found': 0
|
| 134 |
-
}
|
| 135 |
-
|
| 136 |
-
# Formatar resposta
|
| 137 |
-
formatted_results = []
|
| 138 |
-
found_ids = set(results['ids'])
|
| 139 |
-
|
| 140 |
-
for i in range(len(results['ids'])):
|
| 141 |
-
result = {
|
| 142 |
-
'id': results['ids'][i],
|
| 143 |
-
'ementa': results['documents'][i]
|
| 144 |
-
}
|
| 145 |
-
|
| 146 |
-
if return_embeddings and 'embeddings' in results:
|
| 147 |
-
result['embedding'] = results['embeddings'][i]
|
| 148 |
-
|
| 149 |
-
formatted_results.append(result)
|
| 150 |
-
|
| 151 |
-
# IDs não encontrados
|
| 152 |
-
not_found = [id for id in ids if id not in found_ids]
|
| 153 |
|
| 154 |
return {
|
| 155 |
-
'cluster_id': self.config
|
| 156 |
-
'
|
| 157 |
-
'
|
| 158 |
-
'total_found': len(formatted_results)
|
| 159 |
}
|
| 160 |
|
| 161 |
def get_cluster_info(self) -> Dict:
|
| 162 |
-
"""Retorna informações do cluster"""
|
| 163 |
-
import os
|
| 164 |
-
|
| 165 |
-
# Calcular tamanho do ChromaDB
|
| 166 |
-
db_path = self.config['chromadb_path']
|
| 167 |
-
total_size = 0
|
| 168 |
-
for dirpath, dirnames, filenames in os.walk(db_path):
|
| 169 |
-
for f in filenames:
|
| 170 |
-
fp = os.path.join(dirpath, f)
|
| 171 |
-
total_size += os.path.getsize(fp)
|
| 172 |
-
|
| 173 |
-
db_size_mb = total_size / (1024 * 1024)
|
| 174 |
-
|
| 175 |
return {
|
| 176 |
-
'cluster_id': self.config
|
| 177 |
-
'chunk_range': [self.config
|
| 178 |
-
'
|
| 179 |
-
'
|
| 180 |
-
'
|
| 181 |
-
'
|
| 182 |
-
'db_size_mb': round(db_size_mb, 2),
|
| 183 |
'status': 'ready'
|
| 184 |
}
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
| 2 |
import yaml
|
|
|
|
|
|
|
|
|
|
| 3 |
import logging
|
| 4 |
+
from typing import List, Dict
|
| 5 |
|
| 6 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 7 |
+
from langchain_community.vectorstores import FAISS
|
| 8 |
+
|
| 9 |
+
logging.basicConfig(level=logging.INFO)
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
| 12 |
class QueryEngine:
|
| 13 |
+
def __init__(self, config_path='config.yaml'):
|
| 14 |
+
logger.info("Inicializando QueryEngine...")
|
| 15 |
|
|
|
|
|
|
|
| 16 |
with open(config_path) as f:
|
| 17 |
self.config = yaml.safe_load(f)
|
| 18 |
|
| 19 |
+
model_name = self.config.get('embedding_model', 'all-MiniLM-L6-v2')
|
| 20 |
+
logger.info(f"Modelo: {model_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 23 |
+
model_name=model_name,
|
| 24 |
+
model_kwargs={'device': 'cpu'}
|
| 25 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
faiss_path = self.config.get('faiss_path', '/app/faiss_index')
|
| 28 |
+
logger.info(f"Carregando FAISS de: {faiss_path}")
|
| 29 |
|
| 30 |
+
self.vectorstore = FAISS.load_local(
|
| 31 |
+
faiss_path,
|
| 32 |
+
self.embeddings,
|
| 33 |
+
allow_dangerous_deserialization=True
|
| 34 |
+
)
|
| 35 |
|
| 36 |
+
logger.info("✅ QueryEngine pronto!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
def search_by_embedding(self, query: str, top_k: int = 10, return_embeddings: bool = False) -> Dict:
|
| 39 |
+
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
formatted = []
|
| 42 |
+
for doc, score in results:
|
| 43 |
+
formatted.append({
|
| 44 |
+
'id': doc.metadata.get('id'),
|
| 45 |
+
'ementa': doc.page_content,
|
| 46 |
+
'score': float(score),
|
| 47 |
+
'metadata': doc.metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
})
|
| 49 |
|
| 50 |
return {
|
| 51 |
+
'cluster_id': self.config.get('cluster_id'),
|
| 52 |
+
'query': query,
|
| 53 |
+
'total_results': len(formatted),
|
| 54 |
+
'results': formatted
|
| 55 |
}
|
| 56 |
|
| 57 |
+
def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
|
| 58 |
+
query = ' '.join(keywords)
|
| 59 |
+
return self.search_by_embedding(query, top_k)
|
| 60 |
+
|
| 61 |
+
def search_by_ids(self, ids: List[str], return_embeddings: bool = False) -> Dict:
|
| 62 |
+
# FAISS não tem busca direta por ID - implementação simplificada
|
| 63 |
+
all_docs = self.vectorstore.similarity_search("", k=10000)
|
| 64 |
+
|
| 65 |
+
results = []
|
| 66 |
+
for doc in all_docs:
|
| 67 |
+
if doc.metadata.get('id') in ids:
|
| 68 |
+
results.append({
|
| 69 |
+
'id': doc.metadata.get('id'),
|
| 70 |
+
'ementa': doc.page_content,
|
| 71 |
+
'metadata': doc.metadata
|
| 72 |
+
})
|
| 73 |
+
if len(results) >= len(ids):
|
| 74 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
return {
|
| 77 |
+
'cluster_id': self.config.get('cluster_id'),
|
| 78 |
+
'total_results': len(results),
|
| 79 |
+
'results': results
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
def get_cluster_info(self) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
return {
|
| 84 |
+
'cluster_id': self.config.get('cluster_id'),
|
| 85 |
+
'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')],
|
| 86 |
+
'embedding_model': self.config.get('embedding_model'),
|
| 87 |
+
'embedding_dim': 384,
|
| 88 |
+
'vector_store': 'FAISS',
|
| 89 |
+
'backend': 'LangChain + CPU',
|
|
|
|
| 90 |
'status': 'ready'
|
| 91 |
}
|
rag_builder.py
CHANGED
|
@@ -1,105 +1,94 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
|
|
|
| 4 |
"""
|
|
|
|
|
|
|
|
|
|
| 5 |
import json
|
| 6 |
-
import yaml
|
| 7 |
-
from pathlib import Path
|
| 8 |
import argparse
|
| 9 |
-
import
|
| 10 |
-
from
|
| 11 |
-
from tqdm import tqdm
|
| 12 |
import logging
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
logging.basicConfig(level=logging.INFO)
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
-
def
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
with open(config_path) as f:
|
| 22 |
-
config = yaml.safe_load(f)
|
| 23 |
|
|
|
|
| 24 |
logger.info("="*80)
|
| 25 |
-
logger.info("
|
| 26 |
logger.info("="*80)
|
| 27 |
-
logger.info(f"Cluster ID: {config['cluster_id']}")
|
| 28 |
-
logger.info(f"Chunks: {config['chunk_start']} - {config['chunk_end']}")
|
| 29 |
-
logger.info(f"Embedding Model: {config['embedding_model']}")
|
| 30 |
-
|
| 31 |
-
# Carregar modelo de embedding
|
| 32 |
-
logger.info("\n📥 Carregando modelo de embedding...")
|
| 33 |
-
model = SentenceTransformer(config['embedding_model'])
|
| 34 |
-
logger.info(f"✅ Modelo carregado (dim={config['embedding_dim']})")
|
| 35 |
-
|
| 36 |
-
# Inicializar ChromaDB
|
| 37 |
-
logger.info(f"\n💾 Inicializando ChromaDB em {config['chromadb_path']}...")
|
| 38 |
-
client = chromadb.PersistentClient(path=config['chromadb_path'])
|
| 39 |
-
|
| 40 |
-
# Criar/obter collection
|
| 41 |
-
try:
|
| 42 |
-
collection = client.get_collection(config['collection_name'])
|
| 43 |
-
logger.info(f"⚠️ Collection '{config['collection_name']}' já existe! Apagando...")
|
| 44 |
-
client.delete_collection(config['collection_name'])
|
| 45 |
-
except:
|
| 46 |
-
pass
|
| 47 |
-
|
| 48 |
-
collection = client.create_collection(
|
| 49 |
-
name=config['collection_name'],
|
| 50 |
-
metadata={
|
| 51 |
-
"cluster_id": config['cluster_id'],
|
| 52 |
-
"chunk_start": config['chunk_start'],
|
| 53 |
-
"chunk_end": config['chunk_end']
|
| 54 |
-
}
|
| 55 |
-
)
|
| 56 |
-
logger.info(f"✅ Collection criada")
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
records
|
| 61 |
-
with open(input_jsonl) as f:
|
| 62 |
-
for line in f:
|
| 63 |
-
records.append(json.loads(line))
|
| 64 |
-
|
| 65 |
-
total = len(records)
|
| 66 |
-
logger.info(f"✅ {total:,} registros carregados")
|
| 67 |
-
|
| 68 |
-
# Processar em batches
|
| 69 |
-
batch_size = config['embedding_batch_size']
|
| 70 |
-
logger.info(f"\n🚀 Gerando embeddings em batches de {batch_size}...")
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
|
|
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
ids=ids,
|
| 90 |
-
embeddings=embeddings,
|
| 91 |
-
documents=documents,
|
| 92 |
-
metadatas=metadatas
|
| 93 |
-
)
|
| 94 |
|
| 95 |
-
|
| 96 |
-
logger.info(f"📊 Total de registros: {collection.count():,}")
|
| 97 |
-
logger.info("="*80)
|
| 98 |
|
| 99 |
-
|
| 100 |
parser = argparse.ArgumentParser()
|
| 101 |
-
parser.add_argument('--input', required=True
|
| 102 |
-
parser.add_argument('--
|
|
|
|
|
|
|
| 103 |
args = parser.parse_args()
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
RAG Builder usando LangChain + HuggingFaceEmbeddings (CPU)
|
| 4 |
+
Constrói FAISS vector store a partir de JSONL filtrado
|
| 5 |
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
import json
|
|
|
|
|
|
|
| 10 |
import argparse
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import List, Dict
|
|
|
|
| 13 |
import logging
|
| 14 |
|
| 15 |
+
from langchain.docstore.document import Document
|
| 16 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 17 |
+
from langchain_community.vectorstores import FAISS
|
| 18 |
+
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
| 22 |
+
def load_jsonl(filepath: str) -> List[Dict]:
|
| 23 |
+
records = []
|
| 24 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 25 |
+
for line in f:
|
| 26 |
+
if line.strip():
|
| 27 |
+
records.append(json.loads(line))
|
| 28 |
+
return records
|
| 29 |
+
|
| 30 |
+
def create_documents(records: List[Dict]) -> List[Document]:
|
| 31 |
+
documents = []
|
| 32 |
+
for record in records:
|
| 33 |
+
doc_id = record.get('id', 'unknown')
|
| 34 |
+
ementa = record.get('ementa', '')
|
| 35 |
+
|
| 36 |
+
if not ementa:
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
doc = Document(
|
| 40 |
+
page_content=ementa,
|
| 41 |
+
metadata={'id': doc_id, 'source': 'tjpr'}
|
| 42 |
+
)
|
| 43 |
+
documents.append(doc)
|
| 44 |
|
| 45 |
+
return documents
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
def build_vectorstore(input_file, output_dir='/app/faiss_index', model_name='all-MiniLM-L6-v2', batch_size=64):
|
| 48 |
logger.info("="*80)
|
| 49 |
+
logger.info("🚀 RAG Builder - LangChain + FAISS (CPU)")
|
| 50 |
logger.info("="*80)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
logger.info(f"\n📂 Carregando {input_file}...")
|
| 53 |
+
records = load_jsonl(input_file)
|
| 54 |
+
logger.info(f"✅ {len(records):,} registros")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
logger.info("\n📄 Criando Documents...")
|
| 57 |
+
documents = create_documents(records)
|
| 58 |
+
logger.info(f"✅ {len(documents):,} documentos")
|
| 59 |
|
| 60 |
+
if not documents:
|
| 61 |
+
logger.error("❌ Nenhum documento válido!")
|
| 62 |
+
sys.exit(1)
|
| 63 |
|
| 64 |
+
logger.info(f"\n🤖 Criando embeddings com {model_name} (CPU)...")
|
| 65 |
+
embeddings = HuggingFaceEmbeddings(
|
| 66 |
+
model_name=model_name,
|
| 67 |
+
model_kwargs={'device': 'cpu'},
|
| 68 |
+
encode_kwargs={'batch_size': batch_size, 'show_progress_bar': True}
|
| 69 |
+
)
|
| 70 |
|
| 71 |
+
logger.info("\n🔍 Construindo FAISS index...")
|
| 72 |
+
vectorstore = FAISS.from_documents(documents, embeddings)
|
| 73 |
|
| 74 |
+
logger.info(f"\n💾 Salvando em {output_dir}...")
|
| 75 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 76 |
+
vectorstore.save_local(output_dir)
|
| 77 |
|
| 78 |
+
logger.info("\n✅ FAISS INDEX CRIADO!")
|
| 79 |
+
logger.info(f"📊 {len(documents):,} documentos indexados")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
return vectorstore
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
def main():
|
| 84 |
parser = argparse.ArgumentParser()
|
| 85 |
+
parser.add_argument('--input', required=True)
|
| 86 |
+
parser.add_argument('--output', default='/app/faiss_index')
|
| 87 |
+
parser.add_argument('--model', default='all-MiniLM-L6-v2')
|
| 88 |
+
parser.add_argument('--batch-size', type=int, default=64)
|
| 89 |
args = parser.parse_args()
|
| 90 |
|
| 91 |
+
build_vectorstore(args.input, args.output, args.model, args.batch_size)
|
| 92 |
+
|
| 93 |
+
if __name__ == '__main__':
|
| 94 |
+
main()
|
requirements.txt
CHANGED
|
@@ -3,12 +3,13 @@ fastapi==0.109.0
|
|
| 3 |
uvicorn[standard]==0.27.0
|
| 4 |
pydantic==2.5.0
|
| 5 |
|
| 6 |
-
#
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
transformers==4.37.2
|
| 10 |
sentence-transformers==2.5.1
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Utilities
|
| 14 |
PyYAML==6.0.1
|
|
|
|
| 3 |
uvicorn[standard]==0.27.0
|
| 4 |
pydantic==2.5.0
|
| 5 |
|
| 6 |
+
# LangChain + Embeddings (CPU-only)
|
| 7 |
+
langchain==0.1.11
|
| 8 |
+
langchain-community==0.0.24
|
|
|
|
| 9 |
sentence-transformers==2.5.1
|
| 10 |
+
|
| 11 |
+
# Vector Store
|
| 12 |
+
faiss-cpu==1.8.0
|
| 13 |
|
| 14 |
# Utilities
|
| 15 |
PyYAML==6.0.1
|
setup.py
CHANGED
|
@@ -1,8 +1,4 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Setup em background - Clona dados, constrói ChromaDB
|
| 4 |
-
Executa enquanto FastAPI já está respondendo (evita timeout HF)
|
| 5 |
-
"""
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
import yaml
|
|
@@ -12,64 +8,34 @@ import logging
|
|
| 12 |
from pathlib import Path
|
| 13 |
from datetime import datetime
|
| 14 |
|
| 15 |
-
|
| 16 |
-
logging.basicConfig(
|
| 17 |
-
level=logging.INFO,
|
| 18 |
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 19 |
-
handlers=[
|
| 20 |
-
logging.StreamHandler(sys.stdout),
|
| 21 |
-
logging.FileHandler('/tmp/setup.log')
|
| 22 |
-
]
|
| 23 |
-
)
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
-
# Forçar flush imediato
|
| 27 |
-
for handler in logger.handlers:
|
| 28 |
-
handler.flush = lambda: None
|
| 29 |
-
|
| 30 |
STATUS_FILE = Path('/tmp/setup_status.json')
|
| 31 |
-
READY_FLAG = Path('/tmp/
|
| 32 |
-
|
| 33 |
-
def update_status(status
|
| 34 |
-
|
| 35 |
-
data = {
|
| 36 |
-
'status': status,
|
| 37 |
-
'message': message,
|
| 38 |
-
'progress': progress,
|
| 39 |
-
'timestamp': datetime.now().isoformat()
|
| 40 |
-
}
|
| 41 |
with open(STATUS_FILE, 'w') as f:
|
| 42 |
json.dump(data, f)
|
| 43 |
logger.info(f"[{progress}%] {status}: {message}")
|
| 44 |
sys.stdout.flush()
|
| 45 |
|
| 46 |
-
def
|
| 47 |
-
"
|
| 48 |
-
|
| 49 |
-
logger.info(f"Comando: {cmd}")
|
| 50 |
-
|
| 51 |
-
result = subprocess.run(
|
| 52 |
-
cmd,
|
| 53 |
-
shell=True,
|
| 54 |
-
capture_output=True,
|
| 55 |
-
text=True
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
if result.returncode != 0:
|
| 59 |
logger.error(f"ERRO: {result.stderr}")
|
| 60 |
-
raise Exception(f"{
|
| 61 |
-
|
| 62 |
-
logger.info(f"✅ {description} completo")
|
| 63 |
return result.stdout
|
| 64 |
|
| 65 |
def main():
|
| 66 |
-
"""Setup completo em background"""
|
| 67 |
try:
|
| 68 |
logger.info("="*80)
|
| 69 |
-
logger.info("🚀 PARA.AI RAG
|
| 70 |
logger.info("="*80)
|
| 71 |
|
| 72 |
-
# Carregar configuração
|
| 73 |
update_status('loading', 'Carregando configuração', 0)
|
| 74 |
with open('config.yaml') as f:
|
| 75 |
config = yaml.safe_load(f)
|
|
@@ -79,120 +45,62 @@ def main():
|
|
| 79 |
chunk_end = config['chunk_end']
|
| 80 |
github_repo = config['github_repo']
|
| 81 |
|
| 82 |
-
logger.info(f"Cluster: {cluster_id}")
|
| 83 |
-
logger.info(f"Chunks: {chunk_start} - {chunk_end}")
|
| 84 |
-
logger.info("")
|
| 85 |
-
|
| 86 |
-
# Verificar se ChromaDB já existe
|
| 87 |
if READY_FLAG.exists():
|
| 88 |
-
logger.info("✅
|
| 89 |
-
update_status('ready', '
|
| 90 |
return
|
| 91 |
|
| 92 |
-
#
|
| 93 |
-
update_status('cloning', 'Clonando chunks
|
| 94 |
-
|
| 95 |
os.makedirs('/tmp/repo', exist_ok=True)
|
| 96 |
os.chdir('/tmp/repo')
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
-
# Gerar pattern de chunks
|
| 110 |
-
logger.info(f"Gerando pattern para chunks {chunk_start}-{chunk_end}...")
|
| 111 |
-
pattern_parts = []
|
| 112 |
-
for i in range(chunk_start, chunk_end + 1):
|
| 113 |
-
pattern_parts.append(f"chunks_dados/chunk_dados_{i:04d}.tar.gz")
|
| 114 |
-
|
| 115 |
-
# Set sparse checkout (em batches para evitar arg list too long)
|
| 116 |
-
batch_size = 50
|
| 117 |
-
for i in range(0, len(pattern_parts), batch_size):
|
| 118 |
-
batch = pattern_parts[i:i+batch_size]
|
| 119 |
-
pattern = ' '.join(batch)
|
| 120 |
-
run_command(
|
| 121 |
-
f"git sparse-checkout add {pattern}",
|
| 122 |
-
f"Sparse checkout batch {i//batch_size + 1}"
|
| 123 |
-
)
|
| 124 |
-
|
| 125 |
-
# Contar chunks clonados
|
| 126 |
-
result = run_command(
|
| 127 |
-
"find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l",
|
| 128 |
-
"Contar chunks"
|
| 129 |
-
)
|
| 130 |
-
chunks_count = int(result.strip())
|
| 131 |
logger.info(f"✅ {chunks_count} chunks clonados")
|
| 132 |
|
| 133 |
-
#
|
| 134 |
update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
|
| 135 |
-
|
| 136 |
os.makedirs('/tmp/extracted', exist_ok=True)
|
|
|
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
)
|
| 142 |
-
|
| 143 |
-
# ETAPA 3: Concatenar JSONL
|
| 144 |
-
update_status('concatenating', 'Concatenando jurisprudencias.jsonl', 50)
|
| 145 |
-
|
| 146 |
-
run_command(
|
| 147 |
-
"find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true",
|
| 148 |
-
"Concatenar JSONL"
|
| 149 |
-
)
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
"wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'",
|
| 154 |
-
"Contar registros"
|
| 155 |
-
)
|
| 156 |
-
total_records = int(result.strip())
|
| 157 |
-
logger.info(f"✅ {total_records:,} registros concatenados")
|
| 158 |
|
| 159 |
-
#
|
| 160 |
update_status('filtering', 'Filtrando campos (id + ementa)', 60)
|
| 161 |
-
|
| 162 |
os.chdir('/home/user/app')
|
| 163 |
-
|
| 164 |
-
"python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl",
|
| 165 |
-
"Filtrar campos"
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
# ETAPA 5: Build ChromaDB
|
| 169 |
-
update_status('building', 'Construindo ChromaDB com embeddings (pode demorar)', 70)
|
| 170 |
-
|
| 171 |
-
run_command(
|
| 172 |
-
"python3 rag_builder.py --input /tmp/filtered.jsonl",
|
| 173 |
-
"Build ChromaDB"
|
| 174 |
-
)
|
| 175 |
|
| 176 |
-
#
|
| 177 |
-
update_status('
|
|
|
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
)
|
| 183 |
|
| 184 |
-
#
|
| 185 |
-
update_status('ready', f'
|
| 186 |
READY_FLAG.touch()
|
| 187 |
|
| 188 |
logger.info("="*80)
|
| 189 |
-
logger.info("✅ SETUP COMPLETO
|
| 190 |
logger.info("="*80)
|
| 191 |
|
| 192 |
except Exception as e:
|
| 193 |
-
logger.error("
|
| 194 |
-
logger.error(f"❌ ERRO NO SETUP: {e}")
|
| 195 |
-
logger.error("="*80)
|
| 196 |
update_status('error', str(e), 0)
|
| 197 |
sys.exit(1)
|
| 198 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
import sys
|
| 4 |
import yaml
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from datetime import datetime
|
| 10 |
|
| 11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
STATUS_FILE = Path('/tmp/setup_status.json')
|
| 15 |
+
READY_FLAG = Path('/tmp/faiss_ready')
|
| 16 |
+
|
| 17 |
+
def update_status(status, message, progress=0):
|
| 18 |
+
data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
with open(STATUS_FILE, 'w') as f:
|
| 20 |
json.dump(data, f)
|
| 21 |
logger.info(f"[{progress}%] {status}: {message}")
|
| 22 |
sys.stdout.flush()
|
| 23 |
|
| 24 |
+
def run_cmd(cmd, desc):
|
| 25 |
+
logger.info(f"Executando: {desc}")
|
| 26 |
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
if result.returncode != 0:
|
| 28 |
logger.error(f"ERRO: {result.stderr}")
|
| 29 |
+
raise Exception(f"{desc} falhou")
|
| 30 |
+
logger.info(f"✅ {desc}")
|
|
|
|
| 31 |
return result.stdout
|
| 32 |
|
| 33 |
def main():
|
|
|
|
| 34 |
try:
|
| 35 |
logger.info("="*80)
|
| 36 |
+
logger.info("🚀 PARA.AI RAG (LangChain) - SETUP EM BACKGROUND")
|
| 37 |
logger.info("="*80)
|
| 38 |
|
|
|
|
| 39 |
update_status('loading', 'Carregando configuração', 0)
|
| 40 |
with open('config.yaml') as f:
|
| 41 |
config = yaml.safe_load(f)
|
|
|
|
| 45 |
chunk_end = config['chunk_end']
|
| 46 |
github_repo = config['github_repo']
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
if READY_FLAG.exists():
|
| 49 |
+
logger.info("✅ FAISS já pronto!")
|
| 50 |
+
update_status('ready', 'FAISS já existe', 100)
|
| 51 |
return
|
| 52 |
|
| 53 |
+
# CLONE
|
| 54 |
+
update_status('cloning', 'Clonando chunks (sparse checkout)', 10)
|
|
|
|
| 55 |
os.makedirs('/tmp/repo', exist_ok=True)
|
| 56 |
os.chdir('/tmp/repo')
|
| 57 |
|
| 58 |
+
run_cmd(f"git clone --filter=blob:none --sparse {github_repo} .", "Git clone")
|
| 59 |
+
run_cmd("git sparse-checkout init --cone", "Sparse checkout init")
|
| 60 |
+
|
| 61 |
+
patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(chunk_start, chunk_end + 1)]
|
| 62 |
+
for i in range(0, len(patterns), 50):
|
| 63 |
+
batch = ' '.join(patterns[i:i+50])
|
| 64 |
+
run_cmd(f"git sparse-checkout add {batch}", f"Batch {i//50 + 1}")
|
| 65 |
+
|
| 66 |
+
chunks_count = int(run_cmd("find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l", "Contar chunks").strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
logger.info(f"✅ {chunks_count} chunks clonados")
|
| 68 |
|
| 69 |
+
# EXTRACT
|
| 70 |
update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
|
|
|
|
| 71 |
os.makedirs('/tmp/extracted', exist_ok=True)
|
| 72 |
+
run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Descompactar")
|
| 73 |
|
| 74 |
+
# CONCAT
|
| 75 |
+
update_status('concatenating', 'Concatenando JSONL', 50)
|
| 76 |
+
run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concatenar")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
total_records = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Contar registros").strip())
|
| 79 |
+
logger.info(f"✅ {total_records:,} registros")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
# FILTER
|
| 82 |
update_status('filtering', 'Filtrando campos (id + ementa)', 60)
|
|
|
|
| 83 |
os.chdir('/home/user/app')
|
| 84 |
+
run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filtrar")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
# BUILD FAISS
|
| 87 |
+
update_status('building', 'Construindo FAISS index (pode demorar)', 70)
|
| 88 |
+
run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS")
|
| 89 |
|
| 90 |
+
# CLEANUP
|
| 91 |
+
update_status('cleaning', 'Limpando temporários', 95)
|
| 92 |
+
run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Limpar")
|
|
|
|
| 93 |
|
| 94 |
+
# DONE
|
| 95 |
+
update_status('ready', f'FAISS pronto com {total_records:,} registros!', 100)
|
| 96 |
READY_FLAG.touch()
|
| 97 |
|
| 98 |
logger.info("="*80)
|
| 99 |
+
logger.info("✅ SETUP COMPLETO!")
|
| 100 |
logger.info("="*80)
|
| 101 |
|
| 102 |
except Exception as e:
|
| 103 |
+
logger.error(f"❌ ERRO: {e}")
|
|
|
|
|
|
|
| 104 |
update_status('error', str(e), 0)
|
| 105 |
sys.exit(1)
|
| 106 |
|