Upload 8 files
Browse files- Dockerfile +1 -10
- README.md +5 -14
- app.py +12 -6
- entrypoint.sh +7 -13
- filter_fields.py +1 -5
- query_engine.py +9 -64
- rag_builder.py +15 -44
- requirements.txt +18 -18
Dockerfile
CHANGED
|
@@ -1,32 +1,23 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
-
|
| 4 |
-
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
| 5 |
|
| 6 |
-
# Criar usuário não-root
|
| 7 |
RUN useradd -m -u 1000 user
|
| 8 |
USER user
|
| 9 |
|
| 10 |
-
# Configurar environment
|
| 11 |
ENV HOME=/home/user \
|
| 12 |
PATH=/home/user/.local/bin:$PATH \
|
| 13 |
PYTHONUNBUFFERED=1
|
| 14 |
|
| 15 |
WORKDIR $HOME/app
|
| 16 |
|
| 17 |
-
# Copiar requirements e instalar dependências
|
| 18 |
COPY --chown=user requirements.txt .
|
| 19 |
RUN pip install --no-cache-dir --upgrade pip && \
|
| 20 |
pip install --no-cache-dir -r requirements.txt
|
| 21 |
|
| 22 |
-
# Copiar código da aplicação
|
| 23 |
COPY --chown=user . .
|
| 24 |
-
|
| 25 |
-
# Tornar entrypoint executável
|
| 26 |
RUN chmod +x entrypoint.sh
|
| 27 |
|
| 28 |
-
# Expor porta
|
| 29 |
EXPOSE 7860
|
| 30 |
|
| 31 |
-
# Comando de inicialização
|
| 32 |
CMD ["./entrypoint.sh"]
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
+
RUN apt-get update && apt-get install -y git curl && rm -rf /var/lib/apt/lists/*
|
|
|
|
| 4 |
|
|
|
|
| 5 |
RUN useradd -m -u 1000 user
|
| 6 |
USER user
|
| 7 |
|
|
|
|
| 8 |
ENV HOME=/home/user \
|
| 9 |
PATH=/home/user/.local/bin:$PATH \
|
| 10 |
PYTHONUNBUFFERED=1
|
| 11 |
|
| 12 |
WORKDIR $HOME/app
|
| 13 |
|
|
|
|
| 14 |
COPY --chown=user requirements.txt .
|
| 15 |
RUN pip install --no-cache-dir --upgrade pip && \
|
| 16 |
pip install --no-cache-dir -r requirements.txt
|
| 17 |
|
|
|
|
| 18 |
COPY --chown=user . .
|
|
|
|
|
|
|
| 19 |
RUN chmod +x entrypoint.sh
|
| 20 |
|
|
|
|
| 21 |
EXPOSE 7860
|
| 22 |
|
|
|
|
| 23 |
CMD ["./entrypoint.sh"]
|
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: Para.AI RAG Cluster
|
| 3 |
emoji: ⚖️
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
|
@@ -7,21 +7,12 @@ sdk: docker
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
# ⚖️ Para.AI RAG
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
##
|
| 15 |
|
| 16 |
-
|
| 17 |
-
2. `git init && git add . && git commit -m "Initial"`
|
| 18 |
-
3. `git push origin main`
|
| 19 |
-
|
| 20 |
-
## 📡 Endpoints
|
| 21 |
-
|
| 22 |
-
- `GET /` - Status
|
| 23 |
-
- `GET /setup/status` - Progresso do setup
|
| 24 |
-
- `POST /search/embedding` - Busca semântica
|
| 25 |
-
- `GET /cluster/info` - Info do cluster
|
| 26 |
|
| 27 |
⚖️ **InJustiça não para o Paraná!** 🐝
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Para.AI RAG Cluster DEBUG
|
| 3 |
emoji: ⚖️
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# ⚖️ Para.AI RAG (DEBUG VERSION)
|
| 11 |
|
| 12 |
+
Versão com logs de depuração intensivos.
|
| 13 |
|
| 14 |
+
## Endpoints adicionais
|
| 15 |
|
| 16 |
+
- `GET /setup/logs` - Ver logs completos do setup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
⚖️ **InJustiça não para o Paraná!** 🐝
|
app.py
CHANGED
|
@@ -31,14 +31,14 @@ def get_query_engine():
|
|
| 31 |
global query_engine
|
| 32 |
if query_engine is None:
|
| 33 |
if not is_ready():
|
| 34 |
-
raise HTTPException(status_code=503, detail="RAG em construção.
|
| 35 |
logger.info("Carregando QueryEngine...")
|
| 36 |
from query_engine import QueryEngine
|
| 37 |
query_engine = QueryEngine()
|
| 38 |
logger.info("✅ QueryEngine carregado!")
|
| 39 |
return query_engine
|
| 40 |
|
| 41 |
-
app = FastAPI(title="Para.AI RAG Cluster
|
| 42 |
|
| 43 |
class EmbeddingSearchRequest(BaseModel):
|
| 44 |
query: str
|
|
@@ -58,19 +58,25 @@ class IDSearchRequest(BaseModel):
|
|
| 58 |
async def root():
|
| 59 |
setup_status = get_setup_status()
|
| 60 |
ready = is_ready()
|
| 61 |
-
|
| 62 |
response = {"status": "online", "rag_ready": ready, "setup": setup_status, "backend": "LangChain + FAISS (CPU)"}
|
| 63 |
-
|
| 64 |
if ready and query_engine:
|
| 65 |
response["cluster_id"] = query_engine.config.get('cluster_id')
|
| 66 |
response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
|
| 67 |
-
|
| 68 |
return response
|
| 69 |
|
| 70 |
@app.get("/setup/status")
|
| 71 |
async def setup_status():
|
| 72 |
return get_setup_status()
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
@app.get("/health")
|
| 75 |
async def health():
|
| 76 |
return {"status": "ok", "timestamp": time.time()}
|
|
@@ -123,7 +129,7 @@ async def cluster_info():
|
|
| 123 |
async def startup_event():
|
| 124 |
app.state.start_time = time.time()
|
| 125 |
logger.info("="*80)
|
| 126 |
-
logger.info("🚀 Para.AI RAG
|
| 127 |
logger.info("="*80)
|
| 128 |
|
| 129 |
if __name__ == "__main__":
|
|
|
|
| 31 |
global query_engine
|
| 32 |
if query_engine is None:
|
| 33 |
if not is_ready():
|
| 34 |
+
raise HTTPException(status_code=503, detail="RAG em construção. Aguarde setup terminar.")
|
| 35 |
logger.info("Carregando QueryEngine...")
|
| 36 |
from query_engine import QueryEngine
|
| 37 |
query_engine = QueryEngine()
|
| 38 |
logger.info("✅ QueryEngine carregado!")
|
| 39 |
return query_engine
|
| 40 |
|
| 41 |
+
app = FastAPI(title="Para.AI RAG Cluster", version="1.0.0")
|
| 42 |
|
| 43 |
class EmbeddingSearchRequest(BaseModel):
|
| 44 |
query: str
|
|
|
|
| 58 |
async def root():
|
| 59 |
setup_status = get_setup_status()
|
| 60 |
ready = is_ready()
|
|
|
|
| 61 |
response = {"status": "online", "rag_ready": ready, "setup": setup_status, "backend": "LangChain + FAISS (CPU)"}
|
|
|
|
| 62 |
if ready and query_engine:
|
| 63 |
response["cluster_id"] = query_engine.config.get('cluster_id')
|
| 64 |
response["chunk_range"] = [query_engine.config.get('chunk_start'), query_engine.config.get('chunk_end')]
|
|
|
|
| 65 |
return response
|
| 66 |
|
| 67 |
@app.get("/setup/status")
|
| 68 |
async def setup_status():
|
| 69 |
return get_setup_status()
|
| 70 |
|
| 71 |
+
@app.get("/setup/logs")
|
| 72 |
+
async def setup_logs():
|
| 73 |
+
try:
|
| 74 |
+
with open('/tmp/setup_debug.log', 'r') as f:
|
| 75 |
+
logs = f.read()
|
| 76 |
+
return {"logs": logs, "size": len(logs)}
|
| 77 |
+
except:
|
| 78 |
+
return {"logs": "Log file not available", "size": 0}
|
| 79 |
+
|
| 80 |
@app.get("/health")
|
| 81 |
async def health():
|
| 82 |
return {"status": "ok", "timestamp": time.time()}
|
|
|
|
| 129 |
async def startup_event():
|
| 130 |
app.state.start_time = time.time()
|
| 131 |
logger.info("="*80)
|
| 132 |
+
logger.info("🚀 Para.AI RAG ONLINE")
|
| 133 |
logger.info("="*80)
|
| 134 |
|
| 135 |
if __name__ == "__main__":
|
entrypoint.sh
CHANGED
|
@@ -1,22 +1,16 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
set -e
|
| 3 |
-
|
| 4 |
-
echo "=================================="
|
| 5 |
echo "🚀 Para.AI RAG Startup"
|
| 6 |
-
echo "
|
| 7 |
-
|
| 8 |
cd /home/user/app
|
| 9 |
-
|
| 10 |
echo "1️⃣ Iniciando setup em background..."
|
| 11 |
python3 -u setup.py > /tmp/setup_output.log 2>&1 &
|
| 12 |
-
echo "✅ Setup
|
| 13 |
-
echo ""
|
| 14 |
-
|
| 15 |
sleep 2
|
| 16 |
-
|
| 17 |
echo "2️⃣ Iniciando FastAPI..."
|
| 18 |
-
echo "🎯 API online
|
| 19 |
-
echo "
|
| 20 |
-
echo "
|
| 21 |
-
|
| 22 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
set -e
|
| 3 |
+
echo "=========================================="
|
|
|
|
| 4 |
echo "🚀 Para.AI RAG Startup"
|
| 5 |
+
echo "=========================================="
|
|
|
|
| 6 |
cd /home/user/app
|
|
|
|
| 7 |
echo "1️⃣ Iniciando setup em background..."
|
| 8 |
python3 -u setup.py > /tmp/setup_output.log 2>&1 &
|
| 9 |
+
echo "✅ Setup PID: $!"
|
|
|
|
|
|
|
| 10 |
sleep 2
|
|
|
|
| 11 |
echo "2️⃣ Iniciando FastAPI..."
|
| 12 |
+
echo "🎯 API online"
|
| 13 |
+
echo "📊 Status: /setup/status"
|
| 14 |
+
echo "📋 Logs: /setup/logs"
|
| 15 |
+
echo "=========================================="
|
| 16 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
filter_fields.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
import json
|
| 3 |
-
import argparse
|
| 4 |
-
import yaml
|
| 5 |
|
| 6 |
def filter_jsonl(input_file, output_file, fields_to_keep):
|
| 7 |
with open(input_file, 'r', encoding='utf-8') as fin:
|
|
@@ -18,10 +16,8 @@ def main():
|
|
| 18 |
parser.add_argument('--output', required=True)
|
| 19 |
parser.add_argument('--config', default='config.yaml')
|
| 20 |
args = parser.parse_args()
|
| 21 |
-
|
| 22 |
with open(args.config) as f:
|
| 23 |
config = yaml.safe_load(f)
|
| 24 |
-
|
| 25 |
filter_jsonl(args.input, args.output, config['campos_filter'])
|
| 26 |
print(f"✅ Filtrado: {args.output}")
|
| 27 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
import json, argparse, yaml
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def filter_jsonl(input_file, output_file, fields_to_keep):
|
| 5 |
with open(input_file, 'r', encoding='utf-8') as fin:
|
|
|
|
| 16 |
parser.add_argument('--output', required=True)
|
| 17 |
parser.add_argument('--config', default='config.yaml')
|
| 18 |
args = parser.parse_args()
|
|
|
|
| 19 |
with open(args.config) as f:
|
| 20 |
config = yaml.safe_load(f)
|
|
|
|
| 21 |
filter_jsonl(args.input, args.output, config['campos_filter'])
|
| 22 |
print(f"✅ Filtrado: {args.output}")
|
| 23 |
|
query_engine.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
import yaml
|
| 3 |
-
import logging
|
| 4 |
from typing import List, Dict
|
| 5 |
-
|
| 6 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 7 |
from langchain_community.vectorstores import FAISS
|
| 8 |
|
|
@@ -12,79 +10,26 @@ logger = logging.getLogger(__name__)
|
|
| 12 |
class QueryEngine:
|
| 13 |
def __init__(self, config_path='config.yaml'):
|
| 14 |
logger.info("Inicializando QueryEngine...")
|
| 15 |
-
|
| 16 |
with open(config_path) as f:
|
| 17 |
self.config = yaml.safe_load(f)
|
| 18 |
-
|
| 19 |
model_name = self.config.get('embedding_model', 'sentence-transformers/all-MiniLM-L6-v2')
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
self.embeddings = HuggingFaceEmbeddings(
|
| 23 |
-
model_name=model_name,
|
| 24 |
-
model_kwargs={'device': 'cpu'}
|
| 25 |
-
)
|
| 26 |
-
|
| 27 |
faiss_path = self.config.get('faiss_path', '/app/faiss_index')
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
self.vectorstore = FAISS.load_local(
|
| 31 |
-
faiss_path,
|
| 32 |
-
self.embeddings,
|
| 33 |
-
allow_dangerous_deserialization=True
|
| 34 |
-
)
|
| 35 |
-
|
| 36 |
logger.info("✅ QueryEngine pronto!")
|
| 37 |
|
| 38 |
def search_by_embedding(self, query: str, top_k: int = 10, return_embeddings: bool = False) -> Dict:
|
| 39 |
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
|
| 40 |
-
|
| 41 |
-
formatted
|
| 42 |
-
for doc, score in results:
|
| 43 |
-
formatted.append({
|
| 44 |
-
'id': doc.metadata.get('id'),
|
| 45 |
-
'ementa': doc.page_content,
|
| 46 |
-
'score': float(score),
|
| 47 |
-
'metadata': doc.metadata
|
| 48 |
-
})
|
| 49 |
-
|
| 50 |
-
return {
|
| 51 |
-
'cluster_id': self.config.get('cluster_id'),
|
| 52 |
-
'query': query,
|
| 53 |
-
'total_results': len(formatted),
|
| 54 |
-
'results': formatted
|
| 55 |
-
}
|
| 56 |
|
| 57 |
def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
|
| 58 |
-
|
| 59 |
-
return self.search_by_embedding(query, top_k)
|
| 60 |
|
| 61 |
def search_by_ids(self, ids: List[str], return_embeddings: bool = False) -> Dict:
|
| 62 |
all_docs = self.vectorstore.similarity_search("", k=10000)
|
| 63 |
-
|
| 64 |
-
results
|
| 65 |
-
for doc in all_docs:
|
| 66 |
-
if doc.metadata.get('id') in ids:
|
| 67 |
-
results.append({
|
| 68 |
-
'id': doc.metadata.get('id'),
|
| 69 |
-
'ementa': doc.page_content,
|
| 70 |
-
'metadata': doc.metadata
|
| 71 |
-
})
|
| 72 |
-
if len(results) >= len(ids):
|
| 73 |
-
break
|
| 74 |
-
|
| 75 |
-
return {
|
| 76 |
-
'cluster_id': self.config.get('cluster_id'),
|
| 77 |
-
'total_results': len(results),
|
| 78 |
-
'results': results
|
| 79 |
-
}
|
| 80 |
|
| 81 |
def get_cluster_info(self) -> Dict:
|
| 82 |
-
return {
|
| 83 |
-
'cluster_id': self.config.get('cluster_id'),
|
| 84 |
-
'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')],
|
| 85 |
-
'embedding_model': self.config.get('embedding_model'),
|
| 86 |
-
'embedding_dim': 384,
|
| 87 |
-
'vector_store': 'FAISS',
|
| 88 |
-
'backend': 'LangChain + CPU',
|
| 89 |
-
'status': 'ready'
|
| 90 |
-
}
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
import yaml, logging
|
|
|
|
| 3 |
from typing import List, Dict
|
|
|
|
| 4 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 5 |
from langchain_community.vectorstores import FAISS
|
| 6 |
|
|
|
|
| 10 |
class QueryEngine:
|
| 11 |
def __init__(self, config_path='config.yaml'):
|
| 12 |
logger.info("Inicializando QueryEngine...")
|
|
|
|
| 13 |
with open(config_path) as f:
|
| 14 |
self.config = yaml.safe_load(f)
|
|
|
|
| 15 |
model_name = self.config.get('embedding_model', 'sentence-transformers/all-MiniLM-L6-v2')
|
| 16 |
+
self.embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
faiss_path = self.config.get('faiss_path', '/app/faiss_index')
|
| 18 |
+
self.vectorstore = FAISS.load_local(faiss_path, self.embeddings, allow_dangerous_deserialization=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
logger.info("✅ QueryEngine pronto!")
|
| 20 |
|
| 21 |
def search_by_embedding(self, query: str, top_k: int = 10, return_embeddings: bool = False) -> Dict:
|
| 22 |
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
|
| 23 |
+
formatted = [{'id': doc.metadata.get('id'), 'ementa': doc.page_content, 'score': float(score), 'metadata': doc.metadata} for doc, score in results]
|
| 24 |
+
return {'cluster_id': self.config.get('cluster_id'), 'query': query, 'total_results': len(formatted), 'results': formatted}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def search_by_keywords(self, keywords: List[str], operator: str = 'AND', top_k: int = 20) -> Dict:
|
| 27 |
+
return self.search_by_embedding(' '.join(keywords), top_k)
|
|
|
|
| 28 |
|
| 29 |
def search_by_ids(self, ids: List[str], return_embeddings: bool = False) -> Dict:
|
| 30 |
all_docs = self.vectorstore.similarity_search("", k=10000)
|
| 31 |
+
results = [{'id': doc.metadata.get('id'), 'ementa': doc.page_content, 'metadata': doc.metadata} for doc in all_docs if doc.metadata.get('id') in ids][:len(ids)]
|
| 32 |
+
return {'cluster_id': self.config.get('cluster_id'), 'total_results': len(results), 'results': results}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def get_cluster_info(self) -> Dict:
|
| 35 |
+
return {'cluster_id': self.config.get('cluster_id'), 'chunk_range': [self.config.get('chunk_start'), self.config.get('chunk_end')], 'embedding_model': self.config.get('embedding_model'), 'embedding_dim': 384, 'vector_store': 'FAISS', 'backend': 'LangChain + CPU', 'status': 'ready'}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_builder.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
import os, sys, json, argparse, logging, traceback
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import List, Dict
|
| 5 |
from langchain.docstore.document import Document
|
|
@@ -11,31 +11,22 @@ logger = logging.getLogger(__name__)
|
|
| 11 |
|
| 12 |
def load_jsonl(filepath: str) -> List[Dict]:
|
| 13 |
records = []
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
logger.info(f" {i:,} linhas...")
|
| 24 |
-
logger.info(f"✅ {len(records):,} registros")
|
| 25 |
-
return records
|
| 26 |
-
except Exception as e:
|
| 27 |
-
logger.error(f"❌ Erro: {e}")
|
| 28 |
-
raise
|
| 29 |
|
| 30 |
def create_documents(records: List[Dict]) -> List[Document]:
|
| 31 |
documents = []
|
| 32 |
for i, record in enumerate(records, 1):
|
| 33 |
ementa = record.get('ementa', '')
|
| 34 |
if ementa:
|
| 35 |
-
documents.append(Document(
|
| 36 |
-
page_content=ementa,
|
| 37 |
-
metadata={'id': str(record.get('id', f'u{i}')), 'source': 'tjpr'}
|
| 38 |
-
))
|
| 39 |
if i % 50000 == 0:
|
| 40 |
logger.info(f" {i:,}/{len(records):,}...")
|
| 41 |
logger.info(f"✅ {len(documents):,} documentos")
|
|
@@ -43,48 +34,28 @@ def create_documents(records: List[Dict]) -> List[Document]:
|
|
| 43 |
|
| 44 |
def build_vectorstore(input_file, output_dir='/app/faiss_index', model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=16):
|
| 45 |
try:
|
| 46 |
-
import time
|
| 47 |
logger.info("="*80)
|
| 48 |
-
logger.info("🚀 RAG Builder
|
| 49 |
logger.info("="*80)
|
| 50 |
-
|
| 51 |
logger.info("\nPASSO 1/5: Carregando JSONL")
|
| 52 |
records = load_jsonl(input_file)
|
| 53 |
-
if not records:
|
| 54 |
-
raise ValueError("Nenhum registro!")
|
| 55 |
-
|
| 56 |
logger.info("\nPASSO 2/5: Criando Documents")
|
| 57 |
documents = create_documents(records)
|
| 58 |
-
if not documents:
|
| 59 |
-
raise ValueError("Nenhum documento!")
|
| 60 |
-
|
| 61 |
logger.info(f"\nPASSO 3/5: Inicializando Embeddings ({model_name})")
|
| 62 |
-
embeddings = HuggingFaceEmbeddings(
|
| 63 |
-
model_name=model_name,
|
| 64 |
-
model_kwargs={'device': 'cpu'},
|
| 65 |
-
encode_kwargs={'batch_size': batch_size, 'show_progress_bar': True, 'normalize_embeddings': True}
|
| 66 |
-
)
|
| 67 |
logger.info("✅ Embeddings OK")
|
| 68 |
-
|
| 69 |
logger.info(f"\nPASSO 4/5: Construindo FAISS ({len(documents):,} docs)")
|
| 70 |
start = time.time()
|
| 71 |
vectorstore = FAISS.from_documents(documents, embeddings)
|
| 72 |
-
logger.info(f"✅ FAISS em {time.time()-start:.1f}s
|
| 73 |
-
|
| 74 |
logger.info(f"\nPASSO 5/5: Salvando em {output_dir}")
|
| 75 |
os.makedirs(output_dir, exist_ok=True)
|
| 76 |
vectorstore.save_local(output_dir)
|
| 77 |
-
logger.info("✅ Salvo!")
|
| 78 |
-
|
| 79 |
-
logger.info("\n" + "="*80)
|
| 80 |
logger.info("✅ BUILD COMPLETO!")
|
| 81 |
-
logger.info("="*80)
|
| 82 |
return vectorstore
|
| 83 |
except Exception as e:
|
| 84 |
-
logger.error("\n
|
| 85 |
-
logger.error(f"❌ ERRO: {type(e).__name__}: {e}")
|
| 86 |
logger.error(traceback.format_exc())
|
| 87 |
-
logger.error("="*80)
|
| 88 |
raise
|
| 89 |
|
| 90 |
def main():
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
import os, sys, json, argparse, logging, traceback, time
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import List, Dict
|
| 5 |
from langchain.docstore.document import Document
|
|
|
|
| 11 |
|
| 12 |
def load_jsonl(filepath: str) -> List[Dict]:
|
| 13 |
records = []
|
| 14 |
+
logger.info(f"📂 Carregando: {filepath}")
|
| 15 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 16 |
+
for i, line in enumerate(f, 1):
|
| 17 |
+
if line.strip():
|
| 18 |
+
records.append(json.loads(line))
|
| 19 |
+
if i % 50000 == 0:
|
| 20 |
+
logger.info(f" {i:,} linhas...")
|
| 21 |
+
logger.info(f"✅ {len(records):,} registros")
|
| 22 |
+
return records
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def create_documents(records: List[Dict]) -> List[Document]:
|
| 25 |
documents = []
|
| 26 |
for i, record in enumerate(records, 1):
|
| 27 |
ementa = record.get('ementa', '')
|
| 28 |
if ementa:
|
| 29 |
+
documents.append(Document(page_content=ementa, metadata={'id': str(record.get('id', f'u{i}')), 'source': 'tjpr'}))
|
|
|
|
|
|
|
|
|
|
| 30 |
if i % 50000 == 0:
|
| 31 |
logger.info(f" {i:,}/{len(records):,}...")
|
| 32 |
logger.info(f"✅ {len(documents):,} documentos")
|
|
|
|
| 34 |
|
| 35 |
def build_vectorstore(input_file, output_dir='/app/faiss_index', model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=16):
|
| 36 |
try:
|
|
|
|
| 37 |
logger.info("="*80)
|
| 38 |
+
logger.info("🚀 RAG Builder")
|
| 39 |
logger.info("="*80)
|
|
|
|
| 40 |
logger.info("\nPASSO 1/5: Carregando JSONL")
|
| 41 |
records = load_jsonl(input_file)
|
|
|
|
|
|
|
|
|
|
| 42 |
logger.info("\nPASSO 2/5: Criando Documents")
|
| 43 |
documents = create_documents(records)
|
|
|
|
|
|
|
|
|
|
| 44 |
logger.info(f"\nPASSO 3/5: Inicializando Embeddings ({model_name})")
|
| 45 |
+
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs={'batch_size': batch_size, 'show_progress_bar': True, 'normalize_embeddings': True})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
logger.info("✅ Embeddings OK")
|
|
|
|
| 47 |
logger.info(f"\nPASSO 4/5: Construindo FAISS ({len(documents):,} docs)")
|
| 48 |
start = time.time()
|
| 49 |
vectorstore = FAISS.from_documents(documents, embeddings)
|
| 50 |
+
logger.info(f"✅ FAISS em {time.time()-start:.1f}s")
|
|
|
|
| 51 |
logger.info(f"\nPASSO 5/5: Salvando em {output_dir}")
|
| 52 |
os.makedirs(output_dir, exist_ok=True)
|
| 53 |
vectorstore.save_local(output_dir)
|
|
|
|
|
|
|
|
|
|
| 54 |
logger.info("✅ BUILD COMPLETO!")
|
|
|
|
| 55 |
return vectorstore
|
| 56 |
except Exception as e:
|
| 57 |
+
logger.error(f"\n❌ ERRO: {type(e).__name__}: {e}")
|
|
|
|
| 58 |
logger.error(traceback.format_exc())
|
|
|
|
| 59 |
raise
|
| 60 |
|
| 61 |
def main():
|
requirements.txt
CHANGED
|
@@ -1,28 +1,28 @@
|
|
| 1 |
# FastAPI
|
| 2 |
-
fastapi
|
| 3 |
-
uvicorn[standard]
|
| 4 |
-
pydantic
|
| 5 |
|
| 6 |
# LangChain + Embeddings
|
| 7 |
-
langchain
|
| 8 |
-
langchain-community
|
| 9 |
|
| 10 |
-
# Sentence Transformers
|
| 11 |
-
sentence-transformers
|
| 12 |
-
transformers
|
| 13 |
-
torch
|
| 14 |
-
tokenizers
|
| 15 |
-
safetensors
|
| 16 |
|
| 17 |
# FAISS
|
| 18 |
-
faiss-cpu
|
| 19 |
|
| 20 |
# HuggingFace Hub
|
| 21 |
-
huggingface-hub
|
| 22 |
|
| 23 |
# Utilities
|
| 24 |
-
PyYAML
|
| 25 |
-
GitPython
|
| 26 |
-
pandas
|
| 27 |
-
numpy
|
| 28 |
-
tqdm
|
|
|
|
| 1 |
# FastAPI
|
| 2 |
+
fastapi==0.109.0
|
| 3 |
+
uvicorn[standard]==0.27.0
|
| 4 |
+
pydantic==2.5.0
|
| 5 |
|
| 6 |
# LangChain + Embeddings
|
| 7 |
+
langchain==0.1.11
|
| 8 |
+
langchain-community==0.0.24
|
| 9 |
|
| 10 |
+
# Sentence Transformers
|
| 11 |
+
sentence-transformers==2.5.1
|
| 12 |
+
transformers==4.37.2
|
| 13 |
+
torch==2.2.0
|
| 14 |
+
tokenizers==0.15.2
|
| 15 |
+
safetensors==0.4.2
|
| 16 |
|
| 17 |
# FAISS
|
| 18 |
+
faiss-cpu==1.8.0
|
| 19 |
|
| 20 |
# HuggingFace Hub
|
| 21 |
+
huggingface-hub==0.20.3
|
| 22 |
|
| 23 |
# Utilities
|
| 24 |
+
PyYAML==6.0.1
|
| 25 |
+
GitPython==3.1.41
|
| 26 |
+
pandas==2.1.4
|
| 27 |
+
numpy==1.26.3
|
| 28 |
+
tqdm==4.66.1
|