Spaces:
Build error
Build error
Upload 6 files
Browse files- QUICKSTART.txt +18 -3
- README.md +55 -240
- app.py +106 -24
- entrypoint.sh +25 -65
- monitor_setup.sh +68 -0
- setup.py +200 -0
QUICKSTART.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
╔══════════════════════════════════════════════════════════════════════════════╗
|
| 2 |
║ PARA.AI RAG CLUSTER - QUICKSTART ║
|
|
|
|
| 3 |
╚══════════════════════════════════════════════════════════════════════════════╝
|
| 4 |
|
| 5 |
🎯 DEPLOY EM 5 MINUTOS:
|
|
@@ -24,14 +25,28 @@
|
|
| 24 |
$ git commit -m "Initial deployment"
|
| 25 |
$ git push origin main
|
| 26 |
|
| 27 |
-
4.
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
$ curl https://SEU-USUARIO-para-ai-rag-0301.hf.space/cluster/info
|
| 32 |
|
| 33 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
📖 LEIA: INSTRUCTIONS.md para guia completo
|
| 36 |
|
| 37 |
-
✅ PRONTO! Seu RAG está online!
|
|
|
|
| 1 |
╔══════════════════════════════════════════════════════════════════════════════╗
|
| 2 |
║ PARA.AI RAG CLUSTER - QUICKSTART ║
|
| 3 |
+
║ (COM ANTI-TIMEOUT HF SPACES) ║
|
| 4 |
╚══════════════════════════════════════════════════════════════════════════════╝
|
| 5 |
|
| 6 |
🎯 DEPLOY EM 5 MINUTOS:
|
|
|
|
| 25 |
$ git commit -m "Initial deployment"
|
| 26 |
$ git push origin main
|
| 27 |
|
| 28 |
+
4. Monitorar progresso:
|
| 29 |
|
| 30 |
+
# Space fica online em ~3s (FastAPI responde)
|
| 31 |
+
$ curl https://SEU-USUARIO-para-ai-rag-0301.hf.space/health
|
| 32 |
+
|
| 33 |
+
# Ver progresso do setup (~15min)
|
| 34 |
+
$ curl https://SEU-USUARIO-para-ai-rag-0301.hf.space/setup/status
|
| 35 |
+
|
| 36 |
+
5. Quando setup completo (progress: 100):
|
| 37 |
|
| 38 |
$ curl https://SEU-USUARIO-para-ai-rag-0301.hf.space/cluster/info
|
| 39 |
|
| 40 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 41 |
|
| 42 |
+
🔥 ARQUITETURA ANTI-TIMEOUT:
|
| 43 |
+
|
| 44 |
+
entrypoint.sh
|
| 45 |
+
├─ python3 -u setup.py & ← Background (15min)
|
| 46 |
+
└─ uvicorn app:app ← Foreground (3s) ✅ HF não fecha!
|
| 47 |
+
|
| 48 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 49 |
+
|
| 50 |
📖 LEIA: INSTRUCTIONS.md para guia completo
|
| 51 |
|
| 52 |
+
✅ PRONTO! Seu RAG está online (setup em background)!
|
README.md
CHANGED
|
@@ -6,269 +6,84 @@ colorTo: purple
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: agpl-3.0
|
| 9 |
-
tags:
|
| 10 |
-
- legal-ai
|
| 11 |
-
- rag
|
| 12 |
-
- jurisprudence
|
| 13 |
-
- brazilian-law
|
| 14 |
-
- chromadb
|
| 15 |
---
|
| 16 |
|
| 17 |
-
# ⚖️ Para.AI RAG Cluster
|
| 18 |
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
## 🚀 Status
|
| 32 |
-
|
| 33 |
-

|
| 34 |
-

|
| 35 |
-

|
| 36 |
-
|
| 37 |
-
**🔗 API Base URL:** `https://huggingface.co/spaces/seu-usuario/para-ai-rag-0301`
|
| 38 |
-
|
| 39 |
-
---
|
| 40 |
-
|
| 41 |
-
## 📡 Endpoints Disponíveis
|
| 42 |
-
|
| 43 |
-
### 1. Busca por Similaridade Semântica
|
| 44 |
|
| 45 |
-
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
```json
|
| 51 |
-
{
|
| 52 |
-
"query": "despejo por falta de pagamento de aluguel",
|
| 53 |
-
"top_k": 10,
|
| 54 |
-
"return_embeddings": false
|
| 55 |
-
}
|
| 56 |
-
```
|
| 57 |
|
| 58 |
-
|
| 59 |
-
```json
|
| 60 |
{
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
|
| 65 |
-
"id": "1234567",
|
| 66 |
-
"ementa": "AÇÃO DE DESPEJO. FALTA DE PAGAMENTO. PROCEDÊNCIA...",
|
| 67 |
-
"distance": 0.23,
|
| 68 |
-
"score": 0.77
|
| 69 |
-
}
|
| 70 |
-
],
|
| 71 |
-
"total_found": 10,
|
| 72 |
-
"query_time_ms": 45
|
| 73 |
}
|
| 74 |
```
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
### 2. Busca por Termos-Chave
|
| 79 |
-
|
| 80 |
-
**Endpoint:** `POST /search/keywords`
|
| 81 |
-
|
| 82 |
-
Busca por palavras-chave específicas (full-text search).
|
| 83 |
-
|
| 84 |
-
**Request:**
|
| 85 |
-
```json
|
| 86 |
-
{
|
| 87 |
-
"keywords": ["despejo", "falta de pagamento"],
|
| 88 |
-
"operator": "AND",
|
| 89 |
-
"top_k": 20
|
| 90 |
-
}
|
| 91 |
-
```
|
| 92 |
-
|
| 93 |
-
**Response:**
|
| 94 |
-
```json
|
| 95 |
-
{
|
| 96 |
-
"cluster_id": "RAG-0301",
|
| 97 |
-
"results": [
|
| 98 |
-
{
|
| 99 |
-
"id": "1234567",
|
| 100 |
-
"ementa": "AÇÃO DE DESPEJO. FALTA DE PAGAMENTO...",
|
| 101 |
-
"matched_keywords": ["despejo", "falta de pagamento"]
|
| 102 |
-
}
|
| 103 |
-
],
|
| 104 |
-
"total_found": 20,
|
| 105 |
-
"query_time_ms": 32
|
| 106 |
-
}
|
| 107 |
-
```
|
| 108 |
-
|
| 109 |
-
---
|
| 110 |
-
|
| 111 |
-
### 3. Busca por ID
|
| 112 |
-
|
| 113 |
-
**Endpoint:** `POST /search/by_id`
|
| 114 |
-
|
| 115 |
-
Busca direta por IDs de acórdãos.
|
| 116 |
-
|
| 117 |
-
**Request:**
|
| 118 |
-
```json
|
| 119 |
-
{
|
| 120 |
-
"ids": ["1234567", "7654321"],
|
| 121 |
-
"return_embeddings": true
|
| 122 |
-
}
|
| 123 |
-
```
|
| 124 |
-
|
| 125 |
-
**Response:**
|
| 126 |
-
```json
|
| 127 |
-
{
|
| 128 |
-
"cluster_id": "RAG-0301",
|
| 129 |
-
"results": [
|
| 130 |
-
{
|
| 131 |
-
"id": "1234567",
|
| 132 |
-
"ementa": "...",
|
| 133 |
-
"embedding": [0.12, -0.34, ...]
|
| 134 |
-
}
|
| 135 |
-
],
|
| 136 |
-
"not_found": ["7654321"],
|
| 137 |
-
"total_found": 1,
|
| 138 |
-
"query_time_ms": 15
|
| 139 |
-
}
|
| 140 |
-
```
|
| 141 |
-
|
| 142 |
-
---
|
| 143 |
-
|
| 144 |
-
### 4. Informações do Cluster
|
| 145 |
-
|
| 146 |
-
**Endpoint:** `GET /cluster/info`
|
| 147 |
-
|
| 148 |
-
Retorna informações sobre o cluster.
|
| 149 |
-
|
| 150 |
-
**Response:**
|
| 151 |
-
```json
|
| 152 |
-
{
|
| 153 |
-
"cluster_id": "RAG-0301",
|
| 154 |
-
"chunk_range": [301, 600],
|
| 155 |
-
"total_records": 295432,
|
| 156 |
-
"embedding_model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 157 |
-
"embedding_dim": 384,
|
| 158 |
-
"campos_disponiveis": ["id", "ementa"],
|
| 159 |
-
"db_size_mb": 1456,
|
| 160 |
-
"status": "ready",
|
| 161 |
-
"uptime_seconds": 3600
|
| 162 |
-
}
|
| 163 |
-
```
|
| 164 |
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
## 🔧
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
| 174 |
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
results = response.json()
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
print(f"Score: {result['score']:.2f}")
|
| 188 |
-
print(f"Ementa: {result['ementa'][:200]}...")
|
| 189 |
-
print("-" * 80)
|
| 190 |
-
```
|
| 191 |
|
| 192 |
-
|
| 193 |
|
| 194 |
-
|
|
|
|
|
|
|
| 195 |
|
|
|
|
|
|
|
| 196 |
```
|
| 197 |
-
GitHub (chunks 301-600)
|
| 198 |
-
│
|
| 199 |
-
▼
|
| 200 |
-
Git Sparse Checkout (~600MB)
|
| 201 |
-
│
|
| 202 |
-
▼
|
| 203 |
-
Descompactar .tar.gz
|
| 204 |
-
│
|
| 205 |
-
▼
|
| 206 |
-
Filtrar campos (id + ementa)
|
| 207 |
-
│
|
| 208 |
-
▼
|
| 209 |
-
Gerar Embeddings (MiniLM)
|
| 210 |
-
│
|
| 211 |
-
▼
|
| 212 |
-
ChromaDB (1.5GB)
|
| 213 |
-
│
|
| 214 |
-
▼
|
| 215 |
-
FastAPI (7860)
|
| 216 |
-
```
|
| 217 |
-
|
| 218 |
-
**Recursos utilizados:**
|
| 219 |
-
- 💾 RAM: ~2GB / 16GB disponíveis
|
| 220 |
-
- 💿 Disco: ~2.6GB / 50GB disponíveis
|
| 221 |
-
- ⚡ CPU: 1 vCPU / 2 disponíveis
|
| 222 |
-
|
| 223 |
-
---
|
| 224 |
-
|
| 225 |
-
## 📊 Dataset Fonte
|
| 226 |
-
|
| 227 |
-
Os dados vêm do **Para.AI Dataset**, um conjunto de ~4.5 milhões de acórdãos do TJPR, disponível publicamente no GitHub.
|
| 228 |
-
|
| 229 |
-
**Repositório:** [github.com/caarleexx/para-ai-data](https://github.com/caarleexx/para-ai-data)
|
| 230 |
-
|
| 231 |
-
---
|
| 232 |
-
|
| 233 |
-
## 🤝 Projeto Para.AI
|
| 234 |
|
| 235 |
-
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
- [Para.AI RAG 0001](https://huggingface.co/spaces/seu-usuario/para-ai-rag-0001) - Chunks 1-300
|
| 240 |
-
- [Para.AI RAG 0301](https://huggingface.co/spaces/seu-usuario/para-ai-rag-0301) - Chunks 301-600 (este)
|
| 241 |
-
- [Para.AI RAG 0601](https://huggingface.co/spaces/seu-usuario/para-ai-rag-0601) - Chunks 601-900
|
| 242 |
-
- ... (15 clusters no total)
|
| 243 |
-
|
| 244 |
-
### Gateway Agregador
|
| 245 |
-
|
| 246 |
-
Para buscar em **todos os clusters** simultaneamente, use o gateway:
|
| 247 |
-
- [Para.AI Gateway](https://huggingface.co/spaces/seu-usuario/para-ai-gateway)
|
| 248 |
-
|
| 249 |
-
---
|
| 250 |
-
|
| 251 |
-
## 📝 Licença
|
| 252 |
-
|
| 253 |
-
**AGPL-3.0** - Este projeto é open-source e gratuito para uso pessoal, acadêmico e não-comercial.
|
| 254 |
-
|
| 255 |
-
Para uso comercial, consulte a licença completa.
|
| 256 |
-
|
| 257 |
-
---
|
| 258 |
-
|
| 259 |
-
## 🐝 Legado
|
| 260 |
-
|
| 261 |
-
> *"Este projeto nasceu de uma indignação e de um sonho. É a transformação da frustração em uma ferramenta de poder para todos."*
|
| 262 |
-
|
| 263 |
-
**Para.AI** não é apenas código. É um movimento para dar voz aos silenciados e clareza aos deixados no escuro.
|
| 264 |
-
|
| 265 |
-
---
|
| 266 |
-
|
| 267 |
-
## 📧 Contato
|
| 268 |
-
|
| 269 |
-
- **Projeto:** [github.com/caarleexx/para-ai](https://github.com/caarleexx/para-ai)
|
| 270 |
-
- **Issues:** [github.com/caarleexx/para-ai/issues](https://github.com/caarleexx/para-ai/issues)
|
| 271 |
-
|
| 272 |
-
---
|
| 273 |
|
| 274 |
-
|
|
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: agpl-3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# ⚖️ Para.AI RAG Cluster
|
| 12 |
|
| 13 |
+
Micro-cluster RAG para jurisprudências do TJPR usando Hugging Face Spaces (free tier).
|
| 14 |
|
| 15 |
+
## 🚀 Como Funciona
|
| 16 |
|
| 17 |
+
**Arquitetura anti-timeout:**
|
| 18 |
+
1. FastAPI inicia **imediatamente** (<3s)
|
| 19 |
+
2. Setup roda em **background** (~15min)
|
| 20 |
+
3. HF Spaces **não fecha** por timeout
|
| 21 |
+
4. Você acompanha progresso via `/setup/status`
|
| 22 |
|
| 23 |
+
## 📡 Endpoints
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
### Durante Setup (primeiros 15min)
|
| 26 |
|
| 27 |
+
```bash
|
| 28 |
+
# Ver progresso
|
| 29 |
+
curl https://SEU-USUARIO-para-ai-rag-0301.hf.space/setup/status
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
# Resposta:
|
|
|
|
| 32 |
{
|
| 33 |
+
"status": "building",
|
| 34 |
+
"message": "Construindo ChromaDB com embeddings",
|
| 35 |
+
"progress": 70,
|
| 36 |
+
"timestamp": "2026-02-10T20:15:00"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
}
|
| 38 |
```
|
| 39 |
|
| 40 |
+
### Após Setup Completo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
- `POST /search/embedding` - Busca semântica
|
| 43 |
+
- `POST /search/keywords` - Busca por termos
|
| 44 |
+
- `POST /search/by_id` - Busca por ID
|
| 45 |
+
- `GET /cluster/info` - Info do cluster
|
| 46 |
|
| 47 |
+
## 🔧 Deploy
|
| 48 |
|
| 49 |
+
1. **Editar `config.yaml`:**
|
| 50 |
+
```yaml
|
| 51 |
+
cluster_id: "RAG-0301"
|
| 52 |
+
chunk_start: 301
|
| 53 |
+
chunk_end: 600
|
| 54 |
+
github_repo: "https://github.com/SEU-USUARIO/para-ai-data.git"
|
| 55 |
+
```
|
| 56 |
|
| 57 |
+
2. **Criar Space:**
|
| 58 |
+
```bash
|
| 59 |
+
huggingface-cli repo create para-ai-rag-0301 --type space --space_sdk docker
|
| 60 |
+
```
|
| 61 |
|
| 62 |
+
3. **Upload:**
|
| 63 |
+
```bash
|
| 64 |
+
git init
|
| 65 |
+
git remote add origin https://huggingface.co/spaces/SEU-USUARIO/para-ai-rag-0301
|
| 66 |
+
git add .
|
| 67 |
+
git commit -m "Deploy"
|
| 68 |
+
git push origin main
|
| 69 |
+
```
|
|
|
|
| 70 |
|
| 71 |
+
4. **Monitorar:**
|
| 72 |
+
Space fica online em ~3s, RAG pronto em ~15min
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
## 📊 Monitoramento
|
| 75 |
|
| 76 |
+
```bash
|
| 77 |
+
# Status atual
|
| 78 |
+
curl https://SEU-USUARIO-para-ai-rag-0301.hf.space/
|
| 79 |
|
| 80 |
+
# Logs do setup
|
| 81 |
+
# (via interface HF Spaces)
|
| 82 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
## 📚 Documentação
|
| 85 |
|
| 86 |
+
- `QUICKSTART.txt` - Deploy em 5 minutos
|
| 87 |
+
- `INSTRUCTIONS.md` - Guia completo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
⚖️ **InJustiça não para o Paraná!** 🐝
|
app.py
CHANGED
|
@@ -1,27 +1,82 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Para.AI RAG Cluster - FastAPI Application
|
| 4 |
-
|
| 5 |
"""
|
| 6 |
|
| 7 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 8 |
from pydantic import BaseModel
|
| 9 |
from typing import List, Optional
|
| 10 |
import logging
|
| 11 |
import time
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
logging.basicConfig(level=logging.INFO)
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
# FastAPI App
|
| 22 |
app = FastAPI(
|
| 23 |
title="Para.AI RAG Cluster",
|
| 24 |
-
description=
|
| 25 |
version="1.0.0"
|
| 26 |
)
|
| 27 |
|
|
@@ -36,7 +91,7 @@ class EmbeddingSearchRequest(BaseModel):
|
|
| 36 |
|
| 37 |
class KeywordSearchRequest(BaseModel):
|
| 38 |
keywords: List[str]
|
| 39 |
-
operator: str = "AND"
|
| 40 |
top_k: int = 20
|
| 41 |
|
| 42 |
class IDSearchRequest(BaseModel):
|
|
@@ -49,28 +104,50 @@ class IDSearchRequest(BaseModel):
|
|
| 49 |
|
| 50 |
@app.get("/")
|
| 51 |
async def root():
|
| 52 |
-
"""Health check
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
"status": "online",
|
| 55 |
-
"
|
| 56 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
query_engine.config['chunk_start'],
|
| 58 |
query_engine.config['chunk_end']
|
| 59 |
-
]
|
| 60 |
-
"endpoints"
|
| 61 |
"/search/embedding",
|
| 62 |
"/search/keywords",
|
| 63 |
"/search/by_id",
|
| 64 |
-
"/cluster/info"
|
|
|
|
| 65 |
]
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
@app.post("/search/embedding")
|
| 69 |
async def search_embedding(request: EmbeddingSearchRequest):
|
| 70 |
"""Busca por similaridade semântica (embeddings)"""
|
|
|
|
|
|
|
| 71 |
try:
|
| 72 |
start = time.time()
|
| 73 |
-
results =
|
| 74 |
query=request.query,
|
| 75 |
top_k=request.top_k,
|
| 76 |
return_embeddings=request.return_embeddings
|
|
@@ -84,9 +161,11 @@ async def search_embedding(request: EmbeddingSearchRequest):
|
|
| 84 |
@app.post("/search/keywords")
|
| 85 |
async def search_keywords(request: KeywordSearchRequest):
|
| 86 |
"""Busca por termos-chave (full-text search)"""
|
|
|
|
|
|
|
| 87 |
try:
|
| 88 |
start = time.time()
|
| 89 |
-
results =
|
| 90 |
keywords=request.keywords,
|
| 91 |
operator=request.operator,
|
| 92 |
top_k=request.top_k
|
|
@@ -100,9 +179,11 @@ async def search_keywords(request: KeywordSearchRequest):
|
|
| 100 |
@app.post("/search/by_id")
|
| 101 |
async def search_by_id(request: IDSearchRequest):
|
| 102 |
"""Busca direta por ID(s)"""
|
|
|
|
|
|
|
| 103 |
try:
|
| 104 |
start = time.time()
|
| 105 |
-
results =
|
| 106 |
ids=request.ids,
|
| 107 |
return_embeddings=request.return_embeddings
|
| 108 |
)
|
|
@@ -115,8 +196,10 @@ async def search_by_id(request: IDSearchRequest):
|
|
| 115 |
@app.get("/cluster/info")
|
| 116 |
async def cluster_info():
|
| 117 |
"""Informações detalhadas do cluster"""
|
|
|
|
|
|
|
| 118 |
try:
|
| 119 |
-
info =
|
| 120 |
info['uptime_seconds'] = round(time.time() - app.state.start_time, 2)
|
| 121 |
return info
|
| 122 |
except Exception as e:
|
|
@@ -125,12 +208,11 @@ async def cluster_info():
|
|
| 125 |
|
| 126 |
@app.on_event("startup")
|
| 127 |
async def startup_event():
|
| 128 |
-
"""Evento de startup"""
|
| 129 |
app.state.start_time = time.time()
|
| 130 |
logger.info("="*80)
|
| 131 |
-
logger.info(
|
| 132 |
-
logger.info(
|
| 133 |
-
logger.info(f"📊 Registros: {query_engine.collection.count():,}")
|
| 134 |
logger.info("="*80)
|
| 135 |
|
| 136 |
if __name__ == "__main__":
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Para.AI RAG Cluster - FastAPI Application
|
| 4 |
+
Inicia IMEDIATAMENTE (antes do setup terminar) para evitar timeout HF
|
| 5 |
"""
|
| 6 |
|
| 7 |
from fastapi import FastAPI, HTTPException
|
| 8 |
+
from fastapi.responses import JSONResponse
|
| 9 |
from pydantic import BaseModel
|
| 10 |
from typing import List, Optional
|
| 11 |
import logging
|
| 12 |
import time
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
|
| 16 |
logging.basicConfig(level=logging.INFO)
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
+
# ============================================================================
|
| 20 |
+
# VERIFICAÇÃO DE STATUS DO SETUP
|
| 21 |
+
# ============================================================================
|
| 22 |
+
|
| 23 |
+
STATUS_FILE = Path('/tmp/setup_status.json')
|
| 24 |
+
READY_FLAG = Path('/tmp/chromadb_ready')
|
| 25 |
+
|
| 26 |
+
def get_setup_status():
|
| 27 |
+
"""Lê status do setup em background"""
|
| 28 |
+
if not STATUS_FILE.exists():
|
| 29 |
+
return {
|
| 30 |
+
'status': 'initializing',
|
| 31 |
+
'message': 'Setup ainda não iniciado',
|
| 32 |
+
'progress': 0
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
with open(STATUS_FILE) as f:
|
| 37 |
+
return json.load(f)
|
| 38 |
+
except:
|
| 39 |
+
return {
|
| 40 |
+
'status': 'unknown',
|
| 41 |
+
'message': 'Erro ao ler status',
|
| 42 |
+
'progress': 0
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
def is_ready():
|
| 46 |
+
"""Verifica se ChromaDB está pronto"""
|
| 47 |
+
return READY_FLAG.exists()
|
| 48 |
+
|
| 49 |
+
# ============================================================================
|
| 50 |
+
# LAZY LOADING DO QUERY ENGINE
|
| 51 |
+
# ============================================================================
|
| 52 |
+
|
| 53 |
+
query_engine = None
|
| 54 |
+
|
| 55 |
+
def get_query_engine():
|
| 56 |
+
"""Carrega QueryEngine apenas quando ChromaDB estiver pronto"""
|
| 57 |
+
global query_engine
|
| 58 |
+
|
| 59 |
+
if query_engine is None:
|
| 60 |
+
if not is_ready():
|
| 61 |
+
raise HTTPException(
|
| 62 |
+
status_code=503,
|
| 63 |
+
detail="RAG ainda em construção. Tente novamente em alguns minutos."
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
logger.info("Carregando QueryEngine...")
|
| 67 |
+
from query_engine import QueryEngine
|
| 68 |
+
query_engine = QueryEngine()
|
| 69 |
+
logger.info("✅ QueryEngine carregado!")
|
| 70 |
+
|
| 71 |
+
return query_engine
|
| 72 |
+
|
| 73 |
+
# ============================================================================
|
| 74 |
+
# FASTAPI APP
|
| 75 |
+
# ============================================================================
|
| 76 |
|
|
|
|
| 77 |
app = FastAPI(
|
| 78 |
title="Para.AI RAG Cluster",
|
| 79 |
+
description="Micro-cluster RAG para jurisprudências do TJPR",
|
| 80 |
version="1.0.0"
|
| 81 |
)
|
| 82 |
|
|
|
|
| 91 |
|
| 92 |
class KeywordSearchRequest(BaseModel):
|
| 93 |
keywords: List[str]
|
| 94 |
+
operator: str = "AND"
|
| 95 |
top_k: int = 20
|
| 96 |
|
| 97 |
class IDSearchRequest(BaseModel):
|
|
|
|
| 104 |
|
| 105 |
@app.get("/")
|
| 106 |
async def root():
|
| 107 |
+
"""Health check - SEMPRE responde (mesmo durante setup)"""
|
| 108 |
+
setup_status = get_setup_status()
|
| 109 |
+
ready = is_ready()
|
| 110 |
+
|
| 111 |
+
response = {
|
| 112 |
"status": "online",
|
| 113 |
+
"rag_ready": ready,
|
| 114 |
+
"setup": setup_status
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
if ready and query_engine:
|
| 118 |
+
response["cluster_id"] = query_engine.config['cluster_id']
|
| 119 |
+
response["chunk_range"] = [
|
| 120 |
query_engine.config['chunk_start'],
|
| 121 |
query_engine.config['chunk_end']
|
| 122 |
+
]
|
| 123 |
+
response["endpoints"] = [
|
| 124 |
"/search/embedding",
|
| 125 |
"/search/keywords",
|
| 126 |
"/search/by_id",
|
| 127 |
+
"/cluster/info",
|
| 128 |
+
"/setup/status"
|
| 129 |
]
|
| 130 |
+
|
| 131 |
+
return response
|
| 132 |
+
|
| 133 |
+
@app.get("/setup/status")
|
| 134 |
+
async def setup_status():
|
| 135 |
+
"""Retorna status detalhado do setup"""
|
| 136 |
+
return get_setup_status()
|
| 137 |
+
|
| 138 |
+
@app.get("/health")
|
| 139 |
+
async def health():
|
| 140 |
+
"""Health check simples para HF Spaces"""
|
| 141 |
+
return {"status": "ok", "timestamp": time.time()}
|
| 142 |
|
| 143 |
@app.post("/search/embedding")
|
| 144 |
async def search_embedding(request: EmbeddingSearchRequest):
|
| 145 |
"""Busca por similaridade semântica (embeddings)"""
|
| 146 |
+
engine = get_query_engine() # Lança 503 se não estiver pronto
|
| 147 |
+
|
| 148 |
try:
|
| 149 |
start = time.time()
|
| 150 |
+
results = engine.search_by_embedding(
|
| 151 |
query=request.query,
|
| 152 |
top_k=request.top_k,
|
| 153 |
return_embeddings=request.return_embeddings
|
|
|
|
| 161 |
@app.post("/search/keywords")
|
| 162 |
async def search_keywords(request: KeywordSearchRequest):
|
| 163 |
"""Busca por termos-chave (full-text search)"""
|
| 164 |
+
engine = get_query_engine()
|
| 165 |
+
|
| 166 |
try:
|
| 167 |
start = time.time()
|
| 168 |
+
results = engine.search_by_keywords(
|
| 169 |
keywords=request.keywords,
|
| 170 |
operator=request.operator,
|
| 171 |
top_k=request.top_k
|
|
|
|
| 179 |
@app.post("/search/by_id")
|
| 180 |
async def search_by_id(request: IDSearchRequest):
|
| 181 |
"""Busca direta por ID(s)"""
|
| 182 |
+
engine = get_query_engine()
|
| 183 |
+
|
| 184 |
try:
|
| 185 |
start = time.time()
|
| 186 |
+
results = engine.search_by_ids(
|
| 187 |
ids=request.ids,
|
| 188 |
return_embeddings=request.return_embeddings
|
| 189 |
)
|
|
|
|
| 196 |
@app.get("/cluster/info")
|
| 197 |
async def cluster_info():
|
| 198 |
"""Informações detalhadas do cluster"""
|
| 199 |
+
engine = get_query_engine()
|
| 200 |
+
|
| 201 |
try:
|
| 202 |
+
info = engine.get_cluster_info()
|
| 203 |
info['uptime_seconds'] = round(time.time() - app.state.start_time, 2)
|
| 204 |
return info
|
| 205 |
except Exception as e:
|
|
|
|
| 208 |
|
| 209 |
@app.on_event("startup")
|
| 210 |
async def startup_event():
|
| 211 |
+
"""Evento de startup - RÁPIDO (não aguarda setup)"""
|
| 212 |
app.state.start_time = time.time()
|
| 213 |
logger.info("="*80)
|
| 214 |
+
logger.info("🚀 Para.AI RAG Cluster FastAPI ONLINE")
|
| 215 |
+
logger.info("Setup em background: verificar /setup/status")
|
|
|
|
| 216 |
logger.info("="*80)
|
| 217 |
|
| 218 |
if __name__ == "__main__":
|
entrypoint.sh
CHANGED
|
@@ -5,78 +5,38 @@ echo "=================================="
|
|
| 5 |
echo "🚀 Para.AI RAG Cluster Startup"
|
| 6 |
echo "=================================="
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
CHUNK_END=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['chunk_end'])")
|
| 11 |
-
CLUSTER_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['cluster_id'])")
|
| 12 |
-
GITHUB_REPO=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['github_repo'])")
|
| 13 |
-
|
| 14 |
-
echo "📊 Cluster ID: $CLUSTER_ID"
|
| 15 |
-
echo "📦 Chunks: $CHUNK_START - $CHUNK_END"
|
| 16 |
-
echo ""
|
| 17 |
-
|
| 18 |
-
# Verificar se ChromaDB já existe (persistência entre restarts se HF Space tiver)
|
| 19 |
-
if [ -d "/app/chromadb" ] && [ "$(ls -A /app/chromadb)" ]; then
|
| 20 |
-
echo "✅ ChromaDB já existe! Pulando build..."
|
| 21 |
-
else
|
| 22 |
-
echo "🔧 Construindo RAG pela primeira vez..."
|
| 23 |
-
|
| 24 |
-
# 1. Git sparse checkout
|
| 25 |
-
echo ""
|
| 26 |
-
echo "1️⃣ Clonando chunks do GitHub (sparse checkout)..."
|
| 27 |
-
mkdir -p /tmp/repo
|
| 28 |
-
cd /tmp/repo
|
| 29 |
-
|
| 30 |
-
git clone --filter=blob:none --sparse "$GITHUB_REPO" .
|
| 31 |
-
git sparse-checkout init --cone
|
| 32 |
-
|
| 33 |
-
# Gerar pattern para chunks
|
| 34 |
-
PATTERN=""
|
| 35 |
-
for i in $(seq -f "%04g" $CHUNK_START $CHUNK_END); do
|
| 36 |
-
PATTERN="$PATTERN chunks_dados/chunk_dados_$i.tar.gz"
|
| 37 |
-
done
|
| 38 |
-
|
| 39 |
-
git sparse-checkout set $PATTERN
|
| 40 |
-
|
| 41 |
-
echo "✅ $(find chunks_dados -name '*.tar.gz' | wc -l) chunks clonados"
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
echo "2️⃣ Descompactando chunks..."
|
| 46 |
-
mkdir -p /tmp/extracted
|
| 47 |
-
find chunks_dados -name "*.tar.gz" -exec tar -xzf {} -C /tmp/extracted \;
|
| 48 |
-
echo "✅ Chunks descompactados"
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
find /tmp/extracted -name "jurisprudencias.jsonl" -exec cat {} \; > /tmp/all_records.jsonl
|
| 54 |
-
TOTAL_RECORDS=$(wc -l < /tmp/all_records.jsonl)
|
| 55 |
-
echo "✅ $TOTAL_RECORDS registros concatenados"
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
echo "✅ Campos filtrados"
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
echo "✅ ChromaDB pronto!"
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
echo "🧹 Limpando arquivos temporários..."
|
| 72 |
-
rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl
|
| 73 |
-
echo "✅ Limpeza concluída"
|
| 74 |
-
fi
|
| 75 |
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
echo ""
|
| 78 |
echo "=================================="
|
| 79 |
-
echo "
|
| 80 |
echo "=================================="
|
| 81 |
-
|
|
|
|
| 82 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
|
|
|
| 5 |
echo "🚀 Para.AI RAG Cluster Startup"
|
| 6 |
echo "=================================="
|
| 7 |
|
| 8 |
+
# Ir para diretório da aplicação
|
| 9 |
+
cd /home/user/app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
# ESTRATÉGIA: Iniciar setup em background PRIMEIRO, depois FastAPI
|
| 12 |
+
# Isso evita timeout de inicialização do HF Spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
echo ""
|
| 15 |
+
echo "1️⃣ Iniciando setup em background..."
|
| 16 |
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
# Iniciar setup.py em background com output unbuffered (-u)
|
| 19 |
+
# Redirecionar output para arquivo + tela
|
| 20 |
+
python3 -u setup.py > /tmp/setup_output.log 2>&1 &
|
| 21 |
+
SETUP_PID=$!
|
|
|
|
| 22 |
|
| 23 |
+
echo "✅ Setup iniciado em background (PID: $SETUP_PID)"
|
| 24 |
+
echo "📋 Logs em: /tmp/setup_output.log"
|
| 25 |
+
echo "📊 Status em: /tmp/setup_status.json"
|
| 26 |
+
echo ""
|
|
|
|
| 27 |
|
| 28 |
+
# Esperar 2 segundos para setup criar arquivo de status
|
| 29 |
+
sleep 2
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
echo "2️⃣ Iniciando FastAPI..."
|
| 32 |
+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
| 33 |
+
echo "🎯 FastAPI estará online IMEDIATAMENTE"
|
| 34 |
+
echo "🔧 RAG estará disponível quando setup terminar (~10-15 min)"
|
| 35 |
+
echo "📡 Acompanhe em: /setup/status"
|
| 36 |
echo ""
|
| 37 |
echo "=================================="
|
| 38 |
+
echo "🚀 Iniciando API REST..."
|
| 39 |
echo "=================================="
|
| 40 |
+
|
| 41 |
+
# Iniciar FastAPI (bloqueia aqui)
|
| 42 |
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
monitor_setup.sh
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# monitor_setup.sh - Monitora progresso do setup em tempo real
|
| 3 |
+
|
| 4 |
+
SPACE_URL="$1"
|
| 5 |
+
|
| 6 |
+
if [ -z "$SPACE_URL" ]; then
|
| 7 |
+
echo "Uso: $0 <SPACE_URL>"
|
| 8 |
+
echo "Exemplo: $0 https://seu-usuario-para-ai-rag-0301.hf.space"
|
| 9 |
+
exit 1
|
| 10 |
+
fi
|
| 11 |
+
|
| 12 |
+
echo "Monitorando setup de: $SPACE_URL"
|
| 13 |
+
echo ""
|
| 14 |
+
|
| 15 |
+
while true; do
|
| 16 |
+
# Limpar tela
|
| 17 |
+
clear
|
| 18 |
+
|
| 19 |
+
echo "╔══════════════════════════════════════════════════════════════════════╗"
|
| 20 |
+
echo "║ PARA.AI RAG - MONITOR DE SETUP ║"
|
| 21 |
+
echo "╚══════════════════════════════════════════════════════════════════════╝"
|
| 22 |
+
echo ""
|
| 23 |
+
|
| 24 |
+
# Fazer request
|
| 25 |
+
RESPONSE=$(curl -s "$SPACE_URL/setup/status")
|
| 26 |
+
|
| 27 |
+
# Extrair campos
|
| 28 |
+
STATUS=$(echo $RESPONSE | jq -r '.status')
|
| 29 |
+
MESSAGE=$(echo $RESPONSE | jq -r '.message')
|
| 30 |
+
PROGRESS=$(echo $RESPONSE | jq -r '.progress')
|
| 31 |
+
TIMESTAMP=$(echo $RESPONSE | jq -r '.timestamp')
|
| 32 |
+
|
| 33 |
+
# Mostrar info
|
| 34 |
+
echo "Status: $STATUS"
|
| 35 |
+
echo "Progresso: $PROGRESS%"
|
| 36 |
+
echo "Mensagem: $MESSAGE"
|
| 37 |
+
echo "Timestamp: $TIMESTAMP"
|
| 38 |
+
echo ""
|
| 39 |
+
|
| 40 |
+
# Barra de progresso
|
| 41 |
+
BAR_WIDTH=50
|
| 42 |
+
FILLED=$((PROGRESS * BAR_WIDTH / 100))
|
| 43 |
+
EMPTY=$((BAR_WIDTH - FILLED))
|
| 44 |
+
|
| 45 |
+
printf "["
|
| 46 |
+
printf "%${FILLED}s" | tr ' ' '█'
|
| 47 |
+
printf "%${EMPTY}s" | tr ' ' '░'
|
| 48 |
+
printf "] %d%%
|
| 49 |
+
" $PROGRESS
|
| 50 |
+
echo ""
|
| 51 |
+
|
| 52 |
+
# Se completo, parar
|
| 53 |
+
if [ "$STATUS" = "ready" ]; then
|
| 54 |
+
echo "✅ SETUP COMPLETO!"
|
| 55 |
+
echo ""
|
| 56 |
+
echo "Testando cluster info..."
|
| 57 |
+
curl -s "$SPACE_URL/cluster/info" | jq
|
| 58 |
+
break
|
| 59 |
+
fi
|
| 60 |
+
|
| 61 |
+
if [ "$STATUS" = "error" ]; then
|
| 62 |
+
echo "❌ ERRO NO SETUP!"
|
| 63 |
+
break
|
| 64 |
+
fi
|
| 65 |
+
|
| 66 |
+
echo "Atualizando em 10 segundos..."
|
| 67 |
+
sleep 10
|
| 68 |
+
done
|
setup.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Setup em background - Clona dados, constrói ChromaDB
|
| 4 |
+
Executa enquanto FastAPI já está respondendo (evita timeout HF)
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import yaml
|
| 9 |
+
import json
|
| 10 |
+
import subprocess
|
| 11 |
+
import logging
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
|
| 15 |
+
# Setup logging com flush imediato
|
| 16 |
+
logging.basicConfig(
|
| 17 |
+
level=logging.INFO,
|
| 18 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 19 |
+
handlers=[
|
| 20 |
+
logging.StreamHandler(sys.stdout),
|
| 21 |
+
logging.FileHandler('/tmp/setup.log')
|
| 22 |
+
]
|
| 23 |
+
)
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# Forçar flush imediato
|
| 27 |
+
for handler in logger.handlers:
|
| 28 |
+
handler.flush = lambda: None
|
| 29 |
+
|
| 30 |
+
STATUS_FILE = Path('/tmp/setup_status.json')
|
| 31 |
+
READY_FLAG = Path('/tmp/chromadb_ready')
|
| 32 |
+
|
| 33 |
+
def update_status(status: str, message: str, progress: int = 0):
|
| 34 |
+
"""Atualiza arquivo de status para app.py ler"""
|
| 35 |
+
data = {
|
| 36 |
+
'status': status,
|
| 37 |
+
'message': message,
|
| 38 |
+
'progress': progress,
|
| 39 |
+
'timestamp': datetime.now().isoformat()
|
| 40 |
+
}
|
| 41 |
+
with open(STATUS_FILE, 'w') as f:
|
| 42 |
+
json.dump(data, f)
|
| 43 |
+
logger.info(f"[{progress}%] {status}: {message}")
|
| 44 |
+
sys.stdout.flush()
|
| 45 |
+
|
| 46 |
+
def run_command(cmd: str, description: str):
|
| 47 |
+
"""Executa comando shell com logging"""
|
| 48 |
+
logger.info(f"Executando: {description}")
|
| 49 |
+
logger.info(f"Comando: {cmd}")
|
| 50 |
+
|
| 51 |
+
result = subprocess.run(
|
| 52 |
+
cmd,
|
| 53 |
+
shell=True,
|
| 54 |
+
capture_output=True,
|
| 55 |
+
text=True
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
if result.returncode != 0:
|
| 59 |
+
logger.error(f"ERRO: {result.stderr}")
|
| 60 |
+
raise Exception(f"{description} falhou: {result.stderr}")
|
| 61 |
+
|
| 62 |
+
logger.info(f"✅ {description} completo")
|
| 63 |
+
return result.stdout
|
| 64 |
+
|
| 65 |
+
def main():
|
| 66 |
+
"""Setup completo em background"""
|
| 67 |
+
try:
|
| 68 |
+
logger.info("="*80)
|
| 69 |
+
logger.info("🚀 PARA.AI RAG CLUSTER - SETUP EM BACKGROUND")
|
| 70 |
+
logger.info("="*80)
|
| 71 |
+
|
| 72 |
+
# Carregar configuração
|
| 73 |
+
update_status('loading', 'Carregando configuração', 0)
|
| 74 |
+
with open('config.yaml') as f:
|
| 75 |
+
config = yaml.safe_load(f)
|
| 76 |
+
|
| 77 |
+
cluster_id = config['cluster_id']
|
| 78 |
+
chunk_start = config['chunk_start']
|
| 79 |
+
chunk_end = config['chunk_end']
|
| 80 |
+
github_repo = config['github_repo']
|
| 81 |
+
|
| 82 |
+
logger.info(f"Cluster: {cluster_id}")
|
| 83 |
+
logger.info(f"Chunks: {chunk_start} - {chunk_end}")
|
| 84 |
+
logger.info("")
|
| 85 |
+
|
| 86 |
+
# Verificar se ChromaDB já existe
|
| 87 |
+
if READY_FLAG.exists():
|
| 88 |
+
logger.info("✅ ChromaDB já pronto! Pulando setup...")
|
| 89 |
+
update_status('ready', 'ChromaDB já existe', 100)
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
# ETAPA 1: Git Sparse Checkout
|
| 93 |
+
update_status('cloning', 'Clonando chunks do GitHub (sparse checkout)', 10)
|
| 94 |
+
|
| 95 |
+
os.makedirs('/tmp/repo', exist_ok=True)
|
| 96 |
+
os.chdir('/tmp/repo')
|
| 97 |
+
|
| 98 |
+
# Clone inicial
|
| 99 |
+
run_command(
|
| 100 |
+
f"git clone --filter=blob:none --sparse {github_repo} .",
|
| 101 |
+
"Git clone inicial"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
run_command(
|
| 105 |
+
"git sparse-checkout init --cone",
|
| 106 |
+
"Sparse checkout init"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Gerar pattern de chunks
|
| 110 |
+
logger.info(f"Gerando pattern para chunks {chunk_start}-{chunk_end}...")
|
| 111 |
+
pattern_parts = []
|
| 112 |
+
for i in range(chunk_start, chunk_end + 1):
|
| 113 |
+
pattern_parts.append(f"chunks_dados/chunk_dados_{i:04d}.tar.gz")
|
| 114 |
+
|
| 115 |
+
# Set sparse checkout (em batches para evitar arg list too long)
|
| 116 |
+
batch_size = 50
|
| 117 |
+
for i in range(0, len(pattern_parts), batch_size):
|
| 118 |
+
batch = pattern_parts[i:i+batch_size]
|
| 119 |
+
pattern = ' '.join(batch)
|
| 120 |
+
run_command(
|
| 121 |
+
f"git sparse-checkout add {pattern}",
|
| 122 |
+
f"Sparse checkout batch {i//batch_size + 1}"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# Contar chunks clonados
|
| 126 |
+
result = run_command(
|
| 127 |
+
"find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l",
|
| 128 |
+
"Contar chunks"
|
| 129 |
+
)
|
| 130 |
+
chunks_count = int(result.strip())
|
| 131 |
+
logger.info(f"✅ {chunks_count} chunks clonados")
|
| 132 |
+
|
| 133 |
+
# ETAPA 2: Descompactar
|
| 134 |
+
update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
|
| 135 |
+
|
| 136 |
+
os.makedirs('/tmp/extracted', exist_ok=True)
|
| 137 |
+
|
| 138 |
+
run_command(
|
| 139 |
+
"find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true",
|
| 140 |
+
"Descompactar chunks"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# ETAPA 3: Concatenar JSONL
|
| 144 |
+
update_status('concatenating', 'Concatenando jurisprudencias.jsonl', 50)
|
| 145 |
+
|
| 146 |
+
run_command(
|
| 147 |
+
"find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true",
|
| 148 |
+
"Concatenar JSONL"
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Contar registros
|
| 152 |
+
result = run_command(
|
| 153 |
+
"wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'",
|
| 154 |
+
"Contar registros"
|
| 155 |
+
)
|
| 156 |
+
total_records = int(result.strip())
|
| 157 |
+
logger.info(f"✅ {total_records:,} registros concatenados")
|
| 158 |
+
|
| 159 |
+
# ETAPA 4: Filtrar campos
|
| 160 |
+
update_status('filtering', 'Filtrando campos (id + ementa)', 60)
|
| 161 |
+
|
| 162 |
+
os.chdir('/home/user/app')
|
| 163 |
+
run_command(
|
| 164 |
+
"python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl",
|
| 165 |
+
"Filtrar campos"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# ETAPA 5: Build ChromaDB
|
| 169 |
+
update_status('building', 'Construindo ChromaDB com embeddings (pode demorar)', 70)
|
| 170 |
+
|
| 171 |
+
run_command(
|
| 172 |
+
"python3 rag_builder.py --input /tmp/filtered.jsonl",
|
| 173 |
+
"Build ChromaDB"
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# ETAPA 6: Limpar temporários
|
| 177 |
+
update_status('cleaning', 'Limpando arquivos temporários', 95)
|
| 178 |
+
|
| 179 |
+
run_command(
|
| 180 |
+
"rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl",
|
| 181 |
+
"Limpar temporários"
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# ETAPA 7: Marcar como pronto
|
| 185 |
+
update_status('ready', f'ChromaDB pronto com {total_records:,} registros!', 100)
|
| 186 |
+
READY_FLAG.touch()
|
| 187 |
+
|
| 188 |
+
logger.info("="*80)
|
| 189 |
+
logger.info("✅ SETUP COMPLETO - RAG PRONTO PARA USO!")
|
| 190 |
+
logger.info("="*80)
|
| 191 |
+
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logger.error("="*80)
|
| 194 |
+
logger.error(f"❌ ERRO NO SETUP: {e}")
|
| 195 |
+
logger.error("="*80)
|
| 196 |
+
update_status('error', str(e), 0)
|
| 197 |
+
sys.exit(1)
|
| 198 |
+
|
| 199 |
+
if __name__ == "__main__":
|
| 200 |
+
main()
|