Spaces:
Build error
Build error
File size: 4,110 Bytes
d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd cb3b8cb d9446cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
#!/usr/bin/env python3
import os
import sys
import yaml
import json
import subprocess
import logging
from pathlib import Path
from datetime import datetime
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
STATUS_FILE = Path('/tmp/setup_status.json')
READY_FLAG = Path('/tmp/faiss_ready')
def update_status(status, message, progress=0):
data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
with open(STATUS_FILE, 'w') as f:
json.dump(data, f)
logger.info(f"[{progress}%] {status}: {message}")
sys.stdout.flush()
def run_cmd(cmd, desc):
logger.info(f"Executando: {desc}")
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
logger.error(f"ERRO: {result.stderr}")
raise Exception(f"{desc} falhou")
logger.info(f"✅ {desc}")
return result.stdout
def main():
try:
logger.info("="*80)
logger.info("🚀 PARA.AI RAG (LangChain) - SETUP EM BACKGROUND")
logger.info("="*80)
update_status('loading', 'Carregando configuração', 0)
with open('config.yaml') as f:
config = yaml.safe_load(f)
cluster_id = config['cluster_id']
chunk_start = config['chunk_start']
chunk_end = config['chunk_end']
github_repo = config['github_repo']
if READY_FLAG.exists():
logger.info("✅ FAISS já pronto!")
update_status('ready', 'FAISS já existe', 100)
return
# CLONE
update_status('cloning', 'Clonando chunks (sparse checkout)', 10)
os.makedirs('/tmp/repo', exist_ok=True)
os.chdir('/tmp/repo')
run_cmd(f"git clone --filter=blob:none --sparse {github_repo} .", "Git clone")
run_cmd("git sparse-checkout init --cone", "Sparse checkout init")
patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(chunk_start, chunk_end + 1)]
for i in range(0, len(patterns), 50):
batch = ' '.join(patterns[i:i+50])
run_cmd(f"git sparse-checkout add {batch}", f"Batch {i//50 + 1}")
chunks_count = int(run_cmd("find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l", "Contar chunks").strip())
logger.info(f"✅ {chunks_count} chunks clonados")
# EXTRACT
update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
os.makedirs('/tmp/extracted', exist_ok=True)
run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Descompactar")
# CONCAT
update_status('concatenating', 'Concatenando JSONL', 50)
run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concatenar")
total_records = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Contar registros").strip())
logger.info(f"✅ {total_records:,} registros")
# FILTER
update_status('filtering', 'Filtrando campos (id + ementa)', 60)
os.chdir('/home/user/app')
run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filtrar")
# BUILD FAISS
update_status('building', 'Construindo FAISS index (pode demorar)', 70)
run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS")
# CLEANUP
update_status('cleaning', 'Limpando temporários', 95)
run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Limpar")
# DONE
update_status('ready', f'FAISS pronto com {total_records:,} registros!', 100)
READY_FLAG.touch()
logger.info("="*80)
logger.info("✅ SETUP COMPLETO!")
logger.info("="*80)
except Exception as e:
logger.error(f"❌ ERRO: {e}")
update_status('error', str(e), 0)
sys.exit(1)
if __name__ == "__main__":
main()
|