#!/usr/bin/env python3 import os, sys, yaml, json, subprocess, logging from pathlib import Path from datetime import datetime logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) STATUS_FILE = Path('/tmp/setup_status.json') READY_FLAG = Path('/tmp/faiss_ready') def update_status(status, message, progress=0): data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()} with open(STATUS_FILE, 'w') as f: json.dump(data, f) logger.info(f"[{progress}%] {status}: {message}") sys.stdout.flush() def run_cmd(cmd, desc): logger.info(f"Executando: {desc}") result = subprocess.run(cmd, shell=True, capture_output=True, text=True) if result.returncode != 0: logger.error(f"ERRO: {result.stderr}") raise Exception(f"{desc} falhou") logger.info(f"✅ {desc}") return result.stdout def main(): try: logger.info("="*80) logger.info("🚀 PARA.AI RAG - SETUP EM BACKGROUND") logger.info("="*80) update_status('loading', 'Carregando configuração', 0) with open('config.yaml') as f: config = yaml.safe_load(f) if READY_FLAG.exists(): logger.info("✅ FAISS já pronto!") update_status('ready', 'FAISS já existe', 100) return update_status('cloning', 'Clonando chunks', 10) os.makedirs('/tmp/repo', exist_ok=True) os.chdir('/tmp/repo') run_cmd(f"git clone --filter=blob:none --sparse {config['github_repo']} .", "Git clone") run_cmd("git sparse-checkout init --cone", "Sparse init") patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(config['chunk_start'], config['chunk_end']+1)] for i in range(0, len(patterns), 50): run_cmd(f"git sparse-checkout add {' '.join(patterns[i:i+50])}", f"Batch {i//50+1}") update_status('extracting', 'Descompactando', 30) os.makedirs('/tmp/extracted', exist_ok=True) run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Extract") update_status('concatenating', 'Concatenando JSONL', 50) run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concat") total = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Count").strip()) logger.info(f"✅ {total:,} registros") update_status('filtering', 'Filtrando campos', 60) os.chdir('/home/user/app') run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filter") update_status('building', 'Construindo FAISS', 70) run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS") update_status('cleaning', 'Limpando', 95) run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Clean") update_status('ready', f'FAISS pronto com {total:,} registros!', 100) READY_FLAG.touch() logger.info("="*80) logger.info("✅ SETUP COMPLETO!") logger.info("="*80) except Exception as e: logger.error(f"❌ ERRO: {e}") update_status('error', f'Build FAISS falhou: {str(e)}', 0) sys.exit(1) if __name__ == "__main__": main()