Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| import yaml | |
| import json | |
| import subprocess | |
| import logging | |
| from pathlib import Path | |
| from datetime import datetime | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| STATUS_FILE = Path('/tmp/setup_status.json') | |
| READY_FLAG = Path('/tmp/faiss_ready') | |
| def update_status(status, message, progress=0): | |
| data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()} | |
| with open(STATUS_FILE, 'w') as f: | |
| json.dump(data, f) | |
| logger.info(f"[{progress}%] {status}: {message}") | |
| sys.stdout.flush() | |
| def run_cmd(cmd, desc): | |
| logger.info(f"Executando: {desc}") | |
| result = subprocess.run(cmd, shell=True, capture_output=True, text=True) | |
| if result.returncode != 0: | |
| logger.error(f"ERRO: {result.stderr}") | |
| raise Exception(f"{desc} falhou") | |
| logger.info(f"✅ {desc}") | |
| return result.stdout | |
| def main(): | |
| try: | |
| logger.info("="*80) | |
| logger.info("🚀 PARA.AI RAG (LangChain) - SETUP EM BACKGROUND") | |
| logger.info("="*80) | |
| update_status('loading', 'Carregando configuração', 0) | |
| with open('config.yaml') as f: | |
| config = yaml.safe_load(f) | |
| cluster_id = config['cluster_id'] | |
| chunk_start = config['chunk_start'] | |
| chunk_end = config['chunk_end'] | |
| github_repo = config['github_repo'] | |
| if READY_FLAG.exists(): | |
| logger.info("✅ FAISS já pronto!") | |
| update_status('ready', 'FAISS já existe', 100) | |
| return | |
| # CLONE | |
| update_status('cloning', 'Clonando chunks (sparse checkout)', 10) | |
| os.makedirs('/tmp/repo', exist_ok=True) | |
| os.chdir('/tmp/repo') | |
| run_cmd(f"git clone --filter=blob:none --sparse {github_repo} .", "Git clone") | |
| run_cmd("git sparse-checkout init --cone", "Sparse checkout init") | |
| patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(chunk_start, chunk_end + 1)] | |
| for i in range(0, len(patterns), 50): | |
| batch = ' '.join(patterns[i:i+50]) | |
| run_cmd(f"git sparse-checkout add {batch}", f"Batch {i//50 + 1}") | |
| chunks_count = int(run_cmd("find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l", "Contar chunks").strip()) | |
| logger.info(f"✅ {chunks_count} chunks clonados") | |
| # EXTRACT | |
| update_status('extracting', f'Descompactando {chunks_count} chunks', 30) | |
| os.makedirs('/tmp/extracted', exist_ok=True) | |
| run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Descompactar") | |
| # CONCAT | |
| update_status('concatenating', 'Concatenando JSONL', 50) | |
| run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concatenar") | |
| total_records = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Contar registros").strip()) | |
| logger.info(f"✅ {total_records:,} registros") | |
| # FILTER | |
| update_status('filtering', 'Filtrando campos (id + ementa)', 60) | |
| os.chdir('/home/user/app') | |
| run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filtrar") | |
| # BUILD FAISS | |
| update_status('building', 'Construindo FAISS index (pode demorar)', 70) | |
| run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS") | |
| # CLEANUP | |
| update_status('cleaning', 'Limpando temporários', 95) | |
| run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Limpar") | |
| # DONE | |
| update_status('ready', f'FAISS pronto com {total_records:,} registros!', 100) | |
| READY_FLAG.touch() | |
| logger.info("="*80) | |
| logger.info("✅ SETUP COMPLETO!") | |
| logger.info("="*80) | |
| except Exception as e: | |
| logger.error(f"❌ ERRO: {e}") | |
| update_status('error', str(e), 0) | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |