Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| beeROOT Setup v3.6 - Com fragmentação de chunks | |
| Processa chunks do GitHub e constrói índice FAISS | |
| """ | |
| import os, sys, yaml, json, subprocess, logging, traceback, time, tarfile, re | |
| from pathlib import Path | |
| from datetime import datetime | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s [%(levelname)s] %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout), | |
| logging.FileHandler('/tmp/setup_debug.log', mode='w') | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| STATUS_FILE = Path('/tmp/setup_status.json') | |
| READY_FLAG = Path('/tmp/faiss_ready') | |
| def update_status(status, message, progress=0): | |
| data = { | |
| 'status': status, | |
| 'message': message, | |
| 'progress': progress, | |
| 'timestamp': datetime.now().isoformat() | |
| } | |
| with open(STATUS_FILE, 'w') as f: | |
| json.dump(data, f) | |
| logger.info(f"STATUS [{progress}%]: {status} - {message}") | |
| def run_cmd(cmd, desc, check=True, timeout=300): | |
| logger.info("="*80) | |
| logger.info(f"🔧 {desc}") | |
| logger.info(f"📝 {cmd}") | |
| logger.info("-"*80) | |
| try: | |
| start = time.time() | |
| result = subprocess.run( | |
| cmd, | |
| shell=True, | |
| capture_output=True, | |
| text=True, | |
| timeout=timeout, | |
| check=check | |
| ) | |
| elapsed = time.time() - start | |
| logger.info(f"⏱️ {elapsed:.2f}s | Exit: {result.returncode}") | |
| if result.stdout and len(result.stdout.strip()) > 0: | |
| logger.info(f"STDOUT: {result.stdout[:500]}") | |
| if result.stderr and len(result.stderr.strip()) > 0: | |
| logger.warning(f"STDERR: {result.stderr[:500]}") | |
| if result.returncode == 0: | |
| logger.info(f"✅ {desc} - OK") | |
| return result.stdout | |
| except subprocess.CalledProcessError as e: | |
| logger.error(f"❌ FALHOU: {desc}") | |
| raise | |
| def clean_html_text(text): | |
| """Remove tags HTML e limpa texto""" | |
| if not text or not isinstance(text, str): | |
| return "" | |
| text = re.sub(r'<[^>]+>', '', text) | |
| text = re.sub(r'&[a-zA-Z]+;', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def is_valid_value(value): | |
| """Valida se valor é válido""" | |
| if value is None: | |
| return False | |
| if isinstance(value, str): | |
| return bool(value.strip()) | |
| if isinstance(value, (int, float)): | |
| return True | |
| return True | |
| def filter_and_validate_record(record, fields_to_keep): | |
| """Filtra e valida um registro JSON""" | |
| filtered = {} | |
| for field in fields_to_keep: | |
| value = None | |
| # Tenta variações do nome do campo | |
| if field in record: | |
| value = record[field] | |
| else: | |
| for variant in [field.lower(), field.capitalize(), field.upper()]: | |
| if variant in record: | |
| value = record[variant] | |
| break | |
| if value is None: | |
| return None, False | |
| if not is_valid_value(value): | |
| return None, False | |
| # Nome padronizado | |
| field_name = 'id' if field in ['Id', 'id', 'ID'] else field.lower() | |
| # Limpa HTML da ementa | |
| if field.lower() == 'ementa' and isinstance(value, str): | |
| cleaned_value = clean_html_text(value) | |
| if not cleaned_value or not cleaned_value.strip(): | |
| return None, False | |
| filtered[field_name] = cleaned_value | |
| else: | |
| filtered[field_name] = value | |
| return filtered, True | |
| def process_tar_gz(tar_path, output_jsonl, fields_to_keep, max_records_per_file=None): | |
| """ | |
| Processa arquivo tar.gz e extrai registros | |
| Args: | |
| tar_path: Path do arquivo tar.gz | |
| output_jsonl: Path do arquivo JSONL de saída | |
| fields_to_keep: Lista de campos para manter | |
| max_records_per_file: Máximo de registros por arquivo (None = sem limite) | |
| """ | |
| logger.info(f"📦 {tar_path.name}") | |
| stats = {'total': 0, 'validos': 0} | |
| file_counter = 1 | |
| current_file_records = 0 | |
| current_output = output_jsonl | |
| try: | |
| with tarfile.open(tar_path, 'r:gz') as tar: | |
| for member in tar.getmembers(): | |
| if member.name.endswith('jurisprudencias.jsonl') and member.isfile(): | |
| logger.info(f" ✅ {member.name}") | |
| file_obj = tar.extractfile(member) | |
| content = file_obj.read().decode('utf-8') | |
| lines = content.strip().split('\n') | |
| stats['total'] = len(lines) | |
| # Abre arquivo para append | |
| out_file = open(current_output, 'a', encoding='utf-8') | |
| for line in lines: | |
| if not line.strip(): | |
| continue | |
| try: | |
| record = json.loads(line) | |
| filtered, is_valid = filter_and_validate_record( | |
| record, | |
| fields_to_keep | |
| ) | |
| if is_valid: | |
| # Fragmentação por número de registros | |
| if max_records_per_file and current_file_records >= max_records_per_file: | |
| out_file.close() | |
| # Cria novo arquivo fragmentado | |
| file_counter += 1 | |
| base = output_jsonl.stem | |
| ext = output_jsonl.suffix | |
| current_output = output_jsonl.parent / f"{base}_part{file_counter:03d}{ext}" | |
| logger.info(f" 📝 Novo fragmento: {current_output.name}") | |
| out_file = open(current_output, 'w', encoding='utf-8') | |
| current_file_records = 0 | |
| out_file.write(json.dumps(filtered, ensure_ascii=False) + '\n') | |
| stats['validos'] += 1 | |
| current_file_records += 1 | |
| except Exception as e: | |
| logger.debug(f"Erro processando linha: {e}") | |
| pass | |
| out_file.close() | |
| logger.info(f" ✅ {stats['validos']}/{stats['total']} válidos") | |
| return stats['validos'] | |
| except Exception as e: | |
| logger.error(f" ❌ {e}") | |
| raise | |
| def main(): | |
| try: | |
| logger.info("\n" + "="*80) | |
| logger.info("🐝 beeROOT SETUP v3.6") | |
| logger.info("="*80) | |
| # Verifica se já foi executado | |
| if READY_FLAG.exists(): | |
| logger.info("✅ Setup já foi executado anteriormente") | |
| update_status('ready', 'Ready', 100) | |
| return | |
| # Carrega configuração | |
| with open('config.yaml') as f: | |
| config = yaml.safe_load(f) | |
| chunk_start = config['chunk_start'] | |
| chunk_end = config['chunk_end'] | |
| github_repo = config['github_repo'] | |
| campos_filter = config['campos_filter'] | |
| # NOVA OPÇÃO: fragmentação por número de registros | |
| max_records_per_file = config.get('max_records_per_file', None) | |
| if max_records_per_file: | |
| logger.info(f"📊 Fragmentação ativada: {max_records_per_file} registros/arquivo") | |
| # Prepara URL base | |
| base_url = github_repo.replace('https://github.com/', 'https://raw.githubusercontent.com/') | |
| if base_url.endswith('.git'): | |
| base_url = base_url[:-4] | |
| base_url = f"{base_url}/main/chunks_dados" | |
| # Diretório de trabalho | |
| work_dir = Path('/tmp/work') | |
| work_dir.mkdir(exist_ok=True) | |
| output_jsonl = work_dir / 'all_filtered.jsonl' | |
| if output_jsonl.exists(): | |
| output_jsonl.unlink() | |
| logger.info("\n📥 DOWNLOAD E PROCESSAMENTO") | |
| logger.info(f" Chunks: {chunk_start} a {chunk_end}") | |
| update_status('downloading', f'Processando chunks {chunk_start}-{chunk_end}', 10) | |
| total_validos = 0 | |
| for chunk_num in range(chunk_start, chunk_end + 1): | |
| chunk_name = f"chunk_dados_{chunk_num:06d}.tar.gz" | |
| chunk_url = f"{base_url}/{chunk_name}" | |
| chunk_path = work_dir / chunk_name | |
| progress = 10 + int(((chunk_num - chunk_start + 1) / (chunk_end - chunk_start + 1)) * 50) | |
| update_status('processing', f'Chunk {chunk_num}/{chunk_end}', progress) | |
| try: | |
| # Download | |
| run_cmd( | |
| f"curl -L -f -o {chunk_path} {chunk_url}", | |
| f"Download chunk {chunk_num}", | |
| timeout=300 | |
| ) | |
| # Processa | |
| if chunk_path.exists(): | |
| validos = process_tar_gz( | |
| chunk_path, | |
| output_jsonl, | |
| campos_filter, | |
| max_records_per_file | |
| ) | |
| total_validos += validos | |
| chunk_path.unlink() | |
| except Exception as e: | |
| logger.error(f" ❌ Erro no chunk {chunk_num}: {e}") | |
| if chunk_path.exists(): | |
| chunk_path.unlink() | |
| logger.info(f"\n✅ Total de registros válidos: {total_validos:,}") | |
| if total_validos == 0: | |
| raise Exception("Nenhum registro válido encontrado!") | |
| # Build FAISS | |
| logger.info("\n🤖 CONSTRUINDO ÍNDICE FAISS") | |
| update_status('building', 'Building FAISS index', 70) | |
| os.chdir('/home/user/app') | |
| # Processa todos os fragmentos se houver | |
| jsonl_files = list(work_dir.glob('all_filtered*.jsonl')) | |
| logger.info(f" 📁 {len(jsonl_files)} arquivo(s) JSONL para processar") | |
| for jsonl_file in jsonl_files: | |
| logger.info(f" 📝 Processando: {jsonl_file.name}") | |
| result = subprocess.run( | |
| f"python3 rag_builder.py --input {jsonl_file}", | |
| shell=True, | |
| capture_output=True, | |
| text=True, | |
| timeout=900 | |
| ) | |
| if result.stdout: | |
| for line in result.stdout.split('\n'): | |
| if line.strip(): | |
| logger.info(f" {line}") | |
| if result.stderr: | |
| for line in result.stderr.split('\n'): | |
| if line.strip(): | |
| logger.warning(f" {line}") | |
| if result.returncode != 0: | |
| raise Exception(f"Build falhou: exit {result.returncode}") | |
| logger.info("✅ FAISS build completo!") | |
| # Cleanup | |
| run_cmd(f"rm -rf {work_dir}", "Cleanup", check=False) | |
| # Marca como pronto | |
| update_status('ready', f'{total_validos:,} documentos indexados', 100) | |
| READY_FLAG.touch() | |
| logger.info("\n" + "="*80) | |
| logger.info("🎉 SETUP CONCLUÍDO COM SUCESSO!") | |
| logger.info("="*80) | |
| except Exception as e: | |
| logger.error(f"\n💥 ERRO FATAL: {e}") | |
| logger.error(traceback.format_exc()) | |
| update_status('error', str(e), 0) | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |