#!/usr/bin/env python3 import os, sys, yaml, json, subprocess, logging, traceback, time, tarfile, re from pathlib import Path from datetime import datetime logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('/tmp/setup_debug.log', mode='w') ] ) logger = logging.getLogger(__name__) STATUS_FILE = Path('/tmp/setup_status.json') READY_FLAG = Path('/tmp/faiss_ready') def update_status(status, message, progress=0): data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()} with open(STATUS_FILE, 'w') as f: json.dump(data, f) logger.info(f"STATUS [{progress}%]: {status} - {message}") def run_cmd(cmd, desc, check=True, timeout=300): logger.info("="*80) logger.info(f"šŸ”§ {desc}") logger.info(f"šŸ“ {cmd}") logger.info("-"*80) try: start = time.time() result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout, check=check) elapsed = time.time() - start logger.info(f"ā±ļø {elapsed:.2f}s | Exit: {result.returncode}") if result.stdout and len(result.stdout.strip()) > 0: logger.info(f"STDOUT: {result.stdout[:500]}") if result.stderr and len(result.stderr.strip()) > 0: logger.warning(f"STDERR: {result.stderr[:500]}") if result.returncode == 0: logger.info(f"āœ… {desc} - OK") return result.stdout except subprocess.CalledProcessError as e: logger.error(f"āŒ FALHOU: {desc}") raise def clean_html_text(text): if not text or not isinstance(text, str): return "" text = re.sub(r'<[^>]+>', '', text) text = re.sub(r'&[a-zA-Z]+;', ' ', text) text = re.sub(r'\s+', ' ', text) return text.strip() def is_valid_value(value): if value is None: return False if isinstance(value, str): return bool(value.strip()) if isinstance(value, (int, float)): return True return True def filter_and_validate_record(record, fields_to_keep): filtered = {} for field in fields_to_keep: value = None if field in record: value = record[field] else: for variant in [field.lower(), field.capitalize(), field.upper()]: if variant in record: value = record[variant] break if value is None: return None, False if not is_valid_value(value): return None, False field_name = 'id' if field in ['Id', 'id', 'ID'] else field.lower() if field.lower() == 'ementa' and isinstance(value, str): cleaned_value = clean_html_text(value) if not cleaned_value or not cleaned_value.strip(): return None, False filtered[field_name] = cleaned_value else: filtered[field_name] = value return filtered, True def process_tar_gz(tar_path, output_jsonl, fields_to_keep): logger.info(f"šŸ“¦ {tar_path.name}") stats = {'total': 0, 'validos': 0} try: with tarfile.open(tar_path, 'r:gz') as tar: for member in tar.getmembers(): if member.name.endswith('jurisprudencias.jsonl') and member.isfile(): logger.info(f" āœ… {member.name}") file_obj = tar.extractfile(member) content = file_obj.read().decode('utf-8') lines = content.strip().split('\n') stats['total'] = len(lines) with open(output_jsonl, 'a', encoding='utf-8') as out: for line in lines: if not line.strip(): continue try: record = json.loads(line) filtered, is_valid = filter_and_validate_record(record, fields_to_keep) if is_valid: out.write(json.dumps(filtered, ensure_ascii=False) + '\n') stats['validos'] += 1 except: pass logger.info(f" āœ… {stats['validos']}/{stats['total']}") return stats['validos'] return 0 except Exception as e: logger.error(f" āŒ {e}") raise def main(): try: logger.info("\n" + "="*80) logger.info("šŸš€ PARA.AI RAG SETUP v3.5") logger.info("="*80) if READY_FLAG.exists(): update_status('ready', 'Ready', 100) return with open('config.yaml') as f: config = yaml.safe_load(f) chunk_start = config['chunk_start'] chunk_end = config['chunk_end'] github_repo = config['github_repo'] campos_filter = config['campos_filter'] base_url = github_repo.replace('https://github.com/', 'https://raw.githubusercontent.com/') if base_url.endswith('.git'): base_url = base_url[:-4] base_url = f"{base_url}/main/chunks_dados" work_dir = Path('/tmp/work') work_dir.mkdir(exist_ok=True) output_jsonl = work_dir / 'all_filtered.jsonl' if output_jsonl.exists(): output_jsonl.unlink() logger.info("\nšŸ“„ Download") update_status('downloading', 'Downloading', 10) total_validos = 0 for chunk_num in range(chunk_start, chunk_end + 1): chunk_name = f"chunk_dados_{chunk_num:06d}.tar.gz" chunk_url = f"{base_url}/{chunk_name}" chunk_path = work_dir / chunk_name try: run_cmd(f"curl -L -f -o {chunk_path} {chunk_url}", f"Chunk {chunk_num}", timeout=300) if chunk_path.exists(): validos = process_tar_gz(chunk_path, output_jsonl, campos_filter) total_validos += validos chunk_path.unlink() except Exception as e: logger.error(f" āŒ {e}") if chunk_path.exists(): chunk_path.unlink() logger.info(f"\nāœ… Total: {total_validos}") if total_validos == 0: raise Exception("Nenhum registro!") logger.info("\nšŸ¤– Build FAISS") update_status('building', 'Building', 70) os.chdir('/home/user/app') result = subprocess.run( f"python3 rag_builder.py --input {output_jsonl}", shell=True, capture_output=True, text=True, timeout=900 ) if result.stdout: for line in result.stdout.split('\n'): if line.strip(): logger.info(line) if result.stderr: for line in result.stderr.split('\n'): if line.strip(): logger.warning(line) if result.returncode != 0: raise Exception(f"Build falhou: exit {result.returncode}") logger.info("āœ… OK!") run_cmd(f"rm -rf {work_dir}", "Cleanup", check=False) update_status('ready', f'{total_validos} docs', 100) READY_FLAG.touch() except Exception as e: logger.error(f"\nšŸ’„ {e}") logger.error(traceback.format_exc()) update_status('error', str(e), 0) sys.exit(1) if __name__ == "__main__": main()