#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Setup Script - beeRoot v3.6 (FIXED) Download e processa chunks de dados do GitHub com validação """ import os import subprocess import logging import yaml import json import tarfile from pathlib import Path logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler('/tmp/setup.log'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def load_config(config_path='config.yaml'): """Carrega configuração""" with open(config_path) as f: return yaml.safe_load(f) def is_valid_tar_gz(file_path): """Verifica se arquivo é um tar.gz válido""" try: # Verifica tamanho mínimo size = os.path.getsize(file_path) if size < 100: # Muito pequeno para ser tar.gz real logger.warning(f"File too small: {size} bytes (probably HTML error)") return False # Tenta abrir como tar.gz with tarfile.open(file_path, 'r:gz') as tar: members = tar.getmembers() if len(members) == 0: return False return True except Exception as e: logger.warning(f"Not a valid tar.gz: {e}") return False def download_chunks(config): """Download chunks from GitHub com validação""" logger.info("="*80) logger.info("🐝 BEEROOT SETUP v3.6 (FIXED)") logger.info("="*80) repo_url = config['github_repo'] branch = config.get('github_branch', 'main') chunks_path = config.get('chunks_path', 'chunks_dados') chunk_start = config.get('chunk_start', 1) chunk_end = config.get('chunk_end', 50) logger.info(f"\n📦 Fonte: {repo_url}") logger.info(f"📊 Range: chunks {chunk_start}-{chunk_end}") # Workspace work_dir = Path('/tmp/work') work_dir.mkdir(exist_ok=True) # Constrói URL base correta para GitHub raw # Exemplo: https://github.com/user/repo.git → https://raw.githubusercontent.com/user/repo/main/ base_url = repo_url.replace('.git', '') base_url = base_url.replace('github.com', 'raw.githubusercontent.com') base_url = f"{base_url}/{branch}/{chunks_path}" logger.info(f"📍 Base URL: {base_url}") downloaded = [] failed = [] for chunk_id in range(chunk_start, chunk_end + 1): chunk_name = f"chunk_dados_{chunk_id:06d}.tar.gz" chunk_url = f"{base_url}/{chunk_name}" output_path = work_dir / chunk_name logger.info(f"\n 📥 Downloading {chunk_name}...") logger.info(f" URL: {chunk_url}") # Download com curl result = subprocess.run( [ 'curl', '-L', '-f', # -f = fail on HTTP errors '-o', str(output_path), chunk_url ], capture_output=True, text=True ) if result.returncode == 0 and output_path.exists(): size = output_path.stat().st_size logger.info(f" ✅ Downloaded ({size:,} bytes)") # Valida se é tar.gz real if is_valid_tar_gz(output_path): logger.info(f" ✅ Valid tar.gz") downloaded.append(output_path) else: logger.warning(f" ❌ Invalid tar.gz (probably 404)") failed.append(chunk_name) output_path.unlink() # Remove arquivo inválido else: logger.warning(f" ❌ Failed to download") logger.warning(f" Error: {result.stderr[:200]}") failed.append(chunk_name) logger.info(f"\n📊 Summary:") logger.info(f" ✅ Downloaded: {len(downloaded)} chunks") logger.info(f" ❌ Failed: {len(failed)} chunks") if failed: logger.warning(f"\n⚠️ Failed chunks: {', '.join(failed[:10])}") return downloaded def extract_and_merge(chunk_files, config): """Extract JSONL from chunks and merge""" logger.info("\n📦 Extracting and merging...") work_dir = Path('/tmp/work') all_records = [] campos = config.get('campos_filter', ['Id', 'ementa']) for chunk_file in chunk_files: try: logger.info(f"\n 📄 Processing: {chunk_file.name}") with tarfile.open(chunk_file, 'r:gz') as tar: # Lista membros members = tar.getmembers() logger.info(f" Members: {[m.name for m in members]}") # Procura arquivo JSONL jsonl_member = None for member in members: if member.name.endswith('.jsonl'): jsonl_member = member break if not jsonl_member: logger.warning(f" ⚠️ No .jsonl file found!") continue # Extrai e processa f = tar.extractfile(jsonl_member) content = f.read().decode('utf-8') lines = content.strip().split('\n') logger.info(f" Lines: {len(lines)}") for line_num, line in enumerate(lines, 1): if not line.strip(): continue try: record = json.loads(line) # Filtra campos filtered = {k: v for k, v in record.items() if k in campos} if 'Id' in filtered: # Valida campo obrigatório all_records.append(filtered) else: logger.warning(f" ⚠️ Line {line_num} missing 'Id'") except json.JSONDecodeError as e: logger.warning(f" ⚠️ Line {line_num} invalid JSON: {e}") except Exception as e: logger.error(f" ❌ Error extracting {chunk_file}: {e}") import traceback logger.error(traceback.format_exc()) logger.info(f"\n ✅ Total records extracted: {len(all_records):,}") # Salva all_filtered.jsonl output_file = work_dir / 'all_filtered.jsonl' with open(output_file, 'w', encoding='utf-8') as f: for record in all_records: f.write(json.dumps(record, ensure_ascii=False) + '\n') output_size = output_file.stat().st_size logger.info(f" ✅ Saved: {output_file} ({output_size:,} bytes)") return output_file def create_empty_faiss(output_dir): """Cria FAISS vazio como fallback""" logger.warning("\n⚠️ Creating empty FAISS index (no data available)") try: from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema import Document # Cria embeddings embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'} ) # Cria documento dummy dummy_doc = Document( page_content="Sistema inicializando. Aguarde processamento de dados.", metadata={"id": "dummy", "status": "initializing"} ) # Cria FAISS com 1 doc vectorstore = FAISS.from_documents([dummy_doc], embeddings) # Salva vectorstore.save_local(output_dir) logger.info(f" ✅ Empty FAISS created at {output_dir}") return True except Exception as e: logger.error(f" ❌ Failed to create empty FAISS: {e}") return False def build_faiss(jsonl_file, config): """Build FAISS index""" logger.info("\n🔨 Building FAISS index...") # Verifica se tem dados file_size = jsonl_file.stat().st_size if file_size < 10: # Arquivo vazio ou quase vazio logger.warning(f" ⚠️ JSONL file too small ({file_size} bytes)") logger.warning(" ⚠️ Creating empty FAISS as fallback") output_dir = config.get('faiss_path', '/home/user/app/faiss_index') create_empty_faiss(output_dir) return # Tenta build normal result = subprocess.run( f"python3 rag_builder.py --input {jsonl_file} --doc-batch-size 5000", shell=True, capture_output=True, text=True, timeout=3600 # 1 hour ) if result.returncode == 0: logger.info(" ✅ FAISS build complete!") else: logger.error(f" ❌ FAISS build failed:") logger.error(result.stderr[:500]) # Fallback: cria FAISS vazio logger.warning(" ⚠️ Attempting fallback: empty FAISS") output_dir = config.get('faiss_path', '/home/user/app/faiss_index') if create_empty_faiss(output_dir): logger.info(" ✅ Fallback successful") else: raise RuntimeError("FAISS build failed and fallback failed") def main(): try: config = load_config() # 1. Download chunks = download_chunks(config) if not chunks: logger.error("\n❌ No chunks downloaded successfully!") logger.warning("\n⚠️ Creating empty FAISS to allow app to start") output_dir = config.get('faiss_path', '/home/user/app/faiss_index') if create_empty_faiss(output_dir): logger.info("\n✅ App will start with empty index") logger.info(" Configure correct GitHub repo and rebuild") return else: raise RuntimeError("No data and failed to create empty FAISS") # 2. Extract + Merge jsonl_file = extract_and_merge(chunks, config) # 3. Build FAISS build_faiss(jsonl_file, config) logger.info("\n" + "="*80) logger.info("✅ SETUP COMPLETO!") logger.info("="*80) except Exception as e: logger.error(f"\n❌ Setup failed: {e}") import traceback logger.error(traceback.format_exc()) # Último recurso: FAISS vazio logger.warning("\n⚠️ Last resort: creating empty FAISS") try: config = load_config() output_dir = config.get('faiss_path', '/home/user/app/faiss_index') create_empty_faiss(output_dir) logger.info("\n⚠️ App will start but with no data") logger.info(" Fix config.yaml and rebuild Space") except: logger.error("\n❌ Complete failure. Check logs.") raise if __name__ == '__main__': main()