Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Setup Script - beeRoot v3.6 (FIXED) | |
| Download e processa chunks de dados do GitHub com validação | |
| """ | |
| import os | |
| import subprocess | |
| import logging | |
| import yaml | |
| import json | |
| import tarfile | |
| from pathlib import Path | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s [%(levelname)s] %(message)s', | |
| handlers=[ | |
| logging.FileHandler('/tmp/setup.log'), | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def load_config(config_path='config.yaml'): | |
| """Carrega configuração""" | |
| with open(config_path) as f: | |
| return yaml.safe_load(f) | |
| def is_valid_tar_gz(file_path): | |
| """Verifica se arquivo é um tar.gz válido""" | |
| try: | |
| # Verifica tamanho mínimo | |
| size = os.path.getsize(file_path) | |
| if size < 100: # Muito pequeno para ser tar.gz real | |
| logger.warning(f"File too small: {size} bytes (probably HTML error)") | |
| return False | |
| # Tenta abrir como tar.gz | |
| with tarfile.open(file_path, 'r:gz') as tar: | |
| members = tar.getmembers() | |
| if len(members) == 0: | |
| return False | |
| return True | |
| except Exception as e: | |
| logger.warning(f"Not a valid tar.gz: {e}") | |
| return False | |
| def download_chunks(config): | |
| """Download chunks from GitHub com validação""" | |
| logger.info("="*80) | |
| logger.info("🐝 BEEROOT SETUP v3.6 (FIXED)") | |
| logger.info("="*80) | |
| repo_url = config['github_repo'] | |
| branch = config.get('github_branch', 'main') | |
| chunks_path = config.get('chunks_path', 'chunks_dados') | |
| chunk_start = config.get('chunk_start', 1) | |
| chunk_end = config.get('chunk_end', 50) | |
| logger.info(f"\n📦 Fonte: {repo_url}") | |
| logger.info(f"📊 Range: chunks {chunk_start}-{chunk_end}") | |
| # Workspace | |
| work_dir = Path('/tmp/work') | |
| work_dir.mkdir(exist_ok=True) | |
| # Constrói URL base correta para GitHub raw | |
| # Exemplo: https://github.com/user/repo.git → https://raw.githubusercontent.com/user/repo/main/ | |
| base_url = repo_url.replace('.git', '') | |
| base_url = base_url.replace('github.com', 'raw.githubusercontent.com') | |
| base_url = f"{base_url}/{branch}/{chunks_path}" | |
| logger.info(f"📍 Base URL: {base_url}") | |
| downloaded = [] | |
| failed = [] | |
| for chunk_id in range(chunk_start, chunk_end + 1): | |
| chunk_name = f"chunk_dados_{chunk_id:06d}.tar.gz" | |
| chunk_url = f"{base_url}/{chunk_name}" | |
| output_path = work_dir / chunk_name | |
| logger.info(f"\n 📥 Downloading {chunk_name}...") | |
| logger.info(f" URL: {chunk_url}") | |
| # Download com curl | |
| result = subprocess.run( | |
| [ | |
| 'curl', '-L', '-f', # -f = fail on HTTP errors | |
| '-o', str(output_path), | |
| chunk_url | |
| ], | |
| capture_output=True, | |
| text=True | |
| ) | |
| if result.returncode == 0 and output_path.exists(): | |
| size = output_path.stat().st_size | |
| logger.info(f" ✅ Downloaded ({size:,} bytes)") | |
| # Valida se é tar.gz real | |
| if is_valid_tar_gz(output_path): | |
| logger.info(f" ✅ Valid tar.gz") | |
| downloaded.append(output_path) | |
| else: | |
| logger.warning(f" ❌ Invalid tar.gz (probably 404)") | |
| failed.append(chunk_name) | |
| output_path.unlink() # Remove arquivo inválido | |
| else: | |
| logger.warning(f" ❌ Failed to download") | |
| logger.warning(f" Error: {result.stderr[:200]}") | |
| failed.append(chunk_name) | |
| logger.info(f"\n📊 Summary:") | |
| logger.info(f" ✅ Downloaded: {len(downloaded)} chunks") | |
| logger.info(f" ❌ Failed: {len(failed)} chunks") | |
| if failed: | |
| logger.warning(f"\n⚠️ Failed chunks: {', '.join(failed[:10])}") | |
| return downloaded | |
| def extract_and_merge(chunk_files, config): | |
| """Extract JSONL from chunks and merge""" | |
| logger.info("\n📦 Extracting and merging...") | |
| work_dir = Path('/tmp/work') | |
| all_records = [] | |
| campos = config.get('campos_filter', ['Id', 'ementa']) | |
| for chunk_file in chunk_files: | |
| try: | |
| logger.info(f"\n 📄 Processing: {chunk_file.name}") | |
| with tarfile.open(chunk_file, 'r:gz') as tar: | |
| # Lista membros | |
| members = tar.getmembers() | |
| logger.info(f" Members: {[m.name for m in members]}") | |
| # Procura arquivo JSONL | |
| jsonl_member = None | |
| for member in members: | |
| if member.name.endswith('.jsonl'): | |
| jsonl_member = member | |
| break | |
| if not jsonl_member: | |
| logger.warning(f" ⚠️ No .jsonl file found!") | |
| continue | |
| # Extrai e processa | |
| f = tar.extractfile(jsonl_member) | |
| content = f.read().decode('utf-8') | |
| lines = content.strip().split('\n') | |
| logger.info(f" Lines: {len(lines)}") | |
| for line_num, line in enumerate(lines, 1): | |
| if not line.strip(): | |
| continue | |
| try: | |
| record = json.loads(line) | |
| # Filtra campos | |
| filtered = {k: v for k, v in record.items() if k in campos} | |
| if 'Id' in filtered: # Valida campo obrigatório | |
| all_records.append(filtered) | |
| else: | |
| logger.warning(f" ⚠️ Line {line_num} missing 'Id'") | |
| except json.JSONDecodeError as e: | |
| logger.warning(f" ⚠️ Line {line_num} invalid JSON: {e}") | |
| except Exception as e: | |
| logger.error(f" ❌ Error extracting {chunk_file}: {e}") | |
| import traceback | |
| logger.error(traceback.format_exc()) | |
| logger.info(f"\n ✅ Total records extracted: {len(all_records):,}") | |
| # Salva all_filtered.jsonl | |
| output_file = work_dir / 'all_filtered.jsonl' | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| for record in all_records: | |
| f.write(json.dumps(record, ensure_ascii=False) + '\n') | |
| output_size = output_file.stat().st_size | |
| logger.info(f" ✅ Saved: {output_file} ({output_size:,} bytes)") | |
| return output_file | |
| def create_empty_faiss(output_dir): | |
| """Cria FAISS vazio como fallback""" | |
| logger.warning("\n⚠️ Creating empty FAISS index (no data available)") | |
| try: | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.schema import Document | |
| # Cria embeddings | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| model_kwargs={'device': 'cpu'} | |
| ) | |
| # Cria documento dummy | |
| dummy_doc = Document( | |
| page_content="Sistema inicializando. Aguarde processamento de dados.", | |
| metadata={"id": "dummy", "status": "initializing"} | |
| ) | |
| # Cria FAISS com 1 doc | |
| vectorstore = FAISS.from_documents([dummy_doc], embeddings) | |
| # Salva | |
| vectorstore.save_local(output_dir) | |
| logger.info(f" ✅ Empty FAISS created at {output_dir}") | |
| return True | |
| except Exception as e: | |
| logger.error(f" ❌ Failed to create empty FAISS: {e}") | |
| return False | |
| def build_faiss(jsonl_file, config): | |
| """Build FAISS index""" | |
| logger.info("\n🔨 Building FAISS index...") | |
| # Verifica se tem dados | |
| file_size = jsonl_file.stat().st_size | |
| if file_size < 10: # Arquivo vazio ou quase vazio | |
| logger.warning(f" ⚠️ JSONL file too small ({file_size} bytes)") | |
| logger.warning(" ⚠️ Creating empty FAISS as fallback") | |
| output_dir = config.get('faiss_path', '/home/user/app/faiss_index') | |
| create_empty_faiss(output_dir) | |
| return | |
| # Tenta build normal | |
| result = subprocess.run( | |
| f"python3 rag_builder.py --input {jsonl_file} --doc-batch-size 5000", | |
| shell=True, | |
| capture_output=True, | |
| text=True, | |
| timeout=3600 # 1 hour | |
| ) | |
| if result.returncode == 0: | |
| logger.info(" ✅ FAISS build complete!") | |
| else: | |
| logger.error(f" ❌ FAISS build failed:") | |
| logger.error(result.stderr[:500]) | |
| # Fallback: cria FAISS vazio | |
| logger.warning(" ⚠️ Attempting fallback: empty FAISS") | |
| output_dir = config.get('faiss_path', '/home/user/app/faiss_index') | |
| if create_empty_faiss(output_dir): | |
| logger.info(" ✅ Fallback successful") | |
| else: | |
| raise RuntimeError("FAISS build failed and fallback failed") | |
| def main(): | |
| try: | |
| config = load_config() | |
| # 1. Download | |
| chunks = download_chunks(config) | |
| if not chunks: | |
| logger.error("\n❌ No chunks downloaded successfully!") | |
| logger.warning("\n⚠️ Creating empty FAISS to allow app to start") | |
| output_dir = config.get('faiss_path', '/home/user/app/faiss_index') | |
| if create_empty_faiss(output_dir): | |
| logger.info("\n✅ App will start with empty index") | |
| logger.info(" Configure correct GitHub repo and rebuild") | |
| return | |
| else: | |
| raise RuntimeError("No data and failed to create empty FAISS") | |
| # 2. Extract + Merge | |
| jsonl_file = extract_and_merge(chunks, config) | |
| # 3. Build FAISS | |
| build_faiss(jsonl_file, config) | |
| logger.info("\n" + "="*80) | |
| logger.info("✅ SETUP COMPLETO!") | |
| logger.info("="*80) | |
| except Exception as e: | |
| logger.error(f"\n❌ Setup failed: {e}") | |
| import traceback | |
| logger.error(traceback.format_exc()) | |
| # Último recurso: FAISS vazio | |
| logger.warning("\n⚠️ Last resort: creating empty FAISS") | |
| try: | |
| config = load_config() | |
| output_dir = config.get('faiss_path', '/home/user/app/faiss_index') | |
| create_empty_faiss(output_dir) | |
| logger.info("\n⚠️ App will start but with no data") | |
| logger.info(" Fix config.yaml and rebuild Space") | |
| except: | |
| logger.error("\n❌ Complete failure. Check logs.") | |
| raise | |
| if __name__ == '__main__': | |
| main() | |