Carlex22's picture
Upload setup.py
9e502f5 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Setup Script - beeRoot v3.6 (FIXED)
Download e processa chunks de dados do GitHub com validação
"""
import os
import subprocess
import logging
import yaml
import json
import tarfile
from pathlib import Path
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler('/tmp/setup.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def load_config(config_path='config.yaml'):
"""Carrega configuração"""
with open(config_path) as f:
return yaml.safe_load(f)
def is_valid_tar_gz(file_path):
"""Verifica se arquivo é um tar.gz válido"""
try:
# Verifica tamanho mínimo
size = os.path.getsize(file_path)
if size < 100: # Muito pequeno para ser tar.gz real
logger.warning(f"File too small: {size} bytes (probably HTML error)")
return False
# Tenta abrir como tar.gz
with tarfile.open(file_path, 'r:gz') as tar:
members = tar.getmembers()
if len(members) == 0:
return False
return True
except Exception as e:
logger.warning(f"Not a valid tar.gz: {e}")
return False
def download_chunks(config):
"""Download chunks from GitHub com validação"""
logger.info("="*80)
logger.info("🐝 BEEROOT SETUP v3.6 (FIXED)")
logger.info("="*80)
repo_url = config['github_repo']
branch = config.get('github_branch', 'main')
chunks_path = config.get('chunks_path', 'chunks_dados')
chunk_start = config.get('chunk_start', 1)
chunk_end = config.get('chunk_end', 50)
logger.info(f"\n📦 Fonte: {repo_url}")
logger.info(f"📊 Range: chunks {chunk_start}-{chunk_end}")
# Workspace
work_dir = Path('/tmp/work')
work_dir.mkdir(exist_ok=True)
# Constrói URL base correta para GitHub raw
# Exemplo: https://github.com/user/repo.git → https://raw.githubusercontent.com/user/repo/main/
base_url = repo_url.replace('.git', '')
base_url = base_url.replace('github.com', 'raw.githubusercontent.com')
base_url = f"{base_url}/{branch}/{chunks_path}"
logger.info(f"📍 Base URL: {base_url}")
downloaded = []
failed = []
for chunk_id in range(chunk_start, chunk_end + 1):
chunk_name = f"chunk_dados_{chunk_id:06d}.tar.gz"
chunk_url = f"{base_url}/{chunk_name}"
output_path = work_dir / chunk_name
logger.info(f"\n 📥 Downloading {chunk_name}...")
logger.info(f" URL: {chunk_url}")
# Download com curl
result = subprocess.run(
[
'curl', '-L', '-f', # -f = fail on HTTP errors
'-o', str(output_path),
chunk_url
],
capture_output=True,
text=True
)
if result.returncode == 0 and output_path.exists():
size = output_path.stat().st_size
logger.info(f" ✅ Downloaded ({size:,} bytes)")
# Valida se é tar.gz real
if is_valid_tar_gz(output_path):
logger.info(f" ✅ Valid tar.gz")
downloaded.append(output_path)
else:
logger.warning(f" ❌ Invalid tar.gz (probably 404)")
failed.append(chunk_name)
output_path.unlink() # Remove arquivo inválido
else:
logger.warning(f" ❌ Failed to download")
logger.warning(f" Error: {result.stderr[:200]}")
failed.append(chunk_name)
logger.info(f"\n📊 Summary:")
logger.info(f" ✅ Downloaded: {len(downloaded)} chunks")
logger.info(f" ❌ Failed: {len(failed)} chunks")
if failed:
logger.warning(f"\n⚠️ Failed chunks: {', '.join(failed[:10])}")
return downloaded
def extract_and_merge(chunk_files, config):
"""Extract JSONL from chunks and merge"""
logger.info("\n📦 Extracting and merging...")
work_dir = Path('/tmp/work')
all_records = []
campos = config.get('campos_filter', ['Id', 'ementa'])
for chunk_file in chunk_files:
try:
logger.info(f"\n 📄 Processing: {chunk_file.name}")
with tarfile.open(chunk_file, 'r:gz') as tar:
# Lista membros
members = tar.getmembers()
logger.info(f" Members: {[m.name for m in members]}")
# Procura arquivo JSONL
jsonl_member = None
for member in members:
if member.name.endswith('.jsonl'):
jsonl_member = member
break
if not jsonl_member:
logger.warning(f" ⚠️ No .jsonl file found!")
continue
# Extrai e processa
f = tar.extractfile(jsonl_member)
content = f.read().decode('utf-8')
lines = content.strip().split('\n')
logger.info(f" Lines: {len(lines)}")
for line_num, line in enumerate(lines, 1):
if not line.strip():
continue
try:
record = json.loads(line)
# Filtra campos
filtered = {k: v for k, v in record.items() if k in campos}
if 'Id' in filtered: # Valida campo obrigatório
all_records.append(filtered)
else:
logger.warning(f" ⚠️ Line {line_num} missing 'Id'")
except json.JSONDecodeError as e:
logger.warning(f" ⚠️ Line {line_num} invalid JSON: {e}")
except Exception as e:
logger.error(f" ❌ Error extracting {chunk_file}: {e}")
import traceback
logger.error(traceback.format_exc())
logger.info(f"\n ✅ Total records extracted: {len(all_records):,}")
# Salva all_filtered.jsonl
output_file = work_dir / 'all_filtered.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
for record in all_records:
f.write(json.dumps(record, ensure_ascii=False) + '\n')
output_size = output_file.stat().st_size
logger.info(f" ✅ Saved: {output_file} ({output_size:,} bytes)")
return output_file
def create_empty_faiss(output_dir):
"""Cria FAISS vazio como fallback"""
logger.warning("\n⚠️ Creating empty FAISS index (no data available)")
try:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
# Cria embeddings
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'}
)
# Cria documento dummy
dummy_doc = Document(
page_content="Sistema inicializando. Aguarde processamento de dados.",
metadata={"id": "dummy", "status": "initializing"}
)
# Cria FAISS com 1 doc
vectorstore = FAISS.from_documents([dummy_doc], embeddings)
# Salva
vectorstore.save_local(output_dir)
logger.info(f" ✅ Empty FAISS created at {output_dir}")
return True
except Exception as e:
logger.error(f" ❌ Failed to create empty FAISS: {e}")
return False
def build_faiss(jsonl_file, config):
"""Build FAISS index"""
logger.info("\n🔨 Building FAISS index...")
# Verifica se tem dados
file_size = jsonl_file.stat().st_size
if file_size < 10: # Arquivo vazio ou quase vazio
logger.warning(f" ⚠️ JSONL file too small ({file_size} bytes)")
logger.warning(" ⚠️ Creating empty FAISS as fallback")
output_dir = config.get('faiss_path', '/home/user/app/faiss_index')
create_empty_faiss(output_dir)
return
# Tenta build normal
result = subprocess.run(
f"python3 rag_builder.py --input {jsonl_file} --doc-batch-size 5000",
shell=True,
capture_output=True,
text=True,
timeout=3600 # 1 hour
)
if result.returncode == 0:
logger.info(" ✅ FAISS build complete!")
else:
logger.error(f" ❌ FAISS build failed:")
logger.error(result.stderr[:500])
# Fallback: cria FAISS vazio
logger.warning(" ⚠️ Attempting fallback: empty FAISS")
output_dir = config.get('faiss_path', '/home/user/app/faiss_index')
if create_empty_faiss(output_dir):
logger.info(" ✅ Fallback successful")
else:
raise RuntimeError("FAISS build failed and fallback failed")
def main():
try:
config = load_config()
# 1. Download
chunks = download_chunks(config)
if not chunks:
logger.error("\n❌ No chunks downloaded successfully!")
logger.warning("\n⚠️ Creating empty FAISS to allow app to start")
output_dir = config.get('faiss_path', '/home/user/app/faiss_index')
if create_empty_faiss(output_dir):
logger.info("\n✅ App will start with empty index")
logger.info(" Configure correct GitHub repo and rebuild")
return
else:
raise RuntimeError("No data and failed to create empty FAISS")
# 2. Extract + Merge
jsonl_file = extract_and_merge(chunks, config)
# 3. Build FAISS
build_faiss(jsonl_file, config)
logger.info("\n" + "="*80)
logger.info("✅ SETUP COMPLETO!")
logger.info("="*80)
except Exception as e:
logger.error(f"\n❌ Setup failed: {e}")
import traceback
logger.error(traceback.format_exc())
# Último recurso: FAISS vazio
logger.warning("\n⚠️ Last resort: creating empty FAISS")
try:
config = load_config()
output_dir = config.get('faiss_path', '/home/user/app/faiss_index')
create_empty_faiss(output_dir)
logger.info("\n⚠️ App will start but with no data")
logger.info(" Fix config.yaml and rebuild Space")
except:
logger.error("\n❌ Complete failure. Check logs.")
raise
if __name__ == '__main__':
main()