File size: 3,476 Bytes
9514a77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
import os, sys, yaml, json, subprocess, logging
from pathlib import Path
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

STATUS_FILE = Path('/tmp/setup_status.json')
READY_FLAG = Path('/tmp/faiss_ready')

def update_status(status, message, progress=0):
    data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
    with open(STATUS_FILE, 'w') as f:
        json.dump(data, f)
    logger.info(f"[{progress}%] {status}: {message}")
    sys.stdout.flush()

def run_cmd(cmd, desc):
    logger.info(f"Executando: {desc}")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        logger.error(f"ERRO: {result.stderr}")
        raise Exception(f"{desc} falhou")
    logger.info(f"✅ {desc}")
    return result.stdout

def main():
    try:
        logger.info("="*80)
        logger.info("🚀 PARA.AI RAG - SETUP EM BACKGROUND")
        logger.info("="*80)

        update_status('loading', 'Carregando configuração', 0)
        with open('config.yaml') as f:
            config = yaml.safe_load(f)

        if READY_FLAG.exists():
            logger.info("✅ FAISS já pronto!")
            update_status('ready', 'FAISS já existe', 100)
            return

        update_status('cloning', 'Clonando chunks', 10)
        os.makedirs('/tmp/repo', exist_ok=True)
        os.chdir('/tmp/repo')

        run_cmd(f"git clone --filter=blob:none --sparse {config['github_repo']} .", "Git clone")
        run_cmd("git sparse-checkout init --cone", "Sparse init")

        patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(config['chunk_start'], config['chunk_end']+1)]
        for i in range(0, len(patterns), 50):
            run_cmd(f"git sparse-checkout add {' '.join(patterns[i:i+50])}", f"Batch {i//50+1}")

        update_status('extracting', 'Descompactando', 30)
        os.makedirs('/tmp/extracted', exist_ok=True)
        run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Extract")

        update_status('concatenating', 'Concatenando JSONL', 50)
        run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concat")

        total = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Count").strip())
        logger.info(f"✅ {total:,} registros")

        update_status('filtering', 'Filtrando campos', 60)
        os.chdir('/home/user/app')
        run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filter")

        update_status('building', 'Construindo FAISS', 70)
        run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS")

        update_status('cleaning', 'Limpando', 95)
        run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Clean")

        update_status('ready', f'FAISS pronto com {total:,} registros!', 100)
        READY_FLAG.touch()

        logger.info("="*80)
        logger.info("✅ SETUP COMPLETO!")
        logger.info("="*80)

    except Exception as e:
        logger.error(f"❌ ERRO: {e}")
        update_status('error', f'Build FAISS falhou: {str(e)}', 0)
        sys.exit(1)

if __name__ == "__main__":
    main()