File size: 4,110 Bytes
d9446cd
 
 
 
 
 
 
 
 
 
cb3b8cb
d9446cd
 
 
cb3b8cb
 
 
 
d9446cd
 
 
 
 
cb3b8cb
 
 
d9446cd
 
cb3b8cb
 
d9446cd
 
 
 
 
cb3b8cb
d9446cd
 
 
 
 
 
 
 
 
 
 
 
cb3b8cb
 
d9446cd
 
cb3b8cb
 
d9446cd
 
 
cb3b8cb
 
 
 
 
 
 
 
 
d9446cd
 
cb3b8cb
d9446cd
 
cb3b8cb
d9446cd
cb3b8cb
 
 
d9446cd
cb3b8cb
 
d9446cd
cb3b8cb
d9446cd
 
cb3b8cb
d9446cd
cb3b8cb
 
 
d9446cd
cb3b8cb
 
 
d9446cd
cb3b8cb
 
d9446cd
 
 
cb3b8cb
d9446cd
 
 
cb3b8cb
d9446cd
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
import os
import sys
import yaml
import json
import subprocess
import logging
from pathlib import Path
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

STATUS_FILE = Path('/tmp/setup_status.json')
READY_FLAG = Path('/tmp/faiss_ready')

def update_status(status, message, progress=0):
    data = {'status': status, 'message': message, 'progress': progress, 'timestamp': datetime.now().isoformat()}
    with open(STATUS_FILE, 'w') as f:
        json.dump(data, f)
    logger.info(f"[{progress}%] {status}: {message}")
    sys.stdout.flush()

def run_cmd(cmd, desc):
    logger.info(f"Executando: {desc}")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        logger.error(f"ERRO: {result.stderr}")
        raise Exception(f"{desc} falhou")
    logger.info(f"✅ {desc}")
    return result.stdout

def main():
    try:
        logger.info("="*80)
        logger.info("🚀 PARA.AI RAG (LangChain) - SETUP EM BACKGROUND")
        logger.info("="*80)

        update_status('loading', 'Carregando configuração', 0)
        with open('config.yaml') as f:
            config = yaml.safe_load(f)

        cluster_id = config['cluster_id']
        chunk_start = config['chunk_start']
        chunk_end = config['chunk_end']
        github_repo = config['github_repo']

        if READY_FLAG.exists():
            logger.info("✅ FAISS já pronto!")
            update_status('ready', 'FAISS já existe', 100)
            return

        # CLONE
        update_status('cloning', 'Clonando chunks (sparse checkout)', 10)
        os.makedirs('/tmp/repo', exist_ok=True)
        os.chdir('/tmp/repo')

        run_cmd(f"git clone --filter=blob:none --sparse {github_repo} .", "Git clone")
        run_cmd("git sparse-checkout init --cone", "Sparse checkout init")

        patterns = [f"chunks_dados/chunk_dados_{i:04d}.tar.gz" for i in range(chunk_start, chunk_end + 1)]
        for i in range(0, len(patterns), 50):
            batch = ' '.join(patterns[i:i+50])
            run_cmd(f"git sparse-checkout add {batch}", f"Batch {i//50 + 1}")

        chunks_count = int(run_cmd("find chunks_dados -name '*.tar.gz' 2>/dev/null | wc -l", "Contar chunks").strip())
        logger.info(f"✅ {chunks_count} chunks clonados")

        # EXTRACT
        update_status('extracting', f'Descompactando {chunks_count} chunks', 30)
        os.makedirs('/tmp/extracted', exist_ok=True)
        run_cmd("find chunks_dados -name '*.tar.gz' -exec tar -xzf {} -C /tmp/extracted \; 2>/dev/null || true", "Descompactar")

        # CONCAT
        update_status('concatenating', 'Concatenando JSONL', 50)
        run_cmd("find /tmp/extracted -name 'jurisprudencias.jsonl' -exec cat {} \; > /tmp/all_records.jsonl 2>/dev/null || true", "Concatenar")

        total_records = int(run_cmd("wc -l < /tmp/all_records.jsonl 2>/dev/null || echo '0'", "Contar registros").strip())
        logger.info(f"✅ {total_records:,} registros")

        # FILTER
        update_status('filtering', 'Filtrando campos (id + ementa)', 60)
        os.chdir('/home/user/app')
        run_cmd("python3 filter_fields.py --input /tmp/all_records.jsonl --output /tmp/filtered.jsonl", "Filtrar")

        # BUILD FAISS
        update_status('building', 'Construindo FAISS index (pode demorar)', 70)
        run_cmd("python3 rag_builder.py --input /tmp/filtered.jsonl", "Build FAISS")

        # CLEANUP
        update_status('cleaning', 'Limpando temporários', 95)
        run_cmd("rm -rf /tmp/repo /tmp/extracted /tmp/all_records.jsonl /tmp/filtered.jsonl", "Limpar")

        # DONE
        update_status('ready', f'FAISS pronto com {total_records:,} registros!', 100)
        READY_FLAG.touch()

        logger.info("="*80)
        logger.info("✅ SETUP COMPLETO!")
        logger.info("="*80)

    except Exception as e:
        logger.error(f"❌ ERRO: {e}")
        update_status('error', str(e), 0)
        sys.exit(1)

if __name__ == "__main__":
    main()