File size: 3,878 Bytes
9514a77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
import os, sys, json, argparse, logging, traceback
from pathlib import Path
from typing import List, Dict
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def load_jsonl(filepath: str) -> List[Dict]:
    records = []
    try:
        logger.info(f"📂 Abrindo: {filepath}")
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"Arquivo não encontrado: {filepath}")
        with open(filepath, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f, 1):
                if line.strip():
                    records.append(json.loads(line))
                if i % 50000 == 0:
                    logger.info(f"  {i:,} linhas...")
        logger.info(f"✅ {len(records):,} registros")
        return records
    except Exception as e:
        logger.error(f"❌ Erro: {e}")
        raise

def create_documents(records: List[Dict]) -> List[Document]:
    documents = []
    for i, record in enumerate(records, 1):
        ementa = record.get('ementa', '')
        if ementa:
            documents.append(Document(
                page_content=ementa,
                metadata={'id': str(record.get('id', f'u{i}')), 'source': 'tjpr'}
            ))
        if i % 50000 == 0:
            logger.info(f"  {i:,}/{len(records):,}...")
    logger.info(f"✅ {len(documents):,} documentos")
    return documents

def build_vectorstore(input_file, output_dir='/app/faiss_index', model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=16):
    try:
        import time
        logger.info("="*80)
        logger.info("🚀 RAG Builder - LangChain + FAISS")
        logger.info("="*80)

        logger.info("\nPASSO 1/5: Carregando JSONL")
        records = load_jsonl(input_file)
        if not records:
            raise ValueError("Nenhum registro!")

        logger.info("\nPASSO 2/5: Criando Documents")
        documents = create_documents(records)
        if not documents:
            raise ValueError("Nenhum documento!")

        logger.info(f"\nPASSO 3/5: Inicializando Embeddings ({model_name})")
        embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'batch_size': batch_size, 'show_progress_bar': True, 'normalize_embeddings': True}
        )
        logger.info("✅ Embeddings OK")

        logger.info(f"\nPASSO 4/5: Construindo FAISS ({len(documents):,} docs)")
        start = time.time()
        vectorstore = FAISS.from_documents(documents, embeddings)
        logger.info(f"✅ FAISS em {time.time()-start:.1f}s ({len(documents)/(time.time()-start):.0f} docs/s)")

        logger.info(f"\nPASSO 5/5: Salvando em {output_dir}")
        os.makedirs(output_dir, exist_ok=True)
        vectorstore.save_local(output_dir)
        logger.info("✅ Salvo!")

        logger.info("\n" + "="*80)
        logger.info("✅ BUILD COMPLETO!")
        logger.info("="*80)
        return vectorstore
    except Exception as e:
        logger.error("\n" + "="*80)
        logger.error(f"❌ ERRO: {type(e).__name__}: {e}")
        logger.error(traceback.format_exc())
        logger.error("="*80)
        raise

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', required=True)
    parser.add_argument('--output', default='/app/faiss_index')
    parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2')
    parser.add_argument('--batch-size', type=int, default=16)
    args = parser.parse_args()
    build_vectorstore(args.input, args.output, args.model, args.batch_size)

if __name__ == '__main__':
    main()