#!/usr/bin/env python3 import os, sys, json, argparse, logging, time from pathlib import Path logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s') logger = logging.getLogger(__name__) def load_jsonl(filepath: str): logger.info(f"📂 {filepath}") if not Path(filepath).exists(): raise FileNotFoundError(f"Não encontrado: {filepath}") records = [] with open(filepath, 'r', encoding='utf-8') as f: for line in f: if line.strip(): try: records.append(json.loads(line)) except: pass logger.info(f"✅ {len(records):,} registros") return records def create_documents(records): logger.info("📝 Criando Documents...") from langchain_core.documents import Document documents = [] for i, record in enumerate(records, 1): ementa = record.get('ementa', '') doc_id = record.get('id', f'doc_{i}') if not ementa or not str(ementa).strip(): continue documents.append(Document( page_content=str(ementa), metadata={'id': str(doc_id), 'source': 'tjpr'} )) logger.info(f"✅ {len(documents):,} Documents") if not documents: raise ValueError("Nenhum documento!") return documents def build_vectorstore(input_file, output_dir, model_name, batch_size): try: logger.info("="*80) logger.info("🚀 RAG Builder v3.5") logger.info("="*80) logger.info("\n1️⃣ Carregando...") records = load_jsonl(input_file) logger.info("\n2️⃣ Documents...") documents = create_documents(records) logger.info("\n3️⃣ Embeddings...") from langchain_community.embeddings import HuggingFaceEmbeddings embeddings = HuggingFaceEmbeddings( model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs={ 'batch_size': batch_size, 'normalize_embeddings': True } ) logger.info(" ✅ OK") logger.info(f"\n4️⃣ FAISS ({len(documents):,} docs)...") from langchain_community.vectorstores import FAISS start = time.time() vectorstore = FAISS.from_documents(documents, embeddings) elapsed = time.time() - start logger.info(f" ✅ {elapsed:.1f}s") logger.info(f"\n5️⃣ Salvando em {output_dir}...") if not os.path.exists(output_dir): logger.error(f" ❌ Diretório não existe: {output_dir}") raise FileNotFoundError(f"Diretório não encontrado: {output_dir}") vectorstore.save_local(output_dir) logger.info(f" ✅ Salvo!") logger.info("\n✅ COMPLETO!") except Exception as e: logger.error(f"\n❌ {type(e).__name__}: {e}") import traceback logger.error(traceback.format_exc()) sys.exit(1) def main(): parser = argparse.ArgumentParser() parser.add_argument('--input', required=True) parser.add_argument('--output', default='/home/user/app/faiss_index') parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2') parser.add_argument('--batch-size', type=int, default=16) args = parser.parse_args() build_vectorstore(args.input, args.output, args.model, args.batch_size) if __name__ == '__main__': main()