Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os, sys, json, argparse, logging, traceback | |
| from pathlib import Path | |
| from typing import List, Dict | |
| from langchain.docstore.document import Document | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def load_jsonl(filepath: str) -> List[Dict]: | |
| records = [] | |
| try: | |
| logger.info(f"📂 Abrindo: {filepath}") | |
| if not os.path.exists(filepath): | |
| raise FileNotFoundError(f"Arquivo não encontrado: {filepath}") | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| for i, line in enumerate(f, 1): | |
| if line.strip(): | |
| records.append(json.loads(line)) | |
| if i % 50000 == 0: | |
| logger.info(f" {i:,} linhas...") | |
| logger.info(f"✅ {len(records):,} registros") | |
| return records | |
| except Exception as e: | |
| logger.error(f"❌ Erro: {e}") | |
| raise | |
| def create_documents(records: List[Dict]) -> List[Document]: | |
| documents = [] | |
| for i, record in enumerate(records, 1): | |
| ementa = record.get('ementa', '') | |
| if ementa: | |
| documents.append(Document( | |
| page_content=ementa, | |
| metadata={'id': str(record.get('id', f'u{i}')), 'source': 'tjpr'} | |
| )) | |
| if i % 50000 == 0: | |
| logger.info(f" {i:,}/{len(records):,}...") | |
| logger.info(f"✅ {len(documents):,} documentos") | |
| return documents | |
| def build_vectorstore(input_file, output_dir='/app/faiss_index', model_name='sentence-transformers/all-MiniLM-L6-v2', batch_size=16): | |
| try: | |
| import time | |
| logger.info("="*80) | |
| logger.info("🚀 RAG Builder - LangChain + FAISS") | |
| logger.info("="*80) | |
| logger.info("\nPASSO 1/5: Carregando JSONL") | |
| records = load_jsonl(input_file) | |
| if not records: | |
| raise ValueError("Nenhum registro!") | |
| logger.info("\nPASSO 2/5: Criando Documents") | |
| documents = create_documents(records) | |
| if not documents: | |
| raise ValueError("Nenhum documento!") | |
| logger.info(f"\nPASSO 3/5: Inicializando Embeddings ({model_name})") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=model_name, | |
| model_kwargs={'device': 'cpu'}, | |
| encode_kwargs={'batch_size': batch_size, 'show_progress_bar': True, 'normalize_embeddings': True} | |
| ) | |
| logger.info("✅ Embeddings OK") | |
| logger.info(f"\nPASSO 4/5: Construindo FAISS ({len(documents):,} docs)") | |
| start = time.time() | |
| vectorstore = FAISS.from_documents(documents, embeddings) | |
| logger.info(f"✅ FAISS em {time.time()-start:.1f}s ({len(documents)/(time.time()-start):.0f} docs/s)") | |
| logger.info(f"\nPASSO 5/5: Salvando em {output_dir}") | |
| os.makedirs(output_dir, exist_ok=True) | |
| vectorstore.save_local(output_dir) | |
| logger.info("✅ Salvo!") | |
| logger.info("\n" + "="*80) | |
| logger.info("✅ BUILD COMPLETO!") | |
| logger.info("="*80) | |
| return vectorstore | |
| except Exception as e: | |
| logger.error("\n" + "="*80) | |
| logger.error(f"❌ ERRO: {type(e).__name__}: {e}") | |
| logger.error(traceback.format_exc()) | |
| logger.error("="*80) | |
| raise | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--input', required=True) | |
| parser.add_argument('--output', default='/app/faiss_index') | |
| parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2') | |
| parser.add_argument('--batch-size', type=int, default=16) | |
| args = parser.parse_args() | |
| build_vectorstore(args.input, args.output, args.model, args.batch_size) | |
| if __name__ == '__main__': | |
| main() | |