|
|
|
|
|
import os, sys, json, argparse, logging, time |
|
|
from pathlib import Path |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def load_jsonl(filepath: str): |
|
|
logger.info(f"📂 {filepath}") |
|
|
|
|
|
if not Path(filepath).exists(): |
|
|
raise FileNotFoundError(f"Não encontrado: {filepath}") |
|
|
|
|
|
records = [] |
|
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
if line.strip(): |
|
|
try: |
|
|
records.append(json.loads(line)) |
|
|
except: |
|
|
pass |
|
|
|
|
|
logger.info(f"✅ {len(records):,} registros") |
|
|
return records |
|
|
|
|
|
def create_documents(records): |
|
|
logger.info("📝 Criando Documents...") |
|
|
|
|
|
from langchain_core.documents import Document |
|
|
|
|
|
documents = [] |
|
|
for i, record in enumerate(records, 1): |
|
|
ementa = record.get('ementa', '') |
|
|
doc_id = record.get('id', f'doc_{i}') |
|
|
|
|
|
if not ementa or not str(ementa).strip(): |
|
|
continue |
|
|
|
|
|
documents.append(Document( |
|
|
page_content=str(ementa), |
|
|
metadata={'id': str(doc_id), 'source': 'tjpr'} |
|
|
)) |
|
|
|
|
|
logger.info(f"✅ {len(documents):,} Documents") |
|
|
|
|
|
if not documents: |
|
|
raise ValueError("Nenhum documento!") |
|
|
|
|
|
return documents |
|
|
|
|
|
def build_vectorstore(input_file, output_dir, model_name, batch_size): |
|
|
try: |
|
|
logger.info("="*80) |
|
|
logger.info("🚀 RAG Builder v3.5") |
|
|
logger.info("="*80) |
|
|
|
|
|
logger.info("\n1️⃣ Carregando...") |
|
|
records = load_jsonl(input_file) |
|
|
|
|
|
logger.info("\n2️⃣ Documents...") |
|
|
documents = create_documents(records) |
|
|
|
|
|
logger.info("\n3️⃣ Embeddings...") |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings( |
|
|
model_name=model_name, |
|
|
model_kwargs={'device': 'cpu'}, |
|
|
encode_kwargs={ |
|
|
'batch_size': batch_size, |
|
|
'normalize_embeddings': True |
|
|
} |
|
|
) |
|
|
logger.info(" ✅ OK") |
|
|
|
|
|
logger.info(f"\n4️⃣ FAISS ({len(documents):,} docs)...") |
|
|
from langchain_community.vectorstores import FAISS |
|
|
|
|
|
start = time.time() |
|
|
vectorstore = FAISS.from_documents(documents, embeddings) |
|
|
elapsed = time.time() - start |
|
|
|
|
|
logger.info(f" ✅ {elapsed:.1f}s") |
|
|
|
|
|
logger.info(f"\n5️⃣ Salvando em {output_dir}...") |
|
|
|
|
|
if not os.path.exists(output_dir): |
|
|
logger.error(f" ❌ Diretório não existe: {output_dir}") |
|
|
raise FileNotFoundError(f"Diretório não encontrado: {output_dir}") |
|
|
|
|
|
vectorstore.save_local(output_dir) |
|
|
logger.info(f" ✅ Salvo!") |
|
|
|
|
|
logger.info("\n✅ COMPLETO!") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"\n❌ {type(e).__name__}: {e}") |
|
|
import traceback |
|
|
logger.error(traceback.format_exc()) |
|
|
sys.exit(1) |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument('--input', required=True) |
|
|
parser.add_argument('--output', default='/home/user/app/faiss_index') |
|
|
parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2') |
|
|
parser.add_argument('--batch-size', type=int, default=16) |
|
|
args = parser.parse_args() |
|
|
|
|
|
build_vectorstore(args.input, args.output, args.model, args.batch_size) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|