rahPara / rag_builder.py
caarleexx's picture
Upload 7 files
bcc6e2c verified
#!/usr/bin/env python3
import os, sys, json, argparse, logging, time
from pathlib import Path
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)
def load_jsonl(filepath: str):
logger.info(f"📂 {filepath}")
if not Path(filepath).exists():
raise FileNotFoundError(f"Não encontrado: {filepath}")
records = []
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
try:
records.append(json.loads(line))
except:
pass
logger.info(f"✅ {len(records):,} registros")
return records
def create_documents(records):
logger.info("📝 Criando Documents...")
from langchain_core.documents import Document
documents = []
for i, record in enumerate(records, 1):
ementa = record.get('ementa', '')
doc_id = record.get('id', f'doc_{i}')
if not ementa or not str(ementa).strip():
continue
documents.append(Document(
page_content=str(ementa),
metadata={'id': str(doc_id), 'source': 'tjpr'}
))
logger.info(f"✅ {len(documents):,} Documents")
if not documents:
raise ValueError("Nenhum documento!")
return documents
def build_vectorstore(input_file, output_dir, model_name, batch_size):
try:
logger.info("="*80)
logger.info("🚀 RAG Builder v3.5")
logger.info("="*80)
logger.info("\n1️⃣ Carregando...")
records = load_jsonl(input_file)
logger.info("\n2️⃣ Documents...")
documents = create_documents(records)
logger.info("\n3️⃣ Embeddings...")
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu'},
encode_kwargs={
'batch_size': batch_size,
'normalize_embeddings': True
}
)
logger.info(" ✅ OK")
logger.info(f"\n4️⃣ FAISS ({len(documents):,} docs)...")
from langchain_community.vectorstores import FAISS
start = time.time()
vectorstore = FAISS.from_documents(documents, embeddings)
elapsed = time.time() - start
logger.info(f" ✅ {elapsed:.1f}s")
logger.info(f"\n5️⃣ Salvando em {output_dir}...")
if not os.path.exists(output_dir):
logger.error(f" ❌ Diretório não existe: {output_dir}")
raise FileNotFoundError(f"Diretório não encontrado: {output_dir}")
vectorstore.save_local(output_dir)
logger.info(f" ✅ Salvo!")
logger.info("\n✅ COMPLETO!")
except Exception as e:
logger.error(f"\n❌ {type(e).__name__}: {e}")
import traceback
logger.error(traceback.format_exc())
sys.exit(1)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input', required=True)
parser.add_argument('--output', default='/home/user/app/faiss_index')
parser.add_argument('--model', default='sentence-transformers/all-MiniLM-L6-v2')
parser.add_argument('--batch-size', type=int, default=16)
args = parser.parse_args()
build_vectorstore(args.input, args.output, args.model, args.batch_size)
if __name__ == '__main__':
main()