fintech-coop-api / scripts /load_data.py
TomacGonz's picture
Upload 67 files
a2fe39f verified
import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
FILES_PATH = os.path.join(BASE_DIR, '..', 'knowledge_base')
FAISS_INDEX = os.path.join(BASE_DIR, 'api', 'faiss_index')
def run_ingestion():
print(f"Iniciando indexación desde -> {FILES_PATH}")
embeddings = HuggingFaceEmbeddings(
model_name="paraphrase-multilingual-MiniLM-L12-v2"
)
loader = DirectoryLoader(
FILES_PATH,
glob="**/*.pdf",
loader_cls=PyPDFLoader,
recursive=True
)
documents = loader.load()
if not documents:
return "No se encontraron documentos."
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1200,
chunk_overlap=250
)
texts = text_splitter.split_documents(documents)
vector_db = FAISS.from_documents(texts, embeddings)
vector_db.save_local(FAISS_INDEX)
return f" Éxito: {len(texts)} fragmentos guardados en {FAISS_INDEX}"
if __name__ == "__main__":
print(run_ingestion())