Spaces:
Running
Running
| import os | |
| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| FILES_PATH = os.path.join(BASE_DIR, '..', 'knowledge_base') | |
| FAISS_INDEX = os.path.join(BASE_DIR, 'api', 'faiss_index') | |
| def run_ingestion(): | |
| print(f"Iniciando indexación desde -> {FILES_PATH}") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="paraphrase-multilingual-MiniLM-L12-v2" | |
| ) | |
| loader = DirectoryLoader( | |
| FILES_PATH, | |
| glob="**/*.pdf", | |
| loader_cls=PyPDFLoader, | |
| recursive=True | |
| ) | |
| documents = loader.load() | |
| if not documents: | |
| return "No se encontraron documentos." | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1200, | |
| chunk_overlap=250 | |
| ) | |
| texts = text_splitter.split_documents(documents) | |
| vector_db = FAISS.from_documents(texts, embeddings) | |
| vector_db.save_local(FAISS_INDEX) | |
| return f" Éxito: {len(texts)} fragmentos guardados en {FAISS_INDEX}" | |
| if __name__ == "__main__": | |
| print(run_ingestion()) |