| | import os
|
| | import logging
|
| | from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| | from langchain_community.document_loaders import PyPDFLoader
|
| | from langchain_community.vectorstores import FAISS
|
| | from langchain_openai import OpenAIEmbeddings
|
| | from dotenv import load_dotenv
|
| |
|
| | load_dotenv()
|
| |
|
| |
|
| | logging.basicConfig(level=logging.INFO)
|
| | logger = logging.getLogger(__name__)
|
| |
|
| | embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
|
| |
|
| |
|
| | def Ingest_Data(pdf_path: str, vector_db_path: str = "vectorstore/db_faiss"):
|
| | """
|
| | Ingests a PDF, splits it, and saves the vector store.
|
| | Returns a dict with status to send back to the Frontend.
|
| | """
|
| | try:
|
| | logger.info(f"Starting ingestion for: {pdf_path}")
|
| |
|
| |
|
| | if not os.path.exists(pdf_path):
|
| | raise FileNotFoundError(f"The file {pdf_path} was not found.")
|
| |
|
| |
|
| | loader = PyPDFLoader(pdf_path)
|
| | pages = loader.load_and_split()
|
| |
|
| | if not pages:
|
| | return {"status": "error", "message": "PDF contains no text."}
|
| |
|
| |
|
| | splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
|
| | docs = splitter.split_documents(pages)
|
| | logger.info(f"Processing {len(docs)} chunks...")
|
| |
|
| |
|
| |
|
| |
|
| | db = FAISS.from_documents(docs, embeddings)
|
| | db.save_local(vector_db_path)
|
| |
|
| | logger.info(f"Saved vectorstore to {vector_db_path}")
|
| |
|
| |
|
| | return {
|
| | "status": "success",
|
| | "chunks_processed": len(docs),
|
| | "db_path": vector_db_path,
|
| | "message": "File successfully ingested and indexed."
|
| | }
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Ingestion failed: {str(e)}")
|
| | return {
|
| | "status": "failed",
|
| | "error": str(e)
|
| | }
|
| |
|
| |
|
| |
|
| | |