File size: 2,206 Bytes
db1e5a4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import os
import logging
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
# 1. Setup Logging (Better than print for Servers)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
# 2. Add arguments for flexible paths
def Ingest_Data(pdf_path: str, vector_db_path: str = "vectorstore/db_faiss"):
"""
Ingests a PDF, splits it, and saves the vector store.
Returns a dict with status to send back to the Frontend.
"""
try:
logger.info(f"Starting ingestion for: {pdf_path}")
# Validation: Check if file exists
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"The file {pdf_path} was not found.")
# Load
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()
if not pages:
return {"status": "error", "message": "PDF contains no text."}
# Split
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
docs = splitter.split_documents(pages)
logger.info(f"Processing {len(docs)} chunks...")
# Embed & Save
# Note: This is CPU/Network intensive. In FastAPI,
# ensure you run this in a BackgroundTask or ThreadPool.
db = FAISS.from_documents(docs, embeddings)
db.save_local(vector_db_path)
logger.info(f"Saved vectorstore to {vector_db_path}")
# 3. Return JSON-friendly data
return {
"status": "success",
"chunks_processed": len(docs),
"db_path": vector_db_path,
"message": "File successfully ingested and indexed."
}
except Exception as e:
logger.error(f"Ingestion failed: {str(e)}")
return {
"status": "failed",
"error": str(e)
}
#Ingest_Data("MLBOOK.pdf") |