File size: 2,206 Bytes
db1e5a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import logging
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

# 1. Setup Logging (Better than print for Servers)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# 2. Add arguments for flexible paths
def Ingest_Data(pdf_path: str, vector_db_path: str = "vectorstore/db_faiss"):
    """

    Ingests a PDF, splits it, and saves the vector store.

    Returns a dict with status to send back to the Frontend.

    """
    try:
        logger.info(f"Starting ingestion for: {pdf_path}")

        # Validation: Check if file exists
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"The file {pdf_path} was not found.")

        # Load
        loader = PyPDFLoader(pdf_path)
        pages = loader.load_and_split()
        
        if not pages:
            return {"status": "error", "message": "PDF contains no text."}

        # Split
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
        docs = splitter.split_documents(pages)
        logger.info(f"Processing {len(docs)} chunks...")

        # Embed & Save
        # Note: This is CPU/Network intensive. In FastAPI, 
        # ensure you run this in a BackgroundTask or ThreadPool.
        db = FAISS.from_documents(docs, embeddings) 
        db.save_local(vector_db_path)
        
        logger.info(f"Saved vectorstore to {vector_db_path}")

        # 3. Return JSON-friendly data
        return {
            "status": "success",
            "chunks_processed": len(docs),
            "db_path": vector_db_path,
            "message": "File successfully ingested and indexed."
        }

    except Exception as e:
        logger.error(f"Ingestion failed: {str(e)}")
        return {
            "status": "failed", 
            "error": str(e)
        }
    

    
#Ingest_Data("MLBOOK.pdf")