Spaces:

junaid17
/

cortex

Sleeping

App Files Files Community

cortex / data_ingestion.py

junaid17

Upload 6 files

db1e5a4 verified about 2 months ago

raw

history blame contribute delete

2.21 kB

	import os
	import logging
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_community.vectorstores import FAISS
	from langchain_openai import OpenAIEmbeddings
	from dotenv import load_dotenv

	load_dotenv()

	# 1. Setup Logging (Better than print for Servers)
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

	# 2. Add arguments for flexible paths
	def Ingest_Data(pdf_path: str, vector_db_path: str = "vectorstore/db_faiss"):
	"""
	Ingests a PDF, splits it, and saves the vector store.
	Returns a dict with status to send back to the Frontend.
	"""
	try:
	logger.info(f"Starting ingestion for: {pdf_path}")

	# Validation: Check if file exists
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(f"The file {pdf_path} was not found.")

	# Load
	loader = PyPDFLoader(pdf_path)
	pages = loader.load_and_split()

	if not pages:
	return {"status": "error", "message": "PDF contains no text."}

	# Split
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
	docs = splitter.split_documents(pages)
	logger.info(f"Processing {len(docs)} chunks...")

	# Embed & Save
	# Note: This is CPU/Network intensive. In FastAPI,
	# ensure you run this in a BackgroundTask or ThreadPool.
	db = FAISS.from_documents(docs, embeddings)
	db.save_local(vector_db_path)

	logger.info(f"Saved vectorstore to {vector_db_path}")

	# 3. Return JSON-friendly data
	return {
	"status": "success",
	"chunks_processed": len(docs),
	"db_path": vector_db_path,
	"message": "File successfully ingested and indexed."
	}

	except Exception as e:
	logger.error(f"Ingestion failed: {str(e)}")
	return {
	"status": "failed",
	"error": str(e)
	}



	#Ingest_Data("MLBOOK.pdf")