Spaces:

moazx
/

Agentic-Medical-RAG-Chatbot

Sleeping

App Files Files Community

Agentic-Medical-RAG-Chatbot / src /data_loaders.py

moazx

Project setup

c5e1945 4 months ago

raw

history blame contribute delete

7.14 kB

	# Import required libraries
	import pandas as pd
	from pathlib import Path
	from typing import List
	from langchain.schema import Document
	import logging

	# For PDF processing - now using LangChain's PyPDFLoader
	from langchain_community.document_loaders import PyPDFLoader

	# Configure logging
	logger = logging.getLogger(__name__)

	# --- Existing functions (as provided by user) ---

	# Define a placeholder for COMPANY_INFO_DIR if it's not defined in config.py
	# In a real application, ensure config.py is accessible or pass this path.
	try:
	from config import COMPANY_INFO_DIR
	except ImportError:
	logger.warning("COMPANY_INFO_DIR not found in config.py. Using a default placeholder.")
	COMPANY_INFO_DIR = Path("./company_info") # Placeholder path, adjust as needed

	def load_faq_documents(faq_path: Path = Path(COMPANY_INFO_DIR) / "FAQ.csv") -> List[Document]:
	"""
	Load and process FAQ documents from CSV file.

	Args:
	faq_path: Path to the FAQ CSV file

	Returns:
	List of Document objects
	"""
	try:
	# Validate file exists
	if not faq_path.exists():
	raise FileNotFoundError(f"FAQ file not found at {faq_path}")

	df = pd.read_csv(faq_path)

	# Validate required columns
	required_cols = ['Question', 'Answer']
	if not all(col in df.columns for col in required_cols):
	raise ValueError(f"CSV must contain columns: {required_cols}")

	documents = []
	for idx, row in df.iterrows():
	content = f"Question: {row.get('Question', '')}\nAnswer: {row.get('Answer', '')}"

	doc = Document(
	page_content=content,
	metadata={
	"source": "company_faq",
	"type": "faq",
	"doc_id": f"{idx}",
	"filename": faq_path.name
	}
	)
	documents.append(doc)

	logger.info(f"Loaded {len(documents)} FAQ documents from {faq_path.name}")
	return documents

	except Exception as e:
	logger.error(f"Error loading FAQ documents from {faq_path.name}: {str(e)}")
	raise


	def load_company_info(info_path: Path = Path(COMPANY_INFO_DIR) / "info.md") -> Document:
	"""
	Load company information from markdown file.

	Args:
	info_path: Path to the company info markdown file

	Returns:
	Document object containing company info
	"""
	try:
	# Validate file exists
	if not info_path.exists():
	raise FileNotFoundError(f"Info file not found at {info_path}")

	with open(info_path, 'r', encoding='utf-8') as f:
	content = f.read()

	doc = Document(
	page_content=content,
	metadata={
	"source": "company_info",
	"type": "general_info",
	"filename": info_path.name,
	"doc_id": "company_info_main"
	}
	)
	logger.info(f"Loaded company info document from {info_path.name}")
	return doc

	except Exception as e:
	logger.error(f"Error loading company info from {info_path.name}: {str(e)}")
	raise

	# --- New functions for PDF, TXT, and Image loading ---

	def load_pdf_document(file_path: Path) -> List[Document]:
	"""
	Load text from a PDF file using LangChain's PyPDFLoader.
	Each page is treated as a separate document.

	Args:
	file_path: Path to the PDF file.

	Returns:
	A list of Document objects, one for each page of the PDF.
	"""
	documents = []
	try:
	if not file_path.exists():
	raise FileNotFoundError(f"PDF file not found at {file_path}")

	loader = PyPDFLoader(str(file_path)) # PyPDFLoader expects a string path
	docs = loader.load() # This returns a list of LangChain Document objects

	# Enhance metadata for consistency and add source/type
	for doc in docs:
	doc.metadata["source"] = "uploaded_file"
	doc.metadata["type"] = "pdf"
	doc.metadata["filename"] = file_path.name
	# PyPDFLoader usually adds 'page' and 'source' (which is the file path)
	# We can use the existing 'page' if it's there or default to 0
	page_num = doc.metadata.get("page", 0)
	doc.metadata["doc_id"] = f"{file_path.stem}_page_{page_num + 1}" # Ensure page number is 1-indexed

	documents.extend(docs)

	logger.info(f"Loaded {len(documents)} pages from PDF using PyPDFLoader: {file_path.name}")
	return documents
	except Exception as e:
	logger.error(f"Error loading PDF file {file_path.name} with PyPDFLoader: {str(e)}")
	raise

	def load_txt_document(file_path: Path) -> Document:
	"""
	Load text from a TXT file.

	Args:
	file_path: Path to the TXT file.

	Returns:
	A Document object containing the text from the file.
	"""
	try:
	if not file_path.exists():
	raise FileNotFoundError(f"TXT file not found at {file_path}")

	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	doc = Document(
	page_content=content,
	metadata={
	"source": "uploaded_file",
	"type": "txt",
	"filename": file_path.name,
	"doc_id": file_path.stem
	}
	)
	logger.info(f"Loaded TXT file: {file_path.name}")
	return doc
	except Exception as e:
	logger.error(f"Error loading TXT file {file_path.name}: {str(e)}")
	raise


	def process_uploaded_file(file_path: Path) -> List[Document]:
	"""
	Determines the file extension and calls the appropriate function to process it.

	Args:
	file_path: Path to the uploaded file.

	Returns:
	A list of Document objects containing the extracted text.
	Returns an empty list if the file type is unsupported or an error occurs.
	"""
	documents = []
	try:
	if not file_path.exists():
	raise FileNotFoundError(f"File not found at {file_path}")

	extension = file_path.suffix.lower()

	if extension == '.pdf':
	documents = load_pdf_document(file_path)
	elif extension == '.txt':
	documents = [load_txt_document(file_path)] # Wrap in list for consistency
	else:
	logger.warning(f"Unsupported file type for {file_path.name}: {extension}")
	# Optionally, you could raise an error here if unsupported files should halt execution
	# raise ValueError(f"Unsupported file type: {extension}")
	return [] # Return empty list for unsupported types

	except FileNotFoundError as fnfe:
	logger.error(f"Processing failed: {fnfe}")
	except Exception as e:
	logger.error(f"An unexpected error occurred while processing {file_path.name}: {str(e)}")

	return documents