Spaces:

moazx
/

Agentic-Medical-RAG-Chatbot

Sleeping

File size: 7,137 Bytes

c5e1945

# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
import logging

# For PDF processing - now using LangChain's PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader

# Configure logging
logger = logging.getLogger(__name__)

# --- Existing functions (as provided by user) ---

# Define a placeholder for COMPANY_INFO_DIR if it's not defined in config.py
# In a real application, ensure config.py is accessible or pass this path.
try:
    from config import COMPANY_INFO_DIR
except ImportError:
    logger.warning("COMPANY_INFO_DIR not found in config.py. Using a default placeholder.")
    COMPANY_INFO_DIR = Path("./company_info") # Placeholder path, adjust as needed

def load_faq_documents(faq_path: Path = Path(COMPANY_INFO_DIR) / "FAQ.csv") -> List[Document]:
    """
    Load and process FAQ documents from CSV file.
    
    Args:
        faq_path: Path to the FAQ CSV file
        
    Returns:
        List of Document objects
    """
    try:
        # Validate file exists
        if not faq_path.exists():
            raise FileNotFoundError(f"FAQ file not found at {faq_path}")
            
        df = pd.read_csv(faq_path)
        
        # Validate required columns
        required_cols = ['Question', 'Answer']
        if not all(col in df.columns for col in required_cols):
            raise ValueError(f"CSV must contain columns: {required_cols}")
            
        documents = []
        for idx, row in df.iterrows():
            content = f"Question: {row.get('Question', '')}\nAnswer: {row.get('Answer', '')}"
            
            doc = Document(
                page_content=content,
                metadata={
                    "source": "company_faq",
                    "type": "faq", 
                    "doc_id": f"{idx}", 
                    "filename": faq_path.name
                }
            )
            documents.append(doc)
            
        logger.info(f"Loaded {len(documents)} FAQ documents from {faq_path.name}")
        return documents
        
    except Exception as e:
        logger.error(f"Error loading FAQ documents from {faq_path.name}: {str(e)}")
        raise


def load_company_info(info_path: Path = Path(COMPANY_INFO_DIR) / "info.md") -> Document:
    """
    Load company information from markdown file.
    
    Args:
        info_path: Path to the company info markdown file
        
    Returns:
        Document object containing company info
    """
    try:
        # Validate file exists
        if not info_path.exists():
            raise FileNotFoundError(f"Info file not found at {info_path}")
            
        with open(info_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        doc = Document(
            page_content=content,
            metadata={
                "source": "company_info",
                "type": "general_info",
                "filename": info_path.name,
                "doc_id": "company_info_main"
            }
        )
        logger.info(f"Loaded company info document from {info_path.name}")
        return doc
        
    except Exception as e:
        logger.error(f"Error loading company info from {info_path.name}: {str(e)}")
        raise

# --- New functions for PDF, TXT, and Image loading ---

def load_pdf_document(file_path: Path) -> List[Document]:
    """
    Load text from a PDF file using LangChain's PyPDFLoader.
    Each page is treated as a separate document.
    
    Args:
        file_path: Path to the PDF file.
        
    Returns:
        A list of Document objects, one for each page of the PDF.
    """
    documents = []
    try:
        if not file_path.exists():
            raise FileNotFoundError(f"PDF file not found at {file_path}")
        
        loader = PyPDFLoader(str(file_path)) # PyPDFLoader expects a string path
        docs = loader.load() # This returns a list of LangChain Document objects

        # Enhance metadata for consistency and add source/type
        for doc in docs:
            doc.metadata["source"] = "uploaded_file"
            doc.metadata["type"] = "pdf"
            doc.metadata["filename"] = file_path.name
            # PyPDFLoader usually adds 'page' and 'source' (which is the file path)
            # We can use the existing 'page' if it's there or default to 0
            page_num = doc.metadata.get("page", 0) 
            doc.metadata["doc_id"] = f"{file_path.stem}_page_{page_num + 1}" # Ensure page number is 1-indexed

        documents.extend(docs)
        
        logger.info(f"Loaded {len(documents)} pages from PDF using PyPDFLoader: {file_path.name}")
        return documents
    except Exception as e:
        logger.error(f"Error loading PDF file {file_path.name} with PyPDFLoader: {str(e)}")
        raise

def load_txt_document(file_path: Path) -> Document:
    """
    Load text from a TXT file.
    
    Args:
        file_path: Path to the TXT file.
        
    Returns:
        A Document object containing the text from the file.
    """
    try:
        if not file_path.exists():
            raise FileNotFoundError(f"TXT file not found at {file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        doc = Document(
            page_content=content,
            metadata={
                "source": "uploaded_file",
                "type": "txt",
                "filename": file_path.name,
                "doc_id": file_path.stem
            }
        )
        logger.info(f"Loaded TXT file: {file_path.name}")
        return doc
    except Exception as e:
        logger.error(f"Error loading TXT file {file_path.name}: {str(e)}")
        raise


def process_uploaded_file(file_path: Path) -> List[Document]:
    """
    Determines the file extension and calls the appropriate function to process it.
    
    Args:
        file_path: Path to the uploaded file.
        
    Returns:
        A list of Document objects containing the extracted text.
        Returns an empty list if the file type is unsupported or an error occurs.
    """
    documents = []
    try:
        if not file_path.exists():
            raise FileNotFoundError(f"File not found at {file_path}")

        extension = file_path.suffix.lower()

        if extension == '.pdf':
            documents = load_pdf_document(file_path)
        elif extension == '.txt':
            documents = [load_txt_document(file_path)] # Wrap in list for consistency
        else:
            logger.warning(f"Unsupported file type for {file_path.name}: {extension}")
            # Optionally, you could raise an error here if unsupported files should halt execution
            # raise ValueError(f"Unsupported file type: {extension}")
            return [] # Return empty list for unsupported types
            
    except FileNotFoundError as fnfe:
        logger.error(f"Processing failed: {fnfe}")
    except Exception as e:
        logger.error(f"An unexpected error occurred while processing {file_path.name}: {str(e)}")
    
    return documents