moazx's picture
Project setup
c5e1945
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
import logging
# For PDF processing - now using LangChain's PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
# Configure logging
logger = logging.getLogger(__name__)
# --- Existing functions (as provided by user) ---
# Define a placeholder for COMPANY_INFO_DIR if it's not defined in config.py
# In a real application, ensure config.py is accessible or pass this path.
try:
from config import COMPANY_INFO_DIR
except ImportError:
logger.warning("COMPANY_INFO_DIR not found in config.py. Using a default placeholder.")
COMPANY_INFO_DIR = Path("./company_info") # Placeholder path, adjust as needed
def load_faq_documents(faq_path: Path = Path(COMPANY_INFO_DIR) / "FAQ.csv") -> List[Document]:
"""
Load and process FAQ documents from CSV file.
Args:
faq_path: Path to the FAQ CSV file
Returns:
List of Document objects
"""
try:
# Validate file exists
if not faq_path.exists():
raise FileNotFoundError(f"FAQ file not found at {faq_path}")
df = pd.read_csv(faq_path)
# Validate required columns
required_cols = ['Question', 'Answer']
if not all(col in df.columns for col in required_cols):
raise ValueError(f"CSV must contain columns: {required_cols}")
documents = []
for idx, row in df.iterrows():
content = f"Question: {row.get('Question', '')}\nAnswer: {row.get('Answer', '')}"
doc = Document(
page_content=content,
metadata={
"source": "company_faq",
"type": "faq",
"doc_id": f"{idx}",
"filename": faq_path.name
}
)
documents.append(doc)
logger.info(f"Loaded {len(documents)} FAQ documents from {faq_path.name}")
return documents
except Exception as e:
logger.error(f"Error loading FAQ documents from {faq_path.name}: {str(e)}")
raise
def load_company_info(info_path: Path = Path(COMPANY_INFO_DIR) / "info.md") -> Document:
"""
Load company information from markdown file.
Args:
info_path: Path to the company info markdown file
Returns:
Document object containing company info
"""
try:
# Validate file exists
if not info_path.exists():
raise FileNotFoundError(f"Info file not found at {info_path}")
with open(info_path, 'r', encoding='utf-8') as f:
content = f.read()
doc = Document(
page_content=content,
metadata={
"source": "company_info",
"type": "general_info",
"filename": info_path.name,
"doc_id": "company_info_main"
}
)
logger.info(f"Loaded company info document from {info_path.name}")
return doc
except Exception as e:
logger.error(f"Error loading company info from {info_path.name}: {str(e)}")
raise
# --- New functions for PDF, TXT, and Image loading ---
def load_pdf_document(file_path: Path) -> List[Document]:
"""
Load text from a PDF file using LangChain's PyPDFLoader.
Each page is treated as a separate document.
Args:
file_path: Path to the PDF file.
Returns:
A list of Document objects, one for each page of the PDF.
"""
documents = []
try:
if not file_path.exists():
raise FileNotFoundError(f"PDF file not found at {file_path}")
loader = PyPDFLoader(str(file_path)) # PyPDFLoader expects a string path
docs = loader.load() # This returns a list of LangChain Document objects
# Enhance metadata for consistency and add source/type
for doc in docs:
doc.metadata["source"] = "uploaded_file"
doc.metadata["type"] = "pdf"
doc.metadata["filename"] = file_path.name
# PyPDFLoader usually adds 'page' and 'source' (which is the file path)
# We can use the existing 'page' if it's there or default to 0
page_num = doc.metadata.get("page", 0)
doc.metadata["doc_id"] = f"{file_path.stem}_page_{page_num + 1}" # Ensure page number is 1-indexed
documents.extend(docs)
logger.info(f"Loaded {len(documents)} pages from PDF using PyPDFLoader: {file_path.name}")
return documents
except Exception as e:
logger.error(f"Error loading PDF file {file_path.name} with PyPDFLoader: {str(e)}")
raise
def load_txt_document(file_path: Path) -> Document:
"""
Load text from a TXT file.
Args:
file_path: Path to the TXT file.
Returns:
A Document object containing the text from the file.
"""
try:
if not file_path.exists():
raise FileNotFoundError(f"TXT file not found at {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
doc = Document(
page_content=content,
metadata={
"source": "uploaded_file",
"type": "txt",
"filename": file_path.name,
"doc_id": file_path.stem
}
)
logger.info(f"Loaded TXT file: {file_path.name}")
return doc
except Exception as e:
logger.error(f"Error loading TXT file {file_path.name}: {str(e)}")
raise
def process_uploaded_file(file_path: Path) -> List[Document]:
"""
Determines the file extension and calls the appropriate function to process it.
Args:
file_path: Path to the uploaded file.
Returns:
A list of Document objects containing the extracted text.
Returns an empty list if the file type is unsupported or an error occurs.
"""
documents = []
try:
if not file_path.exists():
raise FileNotFoundError(f"File not found at {file_path}")
extension = file_path.suffix.lower()
if extension == '.pdf':
documents = load_pdf_document(file_path)
elif extension == '.txt':
documents = [load_txt_document(file_path)] # Wrap in list for consistency
else:
logger.warning(f"Unsupported file type for {file_path.name}: {extension}")
# Optionally, you could raise an error here if unsupported files should halt execution
# raise ValueError(f"Unsupported file type: {extension}")
return [] # Return empty list for unsupported types
except FileNotFoundError as fnfe:
logger.error(f"Processing failed: {fnfe}")
except Exception as e:
logger.error(f"An unexpected error occurred while processing {file_path.name}: {str(e)}")
return documents