Spaces:
Sleeping
Sleeping
File size: 7,137 Bytes
c5e1945 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
import logging
# For PDF processing - now using LangChain's PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
# Configure logging
logger = logging.getLogger(__name__)
# --- Existing functions (as provided by user) ---
# Define a placeholder for COMPANY_INFO_DIR if it's not defined in config.py
# In a real application, ensure config.py is accessible or pass this path.
try:
from config import COMPANY_INFO_DIR
except ImportError:
logger.warning("COMPANY_INFO_DIR not found in config.py. Using a default placeholder.")
COMPANY_INFO_DIR = Path("./company_info") # Placeholder path, adjust as needed
def load_faq_documents(faq_path: Path = Path(COMPANY_INFO_DIR) / "FAQ.csv") -> List[Document]:
"""
Load and process FAQ documents from CSV file.
Args:
faq_path: Path to the FAQ CSV file
Returns:
List of Document objects
"""
try:
# Validate file exists
if not faq_path.exists():
raise FileNotFoundError(f"FAQ file not found at {faq_path}")
df = pd.read_csv(faq_path)
# Validate required columns
required_cols = ['Question', 'Answer']
if not all(col in df.columns for col in required_cols):
raise ValueError(f"CSV must contain columns: {required_cols}")
documents = []
for idx, row in df.iterrows():
content = f"Question: {row.get('Question', '')}\nAnswer: {row.get('Answer', '')}"
doc = Document(
page_content=content,
metadata={
"source": "company_faq",
"type": "faq",
"doc_id": f"{idx}",
"filename": faq_path.name
}
)
documents.append(doc)
logger.info(f"Loaded {len(documents)} FAQ documents from {faq_path.name}")
return documents
except Exception as e:
logger.error(f"Error loading FAQ documents from {faq_path.name}: {str(e)}")
raise
def load_company_info(info_path: Path = Path(COMPANY_INFO_DIR) / "info.md") -> Document:
"""
Load company information from markdown file.
Args:
info_path: Path to the company info markdown file
Returns:
Document object containing company info
"""
try:
# Validate file exists
if not info_path.exists():
raise FileNotFoundError(f"Info file not found at {info_path}")
with open(info_path, 'r', encoding='utf-8') as f:
content = f.read()
doc = Document(
page_content=content,
metadata={
"source": "company_info",
"type": "general_info",
"filename": info_path.name,
"doc_id": "company_info_main"
}
)
logger.info(f"Loaded company info document from {info_path.name}")
return doc
except Exception as e:
logger.error(f"Error loading company info from {info_path.name}: {str(e)}")
raise
# --- New functions for PDF, TXT, and Image loading ---
def load_pdf_document(file_path: Path) -> List[Document]:
"""
Load text from a PDF file using LangChain's PyPDFLoader.
Each page is treated as a separate document.
Args:
file_path: Path to the PDF file.
Returns:
A list of Document objects, one for each page of the PDF.
"""
documents = []
try:
if not file_path.exists():
raise FileNotFoundError(f"PDF file not found at {file_path}")
loader = PyPDFLoader(str(file_path)) # PyPDFLoader expects a string path
docs = loader.load() # This returns a list of LangChain Document objects
# Enhance metadata for consistency and add source/type
for doc in docs:
doc.metadata["source"] = "uploaded_file"
doc.metadata["type"] = "pdf"
doc.metadata["filename"] = file_path.name
# PyPDFLoader usually adds 'page' and 'source' (which is the file path)
# We can use the existing 'page' if it's there or default to 0
page_num = doc.metadata.get("page", 0)
doc.metadata["doc_id"] = f"{file_path.stem}_page_{page_num + 1}" # Ensure page number is 1-indexed
documents.extend(docs)
logger.info(f"Loaded {len(documents)} pages from PDF using PyPDFLoader: {file_path.name}")
return documents
except Exception as e:
logger.error(f"Error loading PDF file {file_path.name} with PyPDFLoader: {str(e)}")
raise
def load_txt_document(file_path: Path) -> Document:
"""
Load text from a TXT file.
Args:
file_path: Path to the TXT file.
Returns:
A Document object containing the text from the file.
"""
try:
if not file_path.exists():
raise FileNotFoundError(f"TXT file not found at {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
doc = Document(
page_content=content,
metadata={
"source": "uploaded_file",
"type": "txt",
"filename": file_path.name,
"doc_id": file_path.stem
}
)
logger.info(f"Loaded TXT file: {file_path.name}")
return doc
except Exception as e:
logger.error(f"Error loading TXT file {file_path.name}: {str(e)}")
raise
def process_uploaded_file(file_path: Path) -> List[Document]:
"""
Determines the file extension and calls the appropriate function to process it.
Args:
file_path: Path to the uploaded file.
Returns:
A list of Document objects containing the extracted text.
Returns an empty list if the file type is unsupported or an error occurs.
"""
documents = []
try:
if not file_path.exists():
raise FileNotFoundError(f"File not found at {file_path}")
extension = file_path.suffix.lower()
if extension == '.pdf':
documents = load_pdf_document(file_path)
elif extension == '.txt':
documents = [load_txt_document(file_path)] # Wrap in list for consistency
else:
logger.warning(f"Unsupported file type for {file_path.name}: {extension}")
# Optionally, you could raise an error here if unsupported files should halt execution
# raise ValueError(f"Unsupported file type: {extension}")
return [] # Return empty list for unsupported types
except FileNotFoundError as fnfe:
logger.error(f"Processing failed: {fnfe}")
except Exception as e:
logger.error(f"An unexpected error occurred while processing {file_path.name}: {str(e)}")
return documents
|