Spaces:
Sleeping
Sleeping
| # Import required libraries | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import List | |
| from langchain.schema import Document | |
| import logging | |
| # For PDF processing - now using LangChain's PyPDFLoader | |
| from langchain_community.document_loaders import PyPDFLoader | |
| # Configure logging | |
| logger = logging.getLogger(__name__) | |
| # --- Existing functions (as provided by user) --- | |
| # Define a placeholder for COMPANY_INFO_DIR if it's not defined in config.py | |
| # In a real application, ensure config.py is accessible or pass this path. | |
| try: | |
| from config import COMPANY_INFO_DIR | |
| except ImportError: | |
| logger.warning("COMPANY_INFO_DIR not found in config.py. Using a default placeholder.") | |
| COMPANY_INFO_DIR = Path("./company_info") # Placeholder path, adjust as needed | |
| def load_faq_documents(faq_path: Path = Path(COMPANY_INFO_DIR) / "FAQ.csv") -> List[Document]: | |
| """ | |
| Load and process FAQ documents from CSV file. | |
| Args: | |
| faq_path: Path to the FAQ CSV file | |
| Returns: | |
| List of Document objects | |
| """ | |
| try: | |
| # Validate file exists | |
| if not faq_path.exists(): | |
| raise FileNotFoundError(f"FAQ file not found at {faq_path}") | |
| df = pd.read_csv(faq_path) | |
| # Validate required columns | |
| required_cols = ['Question', 'Answer'] | |
| if not all(col in df.columns for col in required_cols): | |
| raise ValueError(f"CSV must contain columns: {required_cols}") | |
| documents = [] | |
| for idx, row in df.iterrows(): | |
| content = f"Question: {row.get('Question', '')}\nAnswer: {row.get('Answer', '')}" | |
| doc = Document( | |
| page_content=content, | |
| metadata={ | |
| "source": "company_faq", | |
| "type": "faq", | |
| "doc_id": f"{idx}", | |
| "filename": faq_path.name | |
| } | |
| ) | |
| documents.append(doc) | |
| logger.info(f"Loaded {len(documents)} FAQ documents from {faq_path.name}") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading FAQ documents from {faq_path.name}: {str(e)}") | |
| raise | |
| def load_company_info(info_path: Path = Path(COMPANY_INFO_DIR) / "info.md") -> Document: | |
| """ | |
| Load company information from markdown file. | |
| Args: | |
| info_path: Path to the company info markdown file | |
| Returns: | |
| Document object containing company info | |
| """ | |
| try: | |
| # Validate file exists | |
| if not info_path.exists(): | |
| raise FileNotFoundError(f"Info file not found at {info_path}") | |
| with open(info_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| doc = Document( | |
| page_content=content, | |
| metadata={ | |
| "source": "company_info", | |
| "type": "general_info", | |
| "filename": info_path.name, | |
| "doc_id": "company_info_main" | |
| } | |
| ) | |
| logger.info(f"Loaded company info document from {info_path.name}") | |
| return doc | |
| except Exception as e: | |
| logger.error(f"Error loading company info from {info_path.name}: {str(e)}") | |
| raise | |
| # --- New functions for PDF, TXT, and Image loading --- | |
| def load_pdf_document(file_path: Path) -> List[Document]: | |
| """ | |
| Load text from a PDF file using LangChain's PyPDFLoader. | |
| Each page is treated as a separate document. | |
| Args: | |
| file_path: Path to the PDF file. | |
| Returns: | |
| A list of Document objects, one for each page of the PDF. | |
| """ | |
| documents = [] | |
| try: | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"PDF file not found at {file_path}") | |
| loader = PyPDFLoader(str(file_path)) # PyPDFLoader expects a string path | |
| docs = loader.load() # This returns a list of LangChain Document objects | |
| # Enhance metadata for consistency and add source/type | |
| for doc in docs: | |
| doc.metadata["source"] = "uploaded_file" | |
| doc.metadata["type"] = "pdf" | |
| doc.metadata["filename"] = file_path.name | |
| # PyPDFLoader usually adds 'page' and 'source' (which is the file path) | |
| # We can use the existing 'page' if it's there or default to 0 | |
| page_num = doc.metadata.get("page", 0) | |
| doc.metadata["doc_id"] = f"{file_path.stem}_page_{page_num + 1}" # Ensure page number is 1-indexed | |
| documents.extend(docs) | |
| logger.info(f"Loaded {len(documents)} pages from PDF using PyPDFLoader: {file_path.name}") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading PDF file {file_path.name} with PyPDFLoader: {str(e)}") | |
| raise | |
| def load_txt_document(file_path: Path) -> Document: | |
| """ | |
| Load text from a TXT file. | |
| Args: | |
| file_path: Path to the TXT file. | |
| Returns: | |
| A Document object containing the text from the file. | |
| """ | |
| try: | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"TXT file not found at {file_path}") | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| doc = Document( | |
| page_content=content, | |
| metadata={ | |
| "source": "uploaded_file", | |
| "type": "txt", | |
| "filename": file_path.name, | |
| "doc_id": file_path.stem | |
| } | |
| ) | |
| logger.info(f"Loaded TXT file: {file_path.name}") | |
| return doc | |
| except Exception as e: | |
| logger.error(f"Error loading TXT file {file_path.name}: {str(e)}") | |
| raise | |
| def process_uploaded_file(file_path: Path) -> List[Document]: | |
| """ | |
| Determines the file extension and calls the appropriate function to process it. | |
| Args: | |
| file_path: Path to the uploaded file. | |
| Returns: | |
| A list of Document objects containing the extracted text. | |
| Returns an empty list if the file type is unsupported or an error occurs. | |
| """ | |
| documents = [] | |
| try: | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"File not found at {file_path}") | |
| extension = file_path.suffix.lower() | |
| if extension == '.pdf': | |
| documents = load_pdf_document(file_path) | |
| elif extension == '.txt': | |
| documents = [load_txt_document(file_path)] # Wrap in list for consistency | |
| else: | |
| logger.warning(f"Unsupported file type for {file_path.name}: {extension}") | |
| # Optionally, you could raise an error here if unsupported files should halt execution | |
| # raise ValueError(f"Unsupported file type: {extension}") | |
| return [] # Return empty list for unsupported types | |
| except FileNotFoundError as fnfe: | |
| logger.error(f"Processing failed: {fnfe}") | |
| except Exception as e: | |
| logger.error(f"An unexpected error occurred while processing {file_path.name}: {str(e)}") | |
| return documents | |