import os import pandas as pd import io from typing import List, Dict, Any from langchain.schema import Document class ExcelLoader: def __init__(self): """Initialize the Excel document loader""" pass def load_file(self, file_path: str) -> List[Document]: """ Load an Excel file and convert it to a list of documents Args: file_path: Path to the Excel file Returns: List of Document objects """ if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") try: print(f"Loading Excel file: {file_path}") # Extract metadata file_name = os.path.basename(file_path) # Get Excel file info excel_file = pd.ExcelFile(file_path) sheet_names = excel_file.sheet_names # Common metadata metadata = { "source": file_path, "title": file_name, "file_type": "excel", "sheet_count": len(sheet_names), "sheet_names": ", ".join(sheet_names) } documents = [] # Process each sheet for sheet_name in sheet_names: print(f"Processing sheet: {sheet_name}") df = pd.read_excel(file_path, sheet_name=sheet_name) # Skip empty sheets if df.empty: continue # Create sheet metadata sheet_metadata = metadata.copy() sheet_metadata.update({ "sheet_name": sheet_name, "row_count": df.shape[0], "column_count": df.shape[1] }) # Convert to string representation buffer = io.StringIO() df.to_csv(buffer) sheet_text = buffer.getvalue() # Create document for this sheet documents.append(Document( page_content=sheet_text, metadata=sheet_metadata )) # Also add a more readable table format try: table_str = df.to_string() table_metadata = sheet_metadata.copy() table_metadata.update({"format": "table"}) documents.append(Document( page_content=table_str, metadata=table_metadata )) except Exception as e: print(f"Error converting sheet to table format: {str(e)}") print(f"Extracted {len(documents)} documents from Excel file") return documents except Exception as e: print(f"Error loading Excel file {file_path}: {str(e)}") return []