| import os |
| import pandas as pd |
| import io |
| from typing import List, Dict, Any |
| from langchain.schema import Document |
|
|
| class ExcelLoader: |
| def __init__(self): |
| """Initialize the Excel document loader""" |
| pass |
| |
| def load_file(self, file_path: str) -> List[Document]: |
| """ |
| Load an Excel file and convert it to a list of documents |
| |
| Args: |
| file_path: Path to the Excel file |
| |
| Returns: |
| List of Document objects |
| """ |
| if not os.path.exists(file_path): |
| raise FileNotFoundError(f"File not found: {file_path}") |
| |
| try: |
| print(f"Loading Excel file: {file_path}") |
| |
| |
| file_name = os.path.basename(file_path) |
| |
| |
| excel_file = pd.ExcelFile(file_path) |
| sheet_names = excel_file.sheet_names |
| |
| |
| metadata = { |
| "source": file_path, |
| "title": file_name, |
| "file_type": "excel", |
| "sheet_count": len(sheet_names), |
| "sheet_names": ", ".join(sheet_names) |
| } |
| |
| documents = [] |
| |
| |
| for sheet_name in sheet_names: |
| print(f"Processing sheet: {sheet_name}") |
| |
| df = pd.read_excel(file_path, sheet_name=sheet_name) |
| |
| |
| if df.empty: |
| continue |
| |
| |
| sheet_metadata = metadata.copy() |
| sheet_metadata.update({ |
| "sheet_name": sheet_name, |
| "row_count": df.shape[0], |
| "column_count": df.shape[1] |
| }) |
| |
| |
| buffer = io.StringIO() |
| df.to_csv(buffer) |
| sheet_text = buffer.getvalue() |
| |
| |
| documents.append(Document( |
| page_content=sheet_text, |
| metadata=sheet_metadata |
| )) |
| |
| |
| try: |
| table_str = df.to_string() |
| table_metadata = sheet_metadata.copy() |
| table_metadata.update({"format": "table"}) |
| |
| documents.append(Document( |
| page_content=table_str, |
| metadata=table_metadata |
| )) |
| except Exception as e: |
| print(f"Error converting sheet to table format: {str(e)}") |
| |
| print(f"Extracted {len(documents)} documents from Excel file") |
| return documents |
| |
| except Exception as e: |
| print(f"Error loading Excel file {file_path}: {str(e)}") |
| return [] |