ASTROIQ / app /document_loaders /excel_loader.py
Ndg07's picture
Manual update from local script
ddffdb8
import os
import pandas as pd
import io
from typing import List, Dict, Any
from langchain.schema import Document
class ExcelLoader:
def __init__(self):
"""Initialize the Excel document loader"""
pass
def load_file(self, file_path: str) -> List[Document]:
"""
Load an Excel file and convert it to a list of documents
Args:
file_path: Path to the Excel file
Returns:
List of Document objects
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
try:
print(f"Loading Excel file: {file_path}")
# Extract metadata
file_name = os.path.basename(file_path)
# Get Excel file info
excel_file = pd.ExcelFile(file_path)
sheet_names = excel_file.sheet_names
# Common metadata
metadata = {
"source": file_path,
"title": file_name,
"file_type": "excel",
"sheet_count": len(sheet_names),
"sheet_names": ", ".join(sheet_names)
}
documents = []
# Process each sheet
for sheet_name in sheet_names:
print(f"Processing sheet: {sheet_name}")
df = pd.read_excel(file_path, sheet_name=sheet_name)
# Skip empty sheets
if df.empty:
continue
# Create sheet metadata
sheet_metadata = metadata.copy()
sheet_metadata.update({
"sheet_name": sheet_name,
"row_count": df.shape[0],
"column_count": df.shape[1]
})
# Convert to string representation
buffer = io.StringIO()
df.to_csv(buffer)
sheet_text = buffer.getvalue()
# Create document for this sheet
documents.append(Document(
page_content=sheet_text,
metadata=sheet_metadata
))
# Also add a more readable table format
try:
table_str = df.to_string()
table_metadata = sheet_metadata.copy()
table_metadata.update({"format": "table"})
documents.append(Document(
page_content=table_str,
metadata=table_metadata
))
except Exception as e:
print(f"Error converting sheet to table format: {str(e)}")
print(f"Extracted {len(documents)} documents from Excel file")
return documents
except Exception as e:
print(f"Error loading Excel file {file_path}: {str(e)}")
return []