agentbee / src /tools /file_parser.py
mangubee's picture
fix: correct author name formatting in multiple files
e7b4937
"""
File Parser Tool - Multi-format file reading
Author: @mangubee
Date: 2026-01-02
Provides file parsing for:
- PDF files (.pdf) using PyPDF2
- Excel files (.xlsx, .xls) using openpyxl
- Word documents (.docx) using python-docx
- Text files (.txt, .csv) using built-in open()
All parsers include retry logic and error handling.
"""
import logging
from pathlib import Path
from typing import Dict, List, Optional
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
# ============================================================================
# CONFIG
# ============================================================================
MAX_RETRIES = 3
RETRY_MIN_WAIT = 1 # seconds
RETRY_MAX_WAIT = 5 # seconds
SUPPORTED_EXTENSIONS = {
'.pdf': 'PDF',
'.xlsx': 'Excel',
'.xls': 'Excel',
'.docx': 'Word',
'.txt': 'Text',
'.csv': 'CSV',
}
# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)
# ============================================================================
# PDF Parser
# ============================================================================
@retry(
stop=stop_after_attempt(MAX_RETRIES),
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
retry=retry_if_exception_type((IOError, OSError)),
reraise=True,
)
def parse_pdf(file_path: str) -> Dict:
"""
Parse PDF file and extract text content.
Args:
file_path: Path to PDF file
Returns:
Dict with structure: {
"content": str, # Extracted text
"pages": int, # Number of pages
"file_type": "PDF",
"file_path": str
}
Raises:
FileNotFoundError: If file doesn't exist
ValueError: If file is corrupted or invalid
IOError: For file reading errors (triggers retry)
"""
try:
from PyPDF2 import PdfReader
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"PDF file not found: {file_path}")
logger.info(f"Parsing PDF: {file_path}")
reader = PdfReader(str(path))
num_pages = len(reader.pages)
# Extract text from all pages
content = []
for page_num, page in enumerate(reader.pages, 1):
text = page.extract_text()
if text.strip():
content.append(f"--- Page {page_num} ---\n{text}")
full_content = "\n\n".join(content)
logger.info(f"PDF parsed successfully: {num_pages} pages, {len(full_content)} chars")
return {
"content": full_content,
"pages": num_pages,
"file_type": "PDF",
"file_path": file_path,
}
except FileNotFoundError as e:
logger.error(f"PDF file not found: {e}")
raise
except (IOError, OSError) as e:
logger.warning(f"PDF IO error (will retry): {e}")
raise
except Exception as e:
logger.error(f"PDF parsing error: {e}")
raise ValueError(f"Failed to parse PDF: {str(e)}")
# ============================================================================
# Excel Parser
# ============================================================================
@retry(
stop=stop_after_attempt(MAX_RETRIES),
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
retry=retry_if_exception_type((IOError, OSError)),
reraise=True,
)
def parse_excel(file_path: str) -> Dict:
"""
Parse Excel file and extract data from all sheets.
Args:
file_path: Path to Excel file (.xlsx or .xls)
Returns:
Dict with structure: {
"content": str, # Formatted table data
"sheets": List[str], # Sheet names
"file_type": "Excel",
"file_path": str
}
Raises:
FileNotFoundError: If file doesn't exist
ValueError: If file is corrupted or invalid
IOError: For file reading errors (triggers retry)
"""
try:
from openpyxl import load_workbook
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"Excel file not found: {file_path}")
logger.info(f"Parsing Excel: {file_path}")
workbook = load_workbook(str(path), data_only=True)
sheet_names = workbook.sheetnames
# Extract data from all sheets
content_parts = []
for sheet_name in sheet_names:
sheet = workbook[sheet_name]
# Get all values
rows = []
for row in sheet.iter_rows(values_only=True):
# Filter out completely empty rows
if any(cell is not None for cell in row):
row_str = "\t".join(str(cell) if cell is not None else "" for cell in row)
rows.append(row_str)
if rows:
sheet_content = f"=== Sheet: {sheet_name} ===\n" + "\n".join(rows)
content_parts.append(sheet_content)
full_content = "\n\n".join(content_parts)
logger.info(f"Excel parsed successfully: {len(sheet_names)} sheets")
return {
"content": full_content,
"sheets": sheet_names,
"file_type": "Excel",
"file_path": file_path,
}
except FileNotFoundError as e:
logger.error(f"Excel file not found: {e}")
raise
except (IOError, OSError) as e:
logger.warning(f"Excel IO error (will retry): {e}")
raise
except Exception as e:
logger.error(f"Excel parsing error: {e}")
raise ValueError(f"Failed to parse Excel: {str(e)}")
# ============================================================================
# Word Document Parser
# ============================================================================
@retry(
stop=stop_after_attempt(MAX_RETRIES),
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
retry=retry_if_exception_type((IOError, OSError)),
reraise=True,
)
def parse_word(file_path: str) -> Dict:
"""
Parse Word document and extract text content.
Args:
file_path: Path to Word file (.docx)
Returns:
Dict with structure: {
"content": str, # Extracted text
"paragraphs": int, # Number of paragraphs
"file_type": "Word",
"file_path": str
}
Raises:
FileNotFoundError: If file doesn't exist
ValueError: If file is corrupted or invalid
IOError: For file reading errors (triggers retry)
"""
try:
from docx import Document
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"Word file not found: {file_path}")
logger.info(f"Parsing Word document: {file_path}")
doc = Document(str(path))
# Extract text from all paragraphs
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
full_content = "\n\n".join(paragraphs)
logger.info(f"Word parsed successfully: {len(paragraphs)} paragraphs")
return {
"content": full_content,
"paragraphs": len(paragraphs),
"file_type": "Word",
"file_path": file_path,
}
except FileNotFoundError as e:
logger.error(f"Word file not found: {e}")
raise
except (IOError, OSError) as e:
logger.warning(f"Word IO error (will retry): {e}")
raise
except Exception as e:
logger.error(f"Word parsing error: {e}")
raise ValueError(f"Failed to parse Word document: {str(e)}")
# ============================================================================
# Text/CSV Parser
# ============================================================================
@retry(
stop=stop_after_attempt(MAX_RETRIES),
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
retry=retry_if_exception_type((IOError, OSError)),
reraise=True,
)
def parse_text(file_path: str) -> Dict:
"""
Parse plain text or CSV file.
Args:
file_path: Path to text file (.txt or .csv)
Returns:
Dict with structure: {
"content": str,
"lines": int,
"file_type": "Text" or "CSV",
"file_path": str
}
Raises:
FileNotFoundError: If file doesn't exist
IOError: For file reading errors (triggers retry)
"""
try:
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"Text file not found: {file_path}")
logger.info(f"Parsing text file: {file_path}")
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.count('\n') + 1
file_type = "CSV" if path.suffix == '.csv' else "Text"
logger.info(f"{file_type} file parsed successfully: {lines} lines")
return {
"content": content,
"lines": lines,
"file_type": file_type,
"file_path": file_path,
}
except FileNotFoundError as e:
logger.error(f"Text file not found: {e}")
raise
except (IOError, OSError) as e:
logger.warning(f"Text file IO error (will retry): {e}")
raise
except UnicodeDecodeError as e:
logger.error(f"Text file encoding error: {e}")
raise ValueError(f"Failed to decode text file (try UTF-8): {str(e)}")
# ============================================================================
# Unified File Parser
# ============================================================================
def parse_file(file_path: str) -> Dict:
"""
Parse file based on extension, automatically selecting the right parser.
Args:
file_path: Path to file
Returns:
Dict with parsed content and metadata
Raises:
ValueError: If file type is not supported
FileNotFoundError: If file doesn't exist
Exception: For parsing errors
"""
path = Path(file_path)
extension = path.suffix.lower()
if extension not in SUPPORTED_EXTENSIONS:
raise ValueError(
f"Unsupported file type: {extension}. "
f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
)
logger.info(f"Dispatching parser for {SUPPORTED_EXTENSIONS[extension]} file: {file_path}")
# Dispatch to appropriate parser
if extension == '.pdf':
return parse_pdf(file_path)
elif extension in ['.xlsx', '.xls']:
return parse_excel(file_path)
elif extension == '.docx':
return parse_word(file_path)
elif extension in ['.txt', '.csv']:
return parse_text(file_path)
else:
# Should never reach here due to check above
raise ValueError(f"No parser for extension: {extension}")