agentbee

Running

File size: 11,087 Bytes

"""
File Parser Tool - Multi-format file reading
Author: @mangubee
Date: 2026-01-02

Provides file parsing for:
- PDF files (.pdf) using PyPDF2
- Excel files (.xlsx, .xls) using openpyxl
- Word documents (.docx) using python-docx
- Text files (.txt, .csv) using built-in open()

All parsers include retry logic and error handling.
"""

import logging
from pathlib import Path
from typing import Dict, List, Optional
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)

# ============================================================================
# CONFIG
# ============================================================================
MAX_RETRIES = 3
RETRY_MIN_WAIT = 1  # seconds
RETRY_MAX_WAIT = 5  # seconds

SUPPORTED_EXTENSIONS = {
    '.pdf': 'PDF',
    '.xlsx': 'Excel',
    '.xls': 'Excel',
    '.docx': 'Word',
    '.txt': 'Text',
    '.csv': 'CSV',
}

# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)


# ============================================================================
# PDF Parser
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((IOError, OSError)),
    reraise=True,
)
def parse_pdf(file_path: str) -> Dict:
    """
    Parse PDF file and extract text content.

    Args:
        file_path: Path to PDF file

    Returns:
        Dict with structure: {
            "content": str,  # Extracted text
            "pages": int,    # Number of pages
            "file_type": "PDF",
            "file_path": str
        }

    Raises:
        FileNotFoundError: If file doesn't exist
        ValueError: If file is corrupted or invalid
        IOError: For file reading errors (triggers retry)
    """
    try:
        from PyPDF2 import PdfReader

        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF file not found: {file_path}")

        logger.info(f"Parsing PDF: {file_path}")

        reader = PdfReader(str(path))
        num_pages = len(reader.pages)

        # Extract text from all pages
        content = []
        for page_num, page in enumerate(reader.pages, 1):
            text = page.extract_text()
            if text.strip():
                content.append(f"--- Page {page_num} ---\n{text}")

        full_content = "\n\n".join(content)

        logger.info(f"PDF parsed successfully: {num_pages} pages, {len(full_content)} chars")

        return {
            "content": full_content,
            "pages": num_pages,
            "file_type": "PDF",
            "file_path": file_path,
        }

    except FileNotFoundError as e:
        logger.error(f"PDF file not found: {e}")
        raise
    except (IOError, OSError) as e:
        logger.warning(f"PDF IO error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"PDF parsing error: {e}")
        raise ValueError(f"Failed to parse PDF: {str(e)}")


# ============================================================================
# Excel Parser
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((IOError, OSError)),
    reraise=True,
)
def parse_excel(file_path: str) -> Dict:
    """
    Parse Excel file and extract data from all sheets.

    Args:
        file_path: Path to Excel file (.xlsx or .xls)

    Returns:
        Dict with structure: {
            "content": str,      # Formatted table data
            "sheets": List[str], # Sheet names
            "file_type": "Excel",
            "file_path": str
        }

    Raises:
        FileNotFoundError: If file doesn't exist
        ValueError: If file is corrupted or invalid
        IOError: For file reading errors (triggers retry)
    """
    try:
        from openpyxl import load_workbook

        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"Excel file not found: {file_path}")

        logger.info(f"Parsing Excel: {file_path}")

        workbook = load_workbook(str(path), data_only=True)
        sheet_names = workbook.sheetnames

        # Extract data from all sheets
        content_parts = []
        for sheet_name in sheet_names:
            sheet = workbook[sheet_name]

            # Get all values
            rows = []
            for row in sheet.iter_rows(values_only=True):
                # Filter out completely empty rows
                if any(cell is not None for cell in row):
                    row_str = "\t".join(str(cell) if cell is not None else "" for cell in row)
                    rows.append(row_str)

            if rows:
                sheet_content = f"=== Sheet: {sheet_name} ===\n" + "\n".join(rows)
                content_parts.append(sheet_content)

        full_content = "\n\n".join(content_parts)

        logger.info(f"Excel parsed successfully: {len(sheet_names)} sheets")

        return {
            "content": full_content,
            "sheets": sheet_names,
            "file_type": "Excel",
            "file_path": file_path,
        }

    except FileNotFoundError as e:
        logger.error(f"Excel file not found: {e}")
        raise
    except (IOError, OSError) as e:
        logger.warning(f"Excel IO error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"Excel parsing error: {e}")
        raise ValueError(f"Failed to parse Excel: {str(e)}")


# ============================================================================
# Word Document Parser
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((IOError, OSError)),
    reraise=True,
)
def parse_word(file_path: str) -> Dict:
    """
    Parse Word document and extract text content.

    Args:
        file_path: Path to Word file (.docx)

    Returns:
        Dict with structure: {
            "content": str,        # Extracted text
            "paragraphs": int,     # Number of paragraphs
            "file_type": "Word",
            "file_path": str
        }

    Raises:
        FileNotFoundError: If file doesn't exist
        ValueError: If file is corrupted or invalid
        IOError: For file reading errors (triggers retry)
    """
    try:
        from docx import Document

        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"Word file not found: {file_path}")

        logger.info(f"Parsing Word document: {file_path}")

        doc = Document(str(path))

        # Extract text from all paragraphs
        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
        full_content = "\n\n".join(paragraphs)

        logger.info(f"Word parsed successfully: {len(paragraphs)} paragraphs")

        return {
            "content": full_content,
            "paragraphs": len(paragraphs),
            "file_type": "Word",
            "file_path": file_path,
        }

    except FileNotFoundError as e:
        logger.error(f"Word file not found: {e}")
        raise
    except (IOError, OSError) as e:
        logger.warning(f"Word IO error (will retry): {e}")
        raise
    except Exception as e:
        logger.error(f"Word parsing error: {e}")
        raise ValueError(f"Failed to parse Word document: {str(e)}")


# ============================================================================
# Text/CSV Parser
# ============================================================================

@retry(
    stop=stop_after_attempt(MAX_RETRIES),
    wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT),
    retry=retry_if_exception_type((IOError, OSError)),
    reraise=True,
)
def parse_text(file_path: str) -> Dict:
    """
    Parse plain text or CSV file.

    Args:
        file_path: Path to text file (.txt or .csv)

    Returns:
        Dict with structure: {
            "content": str,
            "lines": int,
            "file_type": "Text" or "CSV",
            "file_path": str
        }

    Raises:
        FileNotFoundError: If file doesn't exist
        IOError: For file reading errors (triggers retry)
    """
    try:
        path = Path(file_path)
        if not path.exists():
            raise FileNotFoundError(f"Text file not found: {file_path}")

        logger.info(f"Parsing text file: {file_path}")

        with open(path, 'r', encoding='utf-8') as f:
            content = f.read()

        lines = content.count('\n') + 1
        file_type = "CSV" if path.suffix == '.csv' else "Text"

        logger.info(f"{file_type} file parsed successfully: {lines} lines")

        return {
            "content": content,
            "lines": lines,
            "file_type": file_type,
            "file_path": file_path,
        }

    except FileNotFoundError as e:
        logger.error(f"Text file not found: {e}")
        raise
    except (IOError, OSError) as e:
        logger.warning(f"Text file IO error (will retry): {e}")
        raise
    except UnicodeDecodeError as e:
        logger.error(f"Text file encoding error: {e}")
        raise ValueError(f"Failed to decode text file (try UTF-8): {str(e)}")


# ============================================================================
# Unified File Parser
# ============================================================================

def parse_file(file_path: str) -> Dict:
    """
    Parse file based on extension, automatically selecting the right parser.

    Args:
        file_path: Path to file

    Returns:
        Dict with parsed content and metadata

    Raises:
        ValueError: If file type is not supported
        FileNotFoundError: If file doesn't exist
        Exception: For parsing errors
    """
    path = Path(file_path)
    extension = path.suffix.lower()

    if extension not in SUPPORTED_EXTENSIONS:
        raise ValueError(
            f"Unsupported file type: {extension}. "
            f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
        )

    logger.info(f"Dispatching parser for {SUPPORTED_EXTENSIONS[extension]} file: {file_path}")

    # Dispatch to appropriate parser
    if extension == '.pdf':
        return parse_pdf(file_path)
    elif extension in ['.xlsx', '.xls']:
        return parse_excel(file_path)
    elif extension == '.docx':
        return parse_word(file_path)
    elif extension in ['.txt', '.csv']:
        return parse_text(file_path)
    else:
        # Should never reach here due to check above
        raise ValueError(f"No parser for extension: {extension}")