""" File Parser Tool - Multi-format file reading Author: @mangubee Date: 2026-01-02 Provides file parsing for: - PDF files (.pdf) using PyPDF2 - Excel files (.xlsx, .xls) using openpyxl - Word documents (.docx) using python-docx - Text files (.txt, .csv) using built-in open() All parsers include retry logic and error handling. """ import logging from pathlib import Path from typing import Dict, List, Optional from tenacity import ( retry, stop_after_attempt, wait_exponential, retry_if_exception_type, ) # ============================================================================ # CONFIG # ============================================================================ MAX_RETRIES = 3 RETRY_MIN_WAIT = 1 # seconds RETRY_MAX_WAIT = 5 # seconds SUPPORTED_EXTENSIONS = { '.pdf': 'PDF', '.xlsx': 'Excel', '.xls': 'Excel', '.docx': 'Word', '.txt': 'Text', '.csv': 'CSV', } # ============================================================================ # Logging Setup # ============================================================================ logger = logging.getLogger(__name__) # ============================================================================ # PDF Parser # ============================================================================ @retry( stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), retry=retry_if_exception_type((IOError, OSError)), reraise=True, ) def parse_pdf(file_path: str) -> Dict: """ Parse PDF file and extract text content. Args: file_path: Path to PDF file Returns: Dict with structure: { "content": str, # Extracted text "pages": int, # Number of pages "file_type": "PDF", "file_path": str } Raises: FileNotFoundError: If file doesn't exist ValueError: If file is corrupted or invalid IOError: For file reading errors (triggers retry) """ try: from PyPDF2 import PdfReader path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"PDF file not found: {file_path}") logger.info(f"Parsing PDF: {file_path}") reader = PdfReader(str(path)) num_pages = len(reader.pages) # Extract text from all pages content = [] for page_num, page in enumerate(reader.pages, 1): text = page.extract_text() if text.strip(): content.append(f"--- Page {page_num} ---\n{text}") full_content = "\n\n".join(content) logger.info(f"PDF parsed successfully: {num_pages} pages, {len(full_content)} chars") return { "content": full_content, "pages": num_pages, "file_type": "PDF", "file_path": file_path, } except FileNotFoundError as e: logger.error(f"PDF file not found: {e}") raise except (IOError, OSError) as e: logger.warning(f"PDF IO error (will retry): {e}") raise except Exception as e: logger.error(f"PDF parsing error: {e}") raise ValueError(f"Failed to parse PDF: {str(e)}") # ============================================================================ # Excel Parser # ============================================================================ @retry( stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), retry=retry_if_exception_type((IOError, OSError)), reraise=True, ) def parse_excel(file_path: str) -> Dict: """ Parse Excel file and extract data from all sheets. Args: file_path: Path to Excel file (.xlsx or .xls) Returns: Dict with structure: { "content": str, # Formatted table data "sheets": List[str], # Sheet names "file_type": "Excel", "file_path": str } Raises: FileNotFoundError: If file doesn't exist ValueError: If file is corrupted or invalid IOError: For file reading errors (triggers retry) """ try: from openpyxl import load_workbook path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"Excel file not found: {file_path}") logger.info(f"Parsing Excel: {file_path}") workbook = load_workbook(str(path), data_only=True) sheet_names = workbook.sheetnames # Extract data from all sheets content_parts = [] for sheet_name in sheet_names: sheet = workbook[sheet_name] # Get all values rows = [] for row in sheet.iter_rows(values_only=True): # Filter out completely empty rows if any(cell is not None for cell in row): row_str = "\t".join(str(cell) if cell is not None else "" for cell in row) rows.append(row_str) if rows: sheet_content = f"=== Sheet: {sheet_name} ===\n" + "\n".join(rows) content_parts.append(sheet_content) full_content = "\n\n".join(content_parts) logger.info(f"Excel parsed successfully: {len(sheet_names)} sheets") return { "content": full_content, "sheets": sheet_names, "file_type": "Excel", "file_path": file_path, } except FileNotFoundError as e: logger.error(f"Excel file not found: {e}") raise except (IOError, OSError) as e: logger.warning(f"Excel IO error (will retry): {e}") raise except Exception as e: logger.error(f"Excel parsing error: {e}") raise ValueError(f"Failed to parse Excel: {str(e)}") # ============================================================================ # Word Document Parser # ============================================================================ @retry( stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), retry=retry_if_exception_type((IOError, OSError)), reraise=True, ) def parse_word(file_path: str) -> Dict: """ Parse Word document and extract text content. Args: file_path: Path to Word file (.docx) Returns: Dict with structure: { "content": str, # Extracted text "paragraphs": int, # Number of paragraphs "file_type": "Word", "file_path": str } Raises: FileNotFoundError: If file doesn't exist ValueError: If file is corrupted or invalid IOError: For file reading errors (triggers retry) """ try: from docx import Document path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"Word file not found: {file_path}") logger.info(f"Parsing Word document: {file_path}") doc = Document(str(path)) # Extract text from all paragraphs paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] full_content = "\n\n".join(paragraphs) logger.info(f"Word parsed successfully: {len(paragraphs)} paragraphs") return { "content": full_content, "paragraphs": len(paragraphs), "file_type": "Word", "file_path": file_path, } except FileNotFoundError as e: logger.error(f"Word file not found: {e}") raise except (IOError, OSError) as e: logger.warning(f"Word IO error (will retry): {e}") raise except Exception as e: logger.error(f"Word parsing error: {e}") raise ValueError(f"Failed to parse Word document: {str(e)}") # ============================================================================ # Text/CSV Parser # ============================================================================ @retry( stop=stop_after_attempt(MAX_RETRIES), wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), retry=retry_if_exception_type((IOError, OSError)), reraise=True, ) def parse_text(file_path: str) -> Dict: """ Parse plain text or CSV file. Args: file_path: Path to text file (.txt or .csv) Returns: Dict with structure: { "content": str, "lines": int, "file_type": "Text" or "CSV", "file_path": str } Raises: FileNotFoundError: If file doesn't exist IOError: For file reading errors (triggers retry) """ try: path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"Text file not found: {file_path}") logger.info(f"Parsing text file: {file_path}") with open(path, 'r', encoding='utf-8') as f: content = f.read() lines = content.count('\n') + 1 file_type = "CSV" if path.suffix == '.csv' else "Text" logger.info(f"{file_type} file parsed successfully: {lines} lines") return { "content": content, "lines": lines, "file_type": file_type, "file_path": file_path, } except FileNotFoundError as e: logger.error(f"Text file not found: {e}") raise except (IOError, OSError) as e: logger.warning(f"Text file IO error (will retry): {e}") raise except UnicodeDecodeError as e: logger.error(f"Text file encoding error: {e}") raise ValueError(f"Failed to decode text file (try UTF-8): {str(e)}") # ============================================================================ # Unified File Parser # ============================================================================ def parse_file(file_path: str) -> Dict: """ Parse file based on extension, automatically selecting the right parser. Args: file_path: Path to file Returns: Dict with parsed content and metadata Raises: ValueError: If file type is not supported FileNotFoundError: If file doesn't exist Exception: For parsing errors """ path = Path(file_path) extension = path.suffix.lower() if extension not in SUPPORTED_EXTENSIONS: raise ValueError( f"Unsupported file type: {extension}. " f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}" ) logger.info(f"Dispatching parser for {SUPPORTED_EXTENSIONS[extension]} file: {file_path}") # Dispatch to appropriate parser if extension == '.pdf': return parse_pdf(file_path) elif extension in ['.xlsx', '.xls']: return parse_excel(file_path) elif extension == '.docx': return parse_word(file_path) elif extension in ['.txt', '.csv']: return parse_text(file_path) else: # Should never reach here due to check above raise ValueError(f"No parser for extension: {extension}")