| | """ |
| | File Parser Tool - Multi-format file reading |
| | Author: @mangobee |
| | Date: 2026-01-02 |
| | |
| | Provides file parsing for: |
| | - PDF files (.pdf) using PyPDF2 |
| | - Excel files (.xlsx, .xls) using openpyxl |
| | - Word documents (.docx) using python-docx |
| | - Text files (.txt, .csv) using built-in open() |
| | |
| | All parsers include retry logic and error handling. |
| | """ |
| |
|
| | import logging |
| | from pathlib import Path |
| | from typing import Dict, List, Optional |
| | from tenacity import ( |
| | retry, |
| | stop_after_attempt, |
| | wait_exponential, |
| | retry_if_exception_type, |
| | ) |
| |
|
| | |
| | |
| | |
| | MAX_RETRIES = 3 |
| | RETRY_MIN_WAIT = 1 |
| | RETRY_MAX_WAIT = 5 |
| |
|
| | SUPPORTED_EXTENSIONS = { |
| | '.pdf': 'PDF', |
| | '.xlsx': 'Excel', |
| | '.xls': 'Excel', |
| | '.docx': 'Word', |
| | '.txt': 'Text', |
| | '.csv': 'CSV', |
| | } |
| |
|
| | |
| | |
| | |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @retry( |
| | stop=stop_after_attempt(MAX_RETRIES), |
| | wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| | retry=retry_if_exception_type((IOError, OSError)), |
| | reraise=True, |
| | ) |
| | def parse_pdf(file_path: str) -> Dict: |
| | """ |
| | Parse PDF file and extract text content. |
| | |
| | Args: |
| | file_path: Path to PDF file |
| | |
| | Returns: |
| | Dict with structure: { |
| | "content": str, # Extracted text |
| | "pages": int, # Number of pages |
| | "file_type": "PDF", |
| | "file_path": str |
| | } |
| | |
| | Raises: |
| | FileNotFoundError: If file doesn't exist |
| | ValueError: If file is corrupted or invalid |
| | IOError: For file reading errors (triggers retry) |
| | """ |
| | try: |
| | from PyPDF2 import PdfReader |
| |
|
| | path = Path(file_path) |
| | if not path.exists(): |
| | raise FileNotFoundError(f"PDF file not found: {file_path}") |
| |
|
| | logger.info(f"Parsing PDF: {file_path}") |
| |
|
| | reader = PdfReader(str(path)) |
| | num_pages = len(reader.pages) |
| |
|
| | |
| | content = [] |
| | for page_num, page in enumerate(reader.pages, 1): |
| | text = page.extract_text() |
| | if text.strip(): |
| | content.append(f"--- Page {page_num} ---\n{text}") |
| |
|
| | full_content = "\n\n".join(content) |
| |
|
| | logger.info(f"PDF parsed successfully: {num_pages} pages, {len(full_content)} chars") |
| |
|
| | return { |
| | "content": full_content, |
| | "pages": num_pages, |
| | "file_type": "PDF", |
| | "file_path": file_path, |
| | } |
| |
|
| | except FileNotFoundError as e: |
| | logger.error(f"PDF file not found: {e}") |
| | raise |
| | except (IOError, OSError) as e: |
| | logger.warning(f"PDF IO error (will retry): {e}") |
| | raise |
| | except Exception as e: |
| | logger.error(f"PDF parsing error: {e}") |
| | raise ValueError(f"Failed to parse PDF: {str(e)}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @retry( |
| | stop=stop_after_attempt(MAX_RETRIES), |
| | wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| | retry=retry_if_exception_type((IOError, OSError)), |
| | reraise=True, |
| | ) |
| | def parse_excel(file_path: str) -> Dict: |
| | """ |
| | Parse Excel file and extract data from all sheets. |
| | |
| | Args: |
| | file_path: Path to Excel file (.xlsx or .xls) |
| | |
| | Returns: |
| | Dict with structure: { |
| | "content": str, # Formatted table data |
| | "sheets": List[str], # Sheet names |
| | "file_type": "Excel", |
| | "file_path": str |
| | } |
| | |
| | Raises: |
| | FileNotFoundError: If file doesn't exist |
| | ValueError: If file is corrupted or invalid |
| | IOError: For file reading errors (triggers retry) |
| | """ |
| | try: |
| | from openpyxl import load_workbook |
| |
|
| | path = Path(file_path) |
| | if not path.exists(): |
| | raise FileNotFoundError(f"Excel file not found: {file_path}") |
| |
|
| | logger.info(f"Parsing Excel: {file_path}") |
| |
|
| | workbook = load_workbook(str(path), data_only=True) |
| | sheet_names = workbook.sheetnames |
| |
|
| | |
| | content_parts = [] |
| | for sheet_name in sheet_names: |
| | sheet = workbook[sheet_name] |
| |
|
| | |
| | rows = [] |
| | for row in sheet.iter_rows(values_only=True): |
| | |
| | if any(cell is not None for cell in row): |
| | row_str = "\t".join(str(cell) if cell is not None else "" for cell in row) |
| | rows.append(row_str) |
| |
|
| | if rows: |
| | sheet_content = f"=== Sheet: {sheet_name} ===\n" + "\n".join(rows) |
| | content_parts.append(sheet_content) |
| |
|
| | full_content = "\n\n".join(content_parts) |
| |
|
| | logger.info(f"Excel parsed successfully: {len(sheet_names)} sheets") |
| |
|
| | return { |
| | "content": full_content, |
| | "sheets": sheet_names, |
| | "file_type": "Excel", |
| | "file_path": file_path, |
| | } |
| |
|
| | except FileNotFoundError as e: |
| | logger.error(f"Excel file not found: {e}") |
| | raise |
| | except (IOError, OSError) as e: |
| | logger.warning(f"Excel IO error (will retry): {e}") |
| | raise |
| | except Exception as e: |
| | logger.error(f"Excel parsing error: {e}") |
| | raise ValueError(f"Failed to parse Excel: {str(e)}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @retry( |
| | stop=stop_after_attempt(MAX_RETRIES), |
| | wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| | retry=retry_if_exception_type((IOError, OSError)), |
| | reraise=True, |
| | ) |
| | def parse_word(file_path: str) -> Dict: |
| | """ |
| | Parse Word document and extract text content. |
| | |
| | Args: |
| | file_path: Path to Word file (.docx) |
| | |
| | Returns: |
| | Dict with structure: { |
| | "content": str, # Extracted text |
| | "paragraphs": int, # Number of paragraphs |
| | "file_type": "Word", |
| | "file_path": str |
| | } |
| | |
| | Raises: |
| | FileNotFoundError: If file doesn't exist |
| | ValueError: If file is corrupted or invalid |
| | IOError: For file reading errors (triggers retry) |
| | """ |
| | try: |
| | from docx import Document |
| |
|
| | path = Path(file_path) |
| | if not path.exists(): |
| | raise FileNotFoundError(f"Word file not found: {file_path}") |
| |
|
| | logger.info(f"Parsing Word document: {file_path}") |
| |
|
| | doc = Document(str(path)) |
| |
|
| | |
| | paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] |
| | full_content = "\n\n".join(paragraphs) |
| |
|
| | logger.info(f"Word parsed successfully: {len(paragraphs)} paragraphs") |
| |
|
| | return { |
| | "content": full_content, |
| | "paragraphs": len(paragraphs), |
| | "file_type": "Word", |
| | "file_path": file_path, |
| | } |
| |
|
| | except FileNotFoundError as e: |
| | logger.error(f"Word file not found: {e}") |
| | raise |
| | except (IOError, OSError) as e: |
| | logger.warning(f"Word IO error (will retry): {e}") |
| | raise |
| | except Exception as e: |
| | logger.error(f"Word parsing error: {e}") |
| | raise ValueError(f"Failed to parse Word document: {str(e)}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | @retry( |
| | stop=stop_after_attempt(MAX_RETRIES), |
| | wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
| | retry=retry_if_exception_type((IOError, OSError)), |
| | reraise=True, |
| | ) |
| | def parse_text(file_path: str) -> Dict: |
| | """ |
| | Parse plain text or CSV file. |
| | |
| | Args: |
| | file_path: Path to text file (.txt or .csv) |
| | |
| | Returns: |
| | Dict with structure: { |
| | "content": str, |
| | "lines": int, |
| | "file_type": "Text" or "CSV", |
| | "file_path": str |
| | } |
| | |
| | Raises: |
| | FileNotFoundError: If file doesn't exist |
| | IOError: For file reading errors (triggers retry) |
| | """ |
| | try: |
| | path = Path(file_path) |
| | if not path.exists(): |
| | raise FileNotFoundError(f"Text file not found: {file_path}") |
| |
|
| | logger.info(f"Parsing text file: {file_path}") |
| |
|
| | with open(path, 'r', encoding='utf-8') as f: |
| | content = f.read() |
| |
|
| | lines = content.count('\n') + 1 |
| | file_type = "CSV" if path.suffix == '.csv' else "Text" |
| |
|
| | logger.info(f"{file_type} file parsed successfully: {lines} lines") |
| |
|
| | return { |
| | "content": content, |
| | "lines": lines, |
| | "file_type": file_type, |
| | "file_path": file_path, |
| | } |
| |
|
| | except FileNotFoundError as e: |
| | logger.error(f"Text file not found: {e}") |
| | raise |
| | except (IOError, OSError) as e: |
| | logger.warning(f"Text file IO error (will retry): {e}") |
| | raise |
| | except UnicodeDecodeError as e: |
| | logger.error(f"Text file encoding error: {e}") |
| | raise ValueError(f"Failed to decode text file (try UTF-8): {str(e)}") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def parse_file(file_path: str) -> Dict: |
| | """ |
| | Parse file based on extension, automatically selecting the right parser. |
| | |
| | Args: |
| | file_path: Path to file |
| | |
| | Returns: |
| | Dict with parsed content and metadata |
| | |
| | Raises: |
| | ValueError: If file type is not supported |
| | FileNotFoundError: If file doesn't exist |
| | Exception: For parsing errors |
| | """ |
| | path = Path(file_path) |
| | extension = path.suffix.lower() |
| |
|
| | if extension not in SUPPORTED_EXTENSIONS: |
| | raise ValueError( |
| | f"Unsupported file type: {extension}. " |
| | f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}" |
| | ) |
| |
|
| | logger.info(f"Dispatching parser for {SUPPORTED_EXTENSIONS[extension]} file: {file_path}") |
| |
|
| | |
| | if extension == '.pdf': |
| | return parse_pdf(file_path) |
| | elif extension in ['.xlsx', '.xls']: |
| | return parse_excel(file_path) |
| | elif extension == '.docx': |
| | return parse_word(file_path) |
| | elif extension in ['.txt', '.csv']: |
| | return parse_text(file_path) |
| | else: |
| | |
| | raise ValueError(f"No parser for extension: {extension}") |
| |
|