|
|
""" |
|
|
File Parser Tool - Multi-format file reading |
|
|
Author: @mangubee |
|
|
Date: 2026-01-02 |
|
|
|
|
|
Provides file parsing for: |
|
|
- PDF files (.pdf) using PyPDF2 |
|
|
- Excel files (.xlsx, .xls) using openpyxl |
|
|
- Word documents (.docx) using python-docx |
|
|
- Text files (.txt, .csv) using built-in open() |
|
|
|
|
|
All parsers include retry logic and error handling. |
|
|
""" |
|
|
|
|
|
import logging |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional |
|
|
from tenacity import ( |
|
|
retry, |
|
|
stop_after_attempt, |
|
|
wait_exponential, |
|
|
retry_if_exception_type, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MAX_RETRIES = 3 |
|
|
RETRY_MIN_WAIT = 1 |
|
|
RETRY_MAX_WAIT = 5 |
|
|
|
|
|
SUPPORTED_EXTENSIONS = { |
|
|
'.pdf': 'PDF', |
|
|
'.xlsx': 'Excel', |
|
|
'.xls': 'Excel', |
|
|
'.docx': 'Word', |
|
|
'.txt': 'Text', |
|
|
'.csv': 'CSV', |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(MAX_RETRIES), |
|
|
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
|
|
retry=retry_if_exception_type((IOError, OSError)), |
|
|
reraise=True, |
|
|
) |
|
|
def parse_pdf(file_path: str) -> Dict: |
|
|
""" |
|
|
Parse PDF file and extract text content. |
|
|
|
|
|
Args: |
|
|
file_path: Path to PDF file |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"content": str, # Extracted text |
|
|
"pages": int, # Number of pages |
|
|
"file_type": "PDF", |
|
|
"file_path": str |
|
|
} |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If file doesn't exist |
|
|
ValueError: If file is corrupted or invalid |
|
|
IOError: For file reading errors (triggers retry) |
|
|
""" |
|
|
try: |
|
|
from PyPDF2 import PdfReader |
|
|
|
|
|
path = Path(file_path) |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"PDF file not found: {file_path}") |
|
|
|
|
|
logger.info(f"Parsing PDF: {file_path}") |
|
|
|
|
|
reader = PdfReader(str(path)) |
|
|
num_pages = len(reader.pages) |
|
|
|
|
|
|
|
|
content = [] |
|
|
for page_num, page in enumerate(reader.pages, 1): |
|
|
text = page.extract_text() |
|
|
if text.strip(): |
|
|
content.append(f"--- Page {page_num} ---\n{text}") |
|
|
|
|
|
full_content = "\n\n".join(content) |
|
|
|
|
|
logger.info(f"PDF parsed successfully: {num_pages} pages, {len(full_content)} chars") |
|
|
|
|
|
return { |
|
|
"content": full_content, |
|
|
"pages": num_pages, |
|
|
"file_type": "PDF", |
|
|
"file_path": file_path, |
|
|
} |
|
|
|
|
|
except FileNotFoundError as e: |
|
|
logger.error(f"PDF file not found: {e}") |
|
|
raise |
|
|
except (IOError, OSError) as e: |
|
|
logger.warning(f"PDF IO error (will retry): {e}") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"PDF parsing error: {e}") |
|
|
raise ValueError(f"Failed to parse PDF: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(MAX_RETRIES), |
|
|
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
|
|
retry=retry_if_exception_type((IOError, OSError)), |
|
|
reraise=True, |
|
|
) |
|
|
def parse_excel(file_path: str) -> Dict: |
|
|
""" |
|
|
Parse Excel file and extract data from all sheets. |
|
|
|
|
|
Args: |
|
|
file_path: Path to Excel file (.xlsx or .xls) |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"content": str, # Formatted table data |
|
|
"sheets": List[str], # Sheet names |
|
|
"file_type": "Excel", |
|
|
"file_path": str |
|
|
} |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If file doesn't exist |
|
|
ValueError: If file is corrupted or invalid |
|
|
IOError: For file reading errors (triggers retry) |
|
|
""" |
|
|
try: |
|
|
from openpyxl import load_workbook |
|
|
|
|
|
path = Path(file_path) |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Excel file not found: {file_path}") |
|
|
|
|
|
logger.info(f"Parsing Excel: {file_path}") |
|
|
|
|
|
workbook = load_workbook(str(path), data_only=True) |
|
|
sheet_names = workbook.sheetnames |
|
|
|
|
|
|
|
|
content_parts = [] |
|
|
for sheet_name in sheet_names: |
|
|
sheet = workbook[sheet_name] |
|
|
|
|
|
|
|
|
rows = [] |
|
|
for row in sheet.iter_rows(values_only=True): |
|
|
|
|
|
if any(cell is not None for cell in row): |
|
|
row_str = "\t".join(str(cell) if cell is not None else "" for cell in row) |
|
|
rows.append(row_str) |
|
|
|
|
|
if rows: |
|
|
sheet_content = f"=== Sheet: {sheet_name} ===\n" + "\n".join(rows) |
|
|
content_parts.append(sheet_content) |
|
|
|
|
|
full_content = "\n\n".join(content_parts) |
|
|
|
|
|
logger.info(f"Excel parsed successfully: {len(sheet_names)} sheets") |
|
|
|
|
|
return { |
|
|
"content": full_content, |
|
|
"sheets": sheet_names, |
|
|
"file_type": "Excel", |
|
|
"file_path": file_path, |
|
|
} |
|
|
|
|
|
except FileNotFoundError as e: |
|
|
logger.error(f"Excel file not found: {e}") |
|
|
raise |
|
|
except (IOError, OSError) as e: |
|
|
logger.warning(f"Excel IO error (will retry): {e}") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Excel parsing error: {e}") |
|
|
raise ValueError(f"Failed to parse Excel: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(MAX_RETRIES), |
|
|
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
|
|
retry=retry_if_exception_type((IOError, OSError)), |
|
|
reraise=True, |
|
|
) |
|
|
def parse_word(file_path: str) -> Dict: |
|
|
""" |
|
|
Parse Word document and extract text content. |
|
|
|
|
|
Args: |
|
|
file_path: Path to Word file (.docx) |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"content": str, # Extracted text |
|
|
"paragraphs": int, # Number of paragraphs |
|
|
"file_type": "Word", |
|
|
"file_path": str |
|
|
} |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If file doesn't exist |
|
|
ValueError: If file is corrupted or invalid |
|
|
IOError: For file reading errors (triggers retry) |
|
|
""" |
|
|
try: |
|
|
from docx import Document |
|
|
|
|
|
path = Path(file_path) |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Word file not found: {file_path}") |
|
|
|
|
|
logger.info(f"Parsing Word document: {file_path}") |
|
|
|
|
|
doc = Document(str(path)) |
|
|
|
|
|
|
|
|
paragraphs = [para.text for para in doc.paragraphs if para.text.strip()] |
|
|
full_content = "\n\n".join(paragraphs) |
|
|
|
|
|
logger.info(f"Word parsed successfully: {len(paragraphs)} paragraphs") |
|
|
|
|
|
return { |
|
|
"content": full_content, |
|
|
"paragraphs": len(paragraphs), |
|
|
"file_type": "Word", |
|
|
"file_path": file_path, |
|
|
} |
|
|
|
|
|
except FileNotFoundError as e: |
|
|
logger.error(f"Word file not found: {e}") |
|
|
raise |
|
|
except (IOError, OSError) as e: |
|
|
logger.warning(f"Word IO error (will retry): {e}") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Word parsing error: {e}") |
|
|
raise ValueError(f"Failed to parse Word document: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry( |
|
|
stop=stop_after_attempt(MAX_RETRIES), |
|
|
wait=wait_exponential(multiplier=1, min=RETRY_MIN_WAIT, max=RETRY_MAX_WAIT), |
|
|
retry=retry_if_exception_type((IOError, OSError)), |
|
|
reraise=True, |
|
|
) |
|
|
def parse_text(file_path: str) -> Dict: |
|
|
""" |
|
|
Parse plain text or CSV file. |
|
|
|
|
|
Args: |
|
|
file_path: Path to text file (.txt or .csv) |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"content": str, |
|
|
"lines": int, |
|
|
"file_type": "Text" or "CSV", |
|
|
"file_path": str |
|
|
} |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If file doesn't exist |
|
|
IOError: For file reading errors (triggers retry) |
|
|
""" |
|
|
try: |
|
|
path = Path(file_path) |
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Text file not found: {file_path}") |
|
|
|
|
|
logger.info(f"Parsing text file: {file_path}") |
|
|
|
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
content = f.read() |
|
|
|
|
|
lines = content.count('\n') + 1 |
|
|
file_type = "CSV" if path.suffix == '.csv' else "Text" |
|
|
|
|
|
logger.info(f"{file_type} file parsed successfully: {lines} lines") |
|
|
|
|
|
return { |
|
|
"content": content, |
|
|
"lines": lines, |
|
|
"file_type": file_type, |
|
|
"file_path": file_path, |
|
|
} |
|
|
|
|
|
except FileNotFoundError as e: |
|
|
logger.error(f"Text file not found: {e}") |
|
|
raise |
|
|
except (IOError, OSError) as e: |
|
|
logger.warning(f"Text file IO error (will retry): {e}") |
|
|
raise |
|
|
except UnicodeDecodeError as e: |
|
|
logger.error(f"Text file encoding error: {e}") |
|
|
raise ValueError(f"Failed to decode text file (try UTF-8): {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_file(file_path: str) -> Dict: |
|
|
""" |
|
|
Parse file based on extension, automatically selecting the right parser. |
|
|
|
|
|
Args: |
|
|
file_path: Path to file |
|
|
|
|
|
Returns: |
|
|
Dict with parsed content and metadata |
|
|
|
|
|
Raises: |
|
|
ValueError: If file type is not supported |
|
|
FileNotFoundError: If file doesn't exist |
|
|
Exception: For parsing errors |
|
|
""" |
|
|
path = Path(file_path) |
|
|
extension = path.suffix.lower() |
|
|
|
|
|
if extension not in SUPPORTED_EXTENSIONS: |
|
|
raise ValueError( |
|
|
f"Unsupported file type: {extension}. " |
|
|
f"Supported: {', '.join(SUPPORTED_EXTENSIONS.keys())}" |
|
|
) |
|
|
|
|
|
logger.info(f"Dispatching parser for {SUPPORTED_EXTENSIONS[extension]} file: {file_path}") |
|
|
|
|
|
|
|
|
if extension == '.pdf': |
|
|
return parse_pdf(file_path) |
|
|
elif extension in ['.xlsx', '.xls']: |
|
|
return parse_excel(file_path) |
|
|
elif extension == '.docx': |
|
|
return parse_word(file_path) |
|
|
elif extension in ['.txt', '.csv']: |
|
|
return parse_text(file_path) |
|
|
else: |
|
|
|
|
|
raise ValueError(f"No parser for extension: {extension}") |
|
|
|