| import aiofiles |
| from pathlib import Path |
| from typing import Dict |
| from pypdf import PdfReader |
| from loguru import logger |
| from pydantic import BaseModel, Field |
|
|
| |
| class ParsedDocument(BaseModel): |
| filename: str |
| content: str |
| metadata: Dict[str, str] = Field(default_factory=dict) |
| char_count: int |
|
|
| class RequirementParser: |
| """ |
| Async Parser for Requirement Documents (PDF/TXT/MD). |
| Implements secure file handling and metadata extraction. |
| """ |
|
|
| SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.md'} |
|
|
| @staticmethod |
| def _validate_path(file_path: str) -> Path: |
| path = Path(file_path) |
| if not path.exists(): |
| logger.error(f"File not found: {path}") |
| raise FileNotFoundError(f"File not found: {path}") |
| if path.suffix.lower() not in RequirementParser.SUPPORTED_EXTENSIONS: |
| logger.error(f"Unsupported file type: {path.suffix}") |
| raise ValueError(f"Unsupported file type. Allowed: {RequirementParser.SUPPORTED_EXTENSIONS}") |
| return path |
|
|
| @staticmethod |
| async def parse(file_path: str) -> ParsedDocument: |
| """ |
| Asynchronously parses a file and returns a structured Document object. |
| """ |
| path = RequirementParser._validate_path(file_path) |
| logger.info(f"Parsing file: {path.name}") |
|
|
| try: |
| if path.suffix.lower() == '.pdf': |
| return await RequirementParser._parse_pdf(path) |
| else: |
| return await RequirementParser._parse_text(path) |
| except Exception as e: |
| logger.exception(f"Failed to parse {path.name}") |
| raise RuntimeError(f"Parsing failed: {str(e)}") |
|
|
| @staticmethod |
| async def _parse_text(path: Path) -> ParsedDocument: |
| async with aiofiles.open(path, mode='r', encoding='utf-8') as f: |
| content = await f.read() |
| |
| return ParsedDocument( |
| filename=path.name, |
| content=content, |
| metadata={"type": "text/markdown"}, |
| char_count=len(content) |
| ) |
|
|
| @staticmethod |
| async def _parse_pdf(path: Path) -> ParsedDocument: |
| try: |
| reader = PdfReader(str(path)) |
| text = [] |
| for page in reader.pages: |
| text.append(page.extract_text() or "") |
| |
| full_text = "\n".join(text) |
| |
| return ParsedDocument( |
| filename=path.name, |
| content=full_text, |
| metadata={"pages": str(len(reader.pages))}, |
| char_count=len(full_text) |
| ) |
| except Exception as e: |
| raise ValueError(f"Corrupt or unreadable PDF: {str(e)}") |
|
|