Zeta / src /core /requirement_parser.py
Owadokun Tosin Tobi
Update requirement_parser.py
e59096a unverified
import aiofiles
from pathlib import Path
from typing import Dict
from pypdf import PdfReader
from loguru import logger
from pydantic import BaseModel, Field
# SOTA Data Structure for Documents
class ParsedDocument(BaseModel):
filename: str
content: str
metadata: Dict[str, str] = Field(default_factory=dict)
char_count: int
class RequirementParser:
"""
Async Parser for Requirement Documents (PDF/TXT/MD).
Implements secure file handling and metadata extraction.
"""
SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.md'}
@staticmethod
def _validate_path(file_path: str) -> Path:
path = Path(file_path)
if not path.exists():
logger.error(f"File not found: {path}")
raise FileNotFoundError(f"File not found: {path}")
if path.suffix.lower() not in RequirementParser.SUPPORTED_EXTENSIONS:
logger.error(f"Unsupported file type: {path.suffix}")
raise ValueError(f"Unsupported file type. Allowed: {RequirementParser.SUPPORTED_EXTENSIONS}")
return path
@staticmethod
async def parse(file_path: str) -> ParsedDocument:
"""
Asynchronously parses a file and returns a structured Document object.
"""
path = RequirementParser._validate_path(file_path)
logger.info(f"Parsing file: {path.name}")
try:
if path.suffix.lower() == '.pdf':
return await RequirementParser._parse_pdf(path)
else:
return await RequirementParser._parse_text(path)
except Exception as e:
logger.exception(f"Failed to parse {path.name}")
raise RuntimeError(f"Parsing failed: {str(e)}")
@staticmethod
async def _parse_text(path: Path) -> ParsedDocument:
async with aiofiles.open(path, mode='r', encoding='utf-8') as f:
content = await f.read()
return ParsedDocument(
filename=path.name,
content=content,
metadata={"type": "text/markdown"},
char_count=len(content)
)
@staticmethod
async def _parse_pdf(path: Path) -> ParsedDocument:
try:
reader = PdfReader(str(path))
text = []
for page in reader.pages:
text.append(page.extract_text() or "")
full_text = "\n".join(text)
return ParsedDocument(
filename=path.name,
content=full_text,
metadata={"pages": str(len(reader.pages))},
char_count=len(full_text)
)
except Exception as e:
raise ValueError(f"Corrupt or unreadable PDF: {str(e)}")