Spaces:

EATosin
/

Zeta

Sleeping

Zeta / src /core /requirement_parser.py

Owadokun Tosin Tobi

Update requirement_parser.py

e59096a unverified 4 months ago

2.73 kB

	import aiofiles
	from pathlib import Path
	from typing import Dict
	from pypdf import PdfReader
	from loguru import logger
	from pydantic import BaseModel, Field

	# SOTA Data Structure for Documents
	class ParsedDocument(BaseModel):
	filename: str
	content: str
	metadata: Dict[str, str] = Field(default_factory=dict)
	char_count: int

	class RequirementParser:
	"""
	Async Parser for Requirement Documents (PDF/TXT/MD).
	Implements secure file handling and metadata extraction.
	"""

	SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.md'}

	@staticmethod
	def _validate_path(file_path: str) -> Path:
	path = Path(file_path)
	if not path.exists():
	logger.error(f"File not found: {path}")
	raise FileNotFoundError(f"File not found: {path}")
	if path.suffix.lower() not in RequirementParser.SUPPORTED_EXTENSIONS:
	logger.error(f"Unsupported file type: {path.suffix}")
	raise ValueError(f"Unsupported file type. Allowed: {RequirementParser.SUPPORTED_EXTENSIONS}")
	return path

	@staticmethod
	async def parse(file_path: str) -> ParsedDocument:
	"""
	Asynchronously parses a file and returns a structured Document object.
	"""
	path = RequirementParser._validate_path(file_path)
	logger.info(f"Parsing file: {path.name}")

	try:
	if path.suffix.lower() == '.pdf':
	return await RequirementParser._parse_pdf(path)
	else:
	return await RequirementParser._parse_text(path)
	except Exception as e:
	logger.exception(f"Failed to parse {path.name}")
	raise RuntimeError(f"Parsing failed: {str(e)}")

	@staticmethod
	async def _parse_text(path: Path) -> ParsedDocument:
	async with aiofiles.open(path, mode='r', encoding='utf-8') as f:
	content = await f.read()

	return ParsedDocument(
	filename=path.name,
	content=content,
	metadata={"type": "text/markdown"},
	char_count=len(content)
	)

	@staticmethod
	async def _parse_pdf(path: Path) -> ParsedDocument:
	try:
	reader = PdfReader(str(path))
	text = []
	for page in reader.pages:
	text.append(page.extract_text() or "")

	full_text = "\n".join(text)

	return ParsedDocument(
	filename=path.name,
	content=full_text,
	metadata={"pages": str(len(reader.pages))},
	char_count=len(full_text)
	)
	except Exception as e:
	raise ValueError(f"Corrupt or unreadable PDF: {str(e)}")