Spaces:

EATosin
/

Zeta

Sleeping

App Files Files Community

Owadokun Tosin Tobi commited on Jan 14

Commit

5bcb4ce

unverified ·

1 Parent(s): 682dcd9

Create requirement_parser.py

Browse files

Files changed (1) hide show

src/core/requirement_parser.py +83 -0

src/core/requirement_parser.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import aiofiles
+from pathlib import Path
+from typing import Optional, Dict
+from pypdf import PdfReader
+from loguru import logger
+from pydantic import BaseModel, Field
+# SOTA Data Structure for Documents
+class ParsedDocument(BaseModel):
+    filename: str
+    content: str
+    metadata: Dict[str, str] = Field(default_factory=dict)
+    char_count: int
+class RequirementParser:
+    """
+    Async Parser for Requirement Documents (PDF/TXT/MD).
+    Implements secure file handling and metadata extraction.
+    """
+    SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.md'}
+    @staticmethod
+    def _validate_path(file_path: str) -> Path:
+        path = Path(file_path)
+        if not path.exists():
+            logger.error(f"File not found: {path}")
+            raise FileNotFoundError(f"File not found: {path}")
+        if path.suffix.lower() not in RequirementParser.SUPPORTED_EXTENSIONS:
+            logger.error(f"Unsupported file type: {path.suffix}")
+            raise ValueError(f"Unsupported file type. Allowed: {RequirementParser.SUPPORTED_EXTENSIONS}")
+        return path
+    @staticmethod
+    async def parse(file_path: str) -> ParsedDocument:
+        """
+        Asynchronously parses a file and returns a structured Document object.
+        """
+        path = RequirementParser._validate_path(file_path)
+        logger.info(f"Parsing file: {path.name}")
+        try:
+            if path.suffix.lower() == '.pdf':
+                return await RequirementParser._parse_pdf(path)
+            else:
+                return await RequirementParser._parse_text(path)
+        except Exception as e:
+            logger.exception(f"Failed to parse {path.name}")
+            raise RuntimeError(f"Parsing failed: {str(e)}")
+    @staticmethod
+    async def _parse_text(path: Path) -> ParsedDocument:
+        async with aiofiles.open(path, mode='r', encoding='utf-8') as f:
+            content = await f.read()
+        return ParsedDocument(
+            filename=path.name,
+            content=content,
+            metadata={"type": "text/markdown"},
+            char_count=len(content)
+        )
+    @staticmethod
+    async def _parse_pdf(path: Path) -> ParsedDocument:
+        # Note: PyPDF is CPU bound, run in executor if file is huge.
+        # For <10MB, direct execution is acceptable for MVP.
+        try:
+            reader = PdfReader(str(path))
+            text = []
+            for page in reader.pages:
+                text.append(page.extract_text() or "")
+            full_text = "\n".join(text)
+            return ParsedDocument(
+                filename=path.name,
+                content=full_text,
+                metadata={"pages": str(len(reader.pages))},
+                char_count=len(full_text)
+            )
+        except Exception as e:
+            raise ValueError(f"Corrupt or unreadable PDF: {str(e)}")