Owadokun Tosin Tobi commited on
Commit
5bcb4ce
·
unverified ·
1 Parent(s): 682dcd9

Create requirement_parser.py

Browse files
Files changed (1) hide show
  1. src/core/requirement_parser.py +83 -0
src/core/requirement_parser.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import aiofiles
3
+ from pathlib import Path
4
+ from typing import Optional, Dict
5
+ from pypdf import PdfReader
6
+ from loguru import logger
7
+ from pydantic import BaseModel, Field
8
+
9
+ # SOTA Data Structure for Documents
10
+ class ParsedDocument(BaseModel):
11
+ filename: str
12
+ content: str
13
+ metadata: Dict[str, str] = Field(default_factory=dict)
14
+ char_count: int
15
+
16
+ class RequirementParser:
17
+ """
18
+ Async Parser for Requirement Documents (PDF/TXT/MD).
19
+ Implements secure file handling and metadata extraction.
20
+ """
21
+
22
+ SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.md'}
23
+
24
+ @staticmethod
25
+ def _validate_path(file_path: str) -> Path:
26
+ path = Path(file_path)
27
+ if not path.exists():
28
+ logger.error(f"File not found: {path}")
29
+ raise FileNotFoundError(f"File not found: {path}")
30
+ if path.suffix.lower() not in RequirementParser.SUPPORTED_EXTENSIONS:
31
+ logger.error(f"Unsupported file type: {path.suffix}")
32
+ raise ValueError(f"Unsupported file type. Allowed: {RequirementParser.SUPPORTED_EXTENSIONS}")
33
+ return path
34
+
35
+ @staticmethod
36
+ async def parse(file_path: str) -> ParsedDocument:
37
+ """
38
+ Asynchronously parses a file and returns a structured Document object.
39
+ """
40
+ path = RequirementParser._validate_path(file_path)
41
+ logger.info(f"Parsing file: {path.name}")
42
+
43
+ try:
44
+ if path.suffix.lower() == '.pdf':
45
+ return await RequirementParser._parse_pdf(path)
46
+ else:
47
+ return await RequirementParser._parse_text(path)
48
+ except Exception as e:
49
+ logger.exception(f"Failed to parse {path.name}")
50
+ raise RuntimeError(f"Parsing failed: {str(e)}")
51
+
52
+ @staticmethod
53
+ async def _parse_text(path: Path) -> ParsedDocument:
54
+ async with aiofiles.open(path, mode='r', encoding='utf-8') as f:
55
+ content = await f.read()
56
+
57
+ return ParsedDocument(
58
+ filename=path.name,
59
+ content=content,
60
+ metadata={"type": "text/markdown"},
61
+ char_count=len(content)
62
+ )
63
+
64
+ @staticmethod
65
+ async def _parse_pdf(path: Path) -> ParsedDocument:
66
+ # Note: PyPDF is CPU bound, run in executor if file is huge.
67
+ # For <10MB, direct execution is acceptable for MVP.
68
+ try:
69
+ reader = PdfReader(str(path))
70
+ text = []
71
+ for page in reader.pages:
72
+ text.append(page.extract_text() or "")
73
+
74
+ full_text = "\n".join(text)
75
+
76
+ return ParsedDocument(
77
+ filename=path.name,
78
+ content=full_text,
79
+ metadata={"pages": str(len(reader.pages))},
80
+ char_count=len(full_text)
81
+ )
82
+ except Exception as e:
83
+ raise ValueError(f"Corrupt or unreadable PDF: {str(e)}")