File size: 2,725 Bytes
5bcb4ce
 
e59096a
5bcb4ce
 
 
 
e59096a
5bcb4ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import aiofiles
from pathlib import Path
from typing import Dict
from pypdf import PdfReader
from loguru import logger
from pydantic import BaseModel, Field

# SOTA Data Structure for Documents
class ParsedDocument(BaseModel):
    filename: str
    content: str
    metadata: Dict[str, str] = Field(default_factory=dict)
    char_count: int

class RequirementParser:
    """
    Async Parser for Requirement Documents (PDF/TXT/MD).
    Implements secure file handling and metadata extraction.
    """

    SUPPORTED_EXTENSIONS = {'.pdf', '.txt', '.md'}

    @staticmethod
    def _validate_path(file_path: str) -> Path:
        path = Path(file_path)
        if not path.exists():
            logger.error(f"File not found: {path}")
            raise FileNotFoundError(f"File not found: {path}")
        if path.suffix.lower() not in RequirementParser.SUPPORTED_EXTENSIONS:
            logger.error(f"Unsupported file type: {path.suffix}")
            raise ValueError(f"Unsupported file type. Allowed: {RequirementParser.SUPPORTED_EXTENSIONS}")
        return path

    @staticmethod
    async def parse(file_path: str) -> ParsedDocument:
        """
        Asynchronously parses a file and returns a structured Document object.
        """
        path = RequirementParser._validate_path(file_path)
        logger.info(f"Parsing file: {path.name}")

        try:
            if path.suffix.lower() == '.pdf':
                return await RequirementParser._parse_pdf(path)
            else:
                return await RequirementParser._parse_text(path)
        except Exception as e:
            logger.exception(f"Failed to parse {path.name}")
            raise RuntimeError(f"Parsing failed: {str(e)}")

    @staticmethod
    async def _parse_text(path: Path) -> ParsedDocument:
        async with aiofiles.open(path, mode='r', encoding='utf-8') as f:
            content = await f.read()
        
        return ParsedDocument(
            filename=path.name,
            content=content,
            metadata={"type": "text/markdown"},
            char_count=len(content)
        )

    @staticmethod
    async def _parse_pdf(path: Path) -> ParsedDocument:
        try:
            reader = PdfReader(str(path))
            text = []
            for page in reader.pages:
                text.append(page.extract_text() or "")
            
            full_text = "\n".join(text)
            
            return ParsedDocument(
                filename=path.name,
                content=full_text,
                metadata={"pages": str(len(reader.pages))},
                char_count=len(full_text)
            )
        except Exception as e:
            raise ValueError(f"Corrupt or unreadable PDF: {str(e)}")