File size: 1,741 Bytes
adb221d
040da4c
 
adb221d
040da4c
adb221d
 
 
040da4c
adb221d
040da4c
 
 
 
 
 
 
 
 
 
a540238
 
adb221d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a540238
 
 
 
 
 
 
b4a5816
a540238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import asyncio
import io
import os
import time
from pathlib import Path
from typing import Dict, Tuple

import nest_asyncio
from fastapi import UploadFile
from llama_parse import LlamaParse

LLAMAPARSE_API_KEY = os.getenv("LLAMAPARSE_API_KEY")
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown",
    num_workers=4,
    verbose=True,
    language="en",
)


class DocumentParser:
    """DocumentParser is an asynchronous context manager class that provides functionality to parse the content of a document file.

    Methods:
        __aenter__() -> DocumentParser:
            Enter the runtime context related to this object.

        __aexit__(exc_type, exc_val, exc_tb) -> None:
            Exit the runtime context related to this object.

        parse_file_content(file_path: str) -> Tuple[Tuple[int, str], ...]:
            Parse document content using a parser library.

                file_path (str): Path to the file to parse.

    """

    async def __aenter__(self):
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        pass

    async def parse_file_content(self, file_path: str) -> Tuple[Tuple[int, str], ...]:
        """
        Parse document content using parser library.

        Args:
            file_path: Path to the file to parse

        Returns:
            Tuple of (page_number, content) pairs
        """
        loop = asyncio.get_event_loop()
        result = await loop.run_in_executor(None, parser.load_data, file_path)

        if not result:
            return tuple()

        return tuple(
            (i, page.text.strip())
            for i, page in enumerate(result, start=1)
            if hasattr(page, "text") and page.text
        )