mandala-for-us / src /utils /_document_parser.py
kanha-upadhyay's picture
add documentation comments
adb221d
import asyncio
import io
import os
import time
from pathlib import Path
from typing import Dict, Tuple
import nest_asyncio
from fastapi import UploadFile
from llama_parse import LlamaParse
LLAMAPARSE_API_KEY = os.getenv("LLAMAPARSE_API_KEY")
parser = LlamaParse(
api_key=LLAMAPARSE_API_KEY,
result_type="markdown",
num_workers=4,
verbose=True,
language="en",
)
class DocumentParser:
"""DocumentParser is an asynchronous context manager class that provides functionality to parse the content of a document file.
Methods:
__aenter__() -> DocumentParser:
Enter the runtime context related to this object.
__aexit__(exc_type, exc_val, exc_tb) -> None:
Exit the runtime context related to this object.
parse_file_content(file_path: str) -> Tuple[Tuple[int, str], ...]:
Parse document content using a parser library.
file_path (str): Path to the file to parse.
"""
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
pass
async def parse_file_content(self, file_path: str) -> Tuple[Tuple[int, str], ...]:
"""
Parse document content using parser library.
Args:
file_path: Path to the file to parse
Returns:
Tuple of (page_number, content) pairs
"""
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, parser.load_data, file_path)
if not result:
return tuple()
return tuple(
(i, page.text.strip())
for i, page in enumerate(result, start=1)
if hasattr(page, "text") and page.text
)