Spaces:
Runtime error
Runtime error
File size: 1,741 Bytes
adb221d 040da4c adb221d 040da4c adb221d 040da4c adb221d 040da4c a540238 adb221d a540238 b4a5816 a540238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import asyncio
import io
import os
import time
from pathlib import Path
from typing import Dict, Tuple
import nest_asyncio
from fastapi import UploadFile
from llama_parse import LlamaParse
LLAMAPARSE_API_KEY = os.getenv("LLAMAPARSE_API_KEY")
parser = LlamaParse(
api_key=LLAMAPARSE_API_KEY,
result_type="markdown",
num_workers=4,
verbose=True,
language="en",
)
class DocumentParser:
"""DocumentParser is an asynchronous context manager class that provides functionality to parse the content of a document file.
Methods:
__aenter__() -> DocumentParser:
Enter the runtime context related to this object.
__aexit__(exc_type, exc_val, exc_tb) -> None:
Exit the runtime context related to this object.
parse_file_content(file_path: str) -> Tuple[Tuple[int, str], ...]:
Parse document content using a parser library.
file_path (str): Path to the file to parse.
"""
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
pass
async def parse_file_content(self, file_path: str) -> Tuple[Tuple[int, str], ...]:
"""
Parse document content using parser library.
Args:
file_path: Path to the file to parse
Returns:
Tuple of (page_number, content) pairs
"""
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, parser.load_data, file_path)
if not result:
return tuple()
return tuple(
(i, page.text.strip())
for i, page in enumerate(result, start=1)
if hasattr(page, "text") and page.text
)
|