Spaces:
Sleeping
Sleeping
File size: 2,658 Bytes
1161dd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
from llama_parse import LlamaParse
from pptx import Presentation
from server.logger.logger_config import my_logger as logger
USE_LLAMA_PARSE = int(os.getenv('USE_LLAMA_PARSE'))
LLAMA_CLOUD_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')
class AsyncPptxLoader:
def __init__(self, file_path: str) -> None:
logger.info(f"[FILE LOADER] init pptx, file_path: '{file_path}'")
self.file_path = file_path
async def get_content(self) -> str:
try:
content = ''
if USE_LLAMA_PARSE:
parser = LlamaParse(
api_key=LLAMA_CLOUD_API_KEY,
result_type="markdown",
)
text_vec = []
import nest_asyncio
nest_asyncio.apply()
documents = parser.load_data(self.file_path)
for doc in documents:
text_vec.append(doc.text)
content = "\n\n".join(text_vec)
else:
# Load the presentation
prs = Presentation(self.file_path)
# Initialize a list to hold markdown parts
markdown_parts = []
# Process each slide in the presentation
for slide_number, slide in enumerate(prs.slides, start=1):
# Add a slide header
markdown_parts.append(f"## Slide {slide_number}\n")
# Process each shape in the slide
for shape in slide.shapes:
if not shape.has_text_frame:
continue
text_frame = shape.text_frame
# Process each paragraph in the text frame
for paragraph in text_frame.paragraphs:
# Combine the runs in the paragraph to form a full text
text_runs = [run.text for run in paragraph.runs]
paragraph_text = ''.join(text_runs).strip()
# Convert the text into a markdown bullet point
if paragraph_text:
markdown_parts.append(f"- {paragraph_text}\n")
if markdown_parts:
# Join all parts to form the final markdown text
content = ''.join(markdown_parts)
if not content:
logger.warning(f"file_path: '{self.file_path}' is empty!")
return content
except Exception as e:
logger.error(f"get_content is failed, exception: {e}")
return ''
|