Spaces:
Sleeping
Sleeping
| import os | |
| from llama_parse import LlamaParse | |
| from pptx import Presentation | |
| from server.logger.logger_config import my_logger as logger | |
| USE_LLAMA_PARSE = int(os.getenv('USE_LLAMA_PARSE')) | |
| LLAMA_CLOUD_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY') | |
| class AsyncPptxLoader: | |
| def __init__(self, file_path: str) -> None: | |
| logger.info(f"[FILE LOADER] init pptx, file_path: '{file_path}'") | |
| self.file_path = file_path | |
| async def get_content(self) -> str: | |
| try: | |
| content = '' | |
| if USE_LLAMA_PARSE: | |
| parser = LlamaParse( | |
| api_key=LLAMA_CLOUD_API_KEY, | |
| result_type="markdown", | |
| ) | |
| text_vec = [] | |
| import nest_asyncio | |
| nest_asyncio.apply() | |
| documents = parser.load_data(self.file_path) | |
| for doc in documents: | |
| text_vec.append(doc.text) | |
| content = "\n\n".join(text_vec) | |
| else: | |
| # Load the presentation | |
| prs = Presentation(self.file_path) | |
| # Initialize a list to hold markdown parts | |
| markdown_parts = [] | |
| # Process each slide in the presentation | |
| for slide_number, slide in enumerate(prs.slides, start=1): | |
| # Add a slide header | |
| markdown_parts.append(f"## Slide {slide_number}\n") | |
| # Process each shape in the slide | |
| for shape in slide.shapes: | |
| if not shape.has_text_frame: | |
| continue | |
| text_frame = shape.text_frame | |
| # Process each paragraph in the text frame | |
| for paragraph in text_frame.paragraphs: | |
| # Combine the runs in the paragraph to form a full text | |
| text_runs = [run.text for run in paragraph.runs] | |
| paragraph_text = ''.join(text_runs).strip() | |
| # Convert the text into a markdown bullet point | |
| if paragraph_text: | |
| markdown_parts.append(f"- {paragraph_text}\n") | |
| if markdown_parts: | |
| # Join all parts to form the final markdown text | |
| content = ''.join(markdown_parts) | |
| if not content: | |
| logger.warning(f"file_path: '{self.file_path}' is empty!") | |
| return content | |
| except Exception as e: | |
| logger.error(f"get_content is failed, exception: {e}") | |
| return '' | |