Spaces:
Sleeping
Sleeping
| # src/preprocessing/file_processor.py | |
| from pptx import Presentation | |
| import nbformat | |
| import re | |
| import os | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class FileProcessor: | |
| def process_slide_file(file_path: str) -> str: | |
| try: | |
| prs = Presentation(file_path) | |
| content = [] | |
| for i, slide in enumerate(prs.slides): | |
| content.append(f"=== Slide {i+1} ===") | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text") and shape.text.strip(): | |
| cleaned_text = re.sub(r'\s+', ' ', shape.text.strip()) | |
| content.append(cleaned_text) | |
| content.append("") | |
| return "\n".join(content) | |
| except Exception as e: | |
| logger.error(f"Error processing presentation: {str(e)}") | |
| return "" | |
| def process_notebook_file(file_path: str) -> str: | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| nb = nbformat.read(f, as_version=4) | |
| content = [] | |
| for cell in nb.cells: | |
| if cell.cell_type == 'code': | |
| content.append("## CODE CELL ##") | |
| content.append(cell.source.strip()) | |
| content.append("----") | |
| elif cell.cell_type == 'markdown': | |
| content.append("## MARKDOWN CELL ##") | |
| cleaned_text = cell.source.strip() | |
| cleaned_text = re.sub(r'#+\s*', '', cleaned_text) | |
| cleaned_text = re.sub(r'\*{1,2}(.*?)\*{1,2}', r'\1', cleaned_text) | |
| cleaned_text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', cleaned_text) | |
| content.append(cleaned_text) | |
| content.append("----") | |
| return "\n".join(content) | |
| except Exception as e: | |
| logger.error(f"Error processing notebook: {str(e)}") | |
| return "" |