Spaces:
Sleeping
Sleeping
File size: 1,985 Bytes
d8fd28f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# src/preprocessing/file_processor.py
from pptx import Presentation
import nbformat
import re
import os
import logging
logger = logging.getLogger(__name__)
class FileProcessor:
@staticmethod
def process_slide_file(file_path: str) -> str:
try:
prs = Presentation(file_path)
content = []
for i, slide in enumerate(prs.slides):
content.append(f"=== Slide {i+1} ===")
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
cleaned_text = re.sub(r'\s+', ' ', shape.text.strip())
content.append(cleaned_text)
content.append("")
return "\n".join(content)
except Exception as e:
logger.error(f"Error processing presentation: {str(e)}")
return ""
@staticmethod
def process_notebook_file(file_path: str) -> str:
try:
with open(file_path, 'r', encoding='utf-8') as f:
nb = nbformat.read(f, as_version=4)
content = []
for cell in nb.cells:
if cell.cell_type == 'code':
content.append("## CODE CELL ##")
content.append(cell.source.strip())
content.append("----")
elif cell.cell_type == 'markdown':
content.append("## MARKDOWN CELL ##")
cleaned_text = cell.source.strip()
cleaned_text = re.sub(r'#+\s*', '', cleaned_text)
cleaned_text = re.sub(r'\*{1,2}(.*?)\*{1,2}', r'\1', cleaned_text)
cleaned_text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', cleaned_text)
content.append(cleaned_text)
content.append("----")
return "\n".join(content)
except Exception as e:
logger.error(f"Error processing notebook: {str(e)}")
return "" |