File size: 1,985 Bytes
d8fd28f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# src/preprocessing/file_processor.py
from pptx import Presentation
import nbformat
import re
import os
import logging

logger = logging.getLogger(__name__)

class FileProcessor:
    @staticmethod
    def process_slide_file(file_path: str) -> str:
        try:
            prs = Presentation(file_path)
            content = []
            for i, slide in enumerate(prs.slides):
                content.append(f"=== Slide {i+1} ===")
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        cleaned_text = re.sub(r'\s+', ' ', shape.text.strip())
                        content.append(cleaned_text)
                content.append("")
            return "\n".join(content)
        except Exception as e:
            logger.error(f"Error processing presentation: {str(e)}")
            return ""

    @staticmethod
    def process_notebook_file(file_path: str) -> str:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                nb = nbformat.read(f, as_version=4)

            content = []
            for cell in nb.cells:
                if cell.cell_type == 'code':
                    content.append("## CODE CELL ##")
                    content.append(cell.source.strip())
                    content.append("----")
                elif cell.cell_type == 'markdown':
                    content.append("## MARKDOWN CELL ##")
                    cleaned_text = cell.source.strip()
                    cleaned_text = re.sub(r'#+\s*', '', cleaned_text)
                    cleaned_text = re.sub(r'\*{1,2}(.*?)\*{1,2}', r'\1', cleaned_text)
                    cleaned_text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', cleaned_text)
                    content.append(cleaned_text)
                    content.append("----")

            return "\n".join(content)
        except Exception as e:
            logger.error(f"Error processing notebook: {str(e)}")
            return ""