File size: 5,158 Bytes
cd9a871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import json,re

class NotebookParser:
    def __init__(self, notebook_path: str):
        """Initialize with path to a Jupyter notebook file."""
        self.notebook_path = notebook_path
        with open(notebook_path, "r", encoding="utf-8") as f:
            self.nb_json = json.load(f)


    def extract(self, code: bool = False, code_output: bool = False, markdown: bool = False, plots: bool = False):
        """

        Extracts notebook content in order of appearance.



        Args:

            code (bool): include code cells

            code_output (bool): include code cell outputs

            markdown (bool): include markdown cells

            plots (bool): include image outputs (PNG/JPEG, including markdown images)



        Returns:

            List[dict]: list of content blocks for LLM consumption

        """
        content = []
        image_pattern = re.compile(r"!\[.*?\]\((.*?)\)")

        for cell in self.nb_json.get("cells", []):
            cell_type = cell.get("cell_type")

            if markdown and cell_type == "markdown":
                text = "".join(cell.get("source", []))
                if text.strip():
                    if plots:
                        content.append({"type": "text", "text": text})
                    else:
                        text_no_images = image_pattern.sub("", text).strip()
                        if text_no_images:
                            content.append({"type": "text", "text": text_no_images})

                    if plots:
                        for match in image_pattern.findall(text):
                            if match.startswith("data:image/png;base64,"):
                                content.append({
                                    "type": "image",
                                    "source_type": "base64",
                                    "data": match.replace("data:image/png;base64,", ""),
                                    "mime_type": "image/png"
                                })
                            elif match.startswith("data:image/jpeg;base64,"):
                                content.append({
                                    "type": "image",
                                    "source_type": "base64",
                                    "data": match.replace("data:image/jpeg;base64,", ""),
                                    "mime_type": "image/jpeg"
                                })
                            else:
                                content.append({
                                    "type": "text",
                                    "text": f"[Image: {match}]"
                                })

            elif code and cell_type == "code":
                code_text = "".join(cell.get("source", []))
                if code_text.strip():
                    content.append({
                        "type": "text",
                        "text": f"{code_text}"
                    })

            if code_output and cell_type == "code":
                for output in cell.get("outputs", []):
                    if "data" in output:
                        data = output["data"]

                        if plots and "image/png" in data:
                            content.append({
                                "type": "image",
                                "source_type": "base64",
                                "data": data["image/png"],
                                "mime_type": "image/png"
                            })
                        elif plots and "image/jpeg" in data:
                            content.append({
                                "type": "image",
                                "source_type": "base64",
                                "data": data["image/jpeg"],
                                "mime_type": "image/jpeg"
                            })

                        elif "text/plain" in data:
                            text_out = "".join(data["text/plain"])
                            if text_out.strip():
                                content.append({
                                    "type": "text",
                                    "text": f"{text_out}"
                                })

                    if output.get("output_type") == "stream":
                        text_out = "".join(output.get("text", []))
                        if text_out.strip():
                            content.append({
                                "type": "text",
                                "text": f"{text_out}"
                            })

                    if output.get("output_type") == "error":
                        ename = output.get("ename", "")
                        evalue = output.get("evalue", "")
                        traceback = "\n".join(output.get("traceback", []))
                        content.append({
                            "type": "text",
                            "text": f"{ename}: {evalue}\n{traceback}"
                        })

        return content