Great-Lens-D / src /notebook_parser.py
HarishMaths's picture
Upload 2 files
cd9a871 verified
import json,re
class NotebookParser:
def __init__(self, notebook_path: str):
"""Initialize with path to a Jupyter notebook file."""
self.notebook_path = notebook_path
with open(notebook_path, "r", encoding="utf-8") as f:
self.nb_json = json.load(f)
def extract(self, code: bool = False, code_output: bool = False, markdown: bool = False, plots: bool = False):
"""
Extracts notebook content in order of appearance.
Args:
code (bool): include code cells
code_output (bool): include code cell outputs
markdown (bool): include markdown cells
plots (bool): include image outputs (PNG/JPEG, including markdown images)
Returns:
List[dict]: list of content blocks for LLM consumption
"""
content = []
image_pattern = re.compile(r"!\[.*?\]\((.*?)\)")
for cell in self.nb_json.get("cells", []):
cell_type = cell.get("cell_type")
if markdown and cell_type == "markdown":
text = "".join(cell.get("source", []))
if text.strip():
if plots:
content.append({"type": "text", "text": text})
else:
text_no_images = image_pattern.sub("", text).strip()
if text_no_images:
content.append({"type": "text", "text": text_no_images})
if plots:
for match in image_pattern.findall(text):
if match.startswith("data:image/png;base64,"):
content.append({
"type": "image",
"source_type": "base64",
"data": match.replace("data:image/png;base64,", ""),
"mime_type": "image/png"
})
elif match.startswith("data:image/jpeg;base64,"):
content.append({
"type": "image",
"source_type": "base64",
"data": match.replace("data:image/jpeg;base64,", ""),
"mime_type": "image/jpeg"
})
else:
content.append({
"type": "text",
"text": f"[Image: {match}]"
})
elif code and cell_type == "code":
code_text = "".join(cell.get("source", []))
if code_text.strip():
content.append({
"type": "text",
"text": f"{code_text}"
})
if code_output and cell_type == "code":
for output in cell.get("outputs", []):
if "data" in output:
data = output["data"]
if plots and "image/png" in data:
content.append({
"type": "image",
"source_type": "base64",
"data": data["image/png"],
"mime_type": "image/png"
})
elif plots and "image/jpeg" in data:
content.append({
"type": "image",
"source_type": "base64",
"data": data["image/jpeg"],
"mime_type": "image/jpeg"
})
elif "text/plain" in data:
text_out = "".join(data["text/plain"])
if text_out.strip():
content.append({
"type": "text",
"text": f"{text_out}"
})
if output.get("output_type") == "stream":
text_out = "".join(output.get("text", []))
if text_out.strip():
content.append({
"type": "text",
"text": f"{text_out}"
})
if output.get("output_type") == "error":
ename = output.get("ename", "")
evalue = output.get("evalue", "")
traceback = "\n".join(output.get("traceback", []))
content.append({
"type": "text",
"text": f"{ename}: {evalue}\n{traceback}"
})
return content