import json,re class NotebookParser: def __init__(self, notebook_path: str): """Initialize with path to a Jupyter notebook file.""" self.notebook_path = notebook_path with open(notebook_path, "r", encoding="utf-8") as f: self.nb_json = json.load(f) def extract(self, code: bool = False, code_output: bool = False, markdown: bool = False, plots: bool = False): """ Extracts notebook content in order of appearance. Args: code (bool): include code cells code_output (bool): include code cell outputs markdown (bool): include markdown cells plots (bool): include image outputs (PNG/JPEG, including markdown images) Returns: List[dict]: list of content blocks for LLM consumption """ content = [] image_pattern = re.compile(r"!\[.*?\]\((.*?)\)") for cell in self.nb_json.get("cells", []): cell_type = cell.get("cell_type") if markdown and cell_type == "markdown": text = "".join(cell.get("source", [])) if text.strip(): if plots: content.append({"type": "text", "text": text}) else: text_no_images = image_pattern.sub("", text).strip() if text_no_images: content.append({"type": "text", "text": text_no_images}) if plots: for match in image_pattern.findall(text): if match.startswith("data:image/png;base64,"): content.append({ "type": "image", "source_type": "base64", "data": match.replace("data:image/png;base64,", ""), "mime_type": "image/png" }) elif match.startswith("data:image/jpeg;base64,"): content.append({ "type": "image", "source_type": "base64", "data": match.replace("data:image/jpeg;base64,", ""), "mime_type": "image/jpeg" }) else: content.append({ "type": "text", "text": f"[Image: {match}]" }) elif code and cell_type == "code": code_text = "".join(cell.get("source", [])) if code_text.strip(): content.append({ "type": "text", "text": f"{code_text}" }) if code_output and cell_type == "code": for output in cell.get("outputs", []): if "data" in output: data = output["data"] if plots and "image/png" in data: content.append({ "type": "image", "source_type": "base64", "data": data["image/png"], "mime_type": "image/png" }) elif plots and "image/jpeg" in data: content.append({ "type": "image", "source_type": "base64", "data": data["image/jpeg"], "mime_type": "image/jpeg" }) elif "text/plain" in data: text_out = "".join(data["text/plain"]) if text_out.strip(): content.append({ "type": "text", "text": f"{text_out}" }) if output.get("output_type") == "stream": text_out = "".join(output.get("text", [])) if text_out.strip(): content.append({ "type": "text", "text": f"{text_out}" }) if output.get("output_type") == "error": ename = output.get("ename", "") evalue = output.get("evalue", "") traceback = "\n".join(output.get("traceback", [])) content.append({ "type": "text", "text": f"{ename}: {evalue}\n{traceback}" }) return content