Spaces:
Sleeping
Sleeping
| import json,re | |
| class NotebookParser: | |
| def __init__(self, notebook_path: str): | |
| """Initialize with path to a Jupyter notebook file.""" | |
| self.notebook_path = notebook_path | |
| with open(notebook_path, "r", encoding="utf-8") as f: | |
| self.nb_json = json.load(f) | |
| def extract(self, code: bool = False, code_output: bool = False, markdown: bool = False, plots: bool = False): | |
| """ | |
| Extracts notebook content in order of appearance. | |
| Args: | |
| code (bool): include code cells | |
| code_output (bool): include code cell outputs | |
| markdown (bool): include markdown cells | |
| plots (bool): include image outputs (PNG/JPEG, including markdown images) | |
| Returns: | |
| List[dict]: list of content blocks for LLM consumption | |
| """ | |
| content = [] | |
| image_pattern = re.compile(r"!\[.*?\]\((.*?)\)") | |
| for cell in self.nb_json.get("cells", []): | |
| cell_type = cell.get("cell_type") | |
| if markdown and cell_type == "markdown": | |
| text = "".join(cell.get("source", [])) | |
| if text.strip(): | |
| if plots: | |
| content.append({"type": "text", "text": text}) | |
| else: | |
| text_no_images = image_pattern.sub("", text).strip() | |
| if text_no_images: | |
| content.append({"type": "text", "text": text_no_images}) | |
| if plots: | |
| for match in image_pattern.findall(text): | |
| if match.startswith("data:image/png;base64,"): | |
| content.append({ | |
| "type": "image", | |
| "source_type": "base64", | |
| "data": match.replace("data:image/png;base64,", ""), | |
| "mime_type": "image/png" | |
| }) | |
| elif match.startswith("data:image/jpeg;base64,"): | |
| content.append({ | |
| "type": "image", | |
| "source_type": "base64", | |
| "data": match.replace("data:image/jpeg;base64,", ""), | |
| "mime_type": "image/jpeg" | |
| }) | |
| else: | |
| content.append({ | |
| "type": "text", | |
| "text": f"[Image: {match}]" | |
| }) | |
| elif code and cell_type == "code": | |
| code_text = "".join(cell.get("source", [])) | |
| if code_text.strip(): | |
| content.append({ | |
| "type": "text", | |
| "text": f"{code_text}" | |
| }) | |
| if code_output and cell_type == "code": | |
| for output in cell.get("outputs", []): | |
| if "data" in output: | |
| data = output["data"] | |
| if plots and "image/png" in data: | |
| content.append({ | |
| "type": "image", | |
| "source_type": "base64", | |
| "data": data["image/png"], | |
| "mime_type": "image/png" | |
| }) | |
| elif plots and "image/jpeg" in data: | |
| content.append({ | |
| "type": "image", | |
| "source_type": "base64", | |
| "data": data["image/jpeg"], | |
| "mime_type": "image/jpeg" | |
| }) | |
| elif "text/plain" in data: | |
| text_out = "".join(data["text/plain"]) | |
| if text_out.strip(): | |
| content.append({ | |
| "type": "text", | |
| "text": f"{text_out}" | |
| }) | |
| if output.get("output_type") == "stream": | |
| text_out = "".join(output.get("text", [])) | |
| if text_out.strip(): | |
| content.append({ | |
| "type": "text", | |
| "text": f"{text_out}" | |
| }) | |
| if output.get("output_type") == "error": | |
| ename = output.get("ename", "") | |
| evalue = output.get("evalue", "") | |
| traceback = "\n".join(output.get("traceback", [])) | |
| content.append({ | |
| "type": "text", | |
| "text": f"{ename}: {evalue}\n{traceback}" | |
| }) | |
| return content | |