Spaces:

HarishMaths
/

Great-Lens-D

Sleeping

App Files Files Community

Great-Lens-D / src /notebook_parser.py

HarishMaths

Upload 2 files

cd9a871 verified 5 months ago

raw

history blame contribute delete

5.16 kB

	import json,re

	class NotebookParser:
	def __init__(self, notebook_path: str):
	"""Initialize with path to a Jupyter notebook file."""
	self.notebook_path = notebook_path
	with open(notebook_path, "r", encoding="utf-8") as f:
	self.nb_json = json.load(f)


	def extract(self, code: bool = False, code_output: bool = False, markdown: bool = False, plots: bool = False):
	"""
	Extracts notebook content in order of appearance.

	Args:
	code (bool): include code cells
	code_output (bool): include code cell outputs
	markdown (bool): include markdown cells
	plots (bool): include image outputs (PNG/JPEG, including markdown images)

	Returns:
	List[dict]: list of content blocks for LLM consumption
	"""
	content = []
	image_pattern = re.compile(r"!\[.?\]\((.?)\)")

	for cell in self.nb_json.get("cells", []):
	cell_type = cell.get("cell_type")

	if markdown and cell_type == "markdown":
	text = "".join(cell.get("source", []))
	if text.strip():
	if plots:
	content.append({"type": "text", "text": text})
	else:
	text_no_images = image_pattern.sub("", text).strip()
	if text_no_images:
	content.append({"type": "text", "text": text_no_images})

	if plots:
	for match in image_pattern.findall(text):
	if match.startswith("data:image/png;base64,"):
	content.append({
	"type": "image",
	"source_type": "base64",
	"data": match.replace("data:image/png;base64,", ""),
	"mime_type": "image/png"
	})
	elif match.startswith("data:image/jpeg;base64,"):
	content.append({
	"type": "image",
	"source_type": "base64",
	"data": match.replace("data:image/jpeg;base64,", ""),
	"mime_type": "image/jpeg"
	})
	else:
	content.append({
	"type": "text",
	"text": f"[Image: {match}]"
	})

	elif code and cell_type == "code":
	code_text = "".join(cell.get("source", []))
	if code_text.strip():
	content.append({
	"type": "text",
	"text": f"{code_text}"
	})

	if code_output and cell_type == "code":
	for output in cell.get("outputs", []):
	if "data" in output:
	data = output["data"]

	if plots and "image/png" in data:
	content.append({
	"type": "image",
	"source_type": "base64",
	"data": data["image/png"],
	"mime_type": "image/png"
	})
	elif plots and "image/jpeg" in data:
	content.append({
	"type": "image",
	"source_type": "base64",
	"data": data["image/jpeg"],
	"mime_type": "image/jpeg"
	})

	elif "text/plain" in data:
	text_out = "".join(data["text/plain"])
	if text_out.strip():
	content.append({
	"type": "text",
	"text": f"{text_out}"
	})

	if output.get("output_type") == "stream":
	text_out = "".join(output.get("text", []))
	if text_out.strip():
	content.append({
	"type": "text",
	"text": f"{text_out}"
	})

	if output.get("output_type") == "error":
	ename = output.get("ename", "")
	evalue = output.get("evalue", "")
	traceback = "\n".join(output.get("traceback", []))
	content.append({
	"type": "text",
	"text": f"{ename}: {evalue}\n{traceback}"
	})

	return content