Spaces:

ziadmostafa
/

NoteGenie

Sleeping

App Files Files Community

NoteGenie / utils /notebook_helpers.py

ziadmostafa

initial commit

e60fb94 10 months ago

raw

history blame contribute delete

12.5 kB

	import re
	import json
	import functools

	# Add a simple LRU cache for regex patterns
	def get_cached_pattern(pattern, flags=0):
	"""Cache compiled regex patterns for better performance."""
	@functools.lru_cache(maxsize=32)
	def _get_pattern(pattern_str, pattern_flags):
	return re.compile(pattern_str, pattern_flags)

	return _get_pattern(pattern, flags)

	def extract_notebook_info(content):
	"""Extract notebook name and description from the AI response."""
	# Improved regex pattern that handles multiline and markdown formatting better
	name_match = re.search(r"NOTEBOOK_NAME:?\s(.+?)(?=\n\sNOTEBOOK_DESCRIPTION\|\n\s---\|\n\s$\|$)", content, re.DOTALL)
	desc_match = re.search(r"NOTEBOOK_DESCRIPTION:?\s(.+?)(?=\n\s---\|\n\s*$\|$)", content, re.DOTALL)

	# Extract and clean up potential markdown formatting
	name = name_match.group(1).strip() if name_match else "Generated Notebook"
	description = desc_match.group(1).strip() if desc_match else "Notebook generated using NoteGenie"

	# Remove markdown formatting from name and description
	name = re.sub(r'\\(.?)\\*', r'\1', name) # Remove bold formatting
	name = re.sub(r'\(.?)\*', r'\1', name) # Remove italic formatting
	name = re.sub(r'_(.*?)_', r'\1', name) # Remove underline formatting

	description = re.sub(r'\\(.?)\\*', r'\1', description) # Remove bold formatting
	description = re.sub(r'\(.?)\*', r'\1', description) # Remove italic formatting
	description = re.sub(r'_(.*?)_', r'\1', description) # Remove underline formatting

	return {
	"name": name,
	"description": description
	}

	def format_notebook(content):
	"""Convert the AI text response into a properly formatted Jupyter notebook JSON.
	Optimized for performance with larger texts."""
	# Use faster pattern matching approach with improved end-of-file handling
	markdown_pattern = get_cached_pattern(r"---\sMARKDOWN\sCELL\s---\s([\s\S]?)(?=---\s(?:MARKDOWN\|CODE)\sCELL\s---\|$)", re.DOTALL)
	code_pattern = get_cached_pattern(r"---\sCODE\sCELL\s---\s```python\s([\s\S]?)```", re.DOTALL)
	cell_marker_pattern = get_cached_pattern(r"---\s(MARKDOWN\|CODE)\sCELL\s*---", re.DOTALL)

	# OPTIMIZATION: Do a quick initial scan to determine notebook size and complexity
	complexity = len(content) // 1000 # Rough estimate based on content length
	cell_count = len(cell_marker_pattern.findall(content))

	# For very large notebooks, use a more memory-efficient but slower approach
	if complexity > 200 or cell_count > 50: # If over ~200KB or 50 cells
	return format_large_notebook(content)

	# For regular notebooks, use the standard approach which is faster for medium-sized content
	try:
	# Extract cells from the content in a single pass if possible
	markdown_cells = markdown_pattern.findall(content)
	code_cells = code_pattern.findall(content)

	# If the AI didn't use the expected format, try alternate patterns
	if not markdown_cells and not code_cells:
	# Simplified handling for non-standard format
	sections = re.split(r"```python\|```", content)
	cells = []

	for i, section in enumerate(sections):
	section = section.strip()
	if section and i % 2 == 0:
	# This is markdown content
	cells.append({"cell_type": "markdown", "source": section})
	elif section:
	# This is code content
	cells.append({"cell_type": "code", "source": section})
	else:
	# Interleave markdown and code cells in the correct order
	cells = []

	# Find overall ordering of cells
	all_matches = list(cell_marker_pattern.finditer(content))
	all_types = [m.group(1) for m in all_matches]

	md_idx = 0
	code_idx = 0

	for i, cell_type in enumerate(all_types):
	marker = all_matches[i]
	marker_end = marker.end()
	next_marker_start = all_matches[i+1].start() if i+1 < len(all_matches) else len(content)
	cell_content = content[marker_end:next_marker_start].strip()

	if cell_type == "MARKDOWN":
	if md_idx < len(markdown_cells) or (i == len(all_types) - 1 and cell_content):
	if md_idx < len(markdown_cells):
	cell_source = markdown_cells[md_idx].strip()
	md_idx += 1
	else:
	# Handle the last markdown cell if it wasn't captured by the pattern
	cell_source = cell_content

	cells.append({
	"cell_type": "markdown",
	"source": cell_source
	})
	elif cell_type == "CODE":
	if code_idx < len(code_cells) or (i == len(all_types) - 1 and "```python" in cell_content):
	if code_idx < len(code_cells):
	cell_source = code_cells[code_idx].strip()
	code_idx += 1
	else:
	# Handle the last code cell if it wasn't captured by the pattern
	code_match = re.search(r"```python\s([\s\S]?)```", cell_content, re.DOTALL)
	cell_source = code_match.group(1).strip() if code_match else ""

	cells.append({
	"cell_type": "code",
	"source": cell_source
	})

	# Ensure we have at least a title cell if nothing was extracted
	if not cells:
	notebook_info = extract_notebook_info(content)
	cells.append({
	"cell_type": "markdown",
	"source": f"# {notebook_info['name']}\n\n{notebook_info['description']}"
	})

	# Try to extract any code blocks that might be present - only if needed
	code_blocks = re.findall(r"```python\s(.?)```", content, re.DOTALL)
	for block in code_blocks:
	cells.append({
	"cell_type": "code",
	"source": block.strip()
	})

	# Format cells for Jupyter notebook structure - optimize by processing in chunks
	formatted_cells = []
	for cell in cells:
	cell_source = cell["source"]
	# Only split if it's a string, not if it's already a list
	if isinstance(cell_source, str):
	# OPTIMIZATION: For very large cells, process line by line to avoid memory issues
	if len(cell_source) > 10000: # If cell is over 10KB
	source_lines = []
	for line in cell_source.splitlines():
	source_lines.append(line)
	else:
	source_lines = cell_source.split("\n")
	else:
	source_lines = cell_source

	formatted_cell = {
	"cell_type": cell["cell_type"],
	"metadata": {},
	"source": source_lines
	}

	if cell["cell_type"] == "code":
	formatted_cell["execution_count"] = None
	formatted_cell["outputs"] = []

	formatted_cells.append(formatted_cell)

	# Create the notebook structure
	notebook = {
	"cells": formatted_cells,
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"name": "python",
	"version": "3.8.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}

	return notebook
	except Exception as e:
	# If standard approach fails, fall back to the more robust method
	print(f"Error in standard format_notebook: {e}. Using fallback method.")
	return format_large_notebook(content)

	def format_large_notebook(content):
	"""Memory-efficient formatter for very large notebooks.
	Processes content in chunks to avoid memory issues."""
	# Get notebook info
	notebook_info = extract_notebook_info(content)

	# Initialize cells with the title
	cells = [{
	"cell_type": "markdown",
	"metadata": {},
	"source": [f"# {notebook_info['name']}", "", notebook_info['description']]
	}]

	# Process content in chunks using incremental parsing
	# Find cell markers and their positions
	marker_positions = []
	for match in re.finditer(r"---\s(MARKDOWN\|CODE)\sCELL\s*---", content):
	marker_positions.append((match.start(), match.end(), match.group(1)))

	# If no markers are found, try to extract code blocks directly
	if not marker_positions:
	# Just extract code blocks and treat everything else as markdown
	remaining_text = content
	last_end = 0

	for match in re.finditer(r"```python\s(.?)```", content, re.DOTALL):
	# If there's text before this code block, add it as markdown
	if match.start() > last_end:
	markdown_text = content[last_end:match.start()].strip()
	if markdown_text:
	cells.append({
	"cell_type": "markdown",
	"metadata": {},
	"source": markdown_text.split("\n")
	})

	# Add the code block
	code_text = match.group(1).strip()
	if code_text:
	cells.append({
	"cell_type": "code",
	"metadata": {},
	"source": code_text.split("\n"),
	"execution_count": None,
	"outputs": []
	})

	last_end = match.end()

	# If there's text after the last code block, add it as markdown
	if last_end < len(content):
	markdown_text = content[last_end:].strip()
	if markdown_text:
	cells.append({
	"cell_type": "markdown",
	"metadata": {},
	"source": markdown_text.split("\n")
	})
	else:
	# Process each cell based on its markers
	for i, (start, end, cell_type) in enumerate(marker_positions):
	# Find the end of this cell (start of next cell or end of content)
	cell_end = marker_positions[i+1][0] if i+1 < len(marker_positions) else len(content)
	cell_content = content[end:cell_end].strip()

	if cell_type == "MARKDOWN":
	cells.append({
	"cell_type": "markdown",
	"metadata": {},
	"source": cell_content.split("\n")
	})
	elif cell_type == "CODE":
	# Extract code from between triple backticks
	code_match = re.search(r"```python\s(.?)```", cell_content, re.DOTALL)
	if code_match:
	code_text = code_match.group(1).strip()
	cells.append({
	"cell_type": "code",
	"metadata": {},
	"source": code_text.split("\n"),
	"execution_count": None,
	"outputs": []
	})

	# Create the notebook structure
	notebook = {
	"cells": cells,
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"name": "python",
	"version": "3.8.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}

	return notebook