NoteGenie / utils /notebook_helpers.py
ziadmostafa's picture
initial commit
e60fb94
import re
import json
import functools
# Add a simple LRU cache for regex patterns
def get_cached_pattern(pattern, flags=0):
"""Cache compiled regex patterns for better performance."""
@functools.lru_cache(maxsize=32)
def _get_pattern(pattern_str, pattern_flags):
return re.compile(pattern_str, pattern_flags)
return _get_pattern(pattern, flags)
def extract_notebook_info(content):
"""Extract notebook name and description from the AI response."""
# Improved regex pattern that handles multiline and markdown formatting better
name_match = re.search(r"NOTEBOOK_NAME:?\s*(.+?)(?=\n\s*NOTEBOOK_DESCRIPTION|\n\s*---|\n\s*$|$)", content, re.DOTALL)
desc_match = re.search(r"NOTEBOOK_DESCRIPTION:?\s*(.+?)(?=\n\s*---|\n\s*$|$)", content, re.DOTALL)
# Extract and clean up potential markdown formatting
name = name_match.group(1).strip() if name_match else "Generated Notebook"
description = desc_match.group(1).strip() if desc_match else "Notebook generated using NoteGenie"
# Remove markdown formatting from name and description
name = re.sub(r'\*\*(.*?)\*\*', r'\1', name) # Remove bold formatting
name = re.sub(r'\*(.*?)\*', r'\1', name) # Remove italic formatting
name = re.sub(r'_(.*?)_', r'\1', name) # Remove underline formatting
description = re.sub(r'\*\*(.*?)\*\*', r'\1', description) # Remove bold formatting
description = re.sub(r'\*(.*?)\*', r'\1', description) # Remove italic formatting
description = re.sub(r'_(.*?)_', r'\1', description) # Remove underline formatting
return {
"name": name,
"description": description
}
def format_notebook(content):
"""Convert the AI text response into a properly formatted Jupyter notebook JSON.
Optimized for performance with larger texts."""
# Use faster pattern matching approach with improved end-of-file handling
markdown_pattern = get_cached_pattern(r"---\s*MARKDOWN\s*CELL\s*---\s*([\s\S]*?)(?=---\s*(?:MARKDOWN|CODE)\s*CELL\s*---|$)", re.DOTALL)
code_pattern = get_cached_pattern(r"---\s*CODE\s*CELL\s*---\s*```python\s*([\s\S]*?)```", re.DOTALL)
cell_marker_pattern = get_cached_pattern(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", re.DOTALL)
# OPTIMIZATION: Do a quick initial scan to determine notebook size and complexity
complexity = len(content) // 1000 # Rough estimate based on content length
cell_count = len(cell_marker_pattern.findall(content))
# For very large notebooks, use a more memory-efficient but slower approach
if complexity > 200 or cell_count > 50: # If over ~200KB or 50 cells
return format_large_notebook(content)
# For regular notebooks, use the standard approach which is faster for medium-sized content
try:
# Extract cells from the content in a single pass if possible
markdown_cells = markdown_pattern.findall(content)
code_cells = code_pattern.findall(content)
# If the AI didn't use the expected format, try alternate patterns
if not markdown_cells and not code_cells:
# Simplified handling for non-standard format
sections = re.split(r"```python|```", content)
cells = []
for i, section in enumerate(sections):
section = section.strip()
if section and i % 2 == 0:
# This is markdown content
cells.append({"cell_type": "markdown", "source": section})
elif section:
# This is code content
cells.append({"cell_type": "code", "source": section})
else:
# Interleave markdown and code cells in the correct order
cells = []
# Find overall ordering of cells
all_matches = list(cell_marker_pattern.finditer(content))
all_types = [m.group(1) for m in all_matches]
md_idx = 0
code_idx = 0
for i, cell_type in enumerate(all_types):
marker = all_matches[i]
marker_end = marker.end()
next_marker_start = all_matches[i+1].start() if i+1 < len(all_matches) else len(content)
cell_content = content[marker_end:next_marker_start].strip()
if cell_type == "MARKDOWN":
if md_idx < len(markdown_cells) or (i == len(all_types) - 1 and cell_content):
if md_idx < len(markdown_cells):
cell_source = markdown_cells[md_idx].strip()
md_idx += 1
else:
# Handle the last markdown cell if it wasn't captured by the pattern
cell_source = cell_content
cells.append({
"cell_type": "markdown",
"source": cell_source
})
elif cell_type == "CODE":
if code_idx < len(code_cells) or (i == len(all_types) - 1 and "```python" in cell_content):
if code_idx < len(code_cells):
cell_source = code_cells[code_idx].strip()
code_idx += 1
else:
# Handle the last code cell if it wasn't captured by the pattern
code_match = re.search(r"```python\s*([\s\S]*?)```", cell_content, re.DOTALL)
cell_source = code_match.group(1).strip() if code_match else ""
cells.append({
"cell_type": "code",
"source": cell_source
})
# Ensure we have at least a title cell if nothing was extracted
if not cells:
notebook_info = extract_notebook_info(content)
cells.append({
"cell_type": "markdown",
"source": f"# {notebook_info['name']}\n\n{notebook_info['description']}"
})
# Try to extract any code blocks that might be present - only if needed
code_blocks = re.findall(r"```python\s*(.*?)```", content, re.DOTALL)
for block in code_blocks:
cells.append({
"cell_type": "code",
"source": block.strip()
})
# Format cells for Jupyter notebook structure - optimize by processing in chunks
formatted_cells = []
for cell in cells:
cell_source = cell["source"]
# Only split if it's a string, not if it's already a list
if isinstance(cell_source, str):
# OPTIMIZATION: For very large cells, process line by line to avoid memory issues
if len(cell_source) > 10000: # If cell is over 10KB
source_lines = []
for line in cell_source.splitlines():
source_lines.append(line)
else:
source_lines = cell_source.split("\n")
else:
source_lines = cell_source
formatted_cell = {
"cell_type": cell["cell_type"],
"metadata": {},
"source": source_lines
}
if cell["cell_type"] == "code":
formatted_cell["execution_count"] = None
formatted_cell["outputs"] = []
formatted_cells.append(formatted_cell)
# Create the notebook structure
notebook = {
"cells": formatted_cells,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
return notebook
except Exception as e:
# If standard approach fails, fall back to the more robust method
print(f"Error in standard format_notebook: {e}. Using fallback method.")
return format_large_notebook(content)
def format_large_notebook(content):
"""Memory-efficient formatter for very large notebooks.
Processes content in chunks to avoid memory issues."""
# Get notebook info
notebook_info = extract_notebook_info(content)
# Initialize cells with the title
cells = [{
"cell_type": "markdown",
"metadata": {},
"source": [f"# {notebook_info['name']}", "", notebook_info['description']]
}]
# Process content in chunks using incremental parsing
# Find cell markers and their positions
marker_positions = []
for match in re.finditer(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", content):
marker_positions.append((match.start(), match.end(), match.group(1)))
# If no markers are found, try to extract code blocks directly
if not marker_positions:
# Just extract code blocks and treat everything else as markdown
remaining_text = content
last_end = 0
for match in re.finditer(r"```python\s*(.*?)```", content, re.DOTALL):
# If there's text before this code block, add it as markdown
if match.start() > last_end:
markdown_text = content[last_end:match.start()].strip()
if markdown_text:
cells.append({
"cell_type": "markdown",
"metadata": {},
"source": markdown_text.split("\n")
})
# Add the code block
code_text = match.group(1).strip()
if code_text:
cells.append({
"cell_type": "code",
"metadata": {},
"source": code_text.split("\n"),
"execution_count": None,
"outputs": []
})
last_end = match.end()
# If there's text after the last code block, add it as markdown
if last_end < len(content):
markdown_text = content[last_end:].strip()
if markdown_text:
cells.append({
"cell_type": "markdown",
"metadata": {},
"source": markdown_text.split("\n")
})
else:
# Process each cell based on its markers
for i, (start, end, cell_type) in enumerate(marker_positions):
# Find the end of this cell (start of next cell or end of content)
cell_end = marker_positions[i+1][0] if i+1 < len(marker_positions) else len(content)
cell_content = content[end:cell_end].strip()
if cell_type == "MARKDOWN":
cells.append({
"cell_type": "markdown",
"metadata": {},
"source": cell_content.split("\n")
})
elif cell_type == "CODE":
# Extract code from between triple backticks
code_match = re.search(r"```python\s*(.*?)```", cell_content, re.DOTALL)
if code_match:
code_text = code_match.group(1).strip()
cells.append({
"cell_type": "code",
"metadata": {},
"source": code_text.split("\n"),
"execution_count": None,
"outputs": []
})
# Create the notebook structure
notebook = {
"cells": cells,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
return notebook