import re import json import functools # Add a simple LRU cache for regex patterns def get_cached_pattern(pattern, flags=0): """Cache compiled regex patterns for better performance.""" @functools.lru_cache(maxsize=32) def _get_pattern(pattern_str, pattern_flags): return re.compile(pattern_str, pattern_flags) return _get_pattern(pattern, flags) def extract_notebook_info(content): """Extract notebook name and description from the AI response.""" # Improved regex pattern that handles multiline and markdown formatting better name_match = re.search(r"NOTEBOOK_NAME:?\s*(.+?)(?=\n\s*NOTEBOOK_DESCRIPTION|\n\s*---|\n\s*$|$)", content, re.DOTALL) desc_match = re.search(r"NOTEBOOK_DESCRIPTION:?\s*(.+?)(?=\n\s*---|\n\s*$|$)", content, re.DOTALL) # Extract and clean up potential markdown formatting name = name_match.group(1).strip() if name_match else "Generated Notebook" description = desc_match.group(1).strip() if desc_match else "Notebook generated using NoteGenie" # Remove markdown formatting from name and description name = re.sub(r'\*\*(.*?)\*\*', r'\1', name) # Remove bold formatting name = re.sub(r'\*(.*?)\*', r'\1', name) # Remove italic formatting name = re.sub(r'_(.*?)_', r'\1', name) # Remove underline formatting description = re.sub(r'\*\*(.*?)\*\*', r'\1', description) # Remove bold formatting description = re.sub(r'\*(.*?)\*', r'\1', description) # Remove italic formatting description = re.sub(r'_(.*?)_', r'\1', description) # Remove underline formatting return { "name": name, "description": description } def format_notebook(content): """Convert the AI text response into a properly formatted Jupyter notebook JSON. Optimized for performance with larger texts.""" # Use faster pattern matching approach with improved end-of-file handling markdown_pattern = get_cached_pattern(r"---\s*MARKDOWN\s*CELL\s*---\s*([\s\S]*?)(?=---\s*(?:MARKDOWN|CODE)\s*CELL\s*---|$)", re.DOTALL) code_pattern = get_cached_pattern(r"---\s*CODE\s*CELL\s*---\s*```python\s*([\s\S]*?)```", re.DOTALL) cell_marker_pattern = get_cached_pattern(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", re.DOTALL) # OPTIMIZATION: Do a quick initial scan to determine notebook size and complexity complexity = len(content) // 1000 # Rough estimate based on content length cell_count = len(cell_marker_pattern.findall(content)) # For very large notebooks, use a more memory-efficient but slower approach if complexity > 200 or cell_count > 50: # If over ~200KB or 50 cells return format_large_notebook(content) # For regular notebooks, use the standard approach which is faster for medium-sized content try: # Extract cells from the content in a single pass if possible markdown_cells = markdown_pattern.findall(content) code_cells = code_pattern.findall(content) # If the AI didn't use the expected format, try alternate patterns if not markdown_cells and not code_cells: # Simplified handling for non-standard format sections = re.split(r"```python|```", content) cells = [] for i, section in enumerate(sections): section = section.strip() if section and i % 2 == 0: # This is markdown content cells.append({"cell_type": "markdown", "source": section}) elif section: # This is code content cells.append({"cell_type": "code", "source": section}) else: # Interleave markdown and code cells in the correct order cells = [] # Find overall ordering of cells all_matches = list(cell_marker_pattern.finditer(content)) all_types = [m.group(1) for m in all_matches] md_idx = 0 code_idx = 0 for i, cell_type in enumerate(all_types): marker = all_matches[i] marker_end = marker.end() next_marker_start = all_matches[i+1].start() if i+1 < len(all_matches) else len(content) cell_content = content[marker_end:next_marker_start].strip() if cell_type == "MARKDOWN": if md_idx < len(markdown_cells) or (i == len(all_types) - 1 and cell_content): if md_idx < len(markdown_cells): cell_source = markdown_cells[md_idx].strip() md_idx += 1 else: # Handle the last markdown cell if it wasn't captured by the pattern cell_source = cell_content cells.append({ "cell_type": "markdown", "source": cell_source }) elif cell_type == "CODE": if code_idx < len(code_cells) or (i == len(all_types) - 1 and "```python" in cell_content): if code_idx < len(code_cells): cell_source = code_cells[code_idx].strip() code_idx += 1 else: # Handle the last code cell if it wasn't captured by the pattern code_match = re.search(r"```python\s*([\s\S]*?)```", cell_content, re.DOTALL) cell_source = code_match.group(1).strip() if code_match else "" cells.append({ "cell_type": "code", "source": cell_source }) # Ensure we have at least a title cell if nothing was extracted if not cells: notebook_info = extract_notebook_info(content) cells.append({ "cell_type": "markdown", "source": f"# {notebook_info['name']}\n\n{notebook_info['description']}" }) # Try to extract any code blocks that might be present - only if needed code_blocks = re.findall(r"```python\s*(.*?)```", content, re.DOTALL) for block in code_blocks: cells.append({ "cell_type": "code", "source": block.strip() }) # Format cells for Jupyter notebook structure - optimize by processing in chunks formatted_cells = [] for cell in cells: cell_source = cell["source"] # Only split if it's a string, not if it's already a list if isinstance(cell_source, str): # OPTIMIZATION: For very large cells, process line by line to avoid memory issues if len(cell_source) > 10000: # If cell is over 10KB source_lines = [] for line in cell_source.splitlines(): source_lines.append(line) else: source_lines = cell_source.split("\n") else: source_lines = cell_source formatted_cell = { "cell_type": cell["cell_type"], "metadata": {}, "source": source_lines } if cell["cell_type"] == "code": formatted_cell["execution_count"] = None formatted_cell["outputs"] = [] formatted_cells.append(formatted_cell) # Create the notebook structure notebook = { "cells": formatted_cells, "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 } return notebook except Exception as e: # If standard approach fails, fall back to the more robust method print(f"Error in standard format_notebook: {e}. Using fallback method.") return format_large_notebook(content) def format_large_notebook(content): """Memory-efficient formatter for very large notebooks. Processes content in chunks to avoid memory issues.""" # Get notebook info notebook_info = extract_notebook_info(content) # Initialize cells with the title cells = [{ "cell_type": "markdown", "metadata": {}, "source": [f"# {notebook_info['name']}", "", notebook_info['description']] }] # Process content in chunks using incremental parsing # Find cell markers and their positions marker_positions = [] for match in re.finditer(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", content): marker_positions.append((match.start(), match.end(), match.group(1))) # If no markers are found, try to extract code blocks directly if not marker_positions: # Just extract code blocks and treat everything else as markdown remaining_text = content last_end = 0 for match in re.finditer(r"```python\s*(.*?)```", content, re.DOTALL): # If there's text before this code block, add it as markdown if match.start() > last_end: markdown_text = content[last_end:match.start()].strip() if markdown_text: cells.append({ "cell_type": "markdown", "metadata": {}, "source": markdown_text.split("\n") }) # Add the code block code_text = match.group(1).strip() if code_text: cells.append({ "cell_type": "code", "metadata": {}, "source": code_text.split("\n"), "execution_count": None, "outputs": [] }) last_end = match.end() # If there's text after the last code block, add it as markdown if last_end < len(content): markdown_text = content[last_end:].strip() if markdown_text: cells.append({ "cell_type": "markdown", "metadata": {}, "source": markdown_text.split("\n") }) else: # Process each cell based on its markers for i, (start, end, cell_type) in enumerate(marker_positions): # Find the end of this cell (start of next cell or end of content) cell_end = marker_positions[i+1][0] if i+1 < len(marker_positions) else len(content) cell_content = content[end:cell_end].strip() if cell_type == "MARKDOWN": cells.append({ "cell_type": "markdown", "metadata": {}, "source": cell_content.split("\n") }) elif cell_type == "CODE": # Extract code from between triple backticks code_match = re.search(r"```python\s*(.*?)```", cell_content, re.DOTALL) if code_match: code_text = code_match.group(1).strip() cells.append({ "cell_type": "code", "metadata": {}, "source": code_text.split("\n"), "execution_count": None, "outputs": [] }) # Create the notebook structure notebook = { "cells": cells, "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 } return notebook