Spaces:

ziadmostafa
/

NoteGenie

Running

File size: 12,496 Bytes

e60fb94

import re
import json
import functools

# Add a simple LRU cache for regex patterns
def get_cached_pattern(pattern, flags=0):
    """Cache compiled regex patterns for better performance."""
    @functools.lru_cache(maxsize=32)
    def _get_pattern(pattern_str, pattern_flags):
        return re.compile(pattern_str, pattern_flags)
    
    return _get_pattern(pattern, flags)

def extract_notebook_info(content):
    """Extract notebook name and description from the AI response."""
    # Improved regex pattern that handles multiline and markdown formatting better
    name_match = re.search(r"NOTEBOOK_NAME:?\s*(.+?)(?=\n\s*NOTEBOOK_DESCRIPTION|\n\s*---|\n\s*$|$)", content, re.DOTALL)
    desc_match = re.search(r"NOTEBOOK_DESCRIPTION:?\s*(.+?)(?=\n\s*---|\n\s*$|$)", content, re.DOTALL)
    
    # Extract and clean up potential markdown formatting
    name = name_match.group(1).strip() if name_match else "Generated Notebook"
    description = desc_match.group(1).strip() if desc_match else "Notebook generated using NoteGenie"
    
    # Remove markdown formatting from name and description
    name = re.sub(r'\*\*(.*?)\*\*', r'\1', name)  # Remove bold formatting
    name = re.sub(r'\*(.*?)\*', r'\1', name)      # Remove italic formatting
    name = re.sub(r'_(.*?)_', r'\1', name)        # Remove underline formatting
    
    description = re.sub(r'\*\*(.*?)\*\*', r'\1', description)  # Remove bold formatting
    description = re.sub(r'\*(.*?)\*', r'\1', description)      # Remove italic formatting
    description = re.sub(r'_(.*?)_', r'\1', description)        # Remove underline formatting
    
    return {
        "name": name,
        "description": description
    }

def format_notebook(content):
    """Convert the AI text response into a properly formatted Jupyter notebook JSON.
    Optimized for performance with larger texts."""
    # Use faster pattern matching approach with improved end-of-file handling
    markdown_pattern = get_cached_pattern(r"---\s*MARKDOWN\s*CELL\s*---\s*([\s\S]*?)(?=---\s*(?:MARKDOWN|CODE)\s*CELL\s*---|$)", re.DOTALL)
    code_pattern = get_cached_pattern(r"---\s*CODE\s*CELL\s*---\s*```python\s*([\s\S]*?)```", re.DOTALL)
    cell_marker_pattern = get_cached_pattern(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", re.DOTALL)
    
    # OPTIMIZATION: Do a quick initial scan to determine notebook size and complexity
    complexity = len(content) // 1000  # Rough estimate based on content length
    cell_count = len(cell_marker_pattern.findall(content))
    
    # For very large notebooks, use a more memory-efficient but slower approach
    if complexity > 200 or cell_count > 50:  # If over ~200KB or 50 cells
        return format_large_notebook(content)
    
    # For regular notebooks, use the standard approach which is faster for medium-sized content
    try:
        # Extract cells from the content in a single pass if possible
        markdown_cells = markdown_pattern.findall(content)
        code_cells = code_pattern.findall(content)
        
        # If the AI didn't use the expected format, try alternate patterns
        if not markdown_cells and not code_cells:
            # Simplified handling for non-standard format
            sections = re.split(r"```python|```", content)
            cells = []
            
            for i, section in enumerate(sections):
                section = section.strip()
                if section and i % 2 == 0:
                    # This is markdown content
                    cells.append({"cell_type": "markdown", "source": section})
                elif section:
                    # This is code content
                    cells.append({"cell_type": "code", "source": section})
        else:
            # Interleave markdown and code cells in the correct order
            cells = []
            
            # Find overall ordering of cells
            all_matches = list(cell_marker_pattern.finditer(content))
            all_types = [m.group(1) for m in all_matches]
            
            md_idx = 0
            code_idx = 0
            
            for i, cell_type in enumerate(all_types):
                marker = all_matches[i]
                marker_end = marker.end()
                next_marker_start = all_matches[i+1].start() if i+1 < len(all_matches) else len(content)
                cell_content = content[marker_end:next_marker_start].strip()
                
                if cell_type == "MARKDOWN":
                    if md_idx < len(markdown_cells) or (i == len(all_types) - 1 and cell_content):
                        if md_idx < len(markdown_cells):
                            cell_source = markdown_cells[md_idx].strip()
                            md_idx += 1
                        else:
                            # Handle the last markdown cell if it wasn't captured by the pattern
                            cell_source = cell_content
                            
                        cells.append({
                            "cell_type": "markdown",
                            "source": cell_source
                        })
                elif cell_type == "CODE":
                    if code_idx < len(code_cells) or (i == len(all_types) - 1 and "```python" in cell_content):
                        if code_idx < len(code_cells):
                            cell_source = code_cells[code_idx].strip()
                            code_idx += 1
                        else:
                            # Handle the last code cell if it wasn't captured by the pattern
                            code_match = re.search(r"```python\s*([\s\S]*?)```", cell_content, re.DOTALL)
                            cell_source = code_match.group(1).strip() if code_match else ""
                            
                        cells.append({
                            "cell_type": "code",
                            "source": cell_source
                        })
        
        # Ensure we have at least a title cell if nothing was extracted
        if not cells:
            notebook_info = extract_notebook_info(content)
            cells.append({
                "cell_type": "markdown",
                "source": f"# {notebook_info['name']}\n\n{notebook_info['description']}"
            })
            
            # Try to extract any code blocks that might be present - only if needed
            code_blocks = re.findall(r"```python\s*(.*?)```", content, re.DOTALL)
            for block in code_blocks:
                cells.append({
                    "cell_type": "code",
                    "source": block.strip()
                })
        
        # Format cells for Jupyter notebook structure - optimize by processing in chunks
        formatted_cells = []
        for cell in cells:
            cell_source = cell["source"]
            # Only split if it's a string, not if it's already a list
            if isinstance(cell_source, str):
                # OPTIMIZATION: For very large cells, process line by line to avoid memory issues
                if len(cell_source) > 10000:  # If cell is over 10KB
                    source_lines = []
                    for line in cell_source.splitlines():
                        source_lines.append(line)
                else:
                    source_lines = cell_source.split("\n")
            else:
                source_lines = cell_source
                
            formatted_cell = {
                "cell_type": cell["cell_type"],
                "metadata": {},
                "source": source_lines
            }
            
            if cell["cell_type"] == "code":
                formatted_cell["execution_count"] = None
                formatted_cell["outputs"] = []
            
            formatted_cells.append(formatted_cell)
        
        # Create the notebook structure
        notebook = {
            "cells": formatted_cells,
            "metadata": {
                "kernelspec": {
                    "display_name": "Python 3",
                    "language": "python",
                    "name": "python3"
                },
                "language_info": {
                    "name": "python",
                    "version": "3.8.0"
                }
            },
            "nbformat": 4,
            "nbformat_minor": 4
        }
        
        return notebook
    except Exception as e:
        # If standard approach fails, fall back to the more robust method
        print(f"Error in standard format_notebook: {e}. Using fallback method.")
        return format_large_notebook(content)

def format_large_notebook(content):
    """Memory-efficient formatter for very large notebooks.
    Processes content in chunks to avoid memory issues."""
    # Get notebook info
    notebook_info = extract_notebook_info(content)
    
    # Initialize cells with the title
    cells = [{
        "cell_type": "markdown",
        "metadata": {},
        "source": [f"# {notebook_info['name']}", "", notebook_info['description']]
    }]
    
    # Process content in chunks using incremental parsing
    # Find cell markers and their positions
    marker_positions = []
    for match in re.finditer(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", content):
        marker_positions.append((match.start(), match.end(), match.group(1)))
    
    # If no markers are found, try to extract code blocks directly
    if not marker_positions:
        # Just extract code blocks and treat everything else as markdown
        remaining_text = content
        last_end = 0
        
        for match in re.finditer(r"```python\s*(.*?)```", content, re.DOTALL):
            # If there's text before this code block, add it as markdown
            if match.start() > last_end:
                markdown_text = content[last_end:match.start()].strip()
                if markdown_text:
                    cells.append({
                        "cell_type": "markdown",
                        "metadata": {},
                        "source": markdown_text.split("\n")
                    })
            
            # Add the code block
            code_text = match.group(1).strip()
            if code_text:
                cells.append({
                    "cell_type": "code",
                    "metadata": {},
                    "source": code_text.split("\n"),
                    "execution_count": None,
                    "outputs": []
                })
            
            last_end = match.end()
        
        # If there's text after the last code block, add it as markdown
        if last_end < len(content):
            markdown_text = content[last_end:].strip()
            if markdown_text:
                cells.append({
                    "cell_type": "markdown",
                    "metadata": {},
                    "source": markdown_text.split("\n")
                })
    else:
        # Process each cell based on its markers
        for i, (start, end, cell_type) in enumerate(marker_positions):
            # Find the end of this cell (start of next cell or end of content)
            cell_end = marker_positions[i+1][0] if i+1 < len(marker_positions) else len(content)
            cell_content = content[end:cell_end].strip()
            
            if cell_type == "MARKDOWN":
                cells.append({
                    "cell_type": "markdown",
                    "metadata": {},
                    "source": cell_content.split("\n")
                })
            elif cell_type == "CODE":
                # Extract code from between triple backticks
                code_match = re.search(r"```python\s*(.*?)```", cell_content, re.DOTALL)
                if code_match:
                    code_text = code_match.group(1).strip()
                    cells.append({
                        "cell_type": "code",
                        "metadata": {},
                        "source": code_text.split("\n"),
                        "execution_count": None,
                        "outputs": []
                    })
    
    # Create the notebook structure
    notebook = {
        "cells": cells,
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3"
            },
            "language_info": {
                "name": "python",
                "version": "3.8.0"
            }
        },
        "nbformat": 4,
        "nbformat_minor": 4
    }
    
    return notebook