Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| import functools | |
| # Add a simple LRU cache for regex patterns | |
| def get_cached_pattern(pattern, flags=0): | |
| """Cache compiled regex patterns for better performance.""" | |
| def _get_pattern(pattern_str, pattern_flags): | |
| return re.compile(pattern_str, pattern_flags) | |
| return _get_pattern(pattern, flags) | |
| def extract_notebook_info(content): | |
| """Extract notebook name and description from the AI response.""" | |
| # Improved regex pattern that handles multiline and markdown formatting better | |
| name_match = re.search(r"NOTEBOOK_NAME:?\s*(.+?)(?=\n\s*NOTEBOOK_DESCRIPTION|\n\s*---|\n\s*$|$)", content, re.DOTALL) | |
| desc_match = re.search(r"NOTEBOOK_DESCRIPTION:?\s*(.+?)(?=\n\s*---|\n\s*$|$)", content, re.DOTALL) | |
| # Extract and clean up potential markdown formatting | |
| name = name_match.group(1).strip() if name_match else "Generated Notebook" | |
| description = desc_match.group(1).strip() if desc_match else "Notebook generated using NoteGenie" | |
| # Remove markdown formatting from name and description | |
| name = re.sub(r'\*\*(.*?)\*\*', r'\1', name) # Remove bold formatting | |
| name = re.sub(r'\*(.*?)\*', r'\1', name) # Remove italic formatting | |
| name = re.sub(r'_(.*?)_', r'\1', name) # Remove underline formatting | |
| description = re.sub(r'\*\*(.*?)\*\*', r'\1', description) # Remove bold formatting | |
| description = re.sub(r'\*(.*?)\*', r'\1', description) # Remove italic formatting | |
| description = re.sub(r'_(.*?)_', r'\1', description) # Remove underline formatting | |
| return { | |
| "name": name, | |
| "description": description | |
| } | |
| def format_notebook(content): | |
| """Convert the AI text response into a properly formatted Jupyter notebook JSON. | |
| Optimized for performance with larger texts.""" | |
| # Use faster pattern matching approach with improved end-of-file handling | |
| markdown_pattern = get_cached_pattern(r"---\s*MARKDOWN\s*CELL\s*---\s*([\s\S]*?)(?=---\s*(?:MARKDOWN|CODE)\s*CELL\s*---|$)", re.DOTALL) | |
| code_pattern = get_cached_pattern(r"---\s*CODE\s*CELL\s*---\s*```python\s*([\s\S]*?)```", re.DOTALL) | |
| cell_marker_pattern = get_cached_pattern(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", re.DOTALL) | |
| # OPTIMIZATION: Do a quick initial scan to determine notebook size and complexity | |
| complexity = len(content) // 1000 # Rough estimate based on content length | |
| cell_count = len(cell_marker_pattern.findall(content)) | |
| # For very large notebooks, use a more memory-efficient but slower approach | |
| if complexity > 200 or cell_count > 50: # If over ~200KB or 50 cells | |
| return format_large_notebook(content) | |
| # For regular notebooks, use the standard approach which is faster for medium-sized content | |
| try: | |
| # Extract cells from the content in a single pass if possible | |
| markdown_cells = markdown_pattern.findall(content) | |
| code_cells = code_pattern.findall(content) | |
| # If the AI didn't use the expected format, try alternate patterns | |
| if not markdown_cells and not code_cells: | |
| # Simplified handling for non-standard format | |
| sections = re.split(r"```python|```", content) | |
| cells = [] | |
| for i, section in enumerate(sections): | |
| section = section.strip() | |
| if section and i % 2 == 0: | |
| # This is markdown content | |
| cells.append({"cell_type": "markdown", "source": section}) | |
| elif section: | |
| # This is code content | |
| cells.append({"cell_type": "code", "source": section}) | |
| else: | |
| # Interleave markdown and code cells in the correct order | |
| cells = [] | |
| # Find overall ordering of cells | |
| all_matches = list(cell_marker_pattern.finditer(content)) | |
| all_types = [m.group(1) for m in all_matches] | |
| md_idx = 0 | |
| code_idx = 0 | |
| for i, cell_type in enumerate(all_types): | |
| marker = all_matches[i] | |
| marker_end = marker.end() | |
| next_marker_start = all_matches[i+1].start() if i+1 < len(all_matches) else len(content) | |
| cell_content = content[marker_end:next_marker_start].strip() | |
| if cell_type == "MARKDOWN": | |
| if md_idx < len(markdown_cells) or (i == len(all_types) - 1 and cell_content): | |
| if md_idx < len(markdown_cells): | |
| cell_source = markdown_cells[md_idx].strip() | |
| md_idx += 1 | |
| else: | |
| # Handle the last markdown cell if it wasn't captured by the pattern | |
| cell_source = cell_content | |
| cells.append({ | |
| "cell_type": "markdown", | |
| "source": cell_source | |
| }) | |
| elif cell_type == "CODE": | |
| if code_idx < len(code_cells) or (i == len(all_types) - 1 and "```python" in cell_content): | |
| if code_idx < len(code_cells): | |
| cell_source = code_cells[code_idx].strip() | |
| code_idx += 1 | |
| else: | |
| # Handle the last code cell if it wasn't captured by the pattern | |
| code_match = re.search(r"```python\s*([\s\S]*?)```", cell_content, re.DOTALL) | |
| cell_source = code_match.group(1).strip() if code_match else "" | |
| cells.append({ | |
| "cell_type": "code", | |
| "source": cell_source | |
| }) | |
| # Ensure we have at least a title cell if nothing was extracted | |
| if not cells: | |
| notebook_info = extract_notebook_info(content) | |
| cells.append({ | |
| "cell_type": "markdown", | |
| "source": f"# {notebook_info['name']}\n\n{notebook_info['description']}" | |
| }) | |
| # Try to extract any code blocks that might be present - only if needed | |
| code_blocks = re.findall(r"```python\s*(.*?)```", content, re.DOTALL) | |
| for block in code_blocks: | |
| cells.append({ | |
| "cell_type": "code", | |
| "source": block.strip() | |
| }) | |
| # Format cells for Jupyter notebook structure - optimize by processing in chunks | |
| formatted_cells = [] | |
| for cell in cells: | |
| cell_source = cell["source"] | |
| # Only split if it's a string, not if it's already a list | |
| if isinstance(cell_source, str): | |
| # OPTIMIZATION: For very large cells, process line by line to avoid memory issues | |
| if len(cell_source) > 10000: # If cell is over 10KB | |
| source_lines = [] | |
| for line in cell_source.splitlines(): | |
| source_lines.append(line) | |
| else: | |
| source_lines = cell_source.split("\n") | |
| else: | |
| source_lines = cell_source | |
| formatted_cell = { | |
| "cell_type": cell["cell_type"], | |
| "metadata": {}, | |
| "source": source_lines | |
| } | |
| if cell["cell_type"] == "code": | |
| formatted_cell["execution_count"] = None | |
| formatted_cell["outputs"] = [] | |
| formatted_cells.append(formatted_cell) | |
| # Create the notebook structure | |
| notebook = { | |
| "cells": formatted_cells, | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "name": "python", | |
| "version": "3.8.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } | |
| return notebook | |
| except Exception as e: | |
| # If standard approach fails, fall back to the more robust method | |
| print(f"Error in standard format_notebook: {e}. Using fallback method.") | |
| return format_large_notebook(content) | |
| def format_large_notebook(content): | |
| """Memory-efficient formatter for very large notebooks. | |
| Processes content in chunks to avoid memory issues.""" | |
| # Get notebook info | |
| notebook_info = extract_notebook_info(content) | |
| # Initialize cells with the title | |
| cells = [{ | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [f"# {notebook_info['name']}", "", notebook_info['description']] | |
| }] | |
| # Process content in chunks using incremental parsing | |
| # Find cell markers and their positions | |
| marker_positions = [] | |
| for match in re.finditer(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", content): | |
| marker_positions.append((match.start(), match.end(), match.group(1))) | |
| # If no markers are found, try to extract code blocks directly | |
| if not marker_positions: | |
| # Just extract code blocks and treat everything else as markdown | |
| remaining_text = content | |
| last_end = 0 | |
| for match in re.finditer(r"```python\s*(.*?)```", content, re.DOTALL): | |
| # If there's text before this code block, add it as markdown | |
| if match.start() > last_end: | |
| markdown_text = content[last_end:match.start()].strip() | |
| if markdown_text: | |
| cells.append({ | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": markdown_text.split("\n") | |
| }) | |
| # Add the code block | |
| code_text = match.group(1).strip() | |
| if code_text: | |
| cells.append({ | |
| "cell_type": "code", | |
| "metadata": {}, | |
| "source": code_text.split("\n"), | |
| "execution_count": None, | |
| "outputs": [] | |
| }) | |
| last_end = match.end() | |
| # If there's text after the last code block, add it as markdown | |
| if last_end < len(content): | |
| markdown_text = content[last_end:].strip() | |
| if markdown_text: | |
| cells.append({ | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": markdown_text.split("\n") | |
| }) | |
| else: | |
| # Process each cell based on its markers | |
| for i, (start, end, cell_type) in enumerate(marker_positions): | |
| # Find the end of this cell (start of next cell or end of content) | |
| cell_end = marker_positions[i+1][0] if i+1 < len(marker_positions) else len(content) | |
| cell_content = content[end:cell_end].strip() | |
| if cell_type == "MARKDOWN": | |
| cells.append({ | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": cell_content.split("\n") | |
| }) | |
| elif cell_type == "CODE": | |
| # Extract code from between triple backticks | |
| code_match = re.search(r"```python\s*(.*?)```", cell_content, re.DOTALL) | |
| if code_match: | |
| code_text = code_match.group(1).strip() | |
| cells.append({ | |
| "cell_type": "code", | |
| "metadata": {}, | |
| "source": code_text.split("\n"), | |
| "execution_count": None, | |
| "outputs": [] | |
| }) | |
| # Create the notebook structure | |
| notebook = { | |
| "cells": cells, | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "name": "python", | |
| "version": "3.8.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } | |
| return notebook | |