File size: 12,496 Bytes
e60fb94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import re
import json
import functools

# Add a simple LRU cache for regex patterns
def get_cached_pattern(pattern, flags=0):
    """Cache compiled regex patterns for better performance."""
    @functools.lru_cache(maxsize=32)
    def _get_pattern(pattern_str, pattern_flags):
        return re.compile(pattern_str, pattern_flags)
    
    return _get_pattern(pattern, flags)

def extract_notebook_info(content):
    """Extract notebook name and description from the AI response."""
    # Improved regex pattern that handles multiline and markdown formatting better
    name_match = re.search(r"NOTEBOOK_NAME:?\s*(.+?)(?=\n\s*NOTEBOOK_DESCRIPTION|\n\s*---|\n\s*$|$)", content, re.DOTALL)
    desc_match = re.search(r"NOTEBOOK_DESCRIPTION:?\s*(.+?)(?=\n\s*---|\n\s*$|$)", content, re.DOTALL)
    
    # Extract and clean up potential markdown formatting
    name = name_match.group(1).strip() if name_match else "Generated Notebook"
    description = desc_match.group(1).strip() if desc_match else "Notebook generated using NoteGenie"
    
    # Remove markdown formatting from name and description
    name = re.sub(r'\*\*(.*?)\*\*', r'\1', name)  # Remove bold formatting
    name = re.sub(r'\*(.*?)\*', r'\1', name)      # Remove italic formatting
    name = re.sub(r'_(.*?)_', r'\1', name)        # Remove underline formatting
    
    description = re.sub(r'\*\*(.*?)\*\*', r'\1', description)  # Remove bold formatting
    description = re.sub(r'\*(.*?)\*', r'\1', description)      # Remove italic formatting
    description = re.sub(r'_(.*?)_', r'\1', description)        # Remove underline formatting
    
    return {
        "name": name,
        "description": description
    }

def format_notebook(content):
    """Convert the AI text response into a properly formatted Jupyter notebook JSON.
    Optimized for performance with larger texts."""
    # Use faster pattern matching approach with improved end-of-file handling
    markdown_pattern = get_cached_pattern(r"---\s*MARKDOWN\s*CELL\s*---\s*([\s\S]*?)(?=---\s*(?:MARKDOWN|CODE)\s*CELL\s*---|$)", re.DOTALL)
    code_pattern = get_cached_pattern(r"---\s*CODE\s*CELL\s*---\s*```python\s*([\s\S]*?)```", re.DOTALL)
    cell_marker_pattern = get_cached_pattern(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", re.DOTALL)
    
    # OPTIMIZATION: Do a quick initial scan to determine notebook size and complexity
    complexity = len(content) // 1000  # Rough estimate based on content length
    cell_count = len(cell_marker_pattern.findall(content))
    
    # For very large notebooks, use a more memory-efficient but slower approach
    if complexity > 200 or cell_count > 50:  # If over ~200KB or 50 cells
        return format_large_notebook(content)
    
    # For regular notebooks, use the standard approach which is faster for medium-sized content
    try:
        # Extract cells from the content in a single pass if possible
        markdown_cells = markdown_pattern.findall(content)
        code_cells = code_pattern.findall(content)
        
        # If the AI didn't use the expected format, try alternate patterns
        if not markdown_cells and not code_cells:
            # Simplified handling for non-standard format
            sections = re.split(r"```python|```", content)
            cells = []
            
            for i, section in enumerate(sections):
                section = section.strip()
                if section and i % 2 == 0:
                    # This is markdown content
                    cells.append({"cell_type": "markdown", "source": section})
                elif section:
                    # This is code content
                    cells.append({"cell_type": "code", "source": section})
        else:
            # Interleave markdown and code cells in the correct order
            cells = []
            
            # Find overall ordering of cells
            all_matches = list(cell_marker_pattern.finditer(content))
            all_types = [m.group(1) for m in all_matches]
            
            md_idx = 0
            code_idx = 0
            
            for i, cell_type in enumerate(all_types):
                marker = all_matches[i]
                marker_end = marker.end()
                next_marker_start = all_matches[i+1].start() if i+1 < len(all_matches) else len(content)
                cell_content = content[marker_end:next_marker_start].strip()
                
                if cell_type == "MARKDOWN":
                    if md_idx < len(markdown_cells) or (i == len(all_types) - 1 and cell_content):
                        if md_idx < len(markdown_cells):
                            cell_source = markdown_cells[md_idx].strip()
                            md_idx += 1
                        else:
                            # Handle the last markdown cell if it wasn't captured by the pattern
                            cell_source = cell_content
                            
                        cells.append({
                            "cell_type": "markdown",
                            "source": cell_source
                        })
                elif cell_type == "CODE":
                    if code_idx < len(code_cells) or (i == len(all_types) - 1 and "```python" in cell_content):
                        if code_idx < len(code_cells):
                            cell_source = code_cells[code_idx].strip()
                            code_idx += 1
                        else:
                            # Handle the last code cell if it wasn't captured by the pattern
                            code_match = re.search(r"```python\s*([\s\S]*?)```", cell_content, re.DOTALL)
                            cell_source = code_match.group(1).strip() if code_match else ""
                            
                        cells.append({
                            "cell_type": "code",
                            "source": cell_source
                        })
        
        # Ensure we have at least a title cell if nothing was extracted
        if not cells:
            notebook_info = extract_notebook_info(content)
            cells.append({
                "cell_type": "markdown",
                "source": f"# {notebook_info['name']}\n\n{notebook_info['description']}"
            })
            
            # Try to extract any code blocks that might be present - only if needed
            code_blocks = re.findall(r"```python\s*(.*?)```", content, re.DOTALL)
            for block in code_blocks:
                cells.append({
                    "cell_type": "code",
                    "source": block.strip()
                })
        
        # Format cells for Jupyter notebook structure - optimize by processing in chunks
        formatted_cells = []
        for cell in cells:
            cell_source = cell["source"]
            # Only split if it's a string, not if it's already a list
            if isinstance(cell_source, str):
                # OPTIMIZATION: For very large cells, process line by line to avoid memory issues
                if len(cell_source) > 10000:  # If cell is over 10KB
                    source_lines = []
                    for line in cell_source.splitlines():
                        source_lines.append(line)
                else:
                    source_lines = cell_source.split("\n")
            else:
                source_lines = cell_source
                
            formatted_cell = {
                "cell_type": cell["cell_type"],
                "metadata": {},
                "source": source_lines
            }
            
            if cell["cell_type"] == "code":
                formatted_cell["execution_count"] = None
                formatted_cell["outputs"] = []
            
            formatted_cells.append(formatted_cell)
        
        # Create the notebook structure
        notebook = {
            "cells": formatted_cells,
            "metadata": {
                "kernelspec": {
                    "display_name": "Python 3",
                    "language": "python",
                    "name": "python3"
                },
                "language_info": {
                    "name": "python",
                    "version": "3.8.0"
                }
            },
            "nbformat": 4,
            "nbformat_minor": 4
        }
        
        return notebook
    except Exception as e:
        # If standard approach fails, fall back to the more robust method
        print(f"Error in standard format_notebook: {e}. Using fallback method.")
        return format_large_notebook(content)

def format_large_notebook(content):
    """Memory-efficient formatter for very large notebooks.
    Processes content in chunks to avoid memory issues."""
    # Get notebook info
    notebook_info = extract_notebook_info(content)
    
    # Initialize cells with the title
    cells = [{
        "cell_type": "markdown",
        "metadata": {},
        "source": [f"# {notebook_info['name']}", "", notebook_info['description']]
    }]
    
    # Process content in chunks using incremental parsing
    # Find cell markers and their positions
    marker_positions = []
    for match in re.finditer(r"---\s*(MARKDOWN|CODE)\s*CELL\s*---", content):
        marker_positions.append((match.start(), match.end(), match.group(1)))
    
    # If no markers are found, try to extract code blocks directly
    if not marker_positions:
        # Just extract code blocks and treat everything else as markdown
        remaining_text = content
        last_end = 0
        
        for match in re.finditer(r"```python\s*(.*?)```", content, re.DOTALL):
            # If there's text before this code block, add it as markdown
            if match.start() > last_end:
                markdown_text = content[last_end:match.start()].strip()
                if markdown_text:
                    cells.append({
                        "cell_type": "markdown",
                        "metadata": {},
                        "source": markdown_text.split("\n")
                    })
            
            # Add the code block
            code_text = match.group(1).strip()
            if code_text:
                cells.append({
                    "cell_type": "code",
                    "metadata": {},
                    "source": code_text.split("\n"),
                    "execution_count": None,
                    "outputs": []
                })
            
            last_end = match.end()
        
        # If there's text after the last code block, add it as markdown
        if last_end < len(content):
            markdown_text = content[last_end:].strip()
            if markdown_text:
                cells.append({
                    "cell_type": "markdown",
                    "metadata": {},
                    "source": markdown_text.split("\n")
                })
    else:
        # Process each cell based on its markers
        for i, (start, end, cell_type) in enumerate(marker_positions):
            # Find the end of this cell (start of next cell or end of content)
            cell_end = marker_positions[i+1][0] if i+1 < len(marker_positions) else len(content)
            cell_content = content[end:cell_end].strip()
            
            if cell_type == "MARKDOWN":
                cells.append({
                    "cell_type": "markdown",
                    "metadata": {},
                    "source": cell_content.split("\n")
                })
            elif cell_type == "CODE":
                # Extract code from between triple backticks
                code_match = re.search(r"```python\s*(.*?)```", cell_content, re.DOTALL)
                if code_match:
                    code_text = code_match.group(1).strip()
                    cells.append({
                        "cell_type": "code",
                        "metadata": {},
                        "source": code_text.split("\n"),
                        "execution_count": None,
                        "outputs": []
                    })
    
    # Create the notebook structure
    notebook = {
        "cells": cells,
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3"
            },
            "language_info": {
                "name": "python",
                "version": "3.8.0"
            }
        },
        "nbformat": 4,
        "nbformat_minor": 4
    }
    
    return notebook