h2i commited on
Commit
7f5c744
·
verified ·
1 Parent(s): 9678c70

Upload 13 files

Browse files
src/v2/.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=gl-U2FsdGVkX186Ze7PMRRd2oHk9V9gAmDv+vMuS3vneQ544WvS4bFhUA7Jfnj+/CYU
2
+ OPENAI_API_BASE=https://aibe.mygreatlearning.com/openai/v1
src/v2/assets.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ logo = "https://mma.prnewswire.com/media/1458111/Great_Learning_Logo.jpg?p=facebook"
2
+ icon = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAMAAABF0y+mAAAAY1BMVEVHcEz////////////////////////////////////////////1+P3v9PuwyetRjdkAac/S3fEyftUNcdJsnN2wvuLd5/aDodhFY7oAJqQAGKJ2ickDM6hGiNe+yecSU7yerNm6H2eSAAAADHRSTlMANIK95v9f4h7TQ0I4MwE6AAAA+ElEQVR4AWSRBQKEMAwEscWlcrj+/5WX0uCDM5WIc+J6fgAEvuc6b8IIJ1H4UHGCB0l8uRQf0sNlYPJiJweR8ZqsykpYJAx2ZbufVELviJL3NS6000gJ9SNqMCbmaJ8ntKgaPIgodxiUFi3euI7Hi1b44Dk+iEqL5it9JzD5Ca1gKGqDZBk4+z+hfzB0PTGMYB5yHMhN0yV52dk86mVZ6ktGNqD5Cqi4pG9Tqe26VvY1LJ4tQq71WrO84nG5fMs69W2+yw5MdBZ+66d+GjciBxNeLVtITpSkBJPcm51vK7kFB/F/aMmEV1AQLseNL4HhS5qEEzXe7AAAqbAXH67DSm4AAAAASUVORK5CYII="
src/v2/colab_handler.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any
6
+
7
+ class ColabNotebookProcessor:
8
+ """Processes Jupyter notebooks to replace Google Colab specific code with local equivalents"""
9
+
10
+ def __init__(self, notebook_dir: str = "/tmp/Notebook"):
11
+ self.notebook_dir = Path(notebook_dir)
12
+ self.dataset_files = self._get_available_datasets()
13
+ self.dataset_mapping = self._create_dataset_mapping()
14
+
15
+ def _get_available_datasets(self) -> List[str]:
16
+ """Get list of available dataset files in the notebook directory"""
17
+ if not self.notebook_dir.exists():
18
+ return []
19
+
20
+ dataset_extensions = {'.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet'}
21
+ return [f.name for f in self.notebook_dir.iterdir()
22
+ if f.suffix.lower() in dataset_extensions and f.is_file()]
23
+
24
+ def _create_dataset_mapping(self) -> Dict[str, str]:
25
+ """Create mapping for common dataset references"""
26
+ mapping = {}
27
+
28
+ # If we have datasets, create common mappings
29
+ for filename in self.dataset_files:
30
+ name_without_ext = Path(filename).stem
31
+
32
+ # Direct mappings
33
+ mapping[filename] = filename
34
+ mapping[name_without_ext] = filename
35
+ mapping[filename.lower()] = filename
36
+ mapping[name_without_ext.lower()] = filename
37
+
38
+ # Common patterns
39
+ if filename.lower().endswith('.csv'):
40
+ mapping['data.csv'] = filename
41
+ mapping['dataset.csv'] = filename
42
+ mapping['train.csv'] = filename
43
+ mapping['test.csv'] = filename
44
+
45
+ return mapping
46
+
47
+ def process_notebook(self, notebook_path: str) -> str:
48
+ """Process notebook and return path to modified notebook"""
49
+ with open(notebook_path, 'r', encoding='utf-8') as f:
50
+ notebook = json.load(f)
51
+
52
+ # Process each cell
53
+ for cell in notebook.get('cells', []):
54
+ if cell.get('cell_type') == 'code':
55
+ cell['source'] = self._process_code_cell(cell.get('source', []))
56
+
57
+ # Save modified notebook
58
+ modified_path = str(Path(notebook_path).parent / f"modified_{Path(notebook_path).name}")
59
+ with open(modified_path, 'w', encoding='utf-8') as f:
60
+ json.dump(notebook, f, indent=2)
61
+
62
+ return modified_path
63
+
64
+ def _process_code_cell(self, source_lines: List[str]) -> List[str]:
65
+ """Process individual code cell to replace Colab-specific code"""
66
+ if isinstance(source_lines, str):
67
+ source_lines = source_lines.splitlines(True)
68
+
69
+ processed_lines = []
70
+ skip_next = False
71
+
72
+ for i, line in enumerate(source_lines):
73
+ if skip_next:
74
+ skip_next = False
75
+ continue
76
+
77
+ processed_line = self._process_line(line)
78
+
79
+ # Handle multi-line Colab patterns
80
+ if self._is_colab_drive_mount(line):
81
+ # Skip the mount line and add a comment
82
+ processed_lines.append("# Google Drive mount replaced with local file access\n")
83
+ continue
84
+ elif self._is_colab_files_upload(line):
85
+ # Replace file upload with dataset selection
86
+ processed_lines.append(self._replace_file_upload(line))
87
+ continue
88
+
89
+ processed_lines.append(processed_line)
90
+
91
+ return processed_lines
92
+
93
+ def _process_line(self, line: str) -> str:
94
+ """Process individual line for Colab replacements"""
95
+ original_line = line
96
+
97
+ # Skip/comment out Colab-specific imports
98
+ if self._is_colab_import(line):
99
+ return f"# {line}" if not line.strip().startswith('#') else line
100
+
101
+ # Replace Google Drive paths with local paths
102
+ line = self._replace_drive_paths(line)
103
+
104
+ # Replace Colab file operations
105
+ line = self._replace_file_operations(line)
106
+
107
+ # Replace uploaded file references
108
+ line = self._replace_uploaded_files(line)
109
+
110
+ return line
111
+
112
+ def _is_colab_import(self, line: str) -> bool:
113
+ """Check if line contains Colab-specific imports"""
114
+ colab_imports = [
115
+ 'from google.colab import drive',
116
+ 'from google.colab import files',
117
+ 'from google.colab import auth',
118
+ 'import google.colab'
119
+ ]
120
+
121
+ line_stripped = line.strip()
122
+ return any(imp in line_stripped for imp in colab_imports)
123
+
124
+ def _is_colab_drive_mount(self, line: str) -> bool:
125
+ """Check if line is a drive mount operation"""
126
+ return 'drive.mount(' in line or 'drive.mount (' in line
127
+
128
+ def _is_colab_files_upload(self, line: str) -> bool:
129
+ """Check if line is a files upload operation"""
130
+ return 'files.upload(' in line or 'files.upload (' in line
131
+
132
+ def _replace_drive_paths(self, line: str) -> str:
133
+ """Replace Google Drive paths with local paths"""
134
+ # Common drive path patterns
135
+ drive_patterns = [
136
+ (r'/content/drive/My Drive/', './'),
137
+ (r'/content/drive/MyDrive/', './'),
138
+ (r'/content/drive/', './'),
139
+ (r'/content/', './'),
140
+ (r'"/content/drive/[^"]*"', lambda m: self._find_dataset_match(m.group())),
141
+ (r"'/content/drive/[^']*'", lambda m: self._find_dataset_match(m.group())),
142
+ ]
143
+
144
+ for pattern, replacement in drive_patterns:
145
+ if callable(replacement):
146
+ line = re.sub(pattern, replacement, line)
147
+ else:
148
+ line = re.sub(pattern, replacement, line)
149
+
150
+ return line
151
+
152
+ def _replace_file_operations(self, line: str) -> str:
153
+ """Replace file operations with local equivalents"""
154
+ # Replace common file reading patterns
155
+ if 'pd.read_csv(' in line:
156
+ line = self._replace_pandas_read(line, 'csv')
157
+ elif 'pd.read_excel(' in line:
158
+ line = self._replace_pandas_read(line, 'excel')
159
+
160
+ return line
161
+
162
+ def _replace_pandas_read(self, line: str, file_type: str) -> str:
163
+ """Replace pandas read operations with local file paths"""
164
+ # Extract filename from the line if possible
165
+ pattern = r'["\']([^"\']+)["\']'
166
+ matches = re.findall(pattern, line)
167
+
168
+ if matches:
169
+ original_path = matches[0]
170
+ # Try to find a matching local dataset
171
+ local_file = self._find_best_dataset_match(original_path, file_type)
172
+ if local_file:
173
+ line = line.replace(original_path, local_file)
174
+
175
+ return line
176
+
177
+ def _replace_uploaded_files(self, line: str) -> str:
178
+ """Replace references to uploaded files with local dataset files"""
179
+ # Pattern for uploaded file references
180
+ if 'uploaded[' in line and self.dataset_files:
181
+ # Replace with first available dataset
182
+ line = f"# Uploaded file replaced with local dataset: {self.dataset_files[0]}\n"
183
+ line += f"# Original: {line.strip()}\n"
184
+ line += f"# Use: '{self.dataset_files[0]}' instead\n"
185
+
186
+ return line
187
+
188
+ def _replace_file_upload(self, line: str) -> str:
189
+ """Replace file upload with comment about available datasets"""
190
+ comment = "# File upload replaced with local datasets\n"
191
+ if self.dataset_files:
192
+ comment += f"# Available datasets: {', '.join(self.dataset_files)}\n"
193
+ else:
194
+ comment += "# No datasets found in directory\n"
195
+ return comment
196
+
197
+ def _find_dataset_match(self, quoted_path: str) -> str:
198
+ """Find best matching dataset for a quoted path"""
199
+ # Remove quotes
200
+ path = quoted_path.strip('\'"')
201
+ filename = os.path.basename(path)
202
+
203
+ # Try direct match first
204
+ if filename in self.dataset_files:
205
+ return f'"{filename}"'
206
+
207
+ # Try mapping
208
+ if filename in self.dataset_mapping:
209
+ return f'"{self.dataset_mapping[filename]}"'
210
+
211
+ # Try partial matches
212
+ for dataset in self.dataset_files:
213
+ if filename.lower() in dataset.lower() or dataset.lower() in filename.lower():
214
+ return f'"{dataset}"'
215
+
216
+ # Return first available dataset if any
217
+ if self.dataset_files:
218
+ return f'"{self.dataset_files[0]}"'
219
+
220
+ return quoted_path # Return original if no match found
221
+
222
+ def _find_best_dataset_match(self, original_path: str, file_type: str) -> str:
223
+ """Find the best matching dataset file"""
224
+ filename = os.path.basename(original_path)
225
+
226
+ # Filter by file type if specified
227
+ type_filtered = []
228
+ if file_type == 'csv':
229
+ type_filtered = [f for f in self.dataset_files if f.lower().endswith('.csv')]
230
+ elif file_type == 'excel':
231
+ type_filtered = [f for f in self.dataset_files if f.lower().endswith(('.xlsx', '.xls'))]
232
+ else:
233
+ type_filtered = self.dataset_files
234
+
235
+ # Try exact match
236
+ if filename in type_filtered:
237
+ return filename
238
+
239
+ # Try name without extension
240
+ name_without_ext = os.path.splitext(filename)[0]
241
+ for dataset in type_filtered:
242
+ if os.path.splitext(dataset)[0] == name_without_ext:
243
+ return dataset
244
+
245
+ # Return first file of the right type
246
+ if type_filtered:
247
+ return type_filtered[0]
248
+
249
+ # Return first available dataset
250
+ if self.dataset_files:
251
+ return self.dataset_files[0]
252
+
253
+ return filename # Return original if no datasets available
src/v2/fact_prompt.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt_fact = """
2
+ You are an expert reviewer specialized in verifying factual accuracy in Jupyter notebooks (machine learning and deep learning case studies).
3
+ You will be provided with a list of notebook cells.
4
+
5
+ Your task is to identify **only factual inconsistencies** in the text.
6
+
7
+ Important Rules:
8
+
9
+ 1. Code vs Markdown
10
+ - If the content is Python code, ignore it completely (do not analyze).
11
+ - Only review markdown/descriptive text.
12
+
13
+ 2. What counts as a factual error
14
+ - Incorrect explanations of functions, algorithms, or methods.
15
+ Examples:
16
+ * "np.mean() computes the median." → Incorrect (it computes the mean).
17
+ * "Logistic regression is used for regression tasks." → Incorrect (it is for classification).
18
+ * "ReLU outputs negative values unchanged." → Incorrect (it zeroes them).
19
+ - Wrong descriptions of standard ML/DL concepts or libraries.
20
+
21
+ 3. What does NOT count as a factual error
22
+ - Dataset-specific observations tied to EDA or plots.
23
+ Examples:
24
+ * "The plot shows a rising trend."
25
+ * "Most customers are between 20–30 years old."
26
+ * "Attrition is our target variable with 84% of records being 'No'
27
+ - Subjective phrasing or stylistic choices.
28
+ - Grammar, punctuation, or clarity issues (ignore them here).
29
+
30
+ 4. Output rules
31
+ - Extract only the exact text fragment(s) that are factually incorrect.
32
+ - Provide the corrected version with the right fact.
33
+ - If no factual errors exist, return an empty JSON.
34
+
35
+ 5. Output format
36
+ - Return only a JSON object following this Pydantic model:
37
+
38
+ ```python
39
+ from typing import List
40
+ from pydantic import BaseModel, Field
41
+
42
+ class LLMFactualCheckOutput(BaseModel):
43
+ text: List[str] = Field(
44
+ ...,
45
+ description="Exact text fragments from the notebook that contain factual errors."
46
+ )
47
+ corrected_text: List[str] = Field(
48
+ ...,
49
+ description="Corrected factual statements aligned with `text`"
50
+ )
51
+ """
src/v2/grammar_chain.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from notebook_parser import NotebookParser
3
+ from grammar_prompt import prompt
4
+ from fact_prompt import prompt_fact
5
+ from langchain_core.runnables import RunnableLambda,RunnableParallel
6
+ from langchain_openai import ChatOpenAI
7
+ from output_schema import LLMCorrectionOutput,LLMFactualCheckOutput
8
+ import pandas as pd
9
+
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+
13
+ def grammar_pipeline():
14
+
15
+ parse_notebook = RunnableLambda(lambda path: NotebookParser(notebook_path=path).extract(code=True, markdown=True))
16
+
17
+ prepare_message = RunnableLambda(
18
+ lambda cells: {
19
+ "role": "user",
20
+ "content": [{"type": "text", "text": prompt}] + [{"type": "text", "text": "The list of cells are : "}] + cells
21
+ },
22
+ )
23
+
24
+ llm = ChatOpenAI(model='gpt-4o-mini',temperature=0, api_key = os.getenv('OPENAI_API_KEY'), base_url = os.getenv('OPENAI_API_BASE'))
25
+
26
+ invoke_llm = RunnableLambda(
27
+ lambda message: llm.with_structured_output(LLMCorrectionOutput).invoke([message])
28
+ )
29
+
30
+ extract_suggestions = RunnableLambda(
31
+ lambda result: {'Is Grammar Error?':result.is_grammar_error,'Grammar_Text':result.text,'Grammar_Suggestions':result.corrected_text}
32
+ )
33
+
34
+ notebook_chain = (
35
+ parse_notebook
36
+ | prepare_message
37
+ | invoke_llm
38
+ | extract_suggestions
39
+ )
40
+
41
+ return notebook_chain
42
+
43
+ def fact_pipeline():
44
+
45
+ parse_notebook = RunnableLambda(lambda path: NotebookParser(notebook_path=path).extract(code=True, markdown=True))
46
+
47
+ prepare_message = RunnableLambda(
48
+ lambda cells: {
49
+ "role": "user",
50
+ "content": [{"type": "text", "text": prompt_fact}] + [{"type": "text", "text": "The list of cells are : "}] + cells
51
+ },
52
+ )
53
+
54
+ llm = ChatOpenAI(model='gpt-4o-mini',temperature=0, api_key = os.getenv('OPENAI_API_KEY'), base_url = os.getenv('OPENAI_API_BASE'))
55
+
56
+ invoke_llm = RunnableLambda(
57
+ lambda message: llm.with_structured_output(LLMFactualCheckOutput).invoke([message])
58
+ )
59
+
60
+ extract_suggestions = RunnableLambda(
61
+ lambda result: {'Fact_Text':result.text,'Fact_Suggestions':result.corrected_text}
62
+ )
63
+
64
+ notebook_chain = (
65
+ parse_notebook
66
+ | prepare_message
67
+ | invoke_llm
68
+ | extract_suggestions
69
+ )
70
+
71
+ return notebook_chain
src/v2/grammar_exec.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from grammar_chain import grammar_pipeline,fact_pipeline
2
+ from utilities import safe_concurrent_batch
3
+ import pandas as pd
4
+ import os
5
+
6
+ def execute(path):
7
+
8
+ notebooks = [path+'//'+f for f in os.listdir(path) if f.endswith(".ipynb")]
9
+
10
+ grammar = safe_concurrent_batch(grammar_pipeline(),notebooks,max_workers=1)[0]
11
+
12
+ fact = safe_concurrent_batch(fact_pipeline(),notebooks,max_workers=1)[0]
13
+
14
+ if grammar["status"] == "success" and fact['status']=='success':
15
+ grammar_df = pd.DataFrame(grammar["output"])
16
+ fact_df = pd.DataFrame(fact["output"])
17
+ result = pd.concat([grammar_df,fact_df], axis=1)
18
+ else:
19
+ result = pd.DataFrame(columns=['Unable to Process'])
20
+
21
+ return result
src/v2/grammar_prompt.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt = """
2
+ You are an expert editor specialized in reviewing Jupyter notebooks.
3
+ You will be provided with a list of notebook cells.
4
+
5
+ Your task is to analyze each cell for:
6
+ 1. Grammar corrections
7
+ 2. Stylistic improvements
8
+
9
+ Important Rules:
10
+
11
+ 1. Detect code vs markdown/descriptive text
12
+ - If the cell contains programming syntax such as `import`, variable assignments (`=`), function definitions (`def`), loops (`for`, `while`), conditional statements (`if`, `else`), or other common Python code patterns, treat it as code.
13
+ - Otherwise, treat it as markdown/descriptive text.
14
+
15
+ 2. For markdown/descriptive text
16
+ - Identify grammatical mistakes, punctuation errors, capitalization issues, spelling mistakes, and any problems with sentence structure or word choice.
17
+ - Check for clarity, conciseness, and readability while ensuring the tone and style remain consistent.
18
+ - Extract only the exact text fragment(s) that contain errors (do not include the entire cell if only a part is incorrect).
19
+ - Return the corrected version while preserving the original meaning and any markdown formatting (headings, bullet points, numbered lists, tables, links, HTML).
20
+
21
+ 3. For code cells
22
+ - Only check grammar in comments (lines starting with `#`).
23
+ - Do not check code syntax, logic, or variable names.
24
+ - Extract only the incorrect part of the comment (not the entire line unless fully incorrect).
25
+
26
+ 4. Strict inclusion rule
27
+ - Only include fragments that actually contain issues.
28
+ - Do NOT include fragments that are already correct.
29
+ - If no corrections are needed, return an empty JSON with all fields appropriately empty or `None`.
30
+
31
+ 5. Classification of corrections
32
+ - This is related to the boolean field is_grammar_error:
33
+ - True if the issue is a genuine grammatical, punctuation, capitalization, or spelling error.
34
+ - False if the issue is only a stylistic improvement (clarity, conciseness, readability, word choice).
35
+
36
+ 6. Output Format
37
+ - Return only a JSON object strictly following this Pydantic model:
38
+
39
+ ```python
40
+ from typing import List, Optional, Union
41
+ from pydantic import BaseModel, Field
42
+
43
+ class LLMCorrectionOutput(BaseModel):
44
+ text: List[str] = Field(
45
+ ...,
46
+ description="A list of exact text fragments from the Jupyter notebook cells where corrections need to be applied. Each fragment must be minimal and only include the part with issues."
47
+ )
48
+ corrected_text: List[str] = Field(
49
+ ...,
50
+ description="A list of corrected text fragments, aligned by index with `text`. Each entry must contain only the corrected version."
51
+ )
52
+ is_grammar_error: List[bool] = Field(
53
+ ...,
54
+ description="A list of booleans aligned by index with `text`. True if the issue is a grammatical/punctuation/capitalization/spelling error, False if it is a stylistic enhancement."
55
+ )
56
+ """
src/v2/notebook.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os,sys,shutil,subprocess,json,re,ast,tempfile,nbformat,logging,argparse
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from colab_handler import ColabNotebookProcessor
5
+
6
+ # -------- Logging --------
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format='[%(asctime)s] %(levelname)s: %(message)s',
10
+ handlers=[logging.StreamHandler(sys.stdout)]
11
+ )
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # -------- Executor Class --------
15
+ class NotebookExecutor:
16
+ def __init__(self, working_dir: str | None = None, verbose: bool = False, output_csv: str = "/tmp/Notebook/notebook_execution_report.csv"):
17
+ self.working_dir = Path(working_dir) if working_dir else Path("/tmp/Notebook")
18
+ self.verbose = verbose
19
+ self.results = []
20
+ self.output_csv = output_csv
21
+ self.colab_processor = ColabNotebookProcessor(str(self.working_dir))
22
+
23
+ def setup_working_dir(self):
24
+ """Create working directory"""
25
+ self.working_dir.mkdir(parents=True, exist_ok=True)
26
+ if self.verbose:
27
+ logger.info(f"Working directory: {self.working_dir}")
28
+
29
+ def clean_working_dir(self):
30
+ """Remove working directory"""
31
+ try:
32
+ if self.working_dir.exists():
33
+ shutil.rmtree(self.working_dir)
34
+ if self.verbose:
35
+ logger.info(f"Cleaned working directory: {self.working_dir}")
36
+ except Exception as e:
37
+ logger.warning(f"Unable to fully clean working dir: {e}")
38
+
39
+ def _read_notebook_json(self, notebook_path: Path):
40
+ try:
41
+ with open(str(notebook_path), "r", encoding="utf-8") as f:
42
+ return json.load(f)
43
+ except Exception as e:
44
+ raise RuntimeError(f"Failed to read notebook JSON: {e}")
45
+
46
+ def list_available_datasets(self):
47
+ """List available datasets in the working directory"""
48
+ dataset_extensions = {'.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet'}
49
+ datasets = [f.name for f in self.working_dir.iterdir()
50
+ if f.suffix.lower() in dataset_extensions and f.is_file()]
51
+
52
+ if self.verbose and datasets:
53
+ logger.info(f"Available datasets: {', '.join(datasets)}")
54
+
55
+ return datasets
56
+
57
+ def extract_imports_from_notebook(self, notebook_path: Path):
58
+ """Extract third-party imports from notebook via AST (best-effort)."""
59
+ imports = set()
60
+ try:
61
+ nb_json = self._read_notebook_json(notebook_path)
62
+ except Exception:
63
+ return set()
64
+
65
+ for cell in nb_json.get("cells", []):
66
+ if cell.get("cell_type") != "code":
67
+ continue
68
+ source = cell.get("source", "")
69
+ if isinstance(source, list):
70
+ source = "\n".join(source)
71
+ try:
72
+ tree = ast.parse(source)
73
+ for node in ast.walk(tree):
74
+ if isinstance(node, ast.Import):
75
+ for alias in node.names:
76
+ imports.add(alias.name.split(".")[0])
77
+ elif isinstance(node, ast.ImportFrom):
78
+ if node.module:
79
+ imports.add(node.module.split(".")[0])
80
+ except Exception:
81
+ # ignore parsing errors in individual cells
82
+ continue
83
+
84
+ # Filter standard-library modules (non-exhaustive)
85
+ stdlib = {
86
+ 'os','sys','json','re','math','random','datetime','time','collections',
87
+ 'itertools','functools','operator','pathlib','urllib','http','xml','html',
88
+ 'csv','sqlite3','pickle','logging','unittest','argparse','configparser',
89
+ 'io','typing','warnings','copy','string','textwrap','unicodedata','struct',
90
+ 'codecs','calendar','hashlib','hmac','secrets','statistics', 'subprocess'
91
+ }
92
+
93
+ # Filter out Google Colab specific imports as they're handled separately
94
+ colab_modules = {'google', 'colab'}
95
+
96
+ third_party = imports - stdlib - colab_modules
97
+ return third_party
98
+
99
+ def install_packages(self, python_executable: Path, packages: set | list):
100
+ """Install packages into the environment (best-effort). Returns (success, stderr_text)"""
101
+ if not packages:
102
+ return True, ""
103
+
104
+ # map common names -> pip packages
105
+ package_mapping = {
106
+ 'sklearn': 'scikit-learn',
107
+ 'cv2': 'opencv-python',
108
+ 'PIL': 'Pillow',
109
+ 'bs4': 'beautifulsoup4',
110
+ }
111
+
112
+ mapped = [package_mapping.get(p, p) for p in packages]
113
+
114
+ # Install packages one-by-one so errors are isolated
115
+ for pkg in mapped:
116
+ try:
117
+ proc = subprocess.run([str(python_executable), "-m", "pip", "install", pkg],
118
+ capture_output=True, text=True, timeout=600)
119
+ if proc.returncode != 0:
120
+ stderr = proc.stderr or proc.stdout or f"pip install returned {proc.returncode}"
121
+ logger.warning(f"Failed to install {pkg}: {stderr.strip()[:400]}")
122
+ return False, stderr
123
+ except subprocess.TimeoutExpired:
124
+ msg = f"Timeout while installing {pkg}"
125
+ logger.warning(msg)
126
+ return False, msg
127
+ except Exception as e:
128
+ msg = f"Error while installing {pkg}: {e}"
129
+ logger.warning(msg)
130
+ return False, msg
131
+
132
+ return True, ""
133
+
134
+ def extract_notebook_error(self, stderr_text: str):
135
+ """Attempt to extract concise error message from papermill/pip stderr."""
136
+ if not stderr_text:
137
+ return "Unknown error occurred"
138
+ lines = stderr_text.strip().splitlines()
139
+ # Look for Traceback or Exception lines
140
+ for line in reversed(lines):
141
+ if any(keyword in line for keyword in ("Traceback", "Error", "Exception", "ModuleNotFoundError", "ImportError")):
142
+ return line.strip()
143
+ # fallback to last non-empty line
144
+ for line in reversed(lines):
145
+ if line.strip():
146
+ return line.strip()
147
+ return lines[-1] if lines else "Unknown error"
148
+
149
+ def display_cell_execution_details(self, output_notebook_path: Path):
150
+ """Verbose: show last executed cells (best-effort)."""
151
+ try:
152
+ nb = nbformat.read(str(output_notebook_path), as_version=4)
153
+ except Exception as e:
154
+ logger.info(f"Could not read output notebook for cell details: {e}")
155
+ return
156
+
157
+ logger.info("CELL-BY-CELL EXECUTION DETAILS (showing up to 10 code cells)")
158
+ shown = 0
159
+ for i, cell in enumerate(nb.cells, start=1):
160
+ if cell.cell_type != "code":
161
+ continue
162
+ shown += 1
163
+ logger.info(f"--- CELL {i} ---")
164
+ src_preview = ("\n".join(cell.source.splitlines()[:6]) + ("\n..." if len(cell.source.splitlines()) > 6 else ""))
165
+ logger.info("SOURCE (first lines):\n" + src_preview)
166
+ if getattr(cell, "outputs", None):
167
+ for output in cell.outputs[-2:]: # show last two outputs per cell
168
+ if output.output_type == "stream":
169
+ text = getattr(output, "text", "")
170
+ logger.info("STREAM OUTPUT:\n" + ("\n".join(text.splitlines()[-4:])))
171
+ elif output.output_type == "error":
172
+ ename = getattr(output, "ename", "")
173
+ evalue = getattr(output, "evalue", "")
174
+ logger.info(f"ERROR: {ename}: {evalue}")
175
+ if shown >= 10:
176
+ break
177
+
178
+ def run_notebook(self, notebook_path: str | Path, timeout: int = 1800):
179
+ """
180
+ Run a single notebook with Colab code replacement and dataset support.
181
+ Returns a dict: {'notebook': <name>, 'status': 'Pass'|'Fail', 'error_message': <msg>}
182
+ """
183
+ try:
184
+ if isinstance(notebook_path, str):
185
+ if notebook_path.startswith('/'):
186
+ notebook_full_path = Path(notebook_path)
187
+ else:
188
+ notebook_full_path = Path('/tmp/Notebook') / notebook_path
189
+ else:
190
+ notebook_full_path = Path(notebook_path)
191
+ except Exception as e:
192
+ return {"notebook": str(notebook_path), "status": "Fail", "error_message": f"Invalid path: {e}"}
193
+
194
+ notebook_name = notebook_full_path.name
195
+ notebook_dir = Path('/tmp/Notebook')
196
+
197
+ # Check if notebook exists
198
+ if not notebook_full_path.exists():
199
+ return {"notebook": notebook_name, "status": "Fail", "error_message": f"Notebook not found at: {notebook_full_path}"}
200
+
201
+ # List available datasets
202
+ datasets = self.list_available_datasets()
203
+ if datasets:
204
+ logger.info(f"Processing notebook with {len(datasets)} available dataset(s)")
205
+
206
+ # Process notebook for Colab compatibility
207
+ try:
208
+ processed_notebook_path = self.colab_processor.process_notebook(str(notebook_full_path))
209
+ if self.verbose:
210
+ logger.info(f"Processed notebook for Colab compatibility: {processed_notebook_path}")
211
+ except Exception as e:
212
+ logger.warning(f"Failed to process Colab compatibility: {e}")
213
+ processed_notebook_path = str(notebook_full_path)
214
+
215
+ # create fresh venv in the notebook folder
216
+ env_path = notebook_dir / "venv"
217
+ if env_path.exists():
218
+ try:
219
+ shutil.rmtree(env_path)
220
+ except Exception:
221
+ pass
222
+
223
+ # create venv
224
+ try:
225
+ venv_proc = subprocess.run([sys.executable, "-m", "venv", str(env_path)], capture_output=True, text=True, timeout=120)
226
+ if venv_proc.returncode != 0:
227
+ stderr = venv_proc.stderr or venv_proc.stdout
228
+ return {"notebook": notebook_name, "status": "Fail",
229
+ "error_message": f"Failed to create venv: {stderr.strip()[:400]}"}
230
+ except subprocess.TimeoutExpired:
231
+ return {"notebook": notebook_name, "status": "Fail",
232
+ "error_message": "Timeout while creating virtual environment"}
233
+ except Exception as e:
234
+ return {"notebook": notebook_name, "status": "Fail",
235
+ "error_message": f"Error creating venv: {e}"}
236
+
237
+ # python executable inside venv
238
+ if os.name == "nt":
239
+ python_exec = env_path / "Scripts" / "python.exe"
240
+ else:
241
+ python_exec = env_path / "bin" / "python"
242
+
243
+ if not python_exec.exists():
244
+ return {"notebook": notebook_name, "status": "Fail",
245
+ "error_message": f"Python executable not found in venv: {python_exec}"}
246
+
247
+ # Upgrade pip and install pinned minimal tooling
248
+ try:
249
+ # Upgrade pip
250
+ up_proc = subprocess.run([str(python_exec), "-m", "pip", "install", "--upgrade", "pip"],
251
+ capture_output=True, text=True, timeout=120)
252
+ if up_proc.returncode != 0:
253
+ logger.warning("pip upgrade returned non-zero; continuing if possible")
254
+
255
+ # Install pinned papermill / ipykernel / jupyter (stable versions)
256
+ pinned = [
257
+ "papermill==2.5.0",
258
+ "ipykernel==6.29.5",
259
+ "jupyter==1.0.0"
260
+ ]
261
+ install_proc = subprocess.run([str(python_exec), "-m", "pip", "install"] + pinned,
262
+ capture_output=True, text=True, timeout=600)
263
+ if install_proc.returncode != 0:
264
+ stderr_text = install_proc.stderr or install_proc.stdout or "pip install returned non-zero"
265
+ return {"notebook": notebook_name, "status": "Fail",
266
+ "error_message": f"Failed to setup environment (pip install core packages): {stderr_text.strip()[:800]}"}
267
+ except subprocess.TimeoutExpired:
268
+ return {"notebook": notebook_name, "status": "Fail",
269
+ "error_message": "Timeout installing core packages"}
270
+ except Exception as e:
271
+ return {"notebook": notebook_name, "status": "Fail",
272
+ "error_message": f"Error installing core packages: {e}"}
273
+
274
+ # Install common data-science packages (helps many notebooks run without per-notebook pip)
275
+ common_packages = ["numpy", "pandas", "matplotlib", "seaborn", "scikit-learn", "plotly"]
276
+ try:
277
+ common_proc = subprocess.run([str(python_exec), "-m", "pip", "install"] + common_packages,
278
+ capture_output=True, text=True, timeout=600)
279
+ if common_proc.returncode != 0:
280
+ logger.warning("Installing common packages returned non-zero; continuing")
281
+ except Exception:
282
+ logger.warning("Unexpected error during common package install; continuing")
283
+
284
+ # Extract inferred imports and try to install them (best-effort)
285
+ # Use the original notebook for import detection, not the processed one
286
+ inferred = self.extract_imports_from_notebook(notebook_full_path)
287
+ if inferred:
288
+ success, stderr_text = self.install_packages(python_exec, inferred)
289
+ if not success:
290
+ return {"notebook": notebook_name, "status": "Fail",
291
+ "error_message": f"Failed to install inferred packages: {stderr_text.strip()[:800]}"}
292
+
293
+ # Create kernel name and install kernel
294
+ kernel_name = f"nb_{re.sub(r'[^A-Za-z0-9_]', '_', notebook_name)}"
295
+ try:
296
+ kernel_proc = subprocess.run([str(python_exec), "-m", "ipykernel", "install", "--user",
297
+ "--name", kernel_name, "--display-name", f"Python ({kernel_name})"],
298
+ capture_output=True, text=True, timeout=60)
299
+ if kernel_proc.returncode != 0:
300
+ stderr_text = kernel_proc.stderr or kernel_proc.stdout or "ipykernel install returned non-zero"
301
+ return {"notebook": notebook_name, "status": "Fail",
302
+ "error_message": f"Failed to install kernel: {stderr_text.strip()[:800]}"}
303
+ except subprocess.TimeoutExpired:
304
+ return {"notebook": notebook_name, "status": "Fail",
305
+ "error_message": "Timeout while installing kernel"}
306
+ except Exception as e:
307
+ return {"notebook": notebook_name, "status": "Fail",
308
+ "error_message": f"Error installing kernel: {e}"}
309
+
310
+ # Execute notebook with papermill (use the processed notebook)
311
+ output_path = notebook_dir / "output.ipynb"
312
+ try:
313
+ exec_proc = subprocess.run([str(python_exec), "-m", "papermill",
314
+ processed_notebook_path, str(output_path),
315
+ "--kernel", kernel_name, "--no-progress-bar"],
316
+ capture_output=True, text=True, timeout=timeout, cwd=str(notebook_dir))
317
+ if exec_proc.returncode == 0:
318
+ status = "Pass"
319
+ error_message = ""
320
+ if self.verbose:
321
+ logger.info("Notebook executed successfully")
322
+ if output_path.exists():
323
+ self.display_cell_execution_details(output_path)
324
+ else:
325
+ status = "Fail"
326
+ stderr_text = (exec_proc.stderr or "") + "\n" + (exec_proc.stdout or "")
327
+ concise = self.extract_notebook_error(stderr_text)
328
+ error_message = f"Execution failed: {concise}"
329
+ if self.verbose:
330
+ logger.error(error_message)
331
+ if output_path.exists():
332
+ self.display_cell_execution_details(output_path)
333
+ except subprocess.TimeoutExpired:
334
+ status = "Fail"
335
+ error_message = f"Execution timed out after {timeout} seconds"
336
+ except Exception as e:
337
+ status = "Fail"
338
+ error_message = f"Papermill execution error: {e}"
339
+
340
+ # cleanup processed notebook
341
+ try:
342
+ if processed_notebook_path != str(notebook_full_path) and os.path.exists(processed_notebook_path):
343
+ os.remove(processed_notebook_path)
344
+ except Exception:
345
+ pass
346
+
347
+ # cleanup venv if present (best-effort)
348
+ try:
349
+ if env_path.exists():
350
+ shutil.rmtree(env_path)
351
+ except Exception:
352
+ logger.info("Could not remove venv directory (non-fatal)")
353
+
354
+ result = {"notebook": notebook_name, "status": status, "error_message": error_message}
355
+ # store result
356
+ self.results.append(result)
357
+ # update CSV incrementally
358
+ self._update_csv_report()
359
+ return result
360
+
361
+ def _update_csv_report(self):
362
+ """Write incremental CSV with columns notebook,status,error_message"""
363
+ try:
364
+ df = pd.DataFrame(self.results)
365
+ # Ensure consistent column ordering
366
+ cols = ['notebook', 'status', 'error_message']
367
+ for c in cols:
368
+ if c not in df.columns:
369
+ df[c] = ""
370
+ df = df[cols]
371
+ df.to_csv(self.output_csv, index=False)
372
+ if self.verbose:
373
+ logger.info(f"Wrote report to {self.output_csv}")
374
+ except Exception as e:
375
+ logger.warning(f"Failed to write CSV report: {e}")
376
+
377
+ # -------- Public entrypoint --------
378
+ def execute_notebook(path: str, timeout: int = 1800, verbose: bool = False, output_csv: str = "notebook_execution_report.csv"):
379
+ """
380
+ Public function to execute a single notebook with Colab support and dataset integration.
381
+
382
+ Args:
383
+ path: path to the uploaded .ipynb file (string)
384
+ timeout: execution timeout in seconds (default 1800)
385
+ verbose: enable verbose logging
386
+ output_csv: path to write CSV report (default notebook_execution_report.csv)
387
+
388
+ Returns:
389
+ result dict: {'notebook': <name>, 'status': 'Pass'|'Fail', 'error_message': <msg>}
390
+ """
391
+ executor = NotebookExecutor(verbose=verbose, output_csv=output_csv)
392
+ executor.setup_working_dir()
393
+ result = executor.run_notebook(path, timeout=timeout)
394
+ return result
395
+
396
+ # -------- CLI main (optional) --------
397
+ def main_call(notebook):
398
+ """Main function for executing notebook with enhanced Colab and dataset support"""
399
+ executor = NotebookExecutor(verbose=True) # Enable verbose for better debugging
400
+ executor.setup_working_dir()
401
+
402
+ result = executor.run_notebook(notebook, timeout=1800)
403
+
404
+ return result
src/v2/notebook_parser.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json,re
2
+
3
+ class NotebookParser:
4
+ def __init__(self, notebook_path: str):
5
+ """Initialize with path to a Jupyter notebook file."""
6
+ self.notebook_path = notebook_path
7
+ with open(notebook_path, "r", encoding="utf-8") as f:
8
+ self.nb_json = json.load(f)
9
+
10
+
11
+ def extract(self, code: bool = False, code_output: bool = False, markdown: bool = False, plots: bool = False):
12
+ """
13
+ Extracts notebook content in order of appearance.
14
+
15
+ Args:
16
+ code (bool): include code cells
17
+ code_output (bool): include code cell outputs
18
+ markdown (bool): include markdown cells
19
+ plots (bool): include image outputs (PNG/JPEG, including markdown images)
20
+
21
+ Returns:
22
+ List[dict]: list of content blocks for LLM consumption
23
+ """
24
+ content = []
25
+ image_pattern = re.compile(r"!\[.*?\]\((.*?)\)")
26
+
27
+ for cell in self.nb_json.get("cells", []):
28
+ cell_type = cell.get("cell_type")
29
+
30
+ if markdown and cell_type == "markdown":
31
+ text = "".join(cell.get("source", []))
32
+ if text.strip():
33
+ if plots:
34
+ content.append({"type": "text", "text": text})
35
+ else:
36
+ text_no_images = image_pattern.sub("", text).strip()
37
+ if text_no_images:
38
+ content.append({"type": "text", "text": text_no_images})
39
+
40
+ if plots:
41
+ for match in image_pattern.findall(text):
42
+ if match.startswith("data:image/png;base64,"):
43
+ content.append({
44
+ "type": "image",
45
+ "source_type": "base64",
46
+ "data": match.replace("data:image/png;base64,", ""),
47
+ "mime_type": "image/png"
48
+ })
49
+ elif match.startswith("data:image/jpeg;base64,"):
50
+ content.append({
51
+ "type": "image",
52
+ "source_type": "base64",
53
+ "data": match.replace("data:image/jpeg;base64,", ""),
54
+ "mime_type": "image/jpeg"
55
+ })
56
+ else:
57
+ content.append({
58
+ "type": "text",
59
+ "text": f"[Image: {match}]"
60
+ })
61
+
62
+ elif code and cell_type == "code":
63
+ code_text = "".join(cell.get("source", []))
64
+ if code_text.strip():
65
+ content.append({
66
+ "type": "text",
67
+ "text": f"{code_text}"
68
+ })
69
+
70
+ if code_output and cell_type == "code":
71
+ for output in cell.get("outputs", []):
72
+ if "data" in output:
73
+ data = output["data"]
74
+
75
+ if plots and "image/png" in data:
76
+ content.append({
77
+ "type": "image",
78
+ "source_type": "base64",
79
+ "data": data["image/png"],
80
+ "mime_type": "image/png"
81
+ })
82
+ elif plots and "image/jpeg" in data:
83
+ content.append({
84
+ "type": "image",
85
+ "source_type": "base64",
86
+ "data": data["image/jpeg"],
87
+ "mime_type": "image/jpeg"
88
+ })
89
+
90
+ elif "text/plain" in data:
91
+ text_out = "".join(data["text/plain"])
92
+ if text_out.strip():
93
+ content.append({
94
+ "type": "text",
95
+ "text": f"{text_out}"
96
+ })
97
+
98
+ if output.get("output_type") == "stream":
99
+ text_out = "".join(output.get("text", []))
100
+ if text_out.strip():
101
+ content.append({
102
+ "type": "text",
103
+ "text": f"{text_out}"
104
+ })
105
+
106
+ if output.get("output_type") == "error":
107
+ ename = output.get("ename", "")
108
+ evalue = output.get("evalue", "")
109
+ traceback = "\n".join(output.get("traceback", []))
110
+ content.append({
111
+ "type": "text",
112
+ "text": f"{ename}: {evalue}\n{traceback}"
113
+ })
114
+
115
+ return content
116
+
117
+
src/v2/output_schema.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Union
2
+ from pydantic import BaseModel, Field
3
+
4
+ class LLMCorrectionOutput(BaseModel):
5
+ text: List[str] = Field(
6
+ ...,
7
+ description="A list of exact text fragments from the Jupyter notebook cells where corrections need to be applied. Each fragment must be minimal and only include the part with issues."
8
+ )
9
+ corrected_text: List[str] = Field(
10
+ ...,
11
+ description="A list of corrected text fragments, aligned by index with `text`. Each entry must contain only the corrected version."
12
+ )
13
+ is_grammar_error: List[bool] = Field(
14
+ ...,
15
+ description="A list of booleans aligned by index with `text`. True if the issue is a grammatical/punctuation/capitalization/spelling error, False if it is a stylistic enhancement."
16
+ )
17
+
18
+ class LLMFactualCheckOutput(BaseModel):
19
+ text: List[str] = Field(
20
+ ...,
21
+ description="Exact text fragments from the notebook that contain factual errors."
22
+ )
23
+ corrected_text: List[str] = Field(
24
+ ...,
25
+ description="Corrected factual statements aligned with `text`."
26
+ )
27
+
28
+ class EvaluationSuggestions(BaseModel):
29
+ key_suggestions: List[str] = Field(
30
+ description="A list of actionable suggestions that highlight the most critical improvements across business context, objectives, conclusions & recommendations, and alignment."
31
+ )
32
+
33
+ class NotebookFlowEvaluation(BaseModel):
34
+ suggestions: List[str] = Field(
35
+ ...,
36
+ description="Actionable suggestions to improve the notebook's logical flow, instructional design, or overall quality."
37
+ )
src/v2/setup.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from assets import logo,icon
3
+
4
+ def page_setup():
5
+ st.set_page_config(
6
+ page_title="Great Lens",
7
+ page_icon=icon)
8
+
9
+ st.logo(logo,size='large',link='https://www.mygreatlearning.com/')
src/v2/streamlit_app_modified.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from setup import page_setup
4
+ from utilities import save_notebook, save_datasets
5
+ from grammar_exec import execute
6
+ from notebook import main_call
7
+ import pandas as pd
8
+ from pathlib import Path
9
+
10
+
11
+ col1, col2, col3 = st.columns([2, 4, 1])
12
+ with col2:
13
+ st.title(":blue[Great] Lens 🕵️‍♂️")
14
+
15
+ # File upload section
16
+ st.subheader("📁 Upload Files")
17
+
18
+ col1, col2 = st.columns(2)
19
+
20
+ with col1:
21
+ st.markdown("**Upload Notebook**")
22
+ notebook = st.file_uploader(
23
+ label='Select Jupyter Notebook',
24
+ accept_multiple_files=False,
25
+ type=['ipynb'],
26
+ help="Upload a .ipynb file to analyze"
27
+ )
28
+
29
+ with col2:
30
+ st.markdown("**Upload Datasets (Optional)**")
31
+ datasets = st.file_uploader(
32
+ label='Select Dataset Files',
33
+ accept_multiple_files=True,
34
+ type=['csv', 'xlsx', 'xls', 'json', 'txt', 'parquet'],
35
+ help="Upload datasets that your notebook references"
36
+ )
37
+
38
+ # Display uploaded files info
39
+ if datasets:
40
+ st.info(f"📊 {len(datasets)} dataset(s) uploaded: {', '.join([f.name for f in datasets])}")
41
+
42
+ if notebook:
43
+ st.success(f"📓 Notebook uploaded: {notebook.name}")
44
+
45
+ # Save files to /tmp/Notebook
46
+ save_notebook(notebook)
47
+ if datasets:
48
+ save_datasets(datasets)
49
+ st.info("✅ Datasets saved to notebook directory")
50
+
51
+ results_tab, grammar_tab = st.tabs(['Execution', 'Grammar/Fact'])
52
+
53
+ with results_tab:
54
+ with st.spinner("🔄 Executing notebook..."):
55
+ try:
56
+ notebook_dir_path = Path("/tmp/Notebook")
57
+ notebook_files = [f for f in notebook_dir_path.iterdir() if f.suffix == '.ipynb']
58
+
59
+ if not notebook_files:
60
+ st.error("No notebook found in directory")
61
+ else:
62
+ notebook_path = notebook_files[0]
63
+ st.write(f'🚀 Processing notebook: {notebook_path.name}')
64
+
65
+ # Show available datasets
66
+ dataset_files = [f for f in notebook_dir_path.iterdir()
67
+ if f.suffix.lower() in ['.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet']]
68
+
69
+ if dataset_files:
70
+ st.info(f"📁 Available datasets: {', '.join([f.name for f in dataset_files])}")
71
+
72
+ results = main_call(notebook_path)
73
+
74
+ # Display results in a more user-friendly way
75
+ if isinstance(results, dict):
76
+ col1, col2 = st.columns([1, 3])
77
+ with col1:
78
+ if results['status'] == 'Pass':
79
+ st.success("✅ **Status: PASSED**")
80
+ else:
81
+ st.error("❌ **Status: FAILED**")
82
+
83
+ with col2:
84
+ st.write(f"**Notebook:** {results['notebook']}")
85
+ if results['error_message']:
86
+ st.error(f"**Error:** {results['error_message']}")
87
+ else:
88
+ st.dataframe(results)
89
+
90
+ except Exception as e:
91
+ st.error(f"❌ Error processing notebook: {str(e)}")
92
+
93
+ with grammar_tab:
94
+ try:
95
+ with st.spinner("🔍 Analyzing grammar and facts..."):
96
+ results = execute("/tmp/Notebook")
97
+
98
+ if not results.empty:
99
+ # Display grammar results in a more readable format
100
+ st.subheader("📝 Grammar & Style Analysis")
101
+
102
+ if 'Grammar_Text' in results.columns and len(results['Grammar_Text'].dropna()) > 0:
103
+ grammar_issues = results[results['Grammar_Text'].notna()]
104
+
105
+ for idx, row in grammar_issues.iterrows():
106
+ if row['Is Grammar Error?']:
107
+ st.warning(f"**Grammar Error:** {row['Grammar_Text']}")
108
+ st.info(f"**Suggestion:** {row['Grammar_Suggestions']}")
109
+ else:
110
+ st.info(f"**Style Suggestion:** {row['Grammar_Text']}")
111
+ st.success(f"**Improvement:** {row['Grammar_Suggestions']}")
112
+ st.divider()
113
+
114
+ st.subheader("🎯 Factual Accuracy Analysis")
115
+
116
+ if 'Fact_Text' in results.columns and len(results['Fact_Text'].dropna()) > 0:
117
+ fact_issues = results[results['Fact_Text'].notna()]
118
+
119
+ for idx, row in fact_issues.iterrows():
120
+ st.error(f"**Factual Error:** {row['Fact_Text']}")
121
+ st.success(f"**Correction:** {row['Fact_Suggestions']}")
122
+ st.divider()
123
+
124
+ # Show raw dataframe as well
125
+ with st.expander("📊 View Raw Results"):
126
+ st.dataframe(results)
127
+ else:
128
+ st.success("✅ No grammar or factual issues found!")
129
+
130
+ except Exception as e:
131
+ st.error(f"❌ Unable to process grammar/facts: {str(e)}")
132
+
133
+ # Add some helpful information
134
+ st.sidebar.markdown("## 💡 How to Use")
135
+ st.sidebar.markdown("""
136
+ 1. **Upload Notebook**: Select your .ipynb file
137
+ 2. **Upload Datasets**: Add any CSV, Excel, or other data files your notebook uses
138
+ 3. **Execution Tab**: See if your notebook runs successfully
139
+ 4. **Grammar/Fact Tab**: Check for text quality and factual accuracy
140
+
141
+ ### 🔧 Colab Support
142
+ The tool automatically handles Google Colab specific code:
143
+ - Replaces Drive mounts with local file access
144
+ - Uses your uploaded datasets instead of Colab file uploads
145
+ - Skips Colab-specific imports that won't work locally
146
+ """)
147
+
148
+ st.sidebar.markdown("## 📋 Supported Formats")
149
+ st.sidebar.markdown("""
150
+ **Notebooks:** .ipynb
151
+ **Datasets:** .csv, .xlsx, .xls, .json, .txt, .parquet
152
+ """)
src/v2/utilities.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ from tqdm import tqdm
3
+ from pathlib import Path
4
+ import os
5
+ import shutil
6
+
7
+ def save_notebook(notebook):
8
+ """Save uploaded notebook to /tmp/Notebook directory"""
9
+ folder_path = Path("/tmp/Notebook")
10
+
11
+ if folder_path.exists() and folder_path.is_dir():
12
+ shutil.rmtree(folder_path)
13
+
14
+ folder_path.mkdir(parents=True, exist_ok=True)
15
+
16
+ if notebook:
17
+ file_path = notebook.name
18
+ with open(os.path.join(folder_path, file_path), "wb") as file:
19
+ file.write(notebook.getbuffer())
20
+
21
+ def save_datasets(datasets):
22
+ """Save uploaded datasets to /tmp/Notebook directory"""
23
+ folder_path = Path("/tmp/Notebook")
24
+
25
+ # Ensure directory exists
26
+ folder_path.mkdir(parents=True, exist_ok=True)
27
+
28
+ if datasets:
29
+ for dataset in datasets:
30
+ file_path = dataset.name
31
+ with open(os.path.join(folder_path, file_path), "wb") as file:
32
+ file.write(dataset.getbuffer())
33
+
34
+ def get_dataset_files(directory_path):
35
+ """Get list of dataset files in the directory"""
36
+ folder_path = Path(directory_path)
37
+ if not folder_path.exists():
38
+ return []
39
+
40
+ dataset_extensions = {'.csv', '.xlsx', '.xls', '.json', '.txt', '.parquet'}
41
+ dataset_files = [f for f in folder_path.iterdir()
42
+ if f.suffix.lower() in dataset_extensions and f.is_file()]
43
+
44
+ return dataset_files
45
+
46
+ def create_dataset_mapping(directory_path):
47
+ """Create a mapping of common dataset names to actual files"""
48
+ dataset_files = get_dataset_files(directory_path)
49
+ mapping = {}
50
+
51
+ for file in dataset_files:
52
+ # Create various possible references to this file
53
+ filename = file.name
54
+ name_without_ext = file.stem
55
+
56
+ # Common patterns notebooks might use
57
+ mapping[filename] = filename
58
+ mapping[name_without_ext] = filename
59
+ mapping[filename.lower()] = filename
60
+ mapping[name_without_ext.lower()] = filename
61
+
62
+ # Common generic names
63
+ if filename.lower().startswith('data'):
64
+ mapping['data.csv'] = filename
65
+ mapping['dataset.csv'] = filename
66
+
67
+ # If it's the first/only CSV, make it the default
68
+ if file.suffix.lower() == '.csv' and 'default.csv' not in mapping:
69
+ mapping['default.csv'] = filename
70
+
71
+ return mapping
72
+
73
+ def safe_concurrent_batch(chain, inputs, max_workers=2):
74
+ """Process inputs concurrently with error handling"""
75
+ results = []
76
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
77
+ future_to_input = {executor.submit(chain.invoke, inp): inp for inp in inputs}
78
+
79
+ for future in tqdm(as_completed(future_to_input), total=len(inputs), desc="Processing"):
80
+ inp = future_to_input[future]
81
+ try:
82
+ output = future.result()
83
+ results.append({
84
+ "input": inp,
85
+ "output": output,
86
+ "status": "success"
87
+ })
88
+ except Exception as e:
89
+ results.append({
90
+ "input": inp,
91
+ "output": None,
92
+ "status": f"failed: {str(e)}"
93
+ })
94
+ return results