""" GitHub Notebook Fixer — Hugging Face Space Fixes .ipynb files so they render correctly on GitHub. Common issues fixed: 1. Missing 'state' key in metadata.widgets 2. Oversized cell outputs (GitHub has a ~1MB render limit) 3. Invalid/missing notebook metadata (kernelspec, language_info) 4. Large base64-encoded images in outputs 5. Widget metadata without required 'state' field """ import gradio as gr import json import copy import base64 import uuid import tempfile from pathlib import Path # --------------------------------------------------------------------------- # Analysis helpers # --------------------------------------------------------------------------- MAX_OUTPUT_SIZE = 500_000 # ~500 KB per cell output is risky for GitHub MAX_TOTAL_SIZE = 10_000_000 # ~10 MB total notebook size warning MAX_IMAGE_SIZE = 1_000_000 # ~1 MB per embedded image GITHUB_ISSUES = { "widget_state_missing": { "severity": "critical", "title": "Missing 'state' in metadata.widgets", "desc": ( "GitHub requires a 'state' key inside metadata.widgets. " "Without it the notebook preview shows 'Invalid Notebook'." ), }, "widgets_empty_state": { "severity": "info", "title": "metadata.widgets exists but 'state' is empty", "desc": "The widget state dict is present but empty — GitHub renders this fine.", }, "no_kernelspec": { "severity": "warning", "title": "Missing kernelspec in metadata", "desc": "GitHub may not detect the notebook language correctly.", }, "no_language_info": { "severity": "info", "title": "Missing language_info in metadata", "desc": "Minor — GitHub can usually infer the language from kernelspec.", }, "oversized_output": { "severity": "warning", "title": "Cell output exceeds ~500 KB", "desc": "Very large outputs can cause GitHub to skip rendering the notebook.", }, "oversized_notebook": { "severity": "critical", "title": "Notebook exceeds ~10 MB", "desc": "GitHub will refuse to render notebooks over ~10 MB.", }, "large_embedded_image": { "severity": "warning", "title": "Large base64 image embedded in output", "desc": "Images over ~1 MB bloat the notebook and slow GitHub rendering.", }, "invalid_nbformat": { "severity": "critical", "title": "Missing or invalid nbformat version", "desc": "GitHub needs nbformat >= 4 to render the notebook.", }, } def _sizeof(obj) -> int: """Rough byte size of a JSON-serializable object.""" return len(json.dumps(obj, ensure_ascii=False).encode()) def analyze_notebook(nb: dict) -> list[dict]: """Return a list of issue dicts found in the notebook.""" issues: list[dict] = [] # ---- nbformat version ---- nbf = nb.get("nbformat") if nbf is None or (isinstance(nbf, int) and nbf < 4): issues.append({**GITHUB_ISSUES["invalid_nbformat"], "location": "root"}) # ---- metadata.widgets ---- meta = nb.get("metadata", {}) widgets = meta.get("widgets") if widgets is not None: if "application/vnd.jupyter.widget-state+json" in widgets: ws = widgets["application/vnd.jupyter.widget-state+json"] if "state" not in ws: issues.append({**GITHUB_ISSUES["widget_state_missing"], "location": "metadata.widgets"}) elif not ws["state"]: issues.append({**GITHUB_ISSUES["widgets_empty_state"], "location": "metadata.widgets"}) else: # widgets key exists but no standard widget-state key for key, val in widgets.items(): if isinstance(val, dict) and "state" not in val: issues.append({ **GITHUB_ISSUES["widget_state_missing"], "location": f"metadata.widgets['{key}']", }) # ---- kernelspec / language_info ---- if "kernelspec" not in meta: issues.append({**GITHUB_ISSUES["no_kernelspec"], "location": "metadata"}) if "language_info" not in meta: issues.append({**GITHUB_ISSUES["no_language_info"], "location": "metadata"}) # ---- per-cell checks ---- for idx, cell in enumerate(nb.get("cells", [])): for out in cell.get("outputs", []): out_size = _sizeof(out) if out_size > MAX_OUTPUT_SIZE: issues.append({ **GITHUB_ISSUES["oversized_output"], "location": f"cells[{idx}]", "detail": f"{out_size / 1_000_000:.2f} MB", }) # check base64 images data = out.get("data", {}) for mime, content in data.items(): if mime.startswith("image/") and isinstance(content, str): try: img_bytes = len(base64.b64decode(content, validate=False)) except Exception: img_bytes = len(content) if img_bytes > MAX_IMAGE_SIZE: issues.append({ **GITHUB_ISSUES["large_embedded_image"], "location": f"cells[{idx}] ({mime})", "detail": f"{img_bytes / 1_000_000:.2f} MB", }) # ---- total size ---- total = _sizeof(nb) if total > MAX_TOTAL_SIZE: issues.append({ **GITHUB_ISSUES["oversized_notebook"], "location": "entire file", "detail": f"{total / 1_000_000:.2f} MB", }) return issues def predict_github_render(issues: list[dict]) -> str: """Return a human-readable prediction.""" crits = [i for i in issues if i["severity"] == "critical"] warns = [i for i in issues if i["severity"] == "warning"] if crits: return "❌ Will NOT render on GitHub" if warns: return "⚠️ Might render, but with issues" return "✅ Should render fine on GitHub" # --------------------------------------------------------------------------- # Fixer # --------------------------------------------------------------------------- def fix_notebook(nb: dict, strip_widgets: bool = False, strip_large_outputs: bool = True) -> dict: """Return a fixed copy of the notebook.""" nb = copy.deepcopy(nb) # ---- Ensure nbformat ---- if nb.get("nbformat") is None or nb.get("nbformat") < 4: nb["nbformat"] = 4 nb.setdefault("nbformat_minor", 5) # ---- metadata ---- meta = nb.setdefault("metadata", {}) # Fix widgets widgets = meta.get("widgets") if widgets is not None: if strip_widgets: del meta["widgets"] else: # Add missing 'state' key to every widget-state entry for key, val in list(widgets.items()): if isinstance(val, dict) and "state" not in val: val["state"] = {} # Also handle the standard key specifically if "application/vnd.jupyter.widget-state+json" in widgets: ws = widgets["application/vnd.jupyter.widget-state+json"] ws.setdefault("state", {}) # Ensure kernelspec if "kernelspec" not in meta: meta["kernelspec"] = { "display_name": "Python 3", "language": "python", "name": "python3", } # Ensure language_info if "language_info" not in meta: meta["language_info"] = { "name": "python", "version": "3.10.0", } # ---- Per-cell fixes ---- for cell in nb.get("cells", []): new_outputs = [] for out in cell.get("outputs", []): out_size = _sizeof(out) # Strip very large outputs if requested if strip_large_outputs and out_size > MAX_OUTPUT_SIZE: new_outputs.append({ "output_type": "stream", "name": "stdout", "text": ["[Output removed — too large for GitHub rendering]\n"], }) continue # Always strip oversized base64 images regardless of the checkbox data = out.get("data", {}) for mime in list(data.keys()): if mime.startswith("image/") and isinstance(data[mime], str): try: img_bytes = len(base64.b64decode(data[mime], validate=False)) except Exception: img_bytes = len(data[mime]) if img_bytes > MAX_IMAGE_SIZE: data[mime] = "" # clear the giant image data.setdefault("text/plain", ["[Large image removed for GitHub compatibility]"]) new_outputs.append(out) cell["outputs"] = new_outputs # Ensure every cell has a valid nbformat 4.5+ id (8-char hex) existing_id = cell.get("id", "") if not existing_id or len(existing_id) < 8: cell["id"] = uuid.uuid4().hex[:8] return nb # --------------------------------------------------------------------------- # Report builder # --------------------------------------------------------------------------- def build_report(issues: list[dict], prediction: str) -> str: lines = [f"## {prediction}\n"] if not issues: lines.append("No issues detected — this notebook looks good for GitHub!\n") return "\n".join(lines) severity_emoji = {"critical": "🔴", "warning": "🟡", "info": "🔵"} lines.append(f"**Found {len(issues)} issue(s):**\n") for i, issue in enumerate(issues, 1): emoji = severity_emoji.get(issue["severity"], "⚪") detail = f" — {issue.get('detail', '')}" if "detail" in issue else "" lines.append(f"{i}. {emoji} **{issue['title']}**{detail}") lines.append(f" *Location:* `{issue['location']}`") lines.append(f" {issue['desc']}\n") return "\n".join(lines) # --------------------------------------------------------------------------- # Gradio handler # --------------------------------------------------------------------------- def process_notebook(file, strip_widgets: bool, strip_large_outputs: bool): """Main handler: analyse → predict → fix → return.""" if file is None: return "Upload a `.ipynb` file first.", None # Read the notebook try: with open(file.name, "r", encoding="utf-8") as f: nb = json.load(f) except json.JSONDecodeError: return "❌ The uploaded file is not valid JSON. Are you sure it's a `.ipynb`?", None except Exception as e: return f"❌ Could not read file: {e}", None # Analyse issues = analyze_notebook(nb) prediction = predict_github_render(issues) report = build_report(issues, prediction) # Fix fixed_nb = fix_notebook(nb, strip_widgets=strip_widgets, strip_large_outputs=strip_large_outputs) # Re-analyse fixed version fixed_issues = analyze_notebook(fixed_nb) fixed_prediction = predict_github_render(fixed_issues) report += "\n---\n" report += f"### After fix: {fixed_prediction}\n" if fixed_issues: remaining = [i for i in fixed_issues if i["severity"] in ("critical", "warning")] if remaining: report += f"⚠️ {len(remaining)} issue(s) remain (may need manual attention).\n" else: report += "Only informational notes remain — notebook should render on GitHub.\n" else: report += "All issues resolved! ✅\n" # Write fixed notebook to temp file out_path = tempfile.NamedTemporaryFile( suffix=".ipynb", delete=False, prefix="fixed_", dir=tempfile.gettempdir() ) with open(out_path.name, "w", encoding="utf-8") as f: json.dump(fixed_nb, f, ensure_ascii=False, indent=1) return report, out_path.name # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- DESCRIPTION = """ # 🔧 GitHub Notebook Fixer **Upload a `.ipynb` file** and this tool will: 1. **Predict** whether it will render on GitHub 2. **Diagnose** all issues (missing widget state, oversized outputs, bad metadata…) 3. **Fix** the problems and return a clean `.ipynb` you can push to GitHub ### Common issues fixed - `metadata.widgets` missing the `state` key → **"Invalid Notebook"** on GitHub - Oversized cell outputs (>500 KB) that block rendering - Missing `kernelspec` / `language_info` metadata - Giant base64-encoded images bloating the file """ with gr.Blocks( title="GitHub Notebook Fixer", theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"), ) as demo: gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="Upload .ipynb file", file_types=[".ipynb"], type="filepath", ) strip_widgets = gr.Checkbox( label="Remove widget metadata entirely (instead of fixing it)", value=False, ) strip_large = gr.Checkbox( label="Strip oversized outputs (>500 KB per cell)", value=True, ) btn = gr.Button("🔍 Analyze & Fix", variant="primary", size="lg") with gr.Column(scale=2): report_output = gr.Markdown(label="Diagnosis Report") file_output = gr.File(label="Download Fixed Notebook") btn.click( fn=process_notebook, inputs=[file_input, strip_widgets, strip_large], outputs=[report_output, file_output], ) gr.Markdown( '---\n*Built to solve the classic GitHub "Invalid Notebook" error. ' "Works for Colab, Jupyter, and any nbformat-4 notebook.*" ) with gr.Accordion("🔒 Privacy & Data — your files are never stored", open=False): gr.Markdown(""" ### How this tool works 1. You upload a `.ipynb` file — it is read **entirely in memory** on the server. 2. The tool analyses the JSON structure, fixes any issues, and writes a temporary output file. 3. You download the fixed file. 4. The temporary file is managed by the OS and is **automatically deleted** — it is never persisted beyond your session. ### What we do NOT do - ❌ We do **not** save, log, or store your notebook file anywhere. - ❌ We do **not** collect any personal information or usage metadata linked to your file. - ❌ There is **no database** — nothing is written to permanent storage. - ❌ Your notebook content is **never shared** with third parties. ### What happens to your data Your file lives only in the server's RAM and a short-lived OS temp file for the duration of your request. Once you close the session or the Space restarts, every trace of it is gone. **You are the only person who ever sees your notebook.** """) if __name__ == "__main__": demo.launch()