Spaces:
Sleeping
fix(kaggle): align pip-managed numpy with kernel's loaded numpy
Browse filesSymptom: Cell 5 (post-restart verify) blew up with
RuntimeError: numpy was upgraded mid-session
(loaded: 2.4.4, installed: 2.4.3)
because Kaggle ships numpy 2.4.4 at /usr/lib/python3/dist-packages while
pip writes to /usr/local/lib/python3.12/dist-packages — and torch's
cu128 install pulled numpy 2.4.3 as a transitive dep into the latter.
unsloth_zoo's startup does a strict string-equality check between
`numpy.__version__` (loaded via Python's import path priority) and
importlib.metadata.version("numpy") (reads pip records) and bails on any
mismatch, even patch-level.
Fix in REPAIR cell:
* Detect the kernel's actual loaded numpy version up front and append
`numpy=={detected}` to the pip constraints file, so unsloth/trl/bnb
cannot resolve a different one.
* Add a 5b realign step that does
pip install --force-reinstall --no-deps numpy=={detected}
to overwrite whatever pip put in /usr/local/lib with the version the
kernel actually loads. After this, importlib.metadata and numpy.__version__
agree regardless of which dist-packages path takes precedence.
Made-with: Cursor
- kaggle/build_notebook.py +22 -1
- kaggle/train_ermap_grpo_kaggle.ipynb +22 -1
|
@@ -176,8 +176,19 @@ get_ipython().system('pip install -q --no-cache-dir --force-reinstall '
|
|
| 176 |
# can NEVER pull a different torch from default PyPI. Without this, step 3's
|
| 177 |
# `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve
|
| 178 |
# torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
with open("/tmp/ermap_constraints.txt", "w") as _cf:
|
| 180 |
-
_cf.write("torch==2.10.0\\ntorchvision==0.25.0\\n")
|
| 181 |
|
| 182 |
# 3. Reinstall bitsandbytes against the now-pinned torch.
|
| 183 |
# --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's
|
|
@@ -202,6 +213,16 @@ get_ipython().system('pip install -q --no-cache-dir '
|
|
| 202 |
'"groq>=0.18.0" "huggingface_hub>=0.25.0" '
|
| 203 |
'"gymnasium>=0.29.0" "openenv-core>=0.1.0"')
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these
|
| 206 |
# while pip is mid-flight, which is what causes the
|
| 207 |
# 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError
|
|
|
|
| 176 |
# can NEVER pull a different torch from default PyPI. Without this, step 3's
|
| 177 |
# `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve
|
| 178 |
# torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.
|
| 179 |
+
#
|
| 180 |
+
# Also pin numpy to whatever Kaggle's kernel already has loaded — Kaggle's
|
| 181 |
+
# image puts numpy at /usr/lib/python3/dist-packages while pip writes to
|
| 182 |
+
# /usr/local/lib/python3.12/dist-packages, so any version drift between the
|
| 183 |
+
# two paths trips unsloth_zoo's strict loaded-vs-installed check at import.
|
| 184 |
+
import subprocess as _sp
|
| 185 |
+
_kernel_numpy = _sp.check_output(
|
| 186 |
+
[sys.executable, "-c", "import numpy; print(numpy.__version__)"],
|
| 187 |
+
text=True,
|
| 188 |
+
).strip()
|
| 189 |
+
print(f" detected kernel numpy = {_kernel_numpy} (will pin)")
|
| 190 |
with open("/tmp/ermap_constraints.txt", "w") as _cf:
|
| 191 |
+
_cf.write(f"torch==2.10.0\\ntorchvision==0.25.0\\nnumpy=={_kernel_numpy}\\n")
|
| 192 |
|
| 193 |
# 3. Reinstall bitsandbytes against the now-pinned torch.
|
| 194 |
# --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's
|
|
|
|
| 213 |
'"groq>=0.18.0" "huggingface_hub>=0.25.0" '
|
| 214 |
'"gymnasium>=0.29.0" "openenv-core>=0.1.0"')
|
| 215 |
|
| 216 |
+
# 5b. Realign the pip-managed numpy with whatever the Kaggle kernel actually
|
| 217 |
+
# has loaded. This force-rewrites /usr/local/lib/.../numpy at the exact
|
| 218 |
+
# version reported by the running interpreter, so importlib.metadata
|
| 219 |
+
# and `numpy.__version__` agree even if Kaggle ships its base numpy at
|
| 220 |
+
# a different dist-packages path.
|
| 221 |
+
print(f"[5b/6] Realigning pip-managed numpy to {_kernel_numpy}...")
|
| 222 |
+
get_ipython().system(
|
| 223 |
+
f'pip install -q --force-reinstall --no-deps "numpy=={_kernel_numpy}"'
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these
|
| 227 |
# while pip is mid-flight, which is what causes the
|
| 228 |
# 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError
|
|
@@ -124,8 +124,19 @@
|
|
| 124 |
"# can NEVER pull a different torch from default PyPI. Without this, step 3's\n",
|
| 125 |
"# `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve\n",
|
| 126 |
"# torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
"with open(\"/tmp/ermap_constraints.txt\", \"w\") as _cf:\n",
|
| 128 |
-
" _cf.write(\"torch==2.10.0\\ntorchvision==0.25.0\\n\")\n",
|
| 129 |
"\n",
|
| 130 |
"# 3. Reinstall bitsandbytes against the now-pinned torch.\n",
|
| 131 |
"# --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's\n",
|
|
@@ -150,6 +161,16 @@
|
|
| 150 |
" '\"groq>=0.18.0\" \"huggingface_hub>=0.25.0\" '\n",
|
| 151 |
" '\"gymnasium>=0.29.0\" \"openenv-core>=0.1.0\"')\n",
|
| 152 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
"# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these\n",
|
| 154 |
"# while pip is mid-flight, which is what causes the\n",
|
| 155 |
"# 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError\n",
|
|
|
|
| 124 |
"# can NEVER pull a different torch from default PyPI. Without this, step 3's\n",
|
| 125 |
"# `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve\n",
|
| 126 |
"# torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.\n",
|
| 127 |
+
"#\n",
|
| 128 |
+
"# Also pin numpy to whatever Kaggle's kernel already has loaded — Kaggle's\n",
|
| 129 |
+
"# image puts numpy at /usr/lib/python3/dist-packages while pip writes to\n",
|
| 130 |
+
"# /usr/local/lib/python3.12/dist-packages, so any version drift between the\n",
|
| 131 |
+
"# two paths trips unsloth_zoo's strict loaded-vs-installed check at import.\n",
|
| 132 |
+
"import subprocess as _sp\n",
|
| 133 |
+
"_kernel_numpy = _sp.check_output(\n",
|
| 134 |
+
" [sys.executable, \"-c\", \"import numpy; print(numpy.__version__)\"],\n",
|
| 135 |
+
" text=True,\n",
|
| 136 |
+
").strip()\n",
|
| 137 |
+
"print(f\" detected kernel numpy = {_kernel_numpy} (will pin)\")\n",
|
| 138 |
"with open(\"/tmp/ermap_constraints.txt\", \"w\") as _cf:\n",
|
| 139 |
+
" _cf.write(f\"torch==2.10.0\\ntorchvision==0.25.0\\nnumpy=={_kernel_numpy}\\n\")\n",
|
| 140 |
"\n",
|
| 141 |
"# 3. Reinstall bitsandbytes against the now-pinned torch.\n",
|
| 142 |
"# --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's\n",
|
|
|
|
| 161 |
" '\"groq>=0.18.0\" \"huggingface_hub>=0.25.0\" '\n",
|
| 162 |
" '\"gymnasium>=0.29.0\" \"openenv-core>=0.1.0\"')\n",
|
| 163 |
"\n",
|
| 164 |
+
"# 5b. Realign the pip-managed numpy with whatever the Kaggle kernel actually\n",
|
| 165 |
+
"# has loaded. This force-rewrites /usr/local/lib/.../numpy at the exact\n",
|
| 166 |
+
"# version reported by the running interpreter, so importlib.metadata\n",
|
| 167 |
+
"# and `numpy.__version__` agree even if Kaggle ships its base numpy at\n",
|
| 168 |
+
"# a different dist-packages path.\n",
|
| 169 |
+
"print(f\"[5b/6] Realigning pip-managed numpy to {_kernel_numpy}...\")\n",
|
| 170 |
+
"get_ipython().system(\n",
|
| 171 |
+
" f'pip install -q --force-reinstall --no-deps \"numpy=={_kernel_numpy}\"'\n",
|
| 172 |
+
")\n",
|
| 173 |
+
"\n",
|
| 174 |
"# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these\n",
|
| 175 |
"# while pip is mid-flight, which is what causes the\n",
|
| 176 |
"# 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError\n",
|