Uddiii commited on
Commit
27cf9cd
·
1 Parent(s): 112679c

fix(kaggle): align pip-managed numpy with kernel's loaded numpy

Browse files

Symptom: Cell 5 (post-restart verify) blew up with
RuntimeError: numpy was upgraded mid-session
(loaded: 2.4.4, installed: 2.4.3)
because Kaggle ships numpy 2.4.4 at /usr/lib/python3/dist-packages while
pip writes to /usr/local/lib/python3.12/dist-packages — and torch's
cu128 install pulled numpy 2.4.3 as a transitive dep into the latter.
unsloth_zoo's startup does a strict string-equality check between
`numpy.__version__` (loaded via Python's import path priority) and
importlib.metadata.version("numpy") (reads pip records) and bails on any
mismatch, even patch-level.

Fix in REPAIR cell:

* Detect the kernel's actual loaded numpy version up front and append
`numpy=={detected}` to the pip constraints file, so unsloth/trl/bnb
cannot resolve a different one.
* Add a 5b realign step that does
pip install --force-reinstall --no-deps numpy=={detected}
to overwrite whatever pip put in /usr/local/lib with the version the
kernel actually loads. After this, importlib.metadata and numpy.__version__
agree regardless of which dist-packages path takes precedence.

Made-with: Cursor

kaggle/build_notebook.py CHANGED
@@ -176,8 +176,19 @@ get_ipython().system('pip install -q --no-cache-dir --force-reinstall '
176
  # can NEVER pull a different torch from default PyPI. Without this, step 3's
177
  # `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve
178
  # torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.
 
 
 
 
 
 
 
 
 
 
 
179
  with open("/tmp/ermap_constraints.txt", "w") as _cf:
180
- _cf.write("torch==2.10.0\\ntorchvision==0.25.0\\n")
181
 
182
  # 3. Reinstall bitsandbytes against the now-pinned torch.
183
  # --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's
@@ -202,6 +213,16 @@ get_ipython().system('pip install -q --no-cache-dir '
202
  '"groq>=0.18.0" "huggingface_hub>=0.25.0" '
203
  '"gymnasium>=0.29.0" "openenv-core>=0.1.0"')
204
 
 
 
 
 
 
 
 
 
 
 
205
  # 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these
206
  # while pip is mid-flight, which is what causes the
207
  # 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError
 
176
  # can NEVER pull a different torch from default PyPI. Without this, step 3's
177
  # `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve
178
  # torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.
179
+ #
180
+ # Also pin numpy to whatever Kaggle's kernel already has loaded — Kaggle's
181
+ # image puts numpy at /usr/lib/python3/dist-packages while pip writes to
182
+ # /usr/local/lib/python3.12/dist-packages, so any version drift between the
183
+ # two paths trips unsloth_zoo's strict loaded-vs-installed check at import.
184
+ import subprocess as _sp
185
+ _kernel_numpy = _sp.check_output(
186
+ [sys.executable, "-c", "import numpy; print(numpy.__version__)"],
187
+ text=True,
188
+ ).strip()
189
+ print(f" detected kernel numpy = {_kernel_numpy} (will pin)")
190
  with open("/tmp/ermap_constraints.txt", "w") as _cf:
191
+ _cf.write(f"torch==2.10.0\\ntorchvision==0.25.0\\nnumpy=={_kernel_numpy}\\n")
192
 
193
  # 3. Reinstall bitsandbytes against the now-pinned torch.
194
  # --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's
 
213
  '"groq>=0.18.0" "huggingface_hub>=0.25.0" '
214
  '"gymnasium>=0.29.0" "openenv-core>=0.1.0"')
215
 
216
+ # 5b. Realign the pip-managed numpy with whatever the Kaggle kernel actually
217
+ # has loaded. This force-rewrites /usr/local/lib/.../numpy at the exact
218
+ # version reported by the running interpreter, so importlib.metadata
219
+ # and `numpy.__version__` agree even if Kaggle ships its base numpy at
220
+ # a different dist-packages path.
221
+ print(f"[5b/6] Realigning pip-managed numpy to {_kernel_numpy}...")
222
+ get_ipython().system(
223
+ f'pip install -q --force-reinstall --no-deps "numpy=={_kernel_numpy}"'
224
+ )
225
+
226
  # 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these
227
  # while pip is mid-flight, which is what causes the
228
  # 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError
kaggle/train_ermap_grpo_kaggle.ipynb CHANGED
@@ -124,8 +124,19 @@
124
  "# can NEVER pull a different torch from default PyPI. Without this, step 3's\n",
125
  "# `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve\n",
126
  "# torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.\n",
 
 
 
 
 
 
 
 
 
 
 
127
  "with open(\"/tmp/ermap_constraints.txt\", \"w\") as _cf:\n",
128
- " _cf.write(\"torch==2.10.0\\ntorchvision==0.25.0\\n\")\n",
129
  "\n",
130
  "# 3. Reinstall bitsandbytes against the now-pinned torch.\n",
131
  "# --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's\n",
@@ -150,6 +161,16 @@
150
  " '\"groq>=0.18.0\" \"huggingface_hub>=0.25.0\" '\n",
151
  " '\"gymnasium>=0.29.0\" \"openenv-core>=0.1.0\"')\n",
152
  "\n",
 
 
 
 
 
 
 
 
 
 
153
  "# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these\n",
154
  "# while pip is mid-flight, which is what causes the\n",
155
  "# 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError\n",
 
124
  "# can NEVER pull a different torch from default PyPI. Without this, step 3's\n",
125
  "# `--force-reinstall bitsandbytes` and step 4's `unsloth` upgrade re-resolve\n",
126
  "# torch from PyPI (currently 2.11.0), which breaks the cu128 torchvision pair.\n",
127
+ "#\n",
128
+ "# Also pin numpy to whatever Kaggle's kernel already has loaded — Kaggle's\n",
129
+ "# image puts numpy at /usr/lib/python3/dist-packages while pip writes to\n",
130
+ "# /usr/local/lib/python3.12/dist-packages, so any version drift between the\n",
131
+ "# two paths trips unsloth_zoo's strict loaded-vs-installed check at import.\n",
132
+ "import subprocess as _sp\n",
133
+ "_kernel_numpy = _sp.check_output(\n",
134
+ " [sys.executable, \"-c\", \"import numpy; print(numpy.__version__)\"],\n",
135
+ " text=True,\n",
136
+ ").strip()\n",
137
+ "print(f\" detected kernel numpy = {_kernel_numpy} (will pin)\")\n",
138
  "with open(\"/tmp/ermap_constraints.txt\", \"w\") as _cf:\n",
139
+ " _cf.write(f\"torch==2.10.0\\ntorchvision==0.25.0\\nnumpy=={_kernel_numpy}\\n\")\n",
140
  "\n",
141
  "# 3. Reinstall bitsandbytes against the now-pinned torch.\n",
142
  "# --no-deps because bnb just needs torch at RUNTIME (it dlopens torch's\n",
 
161
  " '\"groq>=0.18.0\" \"huggingface_hub>=0.25.0\" '\n",
162
  " '\"gymnasium>=0.29.0\" \"openenv-core>=0.1.0\"')\n",
163
  "\n",
164
+ "# 5b. Realign the pip-managed numpy with whatever the Kaggle kernel actually\n",
165
+ "# has loaded. This force-rewrites /usr/local/lib/.../numpy at the exact\n",
166
+ "# version reported by the running interpreter, so importlib.metadata\n",
167
+ "# and `numpy.__version__` agree even if Kaggle ships its base numpy at\n",
168
+ "# a different dist-packages path.\n",
169
+ "print(f\"[5b/6] Realigning pip-managed numpy to {_kernel_numpy}...\")\n",
170
+ "get_ipython().system(\n",
171
+ " f'pip install -q --force-reinstall --no-deps \"numpy=={_kernel_numpy}\"'\n",
172
+ ")\n",
173
+ "\n",
174
  "# 6. Verify in a SUBPROCESS (so the parent kernel never imports any of these\n",
175
  "# while pip is mid-flight, which is what causes the\n",
176
  "# 'numpy was upgraded mid-session (loaded: X, installed: Y)' RuntimeError\n",