dp1812 commited on
Commit
e752fd3
·
verified ·
1 Parent(s): 7828436

fix: pin transformers stack and force slow tokenizer by default to avoid fast-tokenizer errors

Browse files
Files changed (1) hide show
  1. CELESTIAL_Training_Notebook.ipynb +56 -0
CELESTIAL_Training_Notebook.ipynb CHANGED
@@ -1,5 +1,61 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "metadata": {},
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "metadata": {},
6
+ "execution_count": null,
7
+ "outputs": [],
8
+ "source": [
9
+ "# 🔐 Hugging Face Authentication for Google Colab\n",
10
+ "try:\n",
11
+ " from google.colab import userdata\n",
12
+ " import os\n",
13
+ " hf_token = userdata.get('HF_TOKEN')\n",
14
+ " os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token\n",
15
+ " print('✅ HF token loaded from Colab secrets')\n",
16
+ "except ImportError:\n",
17
+ " print('⚠️ Not running in Colab, skipping token setup')\n",
18
+ "except Exception as e:\n",
19
+ " print(f'⚠️ Could not load HF_TOKEN from Colab secrets: {e}')\n",
20
+ " print('💡 Add HF_TOKEN to Colab secrets: Secrets tab → Add new secret → Name: HF_TOKEN')\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "metadata": {},
26
+ "execution_count": null,
27
+ "outputs": [],
28
+ "source": [
29
+ "# 🔧 Install pinned versions for stable training\n",
30
+ "!pip install -q transformers==4.46.2 tokenizers==0.20.1\n",
31
+ "!pip install -q peft==0.14.0 datasets==2.20.0 bitsandbytes==0.43.3 accelerate==0.34.2 huggingface_hub==0.24.6 trl==0.11.4\n",
32
+ "import os; os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n",
33
+ "print('✅ Pinned HF stack installed')\n"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "metadata": {},
39
+ "execution_count": null,
40
+ "outputs": [],
41
+ "source": [
42
+ "# 🩹 Force safe defaults to avoid fast-tokenizer and remote code import issues\n",
43
+ "from transformers import AutoTokenizer as _AutoTokenizer, AutoModelForCausalLM as _AutoModelForCausalLM\n",
44
+ "_orig_tok_from_pretrained = _AutoTokenizer.from_pretrained\n",
45
+ "def _patched_tok_from_pretrained(*args, **kwargs):\n",
46
+ " kwargs.setdefault('use_fast', False)\n",
47
+ " kwargs.setdefault('trust_remote_code', False)\n",
48
+ " return _orig_tok_from_pretrained(*args, **kwargs)\n",
49
+ "_AutoTokenizer.from_pretrained = staticmethod(_patched_tok_from_pretrained)\n",
50
+ "\n",
51
+ "_orig_model_from_pretrained = _AutoModelForCausalLM.from_pretrained\n",
52
+ "def _patched_model_from_pretrained(*args, **kwargs):\n",
53
+ " kwargs.setdefault('trust_remote_code', False)\n",
54
+ " return _orig_model_from_pretrained(*args, **kwargs)\n",
55
+ "_AutoModelForCausalLM.from_pretrained = staticmethod(_patched_model_from_pretrained)\n",
56
+ "print('✅ Patched: AutoTokenizer(use_fast=False, trust_remote_code=False) and AutoModel(trust_remote_code=False) by default')\n"
57
+ ]
58
+ },
59
  {
60
  "cell_type": "code",
61
  "metadata": {},