dp1812 commited on
Commit
9cffd85
·
verified ·
1 Parent(s): 4d8db9e

fix: pin transformers stack and force slow tokenizer by default to avoid fast-tokenizer errors

Browse files
Files changed (1) hide show
  1. CELESTIAL_Training_Notebook.ipynb +58 -0
CELESTIAL_Training_Notebook.ipynb CHANGED
@@ -1,5 +1,63 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "metadata": {},
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "metadata": {},
6
+ "execution_count": null,
7
+ "outputs": [],
8
+ "source": [
9
+ "# 🔐 Hugging Face Authentication for Google Colab\n",
10
+ "try:\n",
11
+ " from google.colab import userdata\n",
12
+ " import os\n",
13
+ " hf_token = userdata.get('HF_TOKEN')\n",
14
+ " os.environ['HUGGINGFACE_HUB_TOKEN'] = hf_token\n",
15
+ " print('✅ HF token loaded from Colab secrets')\n",
16
+ "except ImportError:\n",
17
+ " print('⚠️ Not running in Colab, skipping token setup')\n",
18
+ "except Exception as e:\n",
19
+ " print(f'⚠️ Could not load HF_TOKEN from Colab secrets: {e}')\n",
20
+ " print('💡 Add HF_TOKEN to Colab secrets: Secrets tab → Add new secret → Name: HF_TOKEN')\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "metadata": {},
26
+ "execution_count": null,
27
+ "outputs": [],
28
+ "source": [
29
+ "# 🔧 Install compatible versions for stable training\n",
30
+ "!pip install -q transformers>=4.36.0 tokenizers>=0.15.0\n",
31
+ "!pip install -q peft>=0.8.0 datasets>=2.16.0 bitsandbytes>=0.42.0 accelerate>=0.26.0 huggingface_hub trl\n",
32
+ "import os; os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n",
33
+ "print('✅ Compatible HF stack installed')\n"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "metadata": {},
39
+ "execution_count": null,
40
+ "outputs": [],
41
+ "source": [
42
+ "# 🛡️ Safe loading functions to avoid tokenizer and import errors\n",
43
+ "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
44
+ "\n",
45
+ "def safe_load_tokenizer(model_name, **kwargs):\n",
46
+ " \"\"\"Load tokenizer with safe defaults\"\"\"\n",
47
+ " kwargs.setdefault('use_fast', False)\n",
48
+ " kwargs.setdefault('trust_remote_code', False)\n",
49
+ " return AutoTokenizer.from_pretrained(model_name, **kwargs)\n",
50
+ "\n",
51
+ "def safe_load_model(model_name, **kwargs):\n",
52
+ " \"\"\"Load model with safe defaults\"\"\"\n",
53
+ " kwargs.setdefault('trust_remote_code', False)\n",
54
+ " return AutoModelForCausalLM.from_pretrained(model_name, **kwargs)\n",
55
+ "\n",
56
+ "print('✅ Safe loading functions ready')\n",
57
+ "print('💡 Use: tokenizer = safe_load_tokenizer(MODEL_NAME)')\n",
58
+ "print('💡 Use: model = safe_load_model(MODEL_NAME, quantization_config=bnb_config, device_map=\"auto\")')\n"
59
+ ]
60
+ },
61
  {
62
  "cell_type": "code",
63
  "metadata": {},