dp1812 commited on
Commit
c7aaa20
Β·
verified Β·
1 Parent(s): 543b060

fix: pin transformers stack and force slow tokenizer by default to avoid fast-tokenizer errors

Browse files
Files changed (1) hide show
  1. CELESTIAL_Training_Notebook.ipynb +416 -388
CELESTIAL_Training_Notebook.ipynb CHANGED
@@ -1,390 +1,418 @@
1
  {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# 🌟 CELESTIAL MISTRAL 7B TRAINING\n",
8
- "## Train Your Own Mistral 7B Model for CELESTIAL AI\n",
9
- "\n",
10
- "This notebook properly trains Mistral 7B v0.3 with:\n",
11
- "- 150 production-quality conversations\n",
12
- "- LoRA fine-tuning for efficiency\n",
13
- "- Proper chat formatting for Mistral\n",
14
- "- No logging issues"
15
- ]
16
- },
17
- {
18
- "cell_type": "code",
19
- "execution_count": null,
20
- "metadata": {},
21
- "outputs": [],
22
- "source": [
23
- "# πŸ“¦ INSTALL REQUIRED PACKAGES FOR MISTRAL 7B\n",
24
- "!pip install -q transformers==4.36.0 datasets accelerate peft bitsandbytes huggingface_hub trl\n",
25
- "\n",
26
- "# Disable all logging to prevent issues\n",
27
- "import os\n",
28
- "import warnings\n",
29
- "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
30
- "os.environ[\"WANDB_MODE\"] = \"disabled\"\n",
31
- "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
32
- "warnings.filterwarnings('ignore')\n",
33
- "\n",
34
- "print('βœ… Packages installed for Mistral 7B training!')\n",
35
- "print('🚫 All logging disabled to prevent errors')"
36
- ]
37
- },
38
- {
39
- "cell_type": "code",
40
- "execution_count": null,
41
- "metadata": {},
42
- "outputs": [],
43
- "source": [
44
- "# πŸ”‘ HUGGINGFACE AUTHENTICATION\n",
45
- "from huggingface_hub import notebook_login\n",
46
- "\n",
47
- "print('πŸ” Authenticating with HuggingFace for Mistral access...')\n",
48
- "try:\n",
49
- " notebook_login()\n",
50
- " print('βœ… Authentication successful!')\n",
51
- "except Exception as e:\n",
52
- " print(f'⚠️ Authentication failed: {e}')\n",
53
- " print('Please set your HF token manually if needed')"
54
- ]
55
- },
56
- {
57
- "cell_type": "code",
58
- "execution_count": null,
59
- "metadata": {},
60
- "outputs": [],
61
- "source": [
62
- "# πŸ“Š LOAD CELESTIAL DATASET\n",
63
- "from datasets import load_dataset\n",
64
- "\n",
65
- "DATASET_REPO = 'dp1812/celestial-comprehensive-spiritual-ai'\n",
66
- "\n",
67
- "print('πŸ“Š Loading CELESTIAL dataset for Mistral training...')\n",
68
- "try:\n",
69
- " dataset = load_dataset(DATASET_REPO, data_files='celestial_complete_production_dataset.jsonl', split='train')\n",
70
- " print(f'βœ… Dataset loaded: {len(dataset)} conversations')\n",
71
- " print('🎯 100 numerology + 50 Krishna divine guidance')\n",
72
- "except Exception as e:\n",
73
- " print(f'❌ Dataset loading failed: {e}')\n",
74
- " # Fallback\n",
75
- " try:\n",
76
- " dataset = load_dataset(DATASET_REPO, split='train')\n",
77
- " print(f'βœ… Fallback dataset loaded: {len(dataset)} conversations')\n",
78
- " except Exception as e2:\n",
79
- " print(f'❌ All dataset loading failed: {e2}')\n",
80
- " raise\n",
81
- "\n",
82
- "# Show sample\n",
83
- "print('\\nπŸ“ Sample conversation:')\n",
84
- "sample = dataset[0]\n",
85
- "print(f\"User: {sample['messages'][1]['content'][:60]}...\")\n",
86
- "print(f\"Assistant: {sample['messages'][2]['content'][:60]}...\")"
87
- ]
88
- },
89
- {
90
- "cell_type": "code",
91
- "execution_count": null,
92
- "metadata": {},
93
- "outputs": [],
94
- "source": [
95
- "# πŸ€– LOAD MISTRAL 7B MODEL AND TOKENIZER\n",
96
- "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
97
- "import torch\n",
98
- "\n",
99
- "MODEL_NAME = 'mistralai/Mistral-7B-v0.3'\n",
100
- "\n",
101
- "print('πŸ€– Loading Mistral 7B v0.3 model and tokenizer...')\n",
102
- "\n",
103
- "# Load tokenizer with proper settings\n",
104
- "tokenizer = AutoTokenizer.from_pretrained(\n",
105
- " MODEL_NAME,\n",
106
- " trust_remote_code=True,\n",
107
- " padding_side='right'\n",
108
- ")\n",
109
- "\n",
110
- "# Add pad token if missing\n",
111
- "if tokenizer.pad_token is None:\n",
112
- " tokenizer.pad_token = tokenizer.eos_token\n",
113
- " tokenizer.pad_token_id = tokenizer.eos_token_id\n",
114
- "\n",
115
- "# Quantization config for efficient training\n",
116
- "bnb_config = BitsAndBytesConfig(\n",
117
- " load_in_4bit=True,\n",
118
- " bnb_4bit_quant_type=\"nf4\",\n",
119
- " bnb_4bit_compute_dtype=torch.float16,\n",
120
- " bnb_4bit_use_double_quant=True\n",
121
- ")\n",
122
- "\n",
123
- "# Load Mistral 7B model\n",
124
- "model = AutoModelForCausalLM.from_pretrained(\n",
125
- " MODEL_NAME,\n",
126
- " quantization_config=bnb_config,\n",
127
- " device_map=\"auto\",\n",
128
- " trust_remote_code=True,\n",
129
- " torch_dtype=torch.float16\n",
130
- ")\n",
131
- "\n",
132
- "print('βœ… Mistral 7B model and tokenizer loaded successfully!')\n",
133
- "print(f'πŸ” Model: {MODEL_NAME}')\n",
134
- "print(f'πŸ” Tokenizer vocab size: {len(tokenizer)}')\n",
135
- "print(f'πŸ” Model device: {model.device}')"
136
- ]
137
- },
138
- {
139
- "cell_type": "code",
140
- "execution_count": null,
141
- "metadata": {},
142
- "outputs": [],
143
- "source": [
144
- "# πŸ”§ SETUP LORA FOR MISTRAL 7B\n",
145
- "from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training\n",
146
- "\n",
147
- "print('πŸ”§ Setting up LoRA for Mistral 7B training...')\n",
148
- "\n",
149
- "# Prepare model for k-bit training\n",
150
- "model = prepare_model_for_kbit_training(model)\n",
151
- "\n",
152
- "# Mistral 7B specific target modules\n",
153
- "target_modules = [\n",
154
- " \"q_proj\",\n",
155
- " \"k_proj\", \n",
156
- " \"v_proj\",\n",
157
- " \"o_proj\",\n",
158
- " \"gate_proj\",\n",
159
- " \"up_proj\",\n",
160
- " \"down_proj\",\n",
161
- " \"lm_head\"\n",
162
- "]\n",
163
- "\n",
164
- "print(f'🎯 Target modules for Mistral: {target_modules}')\n",
165
- "\n",
166
- "# Create LoRA config optimized for Mistral\n",
167
- "lora_config = LoraConfig(\n",
168
- " r=64, # Higher rank for better performance\n",
169
- " lora_alpha=16,\n",
170
- " target_modules=target_modules,\n",
171
- " lora_dropout=0.1,\n",
172
- " bias=\"none\",\n",
173
- " task_type=TaskType.CAUSAL_LM,\n",
174
- ")\n",
175
- "\n",
176
- "# Apply LoRA to Mistral\n",
177
- "try:\n",
178
- " model = get_peft_model(model, lora_config)\n",
179
- " model.print_trainable_parameters()\n",
180
- " print('βœ… LoRA adapters attached to Mistral 7B!')\n",
181
- "except Exception as e:\n",
182
- " print(f'❌ LoRA setup failed: {e}')\n",
183
- " raise\n",
184
- "\n",
185
- "print('🎯 Mistral 7B ready for CELESTIAL training!')"
186
- ]
187
- },
188
- {
189
- "cell_type": "code",
190
- "execution_count": null,
191
- "metadata": {},
192
- "outputs": [],
193
- "source": [
194
- "# πŸ“ FORMAT DATA FOR MISTRAL CHAT TRAINING\n",
195
- "def format_for_mistral_chat(example):\n",
196
- " \"\"\"Format conversation for Mistral chat training\"\"\"\n",
197
- " messages = example['messages']\n",
198
- " \n",
199
- " # Extract messages\n",
200
- " system_msg = messages[0]['content']\n",
201
- " user_msg = messages[1]['content']\n",
202
- " assistant_msg = messages[2]['content']\n",
203
- " \n",
204
- " # Mistral chat format\n",
205
- " formatted = f\"<s>[INST] {system_msg}\\n\\nUser: {user_msg} [/INST] {assistant_msg}</s>\"\n",
206
- " \n",
207
- " # Tokenize\n",
208
- " tokens = tokenizer(\n",
209
- " formatted,\n",
210
- " truncation=True,\n",
211
- " padding=False,\n",
212
- " max_length=2048, # Mistral context length\n",
213
- " return_tensors=None\n",
214
- " )\n",
215
- " \n",
216
- " # Set labels (same as input_ids for causal LM)\n",
217
- " tokens['labels'] = tokens['input_ids'].copy()\n",
218
- " \n",
219
- " return tokens\n",
220
- "\n",
221
- "print('πŸ“ Formatting data for Mistral chat training...')\n",
222
- "formatted_dataset = dataset.map(\n",
223
- " format_for_mistral_chat,\n",
224
- " remove_columns=dataset.column_names,\n",
225
- " desc=\"Formatting for Mistral\"\n",
226
- ")\n",
227
- "\n",
228
- "print(f'βœ… Formatted {len(formatted_dataset)} conversations for Mistral')\n",
229
- "print('🎯 Using proper Mistral chat format with [INST] tags')"
230
- ]
231
- },
232
- {
233
- "cell_type": "code",
234
- "execution_count": null,
235
- "metadata": {},
236
- "outputs": [],
237
- "source": [
238
- "# πŸš€ MISTRAL TRAINING CONFIGURATION\n",
239
- "from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling\n",
240
- "\n",
241
- "print('πŸš€ Setting up Mistral 7B training configuration...')\n",
242
- "\n",
243
- "# Training arguments optimized for Mistral 7B\n",
244
- "training_args = TrainingArguments(\n",
245
- " output_dir='./celestial-mistral-7b-results',\n",
246
- " num_train_epochs=3,\n",
247
- " per_device_train_batch_size=1,\n",
248
- " gradient_accumulation_steps=16, # Effective batch size of 16\n",
249
- " warmup_steps=50,\n",
250
- " learning_rate=2e-4, # Higher LR for LoRA\n",
251
- " fp16=True,\n",
252
- " logging_steps=10,\n",
253
- " save_steps=100,\n",
254
- " eval_strategy='no',\n",
255
- " save_strategy='steps',\n",
256
- " load_best_model_at_end=False,\n",
257
- " report_to=[], # No external logging\n",
258
- " remove_unused_columns=False,\n",
259
- " dataloader_drop_last=True,\n",
260
- " group_by_length=True, # Efficient batching\n",
261
- " ddp_find_unused_parameters=False\n",
262
- ")\n",
263
- "\n",
264
- "# Data collator for Mistral\n",
265
- "data_collator = DataCollatorForLanguageModeling(\n",
266
- " tokenizer=tokenizer,\n",
267
- " mlm=False,\n",
268
- " pad_to_multiple_of=8\n",
269
- ")\n",
270
- "\n",
271
- "# Create Mistral trainer\n",
272
- "trainer = Trainer(\n",
273
- " model=model,\n",
274
- " args=training_args,\n",
275
- " train_dataset=formatted_dataset,\n",
276
- " tokenizer=tokenizer,\n",
277
- " data_collator=data_collator\n",
278
- ")\n",
279
- "\n",
280
- "print('βœ… Mistral 7B training configuration ready!')\n",
281
- "print('🎯 Optimized for CELESTIAL AI with LoRA fine-tuning')\n",
282
- "print('⏱️ Expected training time: 30-45 minutes')"
283
- ]
284
- },
285
- {
286
- "cell_type": "code",
287
- "execution_count": null,
288
- "metadata": {},
289
- "outputs": [],
290
- "source": [
291
- "# πŸƒβ€β™‚οΈ START MISTRAL 7B TRAINING\n",
292
- "print('πŸƒβ€β™‚οΈ Starting CELESTIAL Mistral 7B training...')\n",
293
- "print('⏱️ Expected time: 30-45 minutes')\n",
294
- "print('🎯 Training Mistral 7B v0.3 on CELESTIAL conversations')\n",
295
- "print('πŸ’Ž 150 production-quality conversations')\n",
296
- "print('\\nπŸš€ Mistral training begins now...')\n",
297
- "\n",
298
- "try:\n",
299
- " # Start Mistral training\n",
300
- " trainer.train()\n",
301
- " \n",
302
- " print('\\nπŸŽ‰ MISTRAL 7B TRAINING COMPLETED SUCCESSFULLY!')\n",
303
- " print('βœ… CELESTIAL Mistral 7B is now trained!')\n",
304
- " print('🌟 Ready for testing and deployment!')\n",
305
- " \n",
306
- "except Exception as e:\n",
307
- " print(f'❌ Mistral training failed: {e}')\n",
308
- " print('πŸ”§ Please check the error and try again')\n",
309
- " raise"
310
- ]
311
- },
312
- {
313
- "cell_type": "code",
314
- "execution_count": null,
315
- "metadata": {},
316
- "outputs": [],
317
- "source": [
318
- "# πŸ§ͺ TEST TRAINED MISTRAL 7B\n",
319
- "print('πŸ§ͺ Testing the trained CELESTIAL Mistral 7B...')\n",
320
- "\n",
321
- "model.eval()\n",
322
- "\n",
323
- "test_prompts = [\n",
324
- " \"<s>[INST] You are CELESTIAL AI, an expert numerologist. Provide detailed analysis.\\n\\nUser: Tell me about number 7 in Chaldean numerology. [/INST]\",\n",
325
- " \"<s>[INST] You are Shree Krishna providing divine guidance.\\n\\nUser: Krishna, I need guidance about my career path. [/INST]\",\n",
326
- " \"<s>[INST] You are CELESTIAL AI providing numerology analysis.\\n\\nUser: Calculate my numerology for name 'John Smith' born 15/08/1990. [/INST]\"\n",
327
- "]\n",
328
- "\n",
329
- "for i, prompt in enumerate(test_prompts, 1):\n",
330
- " print(f'\\nπŸ” Test {i}: Mistral 7B Response')\n",
331
- " \n",
332
- " try:\n",
333
- " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
334
- " \n",
335
- " with torch.no_grad():\n",
336
- " outputs = model.generate(\n",
337
- " **inputs,\n",
338
- " max_new_tokens=300,\n",
339
- " temperature=0.7,\n",
340
- " do_sample=True,\n",
341
- " pad_token_id=tokenizer.pad_token_id,\n",
342
- " eos_token_id=tokenizer.eos_token_id\n",
343
- " )\n",
344
- " \n",
345
- " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
346
- " generated = response[len(prompt):].strip()\n",
347
- " \n",
348
- " print(f'πŸ€– Mistral Response: {generated[:250]}...')\n",
349
- " \n",
350
- " # Quality check\n",
351
- " if len(generated) > 50 and 'number' in generated.lower() or 'krishna' in generated.lower():\n",
352
- " print('βœ… Response quality: EXCELLENT')\n",
353
- " else:\n",
354
- " print('⚠️ Response quality: NEEDS IMPROVEMENT')\n",
355
- " \n",
356
- " except Exception as e:\n",
357
- " print(f'❌ Test {i} failed: {e}')\n",
358
- "\n",
359
- "print('\\nπŸŽ‰ CELESTIAL MISTRAL 7B TRAINING COMPLETE!')\n",
360
- "print('βœ… Your own trained Mistral 7B model is ready!')\n",
361
- "print('🌟 No external API dependencies - fully yours!')\n",
362
- "print('\\nπŸš€ Next Steps:')\n",
363
- "print(' β€’ Save the trained model to HuggingFace')\n",
364
- "print(' β€’ Integrate with CELESTIAL platform')\n",
365
- "print(' β€’ Expand training data for more features')\n",
366
- "print(' β€’ Deploy to production environment')"
367
- ]
368
- }
369
- ],
370
- "metadata": {
371
- "kernelspec": {
372
- "display_name": "Python 3",
373
- "language": "python",
374
- "name": "python3"
375
- },
376
- "language_info": {
377
- "codemirror_mode": {
378
- "name": "ipython",
379
- "version": 3
380
- },
381
- "file_extension": ".py",
382
- "name": "python",
383
- "nbconvert_exporter": "python",
384
- "pygments_lexer": "ipython3",
385
- "version": "3.8.5"
386
- }
387
  },
388
- "nbformat": 4,
389
- "nbformat_minor": 4
390
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "metadata": {},
6
+ "execution_count": null,
7
+ "outputs": [],
8
+ "source": [
9
+ "# πŸ”§ Install pinned versions for stable training\n",
10
+ "!pip install -q transformers==4.46.2 tokenizers==0.20.1\n",
11
+ "!pip install -q peft==0.14.0 datasets==2.20.0 bitsandbytes==0.43.3 accelerate==0.34.2 huggingface_hub==0.24.6 trl==0.11.4\n",
12
+ "import os; os.environ['TOKENIZERS_PARALLELISM'] = 'false'\n"
13
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  },
15
+ {
16
+ "cell_type": "code",
17
+ "metadata": {},
18
+ "execution_count": null,
19
+ "outputs": [],
20
+ "source": [
21
+ "# 🩹 Force slow tokenizer by default to avoid PyPreTokenizerTypeWrapper errors\n",
22
+ "from transformers import AutoTokenizer as _AutoTokenizer\n",
23
+ "_orig_from_pretrained = _AutoTokenizer.from_pretrained\n",
24
+ "def _patched_from_pretrained(*args, **kwargs):\n",
25
+ " kwargs.setdefault('use_fast', False)\n",
26
+ " return _orig_from_pretrained(*args, **kwargs)\n",
27
+ "_AutoTokenizer.from_pretrained = staticmethod(_patched_from_pretrained)\n",
28
+ "print('βœ… Patched AutoTokenizer.from_pretrained to default use_fast=False')\n"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "metadata": {},
34
+ "source": [
35
+ "# 🌟 CELESTIAL MISTRAL 7B TRAINING\n",
36
+ "## Train Your Own Mistral 7B Model for CELESTIAL AI\n",
37
+ "\n",
38
+ "This notebook properly trains Mistral 7B v0.3 with:\n",
39
+ "- 150 production-quality conversations\n",
40
+ "- LoRA fine-tuning for efficiency\n",
41
+ "- Proper chat formatting for Mistral\n",
42
+ "- No logging issues"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "# πŸ“¦ INSTALL REQUIRED PACKAGES FOR MISTRAL 7B\n",
52
+ "!pip install -q transformers==4.36.0 datasets accelerate peft bitsandbytes huggingface_hub trl\n",
53
+ "\n",
54
+ "# Disable all logging to prevent issues\n",
55
+ "import os\n",
56
+ "import warnings\n",
57
+ "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
58
+ "os.environ[\"WANDB_MODE\"] = \"disabled\"\n",
59
+ "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
60
+ "warnings.filterwarnings('ignore')\n",
61
+ "\n",
62
+ "print('βœ… Packages installed for Mistral 7B training!')\n",
63
+ "print('🚫 All logging disabled to prevent errors')"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "# πŸ”‘ HUGGINGFACE AUTHENTICATION\n",
73
+ "from huggingface_hub import notebook_login\n",
74
+ "\n",
75
+ "print('πŸ” Authenticating with HuggingFace for Mistral access...')\n",
76
+ "try:\n",
77
+ " notebook_login()\n",
78
+ " print('βœ… Authentication successful!')\n",
79
+ "except Exception as e:\n",
80
+ " print(f'⚠️ Authentication failed: {e}')\n",
81
+ " print('Please set your HF token manually if needed')"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": null,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "# πŸ“Š LOAD CELESTIAL DATASET\n",
91
+ "from datasets import load_dataset\n",
92
+ "\n",
93
+ "DATASET_REPO = 'dp1812/celestial-comprehensive-spiritual-ai'\n",
94
+ "\n",
95
+ "print('πŸ“Š Loading CELESTIAL dataset for Mistral training...')\n",
96
+ "try:\n",
97
+ " dataset = load_dataset(DATASET_REPO, data_files='celestial_complete_production_dataset.jsonl', split='train')\n",
98
+ " print(f'βœ… Dataset loaded: {len(dataset)} conversations')\n",
99
+ " print('🎯 100 numerology + 50 Krishna divine guidance')\n",
100
+ "except Exception as e:\n",
101
+ " print(f'❌ Dataset loading failed: {e}')\n",
102
+ " # Fallback\n",
103
+ " try:\n",
104
+ " dataset = load_dataset(DATASET_REPO, split='train')\n",
105
+ " print(f'βœ… Fallback dataset loaded: {len(dataset)} conversations')\n",
106
+ " except Exception as e2:\n",
107
+ " print(f'❌ All dataset loading failed: {e2}')\n",
108
+ " raise\n",
109
+ "\n",
110
+ "# Show sample\n",
111
+ "print('\\nπŸ“ Sample conversation:')\n",
112
+ "sample = dataset[0]\n",
113
+ "print(f\"User: {sample['messages'][1]['content'][:60]}...\")\n",
114
+ "print(f\"Assistant: {sample['messages'][2]['content'][:60]}...\")"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": null,
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "# πŸ€– LOAD MISTRAL 7B MODEL AND TOKENIZER\n",
124
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
125
+ "import torch\n",
126
+ "\n",
127
+ "MODEL_NAME = 'mistralai/Mistral-7B-v0.3'\n",
128
+ "\n",
129
+ "print('πŸ€– Loading Mistral 7B v0.3 model and tokenizer...')\n",
130
+ "\n",
131
+ "# Load tokenizer with proper settings\n",
132
+ "tokenizer = AutoTokenizer.from_pretrained(\n",
133
+ " MODEL_NAME,\n",
134
+ " trust_remote_code=True,\n",
135
+ " padding_side='right'\n",
136
+ ")\n",
137
+ "\n",
138
+ "# Add pad token if missing\n",
139
+ "if tokenizer.pad_token is None:\n",
140
+ " tokenizer.pad_token = tokenizer.eos_token\n",
141
+ " tokenizer.pad_token_id = tokenizer.eos_token_id\n",
142
+ "\n",
143
+ "# Quantization config for efficient training\n",
144
+ "bnb_config = BitsAndBytesConfig(\n",
145
+ " load_in_4bit=True,\n",
146
+ " bnb_4bit_quant_type=\"nf4\",\n",
147
+ " bnb_4bit_compute_dtype=torch.float16,\n",
148
+ " bnb_4bit_use_double_quant=True\n",
149
+ ")\n",
150
+ "\n",
151
+ "# Load Mistral 7B model\n",
152
+ "model = AutoModelForCausalLM.from_pretrained(\n",
153
+ " MODEL_NAME,\n",
154
+ " quantization_config=bnb_config,\n",
155
+ " device_map=\"auto\",\n",
156
+ " trust_remote_code=True,\n",
157
+ " torch_dtype=torch.float16\n",
158
+ ")\n",
159
+ "\n",
160
+ "print('βœ… Mistral 7B model and tokenizer loaded successfully!')\n",
161
+ "print(f'πŸ” Model: {MODEL_NAME}')\n",
162
+ "print(f'πŸ” Tokenizer vocab size: {len(tokenizer)}')\n",
163
+ "print(f'πŸ” Model device: {model.device}')"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "# πŸ”§ SETUP LORA FOR MISTRAL 7B\n",
173
+ "from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training\n",
174
+ "\n",
175
+ "print('πŸ”§ Setting up LoRA for Mistral 7B training...')\n",
176
+ "\n",
177
+ "# Prepare model for k-bit training\n",
178
+ "model = prepare_model_for_kbit_training(model)\n",
179
+ "\n",
180
+ "# Mistral 7B specific target modules\n",
181
+ "target_modules = [\n",
182
+ " \"q_proj\",\n",
183
+ " \"k_proj\", \n",
184
+ " \"v_proj\",\n",
185
+ " \"o_proj\",\n",
186
+ " \"gate_proj\",\n",
187
+ " \"up_proj\",\n",
188
+ " \"down_proj\",\n",
189
+ " \"lm_head\"\n",
190
+ "]\n",
191
+ "\n",
192
+ "print(f'🎯 Target modules for Mistral: {target_modules}')\n",
193
+ "\n",
194
+ "# Create LoRA config optimized for Mistral\n",
195
+ "lora_config = LoraConfig(\n",
196
+ " r=64, # Higher rank for better performance\n",
197
+ " lora_alpha=16,\n",
198
+ " target_modules=target_modules,\n",
199
+ " lora_dropout=0.1,\n",
200
+ " bias=\"none\",\n",
201
+ " task_type=TaskType.CAUSAL_LM,\n",
202
+ ")\n",
203
+ "\n",
204
+ "# Apply LoRA to Mistral\n",
205
+ "try:\n",
206
+ " model = get_peft_model(model, lora_config)\n",
207
+ " model.print_trainable_parameters()\n",
208
+ " print('βœ… LoRA adapters attached to Mistral 7B!')\n",
209
+ "except Exception as e:\n",
210
+ " print(f'❌ LoRA setup failed: {e}')\n",
211
+ " raise\n",
212
+ "\n",
213
+ "print('🎯 Mistral 7B ready for CELESTIAL training!')"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "# πŸ“ FORMAT DATA FOR MISTRAL CHAT TRAINING\n",
223
+ "def format_for_mistral_chat(example):\n",
224
+ " \"\"\"Format conversation for Mistral chat training\"\"\"\n",
225
+ " messages = example['messages']\n",
226
+ " \n",
227
+ " # Extract messages\n",
228
+ " system_msg = messages[0]['content']\n",
229
+ " user_msg = messages[1]['content']\n",
230
+ " assistant_msg = messages[2]['content']\n",
231
+ " \n",
232
+ " # Mistral chat format\n",
233
+ " formatted = f\"<s>[INST] {system_msg}\\n\\nUser: {user_msg} [/INST] {assistant_msg}</s>\"\n",
234
+ " \n",
235
+ " # Tokenize\n",
236
+ " tokens = tokenizer(\n",
237
+ " formatted,\n",
238
+ " truncation=True,\n",
239
+ " padding=False,\n",
240
+ " max_length=2048, # Mistral context length\n",
241
+ " return_tensors=None\n",
242
+ " )\n",
243
+ " \n",
244
+ " # Set labels (same as input_ids for causal LM)\n",
245
+ " tokens['labels'] = tokens['input_ids'].copy()\n",
246
+ " \n",
247
+ " return tokens\n",
248
+ "\n",
249
+ "print('πŸ“ Formatting data for Mistral chat training...')\n",
250
+ "formatted_dataset = dataset.map(\n",
251
+ " format_for_mistral_chat,\n",
252
+ " remove_columns=dataset.column_names,\n",
253
+ " desc=\"Formatting for Mistral\"\n",
254
+ ")\n",
255
+ "\n",
256
+ "print(f'βœ… Formatted {len(formatted_dataset)} conversations for Mistral')\n",
257
+ "print('🎯 Using proper Mistral chat format with [INST] tags')"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "metadata": {},
264
+ "outputs": [],
265
+ "source": [
266
+ "# πŸš€ MISTRAL TRAINING CONFIGURATION\n",
267
+ "from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling\n",
268
+ "\n",
269
+ "print('πŸš€ Setting up Mistral 7B training configuration...')\n",
270
+ "\n",
271
+ "# Training arguments optimized for Mistral 7B\n",
272
+ "training_args = TrainingArguments(\n",
273
+ " output_dir='./celestial-mistral-7b-results',\n",
274
+ " num_train_epochs=3,\n",
275
+ " per_device_train_batch_size=1,\n",
276
+ " gradient_accumulation_steps=16, # Effective batch size of 16\n",
277
+ " warmup_steps=50,\n",
278
+ " learning_rate=2e-4, # Higher LR for LoRA\n",
279
+ " fp16=True,\n",
280
+ " logging_steps=10,\n",
281
+ " save_steps=100,\n",
282
+ " eval_strategy='no',\n",
283
+ " save_strategy='steps',\n",
284
+ " load_best_model_at_end=False,\n",
285
+ " report_to=[], # No external logging\n",
286
+ " remove_unused_columns=False,\n",
287
+ " dataloader_drop_last=True,\n",
288
+ " group_by_length=True, # Efficient batching\n",
289
+ " ddp_find_unused_parameters=False\n",
290
+ ")\n",
291
+ "\n",
292
+ "# Data collator for Mistral\n",
293
+ "data_collator = DataCollatorForLanguageModeling(\n",
294
+ " tokenizer=tokenizer,\n",
295
+ " mlm=False,\n",
296
+ " pad_to_multiple_of=8\n",
297
+ ")\n",
298
+ "\n",
299
+ "# Create Mistral trainer\n",
300
+ "trainer = Trainer(\n",
301
+ " model=model,\n",
302
+ " args=training_args,\n",
303
+ " train_dataset=formatted_dataset,\n",
304
+ " tokenizer=tokenizer,\n",
305
+ " data_collator=data_collator\n",
306
+ ")\n",
307
+ "\n",
308
+ "print('βœ… Mistral 7B training configuration ready!')\n",
309
+ "print('🎯 Optimized for CELESTIAL AI with LoRA fine-tuning')\n",
310
+ "print('⏱️ Expected training time: 30-45 minutes')"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": [
319
+ "# πŸƒβ€β™‚οΈ START MISTRAL 7B TRAINING\n",
320
+ "print('πŸƒβ€β™‚οΈ Starting CELESTIAL Mistral 7B training...')\n",
321
+ "print('⏱️ Expected time: 30-45 minutes')\n",
322
+ "print('🎯 Training Mistral 7B v0.3 on CELESTIAL conversations')\n",
323
+ "print('πŸ’Ž 150 production-quality conversations')\n",
324
+ "print('\\nπŸš€ Mistral training begins now...')\n",
325
+ "\n",
326
+ "try:\n",
327
+ " # Start Mistral training\n",
328
+ " trainer.train()\n",
329
+ " \n",
330
+ " print('\\nπŸŽ‰ MISTRAL 7B TRAINING COMPLETED SUCCESSFULLY!')\n",
331
+ " print('βœ… CELESTIAL Mistral 7B is now trained!')\n",
332
+ " print('🌟 Ready for testing and deployment!')\n",
333
+ " \n",
334
+ "except Exception as e:\n",
335
+ " print(f'❌ Mistral training failed: {e}')\n",
336
+ " print('πŸ”§ Please check the error and try again')\n",
337
+ " raise"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": null,
343
+ "metadata": {},
344
+ "outputs": [],
345
+ "source": [
346
+ "# πŸ§ͺ TEST TRAINED MISTRAL 7B\n",
347
+ "print('πŸ§ͺ Testing the trained CELESTIAL Mistral 7B...')\n",
348
+ "\n",
349
+ "model.eval()\n",
350
+ "\n",
351
+ "test_prompts = [\n",
352
+ " \"<s>[INST] You are CELESTIAL AI, an expert numerologist. Provide detailed analysis.\\n\\nUser: Tell me about number 7 in Chaldean numerology. [/INST]\",\n",
353
+ " \"<s>[INST] You are Shree Krishna providing divine guidance.\\n\\nUser: Krishna, I need guidance about my career path. [/INST]\",\n",
354
+ " \"<s>[INST] You are CELESTIAL AI providing numerology analysis.\\n\\nUser: Calculate my numerology for name 'John Smith' born 15/08/1990. [/INST]\"\n",
355
+ "]\n",
356
+ "\n",
357
+ "for i, prompt in enumerate(test_prompts, 1):\n",
358
+ " print(f'\\nπŸ” Test {i}: Mistral 7B Response')\n",
359
+ " \n",
360
+ " try:\n",
361
+ " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
362
+ " \n",
363
+ " with torch.no_grad():\n",
364
+ " outputs = model.generate(\n",
365
+ " **inputs,\n",
366
+ " max_new_tokens=300,\n",
367
+ " temperature=0.7,\n",
368
+ " do_sample=True,\n",
369
+ " pad_token_id=tokenizer.pad_token_id,\n",
370
+ " eos_token_id=tokenizer.eos_token_id\n",
371
+ " )\n",
372
+ " \n",
373
+ " response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
374
+ " generated = response[len(prompt):].strip()\n",
375
+ " \n",
376
+ " print(f'πŸ€– Mistral Response: {generated[:250]}...')\n",
377
+ " \n",
378
+ " # Quality check\n",
379
+ " if len(generated) > 50 and 'number' in generated.lower() or 'krishna' in generated.lower():\n",
380
+ " print('βœ… Response quality: EXCELLENT')\n",
381
+ " else:\n",
382
+ " print('⚠️ Response quality: NEEDS IMPROVEMENT')\n",
383
+ " \n",
384
+ " except Exception as e:\n",
385
+ " print(f'❌ Test {i} failed: {e}')\n",
386
+ "\n",
387
+ "print('\\nπŸŽ‰ CELESTIAL MISTRAL 7B TRAINING COMPLETE!')\n",
388
+ "print('βœ… Your own trained Mistral 7B model is ready!')\n",
389
+ "print('🌟 No external API dependencies - fully yours!')\n",
390
+ "print('\\nπŸš€ Next Steps:')\n",
391
+ "print(' β€’ Save the trained model to HuggingFace')\n",
392
+ "print(' β€’ Integrate with CELESTIAL platform')\n",
393
+ "print(' β€’ Expand training data for more features')\n",
394
+ "print(' β€’ Deploy to production environment')"
395
+ ]
396
+ }
397
+ ],
398
+ "metadata": {
399
+ "kernelspec": {
400
+ "display_name": "Python 3",
401
+ "language": "python",
402
+ "name": "python3"
403
+ },
404
+ "language_info": {
405
+ "codemirror_mode": {
406
+ "name": "ipython",
407
+ "version": 3
408
+ },
409
+ "file_extension": ".py",
410
+ "name": "python",
411
+ "nbconvert_exporter": "python",
412
+ "pygments_lexer": "ipython3",
413
+ "version": "3.8.5"
414
+ }
415
+ },
416
+ "nbformat": 4,
417
+ "nbformat_minor": 4
418
+ }