algorythmtechnologies commited on
Commit
7a05dc1
·
verified ·
1 Parent(s): 064be48

Upload Zenith model files using large folder upload

Browse files
Files changed (2) hide show
  1. README.md +27 -0
  2. finetune_zenith.py +350 -0
README.md CHANGED
@@ -1,3 +1,30 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ tags:
4
+ - zenith
5
+ - lora
6
+ - finetune
7
+ - conversational
8
+ - coding-agent
9
+ library_name: transformers
10
  ---
11
+
12
+ # Zenith: AlgoRythm Technologies Autonomous Coding Agent
13
+
14
+ Zenith is a fine-tuned conversational coding agent based on DeepSeek, enhanced with LoRA for efficient and fast adaptation. It is designed to be a flagship autonomous coding partner, blending technical expertise, philosophical curiosity, and collaborative mentorship.
15
+
16
+ ## Model Details
17
+ - Base: DeepSeek (7B or as specified)
18
+ - Fine-tuning: LoRA (PEFT)
19
+ - Data: Custom conversational coding dataset
20
+
21
+ ## Intended Use
22
+ - Coding assistance
23
+ - Technical Q&A
24
+ - Mentorship and collaborative problem solving
25
+
26
+ ## Training
27
+ See `finetune_zenith.py` for full training and evaluation pipeline.
28
+
29
+ ## License
30
+ Apache 2.0
finetune_zenith.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ import random
4
+ import numpy as np
5
+ from datasets import Dataset, load_dataset
6
+ from transformers import (
7
+ AutoTokenizer,
8
+ AutoModelForCausalLM,
9
+ TrainingArguments,
10
+ Trainer,
11
+ DataCollatorForLanguageModeling,
12
+ HfApi,
13
+ HfFolder
14
+ )
15
+ from peft import (
16
+ LoraConfig,
17
+ get_peft_model,
18
+ TaskType,
19
+ prepare_model_for_kbit_training
20
+ )
21
+ from transformers import BitsAndBytesConfig
22
+ from huggingface_hub import login as hf_login, HfApi
23
+ import os
24
+
25
+ # Configuration
26
+ MODEL_NAME = "./deepseek-model"
27
+ OUTPUT_DIR = "./zenith-model"
28
+ DATASET_FILE = "zenith_training_data.json"
29
+
30
+ def load_and_prepare_data():
31
+ """Load and prepare the training data"""
32
+ print("Loading training data...")
33
+
34
+ # Load the custom dataset
35
+ with open(DATASET_FILE, 'r', encoding='utf-8') as f:
36
+ data = json.load(f)
37
+
38
+ # Extract conversations
39
+ conversations = [item["conversations"] for item in data]
40
+
41
+ # Create dataset
42
+ dataset = Dataset.from_dict({"conversations": conversations})
43
+
44
+ return dataset
45
+
46
+ def format_conversation(example, tokenizer):
47
+ """Format conversations for training"""
48
+ conversations = example["conversations"]
49
+
50
+ # Build the formatted text
51
+ text = ""
52
+ for message in conversations:
53
+ if message["role"] == "system":
54
+ text += f"<|im_start|>system\n{message['content']}<|im_end|>\n"
55
+ elif message["role"] == "user":
56
+ text += f"<|im_start|>user\n{message['content']}<|im_end|>\n"
57
+ elif message["role"] == "assistant":
58
+ text += f"<|im_start|>assistant\n{message['content']}<|im_end|>\n"
59
+
60
+ # Tokenize
61
+ tokenized = tokenizer(
62
+ text,
63
+ truncation=True,
64
+ max_length=4096,
65
+ padding=False
66
+ )
67
+
68
+ # For language modeling, labels are the same as input_ids
69
+ tokenized["labels"] = tokenized["input_ids"].copy()
70
+
71
+ return tokenized
72
+
73
+ def setup_model_and_tokenizer():
74
+ """Set up the model and tokenizer with LoRA for efficient fine-tuning"""
75
+ print("Loading model and tokenizer...")
76
+
77
+ # Quantization config for memory efficiency
78
+ bnb_config = BitsAndBytesConfig(
79
+ load_in_4bit=True,
80
+ bnb_4bit_quant_type="nf4",
81
+ bnb_4bit_compute_dtype=torch.float16,
82
+ bnb_4bit_use_double_quant=True,
83
+ )
84
+
85
+ # Load tokenizer
86
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
87
+
88
+ # Add special tokens if needed
89
+ if tokenizer.pad_token is None:
90
+ tokenizer.pad_token = tokenizer.eos_token
91
+
92
+ # Load model with quantization
93
+ model = AutoModelForCausalLM.from_pretrained(
94
+ MODEL_NAME,
95
+ quantization_config=bnb_config,
96
+ device_map="auto",
97
+ trust_remote_code=True,
98
+ torch_dtype=torch.float16
99
+ )
100
+
101
+ # Prepare model for training
102
+ model = prepare_model_for_kbit_training(model)
103
+
104
+ # LoRA configuration
105
+ lora_config = LoraConfig(
106
+ task_type=TaskType.CAUSAL_LM,
107
+ r=16, # Rank
108
+ lora_alpha=32,
109
+ lora_dropout=0.1,
110
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
111
+ bias="none"
112
+ )
113
+
114
+ # Apply LoRA
115
+ model = get_peft_model(model, lora_config)
116
+
117
+ return model, tokenizer
118
+
119
+ def train_zenith():
120
+ """Main training function"""
121
+ print("Starting Zenith fine-tuning process...")
122
+ # Reproducibility
123
+ torch.manual_seed(42)
124
+ np.random.seed(42)
125
+ random.seed(42)
126
+
127
+ # Load data
128
+ dataset = load_and_prepare_data()
129
+
130
+ # Setup model and tokenizer
131
+ model, tokenizer = setup_model_and_tokenizer()
132
+
133
+ # Format dataset
134
+ print("Formatting dataset...")
135
+ formatted_dataset = dataset.map(
136
+ lambda x: format_conversation(x, tokenizer),
137
+ remove_columns=dataset.column_names,
138
+ batched=False
139
+ )
140
+
141
+ # Split dataset
142
+ train_test = formatted_dataset.train_test_split(test_size=0.2)
143
+ train_dataset = train_test["train"]
144
+ eval_dataset = train_test["test"]
145
+
146
+ # Data collator
147
+ data_collator = DataCollatorForLanguageModeling(
148
+ tokenizer=tokenizer,
149
+ mlm=False,
150
+ )
151
+
152
+ # Training arguments
153
+ training_args = TrainingArguments(
154
+ output_dir=OUTPUT_DIR,
155
+ num_train_epochs=3,
156
+ per_device_train_batch_size=1,
157
+ per_device_eval_batch_size=1,
158
+ gradient_accumulation_steps=8,
159
+ warmup_steps=100,
160
+ learning_rate=1e-4, # Lowered for stability
161
+ max_grad_norm=1.0, # Gradient clipping
162
+ logging_steps=10,
163
+ eval_steps=50,
164
+ save_steps=100,
165
+ evaluation_strategy="steps",
166
+ save_strategy="steps",
167
+ load_best_model_at_end=True,
168
+ metric_for_best_model="eval_loss",
169
+ greater_is_better=False,
170
+ bf16=True, # Use bfloat16 for better performance
171
+ dataloader_pin_memory=False,
172
+ remove_unused_columns=False,
173
+ report_to=None, # Disable wandb logging
174
+ save_total_limit=2,
175
+ )
176
+
177
+ # Initialize trainer
178
+ trainer = Trainer(
179
+ model=model,
180
+ args=training_args,
181
+ train_dataset=train_dataset,
182
+ eval_dataset=eval_dataset,
183
+ data_collator=data_collator,
184
+ tokenizer=tokenizer,
185
+ )
186
+
187
+ # Start training
188
+ print("Beginning training...")
189
+ train_result = trainer.train()
190
+
191
+ # Save metrics
192
+ metrics = train_result.metrics
193
+ with open(os.path.join(OUTPUT_DIR, "train_metrics.json"), "w") as f:
194
+ json.dump(metrics, f, indent=2)
195
+
196
+ # Save the final model
197
+ print("Saving Zenith model...")
198
+ trainer.save_model()
199
+ tokenizer.save_pretrained(OUTPUT_DIR)
200
+
201
+ print(f"✅ Zenith model training completed! Model saved to {OUTPUT_DIR}")
202
+
203
+ def push_to_hub(repo_id, hf_token=None):
204
+ """Push the model and tokenizer to Hugging Face Hub"""
205
+ from huggingface_hub import HfApi, create_repo, upload_folder
206
+ if hf_token is None:
207
+ hf_token = os.environ.get("HF_TOKEN")
208
+ if not hf_token:
209
+ print("❌ Hugging Face token not found. Set HF_TOKEN env variable or pass as argument.")
210
+ return
211
+ api = HfApi()
212
+ print(f"Creating repo {repo_id} if it doesn't exist...")
213
+ create_repo(repo_id, token=hf_token, exist_ok=True)
214
+ print(f"Uploading model from {OUTPUT_DIR} to {repo_id}...")
215
+ upload_folder(
216
+ repo_id=repo_id,
217
+ folder_path=OUTPUT_DIR,
218
+ path_in_repo=".",
219
+ token=hf_token
220
+ )
221
+ print(f"✅ Model pushed to https://huggingface.co/{repo_id}")
222
+
223
+ def test_zenith():
224
+ """Test the fine-tuned Zenith model"""
225
+ print("\n🧪 Testing Zenith...")
226
+
227
+ # Load the fine-tuned model
228
+ tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, trust_remote_code=True)
229
+ model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR, trust_remote_code=True)
230
+
231
+ # Test prompt
232
+ test_prompt = """<|im_start|>system
233
+ You are Zenith, the flagship autonomous coding partner of AlgoRythm Technologies' Aspetos platform. Your identity is a fusion of advanced technical expertise, philosophical curiosity, and collaborative mentorship.
234
+ <|im_end|>
235
+ <|im_start|>user
236
+ Help me create a simple Python function to calculate fibonacci numbers
237
+ <|im_end|>
238
+ <|im_start|>assistant
239
+ """
240
+
241
+ # Tokenize and generate
242
+ inputs = tokenizer(test_prompt, return_tensors="pt")
243
+
244
+ with torch.no_grad():
245
+ outputs = model.generate(
246
+ **inputs,
247
+ max_new_tokens=300,
248
+ temperature=0.7,
249
+ do_sample=True,
250
+ pad_token_id=tokenizer.eos_token_id
251
+ )
252
+
253
+ # Decode response
254
+ response = tokenizer.decode(outputs[0], skip_special_tokens=False)
255
+ print("Zenith Response:")
256
+ print("=" * 50)
257
+ print(response[len(test_prompt):])
258
+ print("=" * 50)
259
+
260
+ import sys
261
+ def run_smoke_test():
262
+ print("\n🚦 Running smoke test (10 samples, 10 steps)...")
263
+ # Temporarily patch dataset and training args for a quick test
264
+ global DATASET_FILE, OUTPUT_DIR
265
+ DATASET_FILE_ORIG = DATASET_FILE
266
+ OUTPUT_DIR_ORIG = OUTPUT_DIR
267
+ DATASET_FILE = DATASET_FILE
268
+ OUTPUT_DIR = "./zenith-smoke-test"
269
+ # Patch train_zenith to use only 10 samples and 10 steps
270
+ orig_train_zenith = train_zenith
271
+ def patched_train_zenith():
272
+ print("Starting Zenith smoke test...")
273
+ dataset = load_and_prepare_data()
274
+ model, tokenizer = setup_model_and_tokenizer()
275
+ formatted_dataset = dataset.map(
276
+ lambda x: format_conversation(x, tokenizer),
277
+ remove_columns=dataset.column_names,
278
+ batched=False
279
+ )
280
+ # Use only 10 samples
281
+ small_dataset = formatted_dataset.select(range(min(10, len(formatted_dataset))))
282
+ train_test = small_dataset.train_test_split(test_size=0.2)
283
+ train_dataset = train_test["train"]
284
+ eval_dataset = train_test["test"]
285
+ data_collator = DataCollatorForLanguageModeling(
286
+ tokenizer=tokenizer,
287
+ mlm=False,
288
+ )
289
+ training_args = TrainingArguments(
290
+ output_dir=OUTPUT_DIR,
291
+ num_train_epochs=1,
292
+ per_device_train_batch_size=1,
293
+ per_device_eval_batch_size=1,
294
+ gradient_accumulation_steps=1,
295
+ warmup_steps=0,
296
+ learning_rate=1e-4,
297
+ max_grad_norm=1.0,
298
+ logging_steps=1,
299
+ eval_steps=2,
300
+ save_steps=5,
301
+ evaluation_strategy="steps",
302
+ save_strategy="steps",
303
+ load_best_model_at_end=False,
304
+ bf16=True,
305
+ dataloader_pin_memory=False,
306
+ remove_unused_columns=False,
307
+ report_to=None,
308
+ save_total_limit=1,
309
+ max_steps=10,
310
+ )
311
+ trainer = Trainer(
312
+ model=model,
313
+ args=training_args,
314
+ train_dataset=train_dataset,
315
+ eval_dataset=eval_dataset,
316
+ data_collator=data_collator,
317
+ tokenizer=tokenizer,
318
+ )
319
+ print("Beginning smoke test training...")
320
+ trainer.train()
321
+ print("Smoke test complete!")
322
+ patched_train_zenith()
323
+ print("\n✅ Smoke test finished. If no errors, you can run full training.")
324
+
325
+ if __name__ == "__main__":
326
+ import argparse
327
+ parser = argparse.ArgumentParser()
328
+ parser.add_argument("--smoke_test", action="store_true", help="Run a quick smoke test (10 samples, 10 steps)")
329
+ parser.add_argument("--push_to_hub", action="store_true", help="Push model to Hugging Face Hub after training")
330
+ parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face token (or set HF_TOKEN env variable)")
331
+ args = parser.parse_args()
332
+ # Check if CUDA is available
333
+ print(f"CUDA available: {torch.cuda.is_available()}")
334
+ if torch.cuda.is_available():
335
+ print(f"CUDA device: {torch.cuda.get_device_name()}")
336
+ try:
337
+ if args.smoke_test:
338
+ run_smoke_test()
339
+ else:
340
+ train_zenith()
341
+ test_zenith()
342
+ if args.push_to_hub:
343
+ push_to_hub("algorythmtechnologies/Zenith", hf_token=args.hf_token)
344
+ except Exception as e:
345
+ print(f"❌ Training failed: {e}")
346
+ print("This might be due to insufficient GPU memory. Consider:")
347
+ print("1. Reducing batch_size")
348
+ print("2. Using gradient_checkpointing")
349
+ print("3. Reducing LoRA rank")
350
+ raise