Desorden1337 commited on
Commit
ef82471
Β·
1 Parent(s): 6e9a66b

πŸ”₯ Mixtral-8x7B (47B MoE) + LoRA + 4-bit

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. requirements.txt +4 -2
  3. train.py +47 -21
app.py CHANGED
@@ -81,8 +81,8 @@ def get_training_log():
81
  with gr.Blocks(title="D1337 CIPHER Training") as demo:
82
  gr.Markdown("# πŸ”₯ D1337 CIPHER C2 V.1 - TRAINING")
83
  gr.Markdown("**Hardware**: L40S x4 (192GB VRAM)")
84
- gr.Markdown("**Base**: GLM-4.7-Flash-abliterated (31B)")
85
- gr.Markdown("**Dataset**: 92 samples | **Epochs**: 3")
86
 
87
  with gr.Row():
88
  train_btn = gr.Button("πŸš€ START TRAINING", variant="primary")
 
81
  with gr.Blocks(title="D1337 CIPHER Training") as demo:
82
  gr.Markdown("# πŸ”₯ D1337 CIPHER C2 V.1 - TRAINING")
83
  gr.Markdown("**Hardware**: L40S x4 (192GB VRAM)")
84
+ gr.Markdown("**Base**: Mixtral-8x7B-Instruct (47B MoE) + LoRA")
85
+ gr.Markdown("**Dataset**: 92 samples | **Epochs**: 3 | **4-bit + LoRA**")
86
 
87
  with gr.Row():
88
  train_btn = gr.Button("πŸš€ START TRAINING", variant="primary")
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
  torch>=2.0.0
2
- transformers @ git+https://github.com/huggingface/transformers.git
3
  datasets>=2.15.0
4
  accelerate>=0.25.0
5
  huggingface-hub>=0.20.0
6
- gradio>=5.0.0
 
 
 
1
  torch>=2.0.0
2
+ transformers>=4.40.0
3
  datasets>=2.15.0
4
  accelerate>=0.25.0
5
  huggingface-hub>=0.20.0
6
+ gradio>=5.0.0
7
+ peft>=0.10.0
8
+ bitsandbytes>=0.43.0
train.py CHANGED
@@ -1,7 +1,8 @@
1
 
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
4
  from datasets import load_dataset
 
5
  import os
6
 
7
  # L40S x4 Multi-GPU setup
@@ -12,28 +13,53 @@ if torch.cuda.is_available():
12
  for i in range(torch.cuda.device_count()):
13
  print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
14
 
15
- # Load model
16
- model_name = "huihui-ai/Huihui-GLM-4.7-Flash-abliterated"
17
- print(f"\nLoading tokenizer: {model_name}")
18
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
 
 
19
  if tokenizer.pad_token is None:
20
  tokenizer.pad_token = tokenizer.eos_token
21
 
22
- print("Loading model (31B parameters)...")
 
 
 
 
 
 
 
 
23
  model = AutoModelForCausalLM.from_pretrained(
24
- model_name,
25
- torch_dtype=torch.bfloat16,
26
- trust_remote_code=True,
27
- device_map="auto"
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  )
29
- print("Model loaded!")
 
30
 
31
  # Load dataset
32
  print("\nLoading dataset...")
33
  dataset = load_dataset("Desorden1337/d1337-cipher-dataset", split="train")
34
  print(f"Dataset size: {len(dataset)} samples")
35
 
36
- # Tokenize with labels for causal LM
37
  def tokenize(examples):
38
  tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048)
39
  tokens["labels"] = tokens["input_ids"].copy()
@@ -41,13 +67,13 @@ def tokenize(examples):
41
 
42
  dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
43
 
44
- # Training args - L40S x4 OPTIMIZED
45
  training_args = TrainingArguments(
46
  output_dir="./d1337-cipher",
47
  num_train_epochs=3,
48
- per_device_train_batch_size=4,
49
- gradient_accumulation_steps=4,
50
- learning_rate=2e-5,
51
  lr_scheduler_type="cosine",
52
  warmup_ratio=0.1,
53
  weight_decay=0.01,
@@ -56,7 +82,7 @@ training_args = TrainingArguments(
56
  save_total_limit=2,
57
  bf16=True,
58
  gradient_checkpointing=True,
59
- optim="adamw_torch",
60
  push_to_hub=True,
61
  hub_model_id="Desorden1337/d1337-cipher-v1",
62
  hub_private_repo=True,
@@ -64,11 +90,11 @@ training_args = TrainingArguments(
64
  )
65
 
66
  # Train
67
- print("\nπŸš€ STARTING TRAINING...")
68
  trainer = Trainer(
69
- model=model,
70
- args=training_args,
71
- train_dataset=dataset,
72
  tokenizer=tokenizer
73
  )
74
  trainer.train()
 
1
 
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
4
  from datasets import load_dataset
5
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
6
  import os
7
 
8
  # L40S x4 Multi-GPU setup
 
13
  for i in range(torch.cuda.device_count()):
14
  print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
15
 
16
+ # LARGE MODEL - Mixtral 8x7B (47B effective params, MoE)
17
+ model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
18
+ print(f"\nπŸ”₯ Loading BIG MODEL: {model_name}")
19
+
20
+ # Tokenizer
21
+ print("Loading tokenizer...")
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
  if tokenizer.pad_token is None:
24
  tokenizer.pad_token = tokenizer.eos_token
25
 
26
+ # Load model in 4-bit for memory efficiency on 192GB VRAM
27
+ print("Loading model (47B MoE - this takes a few minutes)...")
28
+ bnb_config = BitsAndBytesConfig(
29
+ load_in_4bit=True,
30
+ bnb_4bit_quant_type="nf4",
31
+ bnb_4bit_compute_dtype=torch.bfloat16,
32
+ bnb_4bit_use_double_quant=True
33
+ )
34
+
35
  model = AutoModelForCausalLM.from_pretrained(
36
+ model_name,
37
+ quantization_config=bnb_config,
38
+ device_map="auto",
39
+ trust_remote_code=True
40
+ )
41
+ print("βœ… Model loaded!")
42
+
43
+ # LoRA config for efficient fine-tuning
44
+ print("\nSetting up LoRA...")
45
+ model = prepare_model_for_kbit_training(model)
46
+ lora_config = LoraConfig(
47
+ r=64,
48
+ lora_alpha=128,
49
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
50
+ lora_dropout=0.05,
51
+ bias="none",
52
+ task_type="CAUSAL_LM"
53
  )
54
+ model = get_peft_model(model, lora_config)
55
+ model.print_trainable_parameters()
56
 
57
  # Load dataset
58
  print("\nLoading dataset...")
59
  dataset = load_dataset("Desorden1337/d1337-cipher-dataset", split="train")
60
  print(f"Dataset size: {len(dataset)} samples")
61
 
62
+ # Tokenize
63
  def tokenize(examples):
64
  tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048)
65
  tokens["labels"] = tokens["input_ids"].copy()
 
67
 
68
  dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
69
 
70
+ # Training args
71
  training_args = TrainingArguments(
72
  output_dir="./d1337-cipher",
73
  num_train_epochs=3,
74
+ per_device_train_batch_size=2,
75
+ gradient_accumulation_steps=8,
76
+ learning_rate=2e-4,
77
  lr_scheduler_type="cosine",
78
  warmup_ratio=0.1,
79
  weight_decay=0.01,
 
82
  save_total_limit=2,
83
  bf16=True,
84
  gradient_checkpointing=True,
85
+ optim="paged_adamw_8bit",
86
  push_to_hub=True,
87
  hub_model_id="Desorden1337/d1337-cipher-v1",
88
  hub_private_repo=True,
 
90
  )
91
 
92
  # Train
93
+ print("\nπŸš€ STARTING TRAINING ON MIXTRAL 8x7B...")
94
  trainer = Trainer(
95
+ model=model,
96
+ args=training_args,
97
+ train_dataset=dataset,
98
  tokenizer=tokenizer
99
  )
100
  trainer.train()