KevinKeller commited on
Commit
7c9b6bf
·
verified ·
1 Parent(s): 27d693a

Upload train_question_generator.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_question_generator.py +85 -0
train_question_generator.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = ["trl>=0.17.0", "peft>=0.15.0", "datasets", "transformers", "accelerate", "bitsandbytes"]
3
+ # ///
4
+
5
+ import os
6
+ from datasets import load_dataset
7
+ from peft import LoraConfig
8
+ from trl import SFTTrainer, SFTConfig
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
+ import torch
11
+
12
+ print("Loading dataset...")
13
+ dataset = load_dataset("KevinKeller/cognitive-question-generator-v1")
14
+ train_dataset = dataset["train"]
15
+ eval_dataset = dataset.get("validation")
16
+
17
+ print(f"Train samples: {len(train_dataset)}")
18
+ if eval_dataset:
19
+ print(f"Eval samples: {len(eval_dataset)}")
20
+
21
+ # Using Qwen2.5-7B for question generation (good reasoning capabilities)
22
+ print("Loading model: Qwen/Qwen2.5-7B-Instruct...")
23
+ model_id = "Qwen/Qwen2.5-7B-Instruct"
24
+
25
+ # 4-bit quantization for fitting on A10G
26
+ bnb_config = BitsAndBytesConfig(
27
+ load_in_4bit=True,
28
+ bnb_4bit_quant_type="nf4",
29
+ bnb_4bit_compute_dtype=torch.bfloat16,
30
+ )
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
33
+ tokenizer.pad_token = tokenizer.eos_token
34
+
35
+ model = AutoModelForCausalLM.from_pretrained(
36
+ model_id,
37
+ quantization_config=bnb_config,
38
+ device_map="auto",
39
+ trust_remote_code=True,
40
+ )
41
+
42
+ # LoRA config - slightly higher rank for more complex task
43
+ peft_config = LoraConfig(
44
+ r=32,
45
+ lora_alpha=64,
46
+ lora_dropout=0.05,
47
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
48
+ bias="none",
49
+ task_type="CAUSAL_LM",
50
+ )
51
+
52
+ # Training config - fewer epochs due to larger dataset
53
+ training_args = SFTConfig(
54
+ output_dir="./question-generator-output",
55
+ num_train_epochs=2,
56
+ per_device_train_batch_size=1,
57
+ gradient_accumulation_steps=8,
58
+ learning_rate=1e-4,
59
+ logging_steps=50,
60
+ save_strategy="steps",
61
+ save_steps=500,
62
+ eval_strategy="steps" if eval_dataset else "no",
63
+ eval_steps=500,
64
+ bf16=True,
65
+ push_to_hub=True,
66
+ hub_model_id="KevinKeller/cognitive-question-generator-qwen2.5-7b",
67
+ report_to="none",
68
+ max_seq_length=8192,
69
+ gradient_checkpointing=True,
70
+ )
71
+
72
+ print("Starting training...")
73
+ trainer = SFTTrainer(
74
+ model=model,
75
+ train_dataset=train_dataset,
76
+ eval_dataset=eval_dataset,
77
+ peft_config=peft_config,
78
+ processing_class=tokenizer,
79
+ args=training_args,
80
+ )
81
+
82
+ trainer.train()
83
+ print("Training complete! Pushing to Hub...")
84
+ trainer.push_to_hub()
85
+ print("Done!")