canonica1 commited on
Commit
cb09f4d
·
verified ·
1 Parent(s): 97d0d55

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """rl_training.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1LJmxNlZNnQCGQOFJYCr-KcA7Q-uvk7gK
8
+ """
9
+
10
+ !pip install -qqq datasets==3.2.0 transformers==4.47.1 trl==0.14.0 peft==0.14.0 accelerate==1.2.1 bitsandbytes==0.45.2 wandb==0.19.7 --progress-bar off
11
+ !pip install -qqq flash-attn --no-build-isolation --progress-bar off
12
+
13
+ import torch
14
+ from datasets import load_dataset
15
+ from peft import LoraConfig, get_peft_model
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer
17
+ from trl import GRPOConfig, GRPOTrainer
18
+
19
+ import wandb
20
+
21
+ wandb.login()
22
+
23
+ dataset = load_dataset("mlabonne/smoltldr")
24
+ print(dataset)
25
+
26
+ import os
27
+ os.environ["FLASH_ATTENTION_FORCE_DISABLED"] = "1"
28
+
29
+ model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ model_id,
32
+ torch_dtype="auto",
33
+ device_map="auto",
34
+ )
35
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
36
+
37
+ # Load LoRA
38
+ lora_config = LoraConfig(
39
+ task_type="CAUSAL_LM",
40
+ r=16,
41
+ lora_alpha=32,
42
+ target_modules="all-linear",
43
+ )
44
+ model = get_peft_model(model, lora_config)
45
+ print(model.print_trainable_parameters())
46
+
47
+ # Reward function
48
+ ideal_length = 50
49
+
50
+
51
+ def reward_len(completions, **kwargs):
52
+ return [-abs(ideal_length - len(completion)) for completion in completions]
53
+
54
+ training_args = GRPOConfig(
55
+ output_dir="GRPO",
56
+ learning_rate=2e-5,
57
+ per_device_train_batch_size=8,
58
+ gradient_accumulation_steps=2,
59
+ max_prompt_length=512,
60
+ max_completion_length=96,
61
+ num_generations=8,
62
+ optim="adamw_8bit",
63
+ num_train_epochs=1,
64
+ bf16=True,
65
+ report_to=["wandb"],
66
+ remove_unused_columns=False,
67
+ logging_steps=1,
68
+ )
69
+
70
+ trainer = GRPOTrainer(
71
+ model=model,
72
+ reward_funcs=[reward_len],
73
+ args=training_args,
74
+ train_dataset=dataset["train"],
75
+ )
76
+
77
+ # Train model
78
+ wandb.init(project="GRPO")
79
+ trainer.train()