girish00 commited on
Commit
eb09e6b
·
verified ·
1 Parent(s): 971cb4b

update endpoint helper files

Browse files
Files changed (1) hide show
  1. finetune_coding_llm_colab.py +246 -0
finetune_coding_llm_colab.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Local LoRA fine-tuning script for a small coding model.
3
+
4
+ Quick start (Windows/Linux local):
5
+ 1) pip install transformers datasets peft accelerate bitsandbytes huggingface_hub
6
+ 2) python finetune_coding_llm_colab.py --dataset-size 8000
7
+ 3) Optional upload:
8
+ python finetune_coding_llm_colab.py --skip-train --upload --hf-repo your-user/your-model
9
+ """
10
+
11
+ import argparse
12
+ import json
13
+ import os
14
+ import random
15
+ import torch
16
+ from datasets import load_dataset
17
+ from huggingface_hub import upload_folder
18
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
19
+ from transformers import (
20
+ AutoModelForCausalLM,
21
+ AutoTokenizer,
22
+ BitsAndBytesConfig,
23
+ Trainer,
24
+ TrainingArguments,
25
+ )
26
+
27
+
28
+ DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
29
+ DEFAULT_OUTPUT_DIR = "./model"
30
+ DEFAULT_TRAIN_FILE = "train.json"
31
+ HF_REPO_ID = "your-username/coding-llm-model"
32
+
33
+ # Keep dataset size in the requested 5k-10k window.
34
+ DATASET_SIZE = 8000
35
+
36
+
37
+ TEMPLATES = [
38
+ {
39
+ "instruction": "Fix the Python code",
40
+ "input": "def add(a,b) return a+b",
41
+ "output": "def add(a, b): return a + b",
42
+ "explanation": "Added missing colon and corrected syntax.",
43
+ },
44
+ {
45
+ "instruction": "Fix loop syntax",
46
+ "input": "for i in range(5 print(i)",
47
+ "output": "for i in range(5): print(i)",
48
+ "explanation": "Added missing parenthesis and colon.",
49
+ },
50
+ {
51
+ "instruction": "Fix condition",
52
+ "input": "if x = 10: print(x)",
53
+ "output": "if x == 10: print(x)",
54
+ "explanation": "Corrected assignment to comparison operator.",
55
+ },
56
+ {
57
+ "instruction": "Explain code",
58
+ "input": "for i in range(3): print(i)",
59
+ "output": "Prints numbers from 0 to 2.",
60
+ "explanation": "Loop iterates from 0 to 2 and prints values.",
61
+ },
62
+ ]
63
+
64
+
65
+ def format_training_text(template):
66
+ target = {
67
+ "code": template["output"],
68
+ "explanation": template["explanation"],
69
+ }
70
+ return (
71
+ f"Instruction: {template['instruction']}\n"
72
+ f"Input: {template['input']}\n"
73
+ "Return only valid JSON with keys code and explanation.\n"
74
+ f"JSON: {json.dumps(target, ensure_ascii=False)}\n"
75
+ )
76
+
77
+
78
+ def generate_sample():
79
+ template = random.choice(TEMPLATES)
80
+ text = format_training_text(template)
81
+ return {
82
+ "instruction": template["instruction"],
83
+ "input": template["input"],
84
+ "output": template["output"],
85
+ "explanation": template["explanation"],
86
+ "text": text,
87
+ "confidence": round(random.uniform(0.9, 0.99), 2),
88
+ "relevancy": round(random.uniform(0.85, 0.99), 2),
89
+ }
90
+
91
+
92
+ def build_dataset(train_file, size=DATASET_SIZE):
93
+ dataset = [generate_sample() for _ in range(size)]
94
+ with open(train_file, "w", encoding="utf-8") as f:
95
+ json.dump(dataset, f, indent=2)
96
+ print(f"Dataset created: {len(dataset)} samples -> {train_file}")
97
+
98
+
99
+ def run_training(
100
+ model_name,
101
+ train_file,
102
+ output_dir,
103
+ epochs,
104
+ batch_size,
105
+ learning_rate,
106
+ max_length,
107
+ max_train_samples,
108
+ use_4bit,
109
+ ):
110
+ if not os.path.exists(train_file):
111
+ raise FileNotFoundError(
112
+ f"Training file not found: {train_file}. Generate it with generate_dataset.py first."
113
+ )
114
+
115
+ dataset = load_dataset("json", data_files=train_file)
116
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
117
+ tokenizer.pad_token = tokenizer.eos_token
118
+
119
+ def format_data(example):
120
+ text = example.get("text")
121
+ if not text:
122
+ text = format_training_text(example)
123
+ tokens = tokenizer(
124
+ text,
125
+ truncation=True,
126
+ padding="max_length",
127
+ max_length=max_length,
128
+ )
129
+ tokens["labels"] = tokens["input_ids"].copy()
130
+ return tokens
131
+
132
+ tokenized = dataset.map(
133
+ format_data,
134
+ remove_columns=dataset["train"].column_names,
135
+ desc="Tokenizing training dataset",
136
+ )
137
+ if max_train_samples > 0:
138
+ max_train_samples = min(max_train_samples, len(tokenized["train"]))
139
+ tokenized["train"] = tokenized["train"].select(range(max_train_samples))
140
+
141
+ fp16_enabled = torch.cuda.is_available()
142
+ quantize_4bit = use_4bit and torch.cuda.is_available()
143
+ if use_4bit and not torch.cuda.is_available():
144
+ print("Warning: --use-4bit requested but CUDA not available. Falling back to standard loading.")
145
+ if quantize_4bit:
146
+ bnb_config = BitsAndBytesConfig(load_in_4bit=True)
147
+ model = AutoModelForCausalLM.from_pretrained(
148
+ model_name,
149
+ quantization_config=bnb_config,
150
+ device_map="auto",
151
+ )
152
+ model = prepare_model_for_kbit_training(model)
153
+ else:
154
+ model = AutoModelForCausalLM.from_pretrained(
155
+ model_name,
156
+ device_map="auto" if torch.cuda.is_available() else None,
157
+ )
158
+
159
+ lora_config = LoraConfig(
160
+ r=8,
161
+ lora_alpha=16,
162
+ target_modules=["q_proj", "v_proj"],
163
+ lora_dropout=0.05,
164
+ bias="none",
165
+ task_type="CAUSAL_LM",
166
+ )
167
+ model = get_peft_model(model, lora_config)
168
+
169
+ training_args = TrainingArguments(
170
+ output_dir=output_dir,
171
+ per_device_train_batch_size=batch_size,
172
+ num_train_epochs=epochs,
173
+ gradient_accumulation_steps=2,
174
+ logging_steps=10,
175
+ save_steps=100,
176
+ learning_rate=learning_rate,
177
+ fp16=fp16_enabled,
178
+ dataloader_pin_memory=torch.cuda.is_available(),
179
+ report_to="none",
180
+ )
181
+
182
+ trainer = Trainer(
183
+ model=model,
184
+ args=training_args,
185
+ train_dataset=tokenized["train"],
186
+ )
187
+ trainer.train()
188
+
189
+ model.save_pretrained(output_dir)
190
+ tokenizer.save_pretrained(output_dir)
191
+ print(f"Model and tokenizer saved to: {output_dir}")
192
+
193
+
194
+ def upload_to_hf(repo_id, output_dir):
195
+ if not os.path.exists(output_dir):
196
+ raise FileNotFoundError(
197
+ f"Model output folder not found: {output_dir}. Run training before upload."
198
+ )
199
+
200
+ upload_folder(
201
+ folder_path=output_dir,
202
+ repo_id=repo_id,
203
+ repo_type="model",
204
+ )
205
+ print(f"Uploaded to Hugging Face repo: {repo_id}")
206
+
207
+
208
+ if __name__ == "__main__":
209
+ parser = argparse.ArgumentParser()
210
+ parser.add_argument("--dataset-size", type=int, default=DATASET_SIZE)
211
+ parser.add_argument("--train-file", type=str, default=DEFAULT_TRAIN_FILE)
212
+ parser.add_argument("--output-dir", type=str, default=DEFAULT_OUTPUT_DIR)
213
+ parser.add_argument("--model-name", type=str, default=DEFAULT_MODEL_NAME)
214
+ parser.add_argument("--epochs", type=float, default=1)
215
+ parser.add_argument("--batch-size", type=int, default=2)
216
+ parser.add_argument("--learning-rate", type=float, default=2e-4)
217
+ parser.add_argument("--max-length", type=int, default=512)
218
+ parser.add_argument("--max-train-samples", type=int, default=0)
219
+ parser.add_argument("--use-4bit", action="store_true")
220
+ parser.add_argument("--skip-dataset-gen", action="store_true")
221
+ parser.add_argument("--skip-train", action="store_true")
222
+ parser.add_argument("--upload", action="store_true")
223
+ parser.add_argument("--hf-repo", type=str, default=HF_REPO_ID)
224
+ args = parser.parse_args()
225
+
226
+ if not (5000 <= args.dataset_size <= 10000):
227
+ raise ValueError("dataset-size must be between 5000 and 10000")
228
+
229
+ if not args.skip_dataset_gen:
230
+ build_dataset(train_file=args.train_file, size=args.dataset_size)
231
+
232
+ if not args.skip_train:
233
+ run_training(
234
+ model_name=args.model_name,
235
+ train_file=args.train_file,
236
+ output_dir=args.output_dir,
237
+ epochs=args.epochs,
238
+ batch_size=args.batch_size,
239
+ learning_rate=args.learning_rate,
240
+ max_length=args.max_length,
241
+ max_train_samples=args.max_train_samples,
242
+ use_4bit=args.use_4bit,
243
+ )
244
+
245
+ if args.upload:
246
+ upload_to_hf(repo_id=args.hf_repo, output_dir=args.output_dir)