Percy3822 commited on
Commit
aa25c1f
·
verified ·
1 Parent(s): bc4d02d

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +55 -0
train.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
3
+ import os
4
+
5
+ # Load dataset
6
+ dataset = load_dataset("json", data_files="python_train_100.jsonl")
7
+
8
+ # Load tokenizer and model
9
+ model_name = "distilgpt2"
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForCausalLM.from_pretrained(model_name)
12
+
13
+ # Tokenize function
14
+ def tokenize_function(example):
15
+ full_text = f"### Prompt:\n{example['prompt']}\n### Completion:\n{example['completion']}"
16
+ return tokenizer(full_text, truncation=True, padding="max_length", max_length=512)
17
+
18
+ tokenized_dataset = dataset["train"].map(tokenize_function)
19
+
20
+ # Training arguments
21
+ training_args = TrainingArguments(
22
+ output_dir="trained_model",
23
+ evaluation_strategy="no",
24
+ learning_rate=2e-5,
25
+ per_device_train_batch_size=4,
26
+ num_train_epochs=5,
27
+ weight_decay=0.01,
28
+ save_total_limit=1,
29
+ logging_dir="./logs",
30
+ )
31
+
32
+ # Trainer
33
+ trainer = Trainer(
34
+ model=model,
35
+ args=training_args,
36
+ train_dataset=tokenized_dataset,
37
+ )
38
+
39
+ # Train
40
+ trainer.train()
41
+
42
+ # Save and push model to hub
43
+ repo_name = "Percy3822/python_coder_100"
44
+ trainer.save_model(repo_name)
45
+ tokenizer.save_pretrained(repo_name)
46
+
47
+ # Optional: push to hub
48
+ from huggingface_hub import HfApi
49
+ api = HfApi()
50
+ api.upload_folder(
51
+ folder_path=repo_name,
52
+ path_in_repo="",
53
+ repo_id=repo_name,
54
+ repo_type="model"
55
+ )