Percy3822 commited on
Commit
2c1c9e4
·
1 Parent(s): 1e61809

Initial training setup

Browse files
Files changed (3) hide show
  1. app.py +8 -0
  2. requirements.txt +4 -1
  3. train.py +33 -25
app.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+
4
+ def start_training():
5
+ os.system("python train.py")
6
+ return "Training complete! Check model on Hugging Face."
7
+
8
+ gr.Interface(fn=start_training, inputs=None, outputs="text").launch()
requirements.txt CHANGED
@@ -1 +1,4 @@
1
- tk
 
 
 
 
1
+ transformers==4.54.0
2
+ datasets==4.0.0
3
+ accelerate==1.9.0
4
+ torch>=2.6.0
train.py CHANGED
@@ -1,52 +1,60 @@
1
- import json
2
  from datasets import load_dataset
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
4
- import huggingface_hub
5
 
6
- # 1. Login to Hugging Face (do once)
7
- # huggingface_hub.login(token="YOUR_HF_TOKEN")
8
 
9
- # 2. Load JSONL dataset (local)
10
- dataset = load_dataset("json", data_files="sample_dataset.jsonl", split="train")
11
-
12
- # 3. Convert dict completion → string
13
  def format_for_training(example):
14
- completion_str = json.dumps(example["completion"])
15
- return {"text": example["prompt"] + "\n" + completion_str}
 
 
16
 
17
  dataset = dataset.map(format_for_training)
18
 
19
- # 4. Load small model for low VRAM (good for testing)
20
- model_name = "distilgpt2" # Small, works on Hugging Face free GPU
21
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
22
 
23
- def tokenize_function(examples):
24
- return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
 
25
 
26
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
27
 
28
- # 5. Load model
29
  model = AutoModelForCausalLM.from_pretrained(model_name)
30
 
31
- # 6. Training Arguments (small batch size for low memory)
 
 
 
32
  training_args = TrainingArguments(
33
  output_dir="./results",
34
  overwrite_output_dir=True,
35
- evaluation_strategy="no",
 
36
  per_device_train_batch_size=2,
37
- num_train_epochs=2,
38
- save_steps=20,
 
39
  logging_steps=5,
40
- push_to_hub=True, # Upload to Hugging Face Hub
41
- hub_model_id="Percy3822/quiz_model_test" # Change to your repo name
42
  )
43
 
44
  trainer = Trainer(
45
  model=model,
46
  args=training_args,
47
- train_dataset=tokenized_dataset,
 
 
 
48
  )
49
 
50
- # 7. Train & push to Hugging Face
51
  trainer.train()
 
 
52
  trainer.push_to_hub()
 
 
1
  from datasets import load_dataset
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
 
3
 
4
+ # Load dataset from Hugging Face Hub
5
+ dataset = load_dataset("Percy3822/quiz_model")
6
 
7
+ # Preprocess: combine prompt + completion into single string
 
 
 
8
  def format_for_training(example):
9
+ # Convert dict completion to string if needed
10
+ if isinstance(example["completion"], dict):
11
+ example["completion"] = str(example["completion"])
12
+ return {"text": example["prompt"] + "\n" + example["completion"]}
13
 
14
  dataset = dataset.map(format_for_training)
15
 
16
+ # Load tokenizer and model (small model for low VRAM)
17
+ model_name = "distilgpt2" # Small and fast for testing
18
  tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ tokenizer.pad_token = tokenizer.eos_token
20
 
21
+ # Tokenize
22
+ def tokenize(batch):
23
+ return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
24
 
25
+ dataset = dataset.map(tokenize, batched=True)
26
 
27
+ # Load model
28
  model = AutoModelForCausalLM.from_pretrained(model_name)
29
 
30
+ # Data collator
31
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
32
+
33
+ # Training args
34
  training_args = TrainingArguments(
35
  output_dir="./results",
36
  overwrite_output_dir=True,
37
+ evaluation_strategy="epoch",
38
+ learning_rate=5e-5,
39
  per_device_train_batch_size=2,
40
+ num_train_epochs=1,
41
+ save_strategy="epoch",
42
+ logging_dir="./logs",
43
  logging_steps=5,
44
+ push_to_hub=True,
45
+ hub_model_id="Percy3822/quiz_model",
46
  )
47
 
48
  trainer = Trainer(
49
  model=model,
50
  args=training_args,
51
+ train_dataset=dataset["train"],
52
+ eval_dataset=dataset["train"], # Use train for eval in testing
53
+ tokenizer=tokenizer,
54
+ data_collator=data_collator,
55
  )
56
 
 
57
  trainer.train()
58
+
59
+ # Push trained model to Hub
60
  trainer.push_to_hub()