obx0x3 commited on
Commit
c4a5e63
·
verified ·
1 Parent(s): 451ae93

Update train_finetune.py

Browse files
Files changed (1) hide show
  1. train_finetune.py +53 -105
train_finetune.py CHANGED
@@ -1,107 +1,54 @@
1
- # train_finetune.py
2
- import os
3
- from huggingface_hub import login
4
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
5
- from datasets import load_dataset, Dataset, DatasetDict
6
- import pandas as pd
7
-
8
- # Step 1: Log in to Hugging Face (use HF_TOKEN environment variable or prompt)
9
- HF_TOKEN = os.getenv("HF_TOKEN")
10
- if HF_TOKEN:
11
- login(token=HF_TOKEN)
12
- else:
13
- login() # Enter token manually if not set
14
-
15
- # Step 2: Load existing dementia datasets
16
- try:
17
- data_files = {
18
- "train": "dementia_train_split.json",
19
- "validation": "dementia_validation_split.json",
20
- "test": "dementia_test_multilang.json"
21
- }
22
- train_df = pd.read_json(data_files["train"])
23
- validation_df = pd.read_json(data_files["validation"])
24
- test_df = pd.read_json(data_files["test"])
25
- print(f"Dementia datasets loaded: Train={len(train_df)}, Validation={len(validation_df)}, Test={len(test_df)}")
26
- except Exception as e:
27
- print(f"Error loading dementia datasets: {e}")
28
- raise
29
-
30
- # Step 3: Load go_emotions dataset (small sample to manage resources)
31
- try:
32
- go_emotions = load_dataset("google-research-datasets/go_emotions", split="train[:1000]")
33
- print(f"Go_emotions loaded: {len(go_emotions)} samples")
34
- except Exception as e:
35
- print(f"Error loading go_emotions: {e}")
36
- raise
37
-
38
- # Step 4: Map go_emotions to your dataset format (placeholder responses)
39
- emotion_labels = [
40
- "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
41
- "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
42
- "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
43
- "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
44
- ]
45
-
46
- def generate_placeholder_response(text, emotion_idx):
47
- emotion = emotion_labels[emotion_idx]
48
- return f"I hear you're feeling {emotion}. I'm here to support you."
49
-
50
- augmented_data = []
51
- for example in go_emotions:
52
- text = example["text"]
53
- emotion_idx = example["labels"][0]
54
- response = generate_placeholder_response(text, emotion_idx)
55
- augmented_data.append({
56
- "input": text,
57
- "response": response,
58
- "language": "en",
59
- "emotion": emotion_labels[emotion_idx]
60
- })
61
 
62
- # Step 5: Combine datasets
63
- augmented_df = pd.DataFrame(augmented_data)
64
- combined_train_df = pd.concat([train_df, augmented_df], ignore_index=True)
65
 
66
- # Step 6: Create DatasetDict
67
- dataset = DatasetDict({
68
- "train": Dataset.from_pandas(combined_train_df),
69
- "validation": Dataset.from_pandas(validation_df),
70
- "test": Dataset.from_pandas(test_df)
71
- })
 
72
 
73
- # Step 7: Initialize tokenizer and model
74
- try:
75
- tokenizer = T5Tokenizer.from_pretrained("t5-base")
76
- model = T5ForConditionalGeneration.from_pretrained("t5-base")
77
- print("Model and tokenizer loaded successfully.")
78
- except Exception as e:
79
- print(f"Error loading model/tokenizer: {e}")
80
- raise
81
 
82
- # Step 8: Preprocess function
83
  def preprocess(example):
84
  prefix = "émotion: " if example.get("language", "en") == "fr" else "emotion: "
85
- input_enc = tokenizer(prefix + example["input"], padding="max_length", truncation=True, max_length=128)
86
- target_enc = tokenizer(example["response"], padding="max_length", truncation=True, max_length=128)
 
 
 
 
 
 
 
 
 
 
87
  input_enc["labels"] = target_enc["input_ids"]
88
  return input_enc
89
 
90
- # Step 9: Tokenize dataset
91
- try:
92
- tokenized = dataset.map(preprocess, remove_columns=["input", "response", "emotion", "language"])
93
- print("Dataset tokenized successfully.")
94
- except Exception as e:
95
- print(f"Error tokenizing dataset: {e}")
96
- raise
97
 
98
- # Step 10: Training arguments
99
  args = TrainingArguments(
100
  output_dir="./model",
101
- num_train_epochs=2, # Reduced for testing
102
- per_device_train_batch_size=2, # Reduced for resource constraints
103
- per_device_eval_batch_size=2,
104
- evaluation_strategy="epoch",
105
  save_strategy="epoch",
106
  logging_dir="./logs",
107
  logging_steps=10,
@@ -110,24 +57,25 @@ args = TrainingArguments(
110
  metric_for_best_model="eval_loss"
111
  )
112
 
113
- # Step 11: Initialize and train
114
  trainer = Trainer(
115
  model=model,
116
- args=args,
117
- train_dataset=tokenized["train"],
118
- eval_dataset=tokenized["validation"]
119
  )
120
 
121
- try:
122
- trainer.train()
123
- print("Training completed successfully.")
124
- except Exception as e:
125
- print(f"Training error: {e}")
126
- raise
127
 
128
- # Step 12: Save and push to Hugging Face Hub
129
  trainer.save_model("./model")
130
  tokenizer.save_pretrained("./model")
131
- model.push_to_hub("obx0x3/empathy-dementia")
132
- tokenizer.push_to_hub("obx0x3/empathy-dementia")
133
- print("✅ Model pushed to Hugging Face Hub.")
 
 
 
 
 
 
 
 
 
1
  from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+ from datasets import DatasetDict
4
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # Load tokenizer and model
7
+ model = T5ForConditionalGeneration.from_pretrained("t5-base")
8
+ tokenizer = T5Tokenizer.from_pretrained("t5-base")
9
 
10
+ # Load JSON datasets from local files
11
+ data_files = {
12
+ "train": "dementia_train_split.json",
13
+ "validation": "dementia_validation_split.json",
14
+ "test": "dementia_test_multilang.json"
15
+ }
16
+ dataset = load_dataset("json", data_files=data_files)
17
 
18
+ # Convert to DatasetDict (required for .map with remove_columns)
19
+ dataset = DatasetDict(dataset)
 
 
 
 
 
 
20
 
21
+ # Preprocessing function to tokenize inputs and outputs
22
  def preprocess(example):
23
  prefix = "émotion: " if example.get("language", "en") == "fr" else "emotion: "
24
+ input_enc = tokenizer(
25
+ prefix + example["input"],
26
+ padding="max_length",
27
+ truncation=True,
28
+ max_length=128
29
+ )
30
+ target_enc = tokenizer(
31
+ example["response"],
32
+ padding="max_length",
33
+ truncation=True,
34
+ max_length=128
35
+ )
36
  input_enc["labels"] = target_enc["input_ids"]
37
  return input_enc
38
 
39
+ # Tokenize and clean up metadata
40
+ tokenized_dataset = dataset.map(
41
+ preprocess,
42
+ remove_columns=["input", "response", "emotion", "intent", "tags", "care_mode", "language", "difficulty", "is_dementia_related"]
43
+ )
 
 
44
 
45
+ # Define training arguments
46
  args = TrainingArguments(
47
  output_dir="./model",
48
+ num_train_epochs=4,
49
+ per_device_train_batch_size=4,
50
+ per_device_eval_batch_size=4,
51
+ eval_strategy="epoch",
52
  save_strategy="epoch",
53
  logging_dir="./logs",
54
  logging_steps=10,
 
57
  metric_for_best_model="eval_loss"
58
  )
59
 
60
+ # Define the Trainer
61
  trainer = Trainer(
62
  model=model,
63
+ args=training_args,
64
+ train_dataset=tokenized_dataset["train"],
65
+ eval_dataset=tokenized_dataset["validation"]
66
  )
67
 
68
+ # Start training
69
+ trainer.train()
 
 
 
 
70
 
71
+ # Save and push the final model
72
  trainer.save_model("./model")
73
  tokenizer.save_pretrained("./model")
74
+
75
+ # Optional: Push to HF hub (requires `huggingface-cli login`)
76
+ if training_args.push_to_hub:
77
+ trainer.push_to_hub()
78
+ tokenizer.push_to_hub("obx0x3/empathy-dementia")
79
+
80
+ print("✅ Model trained and saved!")
81
+