Spaces:

obx0x3
/

empathy-api

Sleeping

App Files Files Community

obx0x3 commited on Jul 24, 2025

Commit

c4a5e63

verified ·

1 Parent(s): 451ae93

Update train_finetune.py

Browse files

Files changed (1) hide show

train_finetune.py +53 -105

train_finetune.py CHANGED Viewed

@@ -1,107 +1,54 @@
-# train_finetune.py
-import os
-from huggingface_hub import login
 from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
-from datasets import load_dataset, Dataset, DatasetDict
-import pandas as pd
-# Step 1: Log in to Hugging Face (use HF_TOKEN environment variable or prompt)
-HF_TOKEN = os.getenv("HF_TOKEN")
-if HF_TOKEN:
-    login(token=HF_TOKEN)
-else:
-    login()  # Enter token manually if not set
-# Step 2: Load existing dementia datasets
-try:
-    data_files = {
-        "train": "dementia_train_split.json",
-        "validation": "dementia_validation_split.json",
-        "test": "dementia_test_multilang.json"
-    }
-    train_df = pd.read_json(data_files["train"])
-    validation_df = pd.read_json(data_files["validation"])
-    test_df = pd.read_json(data_files["test"])
-    print(f"Dementia datasets loaded: Train={len(train_df)}, Validation={len(validation_df)}, Test={len(test_df)}")
-except Exception as e:
-    print(f"Error loading dementia datasets: {e}")
-    raise
-# Step 3: Load go_emotions dataset (small sample to manage resources)
-try:
-    go_emotions = load_dataset("google-research-datasets/go_emotions", split="train[:1000]")
-    print(f"Go_emotions loaded: {len(go_emotions)} samples")
-except Exception as e:
-    print(f"Error loading go_emotions: {e}")
-    raise
-# Step 4: Map go_emotions to your dataset format (placeholder responses)
-emotion_labels = [
-    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
-    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
-    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
-    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
-]
-def generate_placeholder_response(text, emotion_idx):
-    emotion = emotion_labels[emotion_idx]
-    return f"I hear you're feeling {emotion}. I'm here to support you."
-augmented_data = []
-for example in go_emotions:
-    text = example["text"]
-    emotion_idx = example["labels"][0]
-    response = generate_placeholder_response(text, emotion_idx)
-    augmented_data.append({
-        "input": text,
-        "response": response,
-        "language": "en",
-        "emotion": emotion_labels[emotion_idx]
-    })
-# Step 5: Combine datasets
-augmented_df = pd.DataFrame(augmented_data)
-combined_train_df = pd.concat([train_df, augmented_df], ignore_index=True)
-# Step 6: Create DatasetDict
-dataset = DatasetDict({
-    "train": Dataset.from_pandas(combined_train_df),
-    "validation": Dataset.from_pandas(validation_df),
-    "test": Dataset.from_pandas(test_df)
-})
-# Step 7: Initialize tokenizer and model
-try:
-    tokenizer = T5Tokenizer.from_pretrained("t5-base")
-    model = T5ForConditionalGeneration.from_pretrained("t5-base")
-    print("Model and tokenizer loaded successfully.")
-except Exception as e:
-    print(f"Error loading model/tokenizer: {e}")
-    raise
-# Step 8: Preprocess function
 def preprocess(example):
     prefix = "émotion: " if example.get("language", "en") == "fr" else "emotion: "
-    input_enc = tokenizer(prefix + example["input"], padding="max_length", truncation=True, max_length=128)
-    target_enc = tokenizer(example["response"], padding="max_length", truncation=True, max_length=128)
     input_enc["labels"] = target_enc["input_ids"]
     return input_enc
-# Step 9: Tokenize dataset
-try:
-    tokenized = dataset.map(preprocess, remove_columns=["input", "response", "emotion", "language"])
-    print("Dataset tokenized successfully.")
-except Exception as e:
-    print(f"Error tokenizing dataset: {e}")
-    raise
-# Step 10: Training arguments
 args = TrainingArguments(
     output_dir="./model",
-    num_train_epochs=2,  # Reduced for testing
-    per_device_train_batch_size=2,  # Reduced for resource constraints
-    per_device_eval_batch_size=2,
-    evaluation_strategy="epoch",
     save_strategy="epoch",
     logging_dir="./logs",
     logging_steps=10,
@@ -110,24 +57,25 @@ args = TrainingArguments(
     metric_for_best_model="eval_loss"
 )
-# Step 11: Initialize and train
 trainer = Trainer(
     model=model,
-    args=args,
-    train_dataset=tokenized["train"],
-    eval_dataset=tokenized["validation"]
 )
-try:
-    trainer.train()
-    print("Training completed successfully.")
-except Exception as e:
-    print(f"Training error: {e}")
-    raise
-# Step 12: Save and push to Hugging Face Hub
 trainer.save_model("./model")
 tokenizer.save_pretrained("./model")
-model.push_to_hub("obx0x3/empathy-dementia")
-tokenizer.push_to_hub("obx0x3/empathy-dementia")
-print("✅ Model pushed to Hugging Face Hub.")

 from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
+from datasets import load_dataset
+from datasets import DatasetDict
+import os
+# Load tokenizer and model
+model = T5ForConditionalGeneration.from_pretrained("t5-base")
+tokenizer = T5Tokenizer.from_pretrained("t5-base")
+# Load JSON datasets from local files
+data_files = {
+    "train": "dementia_train_split.json",
+    "validation": "dementia_validation_split.json",
+    "test": "dementia_test_multilang.json"
+}
+dataset = load_dataset("json", data_files=data_files)
+# Convert to DatasetDict (required for .map with remove_columns)
+dataset = DatasetDict(dataset)
+# Preprocessing function to tokenize inputs and outputs
 def preprocess(example):
     prefix = "émotion: " if example.get("language", "en") == "fr" else "emotion: "
+    input_enc = tokenizer(
+        prefix + example["input"],
+        padding="max_length",
+        truncation=True,
+        max_length=128
+    )
+    target_enc = tokenizer(
+        example["response"],
+        padding="max_length",
+        truncation=True,
+        max_length=128
+    )
     input_enc["labels"] = target_enc["input_ids"]
     return input_enc
+# Tokenize and clean up metadata
+tokenized_dataset = dataset.map(
+    preprocess,
+    remove_columns=["input", "response", "emotion", "intent", "tags", "care_mode", "language", "difficulty", "is_dementia_related"]
+)
+# Define training arguments
 args = TrainingArguments(
     output_dir="./model",
+    num_train_epochs=4,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    eval_strategy="epoch",
     save_strategy="epoch",
     logging_dir="./logs",
     logging_steps=10,
     metric_for_best_model="eval_loss"
 )
+# Define the Trainer
 trainer = Trainer(
     model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset["train"],
+    eval_dataset=tokenized_dataset["validation"]
 )
+# Start training
+trainer.train()
+# Save and push the final model
 trainer.save_model("./model")
 tokenizer.save_pretrained("./model")
+# Optional: Push to HF hub (requires `huggingface-cli login`)
+if training_args.push_to_hub:
+    trainer.push_to_hub()
+    tokenizer.push_to_hub("obx0x3/empathy-dementia")
+print("✅ Model trained and saved!")