#!/bin/bash
#
# Train NFQA Model with Pre-Split Datasets
#
# This script trains the NFQA classification model using manually split
# train/validation/test datasets for balanced training.
#
# Usage:
#   bash run_training_manual.sh
#
# Or with custom parameters:
#   bash run_training_manual.sh --epochs 15 --batch-size 32
#

set -e  # Exit on error

# Default paths
TRAIN_FILE="../output/train_balanced.jsonl"
VAL_FILE="../output/val_balanced.jsonl"
TEST_FILE="../output/test_balanced.jsonl"
OUTPUT_DIR="../output/training/nfqa_model_balanced"

# Default training parameters
MODEL_NAME="xlm-roberta-base"
EPOCHS=6
BATCH_SIZE=16
LEARNING_RATE=2e-5
MAX_LENGTH=128
WARMUP_RATIO=0.1
WEIGHT_DECAY=0.1
DROPOUT=0.2

echo "================================================================================"
echo "NFQA Model Training - Manual Split Mode"
echo "================================================================================"
echo ""
echo "Training Configuration:"
echo "  Train file:       $TRAIN_FILE"
echo "  Validation file:  $VAL_FILE"
echo "  Test file:        $TEST_FILE"
echo "  Output directory: $OUTPUT_DIR"
echo "  Model:            $MODEL_NAME"
echo "  Epochs:           $EPOCHS"
echo "  Batch size:       $BATCH_SIZE"
echo "  Learning rate:    $LEARNING_RATE"
echo "  Max length:       $MAX_LENGTH"
echo "  Warmup ratio:     $WARMUP_RATIO"
echo "  Weight decay:     $WEIGHT_DECAY"
echo "  Dropout:          $DROPOUT"
echo ""
echo "================================================================================"
echo ""

# Check if required files exist
if [ ! -f "$TRAIN_FILE" ]; then
    echo "❌ Error: Training file not found: $TRAIN_FILE"
    echo ""
    echo "Please run the data splitting script first:"
    echo "  cd ../cleaning"
    echo "  python split_train_test_val.py --input ../output/webfaq_nfqa_combined_highquality.jsonl"
    exit 1
fi

if [ ! -f "$VAL_FILE" ]; then
    echo "❌ Error: Validation file not found: $VAL_FILE"
    exit 1
fi

if [ ! -f "$TEST_FILE" ]; then
    echo "❌ Error: Test file not found: $TEST_FILE"
    exit 1
fi

# Create output directory
mkdir -p "$OUTPUT_DIR"

# Run training
python train_nfqa_model.py \
    --train "$TRAIN_FILE" \
    --val "$VAL_FILE" \
    --test "$TEST_FILE" \
    --output-dir "$OUTPUT_DIR" \
    --model-name "$MODEL_NAME" \
    --epochs "$EPOCHS" \
    --batch-size "$BATCH_SIZE" \
    --learning-rate "$LEARNING_RATE" \
    --max-length "$MAX_LENGTH" \
    --warmup-ratio "$WARMUP_RATIO" \
    --weight-decay "$WEIGHT_DECAY" \
    --dropout "$DROPOUT" \
    "$@"  # Pass any additional arguments from command line

# Check if training was successful
if [ $? -eq 0 ]; then
    echo ""
    echo "================================================================================"
    echo "✅ Training completed successfully!"
    echo "================================================================================"
    echo ""
    echo "Model saved to: $OUTPUT_DIR"
    echo ""
    echo "Generated files:"
    echo "  - best_model/                  (best checkpoint based on validation F1)"
    echo "  - final_model/                 (final epoch checkpoint)"
    echo "  - training_history.json        (training metrics)"
    echo "  - training_curves.png          (loss/accuracy/F1 plots)"
    echo "  - test_results.json            (final test metrics)"
    echo "  - classification_report.txt    (per-category performance)"
    echo "  - confusion_matrix.png         (confusion matrix visualization)"
    echo ""
    echo "Next steps:"
    echo "  1. Review training curves: $OUTPUT_DIR/training_curves.png"
    echo "  2. Check test results: $OUTPUT_DIR/test_results.json"
    echo "  3. Analyze confusion matrix: $OUTPUT_DIR/confusion_matrix.png"
    echo "  4. Deploy model from: $OUTPUT_DIR/best_model/"
    echo ""
else
    echo ""
    echo "================================================================================"
    echo "❌ Training failed!"
    echo "================================================================================"
    echo ""
    echo "Please check the error messages above and try again."
    exit 1
fi