#!/bin/bash
#
# Train NFQA Model with Automatic Data Splitting
#
# This script trains the NFQA classification model using a single combined
# dataset that will be automatically split into train/val/test sets.
#
# Usage:
#   bash run_training_auto.sh
#
# Or with custom parameters:
#   bash run_training_auto.sh --epochs 15 --batch-size 32
#

set -e  # Exit on error

# Default paths
INPUT_FILE="../output/webfaq_nfqa_combined_highquality.jsonl"
OUTPUT_DIR="../output/training/nfqa_model_auto"

# Default training parameters
MODEL_NAME="xlm-roberta-base"
EPOCHS=6
BATCH_SIZE=16
LEARNING_RATE=2e-5
MAX_LENGTH=128
WARMUP_STEPS=500
WEIGHT_DECAY=0.1
DROPOUT=0.2
TEST_SIZE=0.2
VAL_SIZE=0.1

echo "================================================================================"
echo "NFQA Model Training - Automatic Split Mode"
echo "================================================================================"
echo ""
echo "Training Configuration:"
echo "  Input file:       $INPUT_FILE"
echo "  Output directory: $OUTPUT_DIR"
echo "  Model:            $MODEL_NAME"
echo "  Epochs:           $EPOCHS"
echo "  Batch size:       $BATCH_SIZE"
echo "  Learning rate:    $LEARNING_RATE"
echo "  Max length:       $MAX_LENGTH"
echo "  Weight decay:     $WEIGHT_DECAY"
echo "  Dropout:          $DROPOUT"
echo "  Test split:       $TEST_SIZE (20%)"
echo "  Val split:        $VAL_SIZE (10%)"
echo ""
echo "================================================================================"
echo ""

# Check if input file exists
if [ ! -f "$INPUT_FILE" ]; then
    echo "❌ Error: Input file not found: $INPUT_FILE"
    echo ""
    echo "Please ensure the combined dataset exists."
    echo "You can create it by running:"
    echo "  cd ../annotator"
    echo "  python combine_datasets.py"
    exit 1
fi

# Create output directory
mkdir -p "$OUTPUT_DIR"

# Run training
python train_nfqa_model.py \
    --input "$INPUT_FILE" \
    --output-dir "$OUTPUT_DIR" \
    --model-name "$MODEL_NAME" \
    --epochs "$EPOCHS" \
    --batch-size "$BATCH_SIZE" \
    --learning-rate "$LEARNING_RATE" \
    --max-length "$MAX_LENGTH" \
    --warmup-steps "$WARMUP_STEPS" \
    --weight-decay "$WEIGHT_DECAY" \
    --dropout "$DROPOUT" \
    --test-size "$TEST_SIZE" \
    --val-size "$VAL_SIZE" \
    "$@"  # Pass any additional arguments from command line

# Check if training was successful
if [ $? -eq 0 ]; then
    echo ""
    echo "================================================================================"
    echo "✅ Training completed successfully!"
    echo "================================================================================"
    echo ""
    echo "Model saved to: $OUTPUT_DIR"
    echo ""
    echo "Generated files:"
    echo "  - best_model/                  (best checkpoint based on validation F1)"
    echo "  - final_model/                 (final epoch checkpoint)"
    echo "  - training_history.json        (training metrics)"
    echo "  - training_curves.png          (loss/accuracy/F1 plots)"
    echo "  - test_results.json            (final test metrics)"
    echo "  - classification_report.txt    (per-category performance)"
    echo "  - confusion_matrix.png         (confusion matrix visualization)"
    echo ""
    echo "Next steps:"
    echo "  1. Review training curves: $OUTPUT_DIR/training_curves.png"
    echo "  2. Check test results: $OUTPUT_DIR/test_results.json"
    echo "  3. Analyze confusion matrix: $OUTPUT_DIR/confusion_matrix.png"
    echo "  4. Deploy model from: $OUTPUT_DIR/best_model/"
    echo ""
else
    echo ""
    echo "================================================================================"
    echo "❌ Training failed!"
    echo "================================================================================"
    echo ""
    echo "Please check the error messages above and try again."
    exit 1
fi