| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| echo "==================================================" |
| echo "Seriguela v3 Model Training" |
| echo "==================================================" |
| echo "Start time: $(date)" |
| echo "" |
|
|
| |
| PROJECT_DIR="${HOME}/seriguela" |
| OUTPUT_DIR="${PROJECT_DIR}/output/Se124M_700K_infix_v3" |
| CONFIG_FILE="${PROJECT_DIR}/configs/training_v3.json" |
| DATA_DIR="${PROJECT_DIR}/data/processed/700K_fixed" |
|
|
| |
| if [ ! -d "$PROJECT_DIR" ]; then |
| echo "ERROR: Project directory not found: $PROJECT_DIR" |
| exit 1 |
| fi |
|
|
| cd "$PROJECT_DIR" |
|
|
| |
| echo "Activating virtual environment..." |
| if [ -d "venv" ]; then |
| source venv/bin/activate |
| elif [ -d ".seriguela" ]; then |
| source .seriguela/bin/activate |
| else |
| echo "ERROR: Virtual environment not found!" |
| exit 1 |
| fi |
|
|
| |
| echo "" |
| echo "Checking GPU availability..." |
| python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}'); print(f'GPU count: {torch.cuda.device_count()}'); print(f'GPU name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"N/A\"}')" |
|
|
| if ! python -c "import torch; exit(0 if torch.cuda.is_available() else 1)"; then |
| echo "WARNING: GPU not detected! Training will be slow on CPU." |
| read -p "Continue anyway? (y/n) " -n 1 -r |
| echo |
| if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
| exit 1 |
| fi |
| fi |
|
|
| |
| echo "" |
| echo "Verifying training data..." |
| if [ ! -f "$DATA_DIR/train_700K.csv" ]; then |
| echo "ERROR: Training data not found: $DATA_DIR/train_700K.csv" |
| echo "Please ensure data preparation step was completed." |
| exit 1 |
| fi |
|
|
| if [ ! -f "$DATA_DIR/validation_700K.csv" ]; then |
| echo "ERROR: Validation data not found: $DATA_DIR/validation_700K.csv" |
| exit 1 |
| fi |
|
|
| |
| echo "Checking for end markers in training data..." |
| MARKER_COUNT=$(head -100 "$DATA_DIR/train_700K.csv" | grep -c "<|endofex|>" || true) |
| if [ "$MARKER_COUNT" -eq 0 ]; then |
| echo "ERROR: No <|endofex|> markers found in training data!" |
| echo "Please run data preparation script first." |
| exit 1 |
| else |
| echo "✓ End markers detected in training data" |
| fi |
|
|
| |
| if [ ! -f "$CONFIG_FILE" ]; then |
| echo "ERROR: Config file not found: $CONFIG_FILE" |
| exit 1 |
| fi |
|
|
| echo "" |
| echo "Configuration:" |
| echo " Config file: $CONFIG_FILE" |
| echo " Output directory: $OUTPUT_DIR" |
| echo " Training data: $DATA_DIR/train_700K.csv" |
| echo " Validation data: $DATA_DIR/validation_700K.csv" |
| echo "" |
|
|
| |
| mkdir -p "$OUTPUT_DIR" |
|
|
| |
| export WANDB_PROJECT="seriguela_v3" |
| export WANDB_RUN_NAME="v3_proper_markers_$(date +%Y%m%d_%H%M%S)" |
|
|
| |
| if ! python -c "import wandb; wandb.api.api_key" 2>/dev/null; then |
| echo "WARNING: Weights & Biases not configured. Training will proceed without W&B logging." |
| echo "To enable W&B: wandb login" |
| fi |
|
|
| |
| echo "" |
| echo "==================================================" |
| echo "Starting training..." |
| echo "==================================================" |
| echo "" |
|
|
| |
| python scripts/train.py \ |
| --config "$CONFIG_FILE" \ |
| --output_dir "$OUTPUT_DIR" \ |
| --use_local_csvs \ |
| --train_file "$DATA_DIR/train_700K.csv" \ |
| --validation_file "$DATA_DIR/validation_700K.csv" \ |
| --wandb_project seriguela_v3 \ |
| --wandb_run_name "$WANDB_RUN_NAME" |
|
|
| TRAIN_EXIT_CODE=$? |
|
|
| echo "" |
| echo "==================================================" |
| echo "Training completed" |
| echo "==================================================" |
| echo "End time: $(date)" |
| echo "Exit code: $TRAIN_EXIT_CODE" |
| echo "" |
|
|
| if [ $TRAIN_EXIT_CODE -eq 0 ]; then |
| echo "✓ Training completed successfully!" |
| echo "" |
| echo "Model saved to: $OUTPUT_DIR" |
| echo "" |
| echo "Next steps:" |
| echo "1. Run evaluation: python scripts/evaluate.py --model_path $OUTPUT_DIR" |
| echo "2. Test generation: python scripts/generate.py --model_path $OUTPUT_DIR --num_generations 50 --validate" |
| echo "3. Push to Hub (if configured): huggingface-cli upload augustocsc/Se124M_700K_infix_v3 $OUTPUT_DIR" |
| else |
| echo "✗ Training failed with exit code $TRAIN_EXIT_CODE" |
| echo "Check logs above for error details." |
| exit $TRAIN_EXIT_CODE |
| fi |
|
|