#!/bin/bash # Data Filtering Pipeline for GUI-Shift set -e # Default values INPUT_FILE="" OUTPUT_FILE="" MODEL_PATH="Qwen/Qwen2.5-VL-7B-Instruct" NUM_GENERATIONS=8 TEMPERATURE=0.9 DEVICE="cuda" SEED=42 # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --input_file) INPUT_FILE="$2" shift 2 ;; --output_file) OUTPUT_FILE="$2" shift 2 ;; --model_path) MODEL_PATH="$2" shift 2 ;; --num_generations) NUM_GENERATIONS="$2" shift 2 ;; --temperature) TEMPERATURE="$2" shift 2 ;; --device) DEVICE="$2" shift 2 ;; --seed) SEED="$2" shift 2 ;; *) echo "Unknown option: $1" exit 1 ;; esac done if [ -z "$INPUT_FILE" ] || [ -z "$OUTPUT_FILE" ]; then echo "Error: --input_file and --output_file are required" echo "Usage: bash scripts/filter_data.sh --input_file ./data/k1_transition.jsonl --output_file ./data/filtered/k1_transition_filtered.jsonl --model_path Qwen/Qwen2.5-VL-7B-Instruct" exit 1 fi echo "=== GUI-Shift Data Filtering ===" echo "Input: $INPUT_FILE" echo "Output: $OUTPUT_FILE" echo "Model: $MODEL_PATH" echo "N generations: $NUM_GENERATIONS" echo "Temperature: $TEMPERATURE" echo "Device: $DEVICE" echo "" mkdir -p "$(dirname "$OUTPUT_FILE")" python src/filtering/filter_data.py \ --input_file "$INPUT_FILE" \ --output_file "$OUTPUT_FILE" \ --model_path "$MODEL_PATH" \ --num_generations "$NUM_GENERATIONS" \ --temperature "$TEMPERATURE" \ --device "$DEVICE" \ --seed "$SEED" echo "" echo "Filtering complete! Output: $OUTPUT_FILE"