| # Data Filtering Pipeline for GUI-Shift | |
| set -e | |
| # Default values | |
| INPUT_FILE="" | |
| OUTPUT_FILE="" | |
| MODEL_PATH="Qwen/Qwen2.5-VL-7B-Instruct" | |
| NUM_GENERATIONS=8 | |
| TEMPERATURE=0.9 | |
| DEVICE="cuda" | |
| SEED=42 | |
| # Parse arguments | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| --input_file) | |
| INPUT_FILE="$2" | |
| shift 2 | |
| ;; | |
| --output_file) | |
| OUTPUT_FILE="$2" | |
| shift 2 | |
| ;; | |
| --model_path) | |
| MODEL_PATH="$2" | |
| shift 2 | |
| ;; | |
| --num_generations) | |
| NUM_GENERATIONS="$2" | |
| shift 2 | |
| ;; | |
| --temperature) | |
| TEMPERATURE="$2" | |
| shift 2 | |
| ;; | |
| --device) | |
| DEVICE="$2" | |
| shift 2 | |
| ;; | |
| --seed) | |
| SEED="$2" | |
| shift 2 | |
| ;; | |
| *) | |
| echo "Unknown option: $1" | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| if [ -z "$INPUT_FILE" ] || [ -z "$OUTPUT_FILE" ]; then | |
| echo "Error: --input_file and --output_file are required" | |
| echo "Usage: bash scripts/filter_data.sh --input_file ./data/k1_transition.jsonl --output_file ./data/filtered/k1_transition_filtered.jsonl --model_path Qwen/Qwen2.5-VL-7B-Instruct" | |
| exit 1 | |
| fi | |
| echo "=== GUI-Shift Data Filtering ===" | |
| echo "Input: $INPUT_FILE" | |
| echo "Output: $OUTPUT_FILE" | |
| echo "Model: $MODEL_PATH" | |
| echo "N generations: $NUM_GENERATIONS" | |
| echo "Temperature: $TEMPERATURE" | |
| echo "Device: $DEVICE" | |
| echo "" | |
| mkdir -p "$(dirname "$OUTPUT_FILE")" | |
| python src/filtering/filter_data.py \ | |
| --input_file "$INPUT_FILE" \ | |
| --output_file "$OUTPUT_FILE" \ | |
| --model_path "$MODEL_PATH" \ | |
| --num_generations "$NUM_GENERATIONS" \ | |
| --temperature "$TEMPERATURE" \ | |
| --device "$DEVICE" \ | |
| --seed "$SEED" | |
| echo "" | |
| echo "Filtering complete! Output: $OUTPUT_FILE" | |