File size: 1,848 Bytes
274c457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
# Data Filtering Pipeline for GUI-Shift

set -e

# Default values
INPUT_FILE=""
OUTPUT_FILE=""
MODEL_PATH="Qwen/Qwen2.5-VL-7B-Instruct"
NUM_GENERATIONS=8
TEMPERATURE=0.9
DEVICE="cuda"
SEED=42

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --input_file)
            INPUT_FILE="$2"
            shift 2
            ;;
        --output_file)
            OUTPUT_FILE="$2"
            shift 2
            ;;
        --model_path)
            MODEL_PATH="$2"
            shift 2
            ;;
        --num_generations)
            NUM_GENERATIONS="$2"
            shift 2
            ;;
        --temperature)
            TEMPERATURE="$2"
            shift 2
            ;;
        --device)
            DEVICE="$2"
            shift 2
            ;;
        --seed)
            SEED="$2"
            shift 2
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

if [ -z "$INPUT_FILE" ] || [ -z "$OUTPUT_FILE" ]; then
    echo "Error: --input_file and --output_file are required"
    echo "Usage: bash scripts/filter_data.sh --input_file ./data/k1_transition.jsonl --output_file ./data/filtered/k1_transition_filtered.jsonl --model_path Qwen/Qwen2.5-VL-7B-Instruct"
    exit 1
fi

echo "=== GUI-Shift Data Filtering ==="
echo "Input: $INPUT_FILE"
echo "Output: $OUTPUT_FILE"
echo "Model: $MODEL_PATH"
echo "N generations: $NUM_GENERATIONS"
echo "Temperature: $TEMPERATURE"
echo "Device: $DEVICE"
echo ""

mkdir -p "$(dirname "$OUTPUT_FILE")"

python src/filtering/filter_data.py \
    --input_file "$INPUT_FILE" \
    --output_file "$OUTPUT_FILE" \
    --model_path "$MODEL_PATH" \
    --num_generations "$NUM_GENERATIONS" \
    --temperature "$TEMPERATURE" \
    --device "$DEVICE" \
    --seed "$SEED"

echo ""
echo "Filtering complete! Output: $OUTPUT_FILE"