gui-shift / scripts /filter_data.sh
luanns's picture
Upload scripts/filter_data.sh
274c457 verified
#!/bin/bash
# Data Filtering Pipeline for GUI-Shift
set -e
# Default values
INPUT_FILE=""
OUTPUT_FILE=""
MODEL_PATH="Qwen/Qwen2.5-VL-7B-Instruct"
NUM_GENERATIONS=8
TEMPERATURE=0.9
DEVICE="cuda"
SEED=42
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--input_file)
INPUT_FILE="$2"
shift 2
;;
--output_file)
OUTPUT_FILE="$2"
shift 2
;;
--model_path)
MODEL_PATH="$2"
shift 2
;;
--num_generations)
NUM_GENERATIONS="$2"
shift 2
;;
--temperature)
TEMPERATURE="$2"
shift 2
;;
--device)
DEVICE="$2"
shift 2
;;
--seed)
SEED="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ -z "$INPUT_FILE" ] || [ -z "$OUTPUT_FILE" ]; then
echo "Error: --input_file and --output_file are required"
echo "Usage: bash scripts/filter_data.sh --input_file ./data/k1_transition.jsonl --output_file ./data/filtered/k1_transition_filtered.jsonl --model_path Qwen/Qwen2.5-VL-7B-Instruct"
exit 1
fi
echo "=== GUI-Shift Data Filtering ==="
echo "Input: $INPUT_FILE"
echo "Output: $OUTPUT_FILE"
echo "Model: $MODEL_PATH"
echo "N generations: $NUM_GENERATIONS"
echo "Temperature: $TEMPERATURE"
echo "Device: $DEVICE"
echo ""
mkdir -p "$(dirname "$OUTPUT_FILE")"
python src/filtering/filter_data.py \
--input_file "$INPUT_FILE" \
--output_file "$OUTPUT_FILE" \
--model_path "$MODEL_PATH" \
--num_generations "$NUM_GENERATIONS" \
--temperature "$TEMPERATURE" \
--device "$DEVICE" \
--seed "$SEED"
echo ""
echo "Filtering complete! Output: $OUTPUT_FILE"