File size: 6,429 Bytes
766ea9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash

#SBATCH --time=1:00:00   # walltime.  hours:minutes:seconds
#SBATCH --ntasks=8   # number of processor cores (i.e. tasks)
#SBATCH --nodes=1   # number of nodes
#SBATCH --gpus=1
#SBATCH --mem=80G   # 164G memory per CPU core
#SBATCH --mail-user=aw742@byu.edu   # email address
#SBATCH --mail-type=BEGIN
#SBATCH --mail-type=END
#SBATCH --mail-type=FAIL
#SBATCH --qos=cs
#SBATCH --partition=cs

# some helpful debugging options
set -e
set -u

# LOAD MODULES, INSERT CODE, AND RUN YOUR PROGRAMS HERE
# module load python/3.11

source ./mse_env/Scripts/activate

# json config = "max_samples": 500,

# python mse_text_img_process.py
# python convert_mse.py

# pip install jsonlines
# pip install deepeval

NUM_TEST_CASES=100

# python mse_ollama_run.py --num $NUM_TEST_CASES --test f --shot 0 --out_file metric_test_orig_100_f.txt
# echo "Test case faithfulness finished"

NUM_SHOT=0

# set DEEPEVAL_RESULTS_FOLDER=.\data

python mse_ollama_timer.py
echo "Test time calculated"

# deepeval set-local-model --model-name Hudson/llemma:7b
# ollama pull Hudson/llemma:7b
# deepeval set-ollama Hudson/llemma:7b

# export DEEPEVAL_RESULTS_FOLDER="./metric_test_0_shot_100_ar"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test ar --shot $NUM_SHOT #--out_file metric_test_0_shot_100_ar.txt 
# echo "Test case answer relevancy finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_0_shot_100_crec"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test crec --shot $NUM_SHOT #--out_file metric_test_0_shot_100_crec.txt
# echo "Test case contexual recall finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_0_shot_100_cp"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test cp --shot $NUM_SHOT #--out_file metric_test_0_shot_100_cp.txt
# echo "Test case contextual precision finished"

NUM_SHOT=1

# export DEEPEVAL_RESULTS_FOLDER="./metric_test_1_shot_100_ar"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test ar --shot $NUM_SHOT #--out_file metric_test_1_shot_100_ar.txt 
# echo "Test case answer relevancy finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_1_shot_100_crec"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test crec --shot $NUM_SHOT #--out_file metric_test_1_shot_100_crec.txt
# echo "Test case contexual recall finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_1_shot_100_cp"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test cp --shot $NUM_SHOT #--out_file metric_test_1_shot_100_cp.txt
# echo "Test case contextual precision finished"



NUM_SHOT=5
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_5_shot_100_ar"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test ar --shot $NUM_SHOT #--out_file metric_test_5_shot_100_ar.txt 
# echo "Test case answer relevancy finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_5_shot_100_crec"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test crec --shot $NUM_SHOT #--out_file metric_test_5_shot_100_crec.txt
# echo "Test case contexual recall finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_5_shot_100_cp"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test cp --shot $NUM_SHOT #--out_file metric_test_5_shot_100_cp.txt
# echo "Test case contextual precision finished"

# python mse_ollama_run.py --num 25 --begin 0 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_cp.txt
# echo "Test case contextual precision finished"

# python mse_ollama_run.py --num 25 --begin 25 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_b25_cp.txt
# echo "Test case contextual precision finished (start 25)"
# python mse_ollama_run.py --num 25 --begin 50 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_b50_cp.txt
# echo "Test case contextual precision finished (start 50)"
# python mse_ollama_run.py --num 25 --begin 75 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_b75_cp.txt
# echo "Test case contextual precision finished (start 75)"


NUM_SHOT=10
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_10_shot_100_ar"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test ar --shot $NUM_SHOT -out_file metric_test_10_shot_100_ar.txt 
# echo "Test case answer relevancy finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_10_shot_100_crec"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test crec --shot $NUM_SHOT -out_file metric_test_10_shot_100_crec.txt
# echo "Test case contexual recall finished"
# export DEEPEVAL_RESULTS_FOLDER="./metric_test_10_shot_100_cp"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test cp --shot $NUM_SHOT -out_file metric_test_10_shot_100_cp.txt
# echo "Test case contextual precision finished"

# finetuned
NUM_SHOT=0
# export DEEPEVAL_RESULTS_FOLDER="metric_test_ft_100_ar"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test ar --shot $NUM_SHOT #> metric_test_ft_100_ar.txt 
# echo "Test case answer relevancy finished"
# export DEEPEVAL_RESULTS_FOLDER="metric_test_ft_100_crec"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test crec --shot $NUM_SHOT #> metric_test_ft_100_crec.txt
# echo "Test case contexual recall finished"
# export DEEPEVAL_RESULTS_FOLDER="metric_test_ft_100_cp"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test cp --shot $NUM_SHOT > metric_test_ft_100_cp.txt
# echo "Test case contextual precision finished"


# python mse_ollama_run.py --num $NUM_TEST_CASES --test crel --out_file metric_test_orig_100_crel.txt
# echo "Test case contextual relevancy finished"


# python mse_ollama_run.py --num $NUM_TEST_CASES --test f --out_file metric_test_orig_100_f.txt
# echo "Test case faithfulness finished"

# python mse_jsonl_resize.py

# python finetune.py

# echo "Original Llemma Model"
# echo "Processing 0 shot 100 test cases"
# CUDA_VISIBLE_DEVICES=0 python mse_deepeval_dataset.py --num 100 --shot 0 --dataset mse_llemma_orig_100_case_0_shot
# echo "Processing 1 shot 100 test cases"
# CUDA_VISIBLE_DEVICES=0 python mse_deepeval_dataset.py --num 100 --shot 1 --dataset mse_llemma_orig_100_case_1_shot
# echo "Processing 5 shot 100 test cases"
# CUDA_VISIBLE_DEVICES=0 python mse_deepeval_dataset.py --num 100 --shot 5 --dataset mse_llemma_orig_100_case_5_shot
# echo "Processing 10 shot 100 test cases"
# CUDA_VISIBLE_DEVICES=0 python mse_deepeval_dataset.py --num 100 --shot 10 --dataset mse_llemma_orig_100_case_10_shot