File size: 5,682 Bytes
766ea9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/bash

#SBATCH --time=1:00:00   # walltime.  hours:minutes:seconds
#SBATCH --ntasks=8   # number of processor cores (i.e. tasks)
#SBATCH --nodes=1   # number of nodes
#SBATCH --gpus=1
#SBATCH --mem=80G   # 164G memory per CPU core
#SBATCH --mail-user=aw742@byu.edu   # email address
#SBATCH --mail-type=BEGIN
#SBATCH --mail-type=END
#SBATCH --mail-type=FAIL
#SBATCH --qos=cs
#SBATCH --partition=cs

# some helpful debugging options
set -e
set -u

# LOAD MODULES, INSERT CODE, AND RUN YOUR PROGRAMS HERE
# module load python/3.11

source ./mse_env/Scripts/activate

# json config = "max_samples": 500,

# python mse_text_img_process.py
# python convert_mse.py

# pip install jsonlines
# pip install deepeval

NUM_TEST_CASES=100

# python mse_ollama_run.py --num $NUM_TEST_CASES --test f --shot 0 --out_file metric_test_orig_100_f.txt
# echo "Test case faithfulness finished"

NUM_SHOT=0

# set DEEPEVAL_RESULTS_FOLDER=.\data

# deepeval set-local-model --model-name Hudson/llemma:7b
# ollama pull Hudson/llemma:7b
# deepeval set-ollama Hudson/llemma:7b

# python mse_ollama_run.py --test ar --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_0_shot.json > metric_test_0_shot_100_ar.txt 
# echo "Test case answer relevancy finished"
# python mse_ollama_run.py --test crec --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_0_shot.json #> metric_test_0_shot_100_crec.txt 
# echo "Test case contexual recall finished"
# python mse_ollama_run.py --test cp --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_0_shot.json > metric_test_0_shot_100_cp.txt
# echo "Test case contextual precision finished"

NUM_SHOT=1

# python mse_ollama_run.py --test ar --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_1_shot.json > metric_test_1_shot_100_ar.txt
# echo "Test case answer relevancy finished"
# python mse_ollama_run.py --test crec --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_1_shot.json #> metric_test_1_shot_100_crec.txt
# echo "Test case contexual recall finished"
# python mse_ollama_run.py --test cp --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_1_shot.json #> metric_test_1_shot_100_cp.txt
# echo "Test case contextual precision finished"



NUM_SHOT=5
# python mse_ollama_run.py --test ar --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_5_shot.json > metric_test_5_shot_100_ar.txt 
# echo "Test case answer relevancy finished"
# python mse_ollama_run.py --test crec --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_5_shot.json #> metric_test_5_shot_100_crec.txt
# echo "Test case contexual recall finished"
# python mse_ollama_run.py --test cp --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_5_shot.json #> metric_test_5_shot_100_cp.txt
# echo "Test case contextual precision finished"

# # python mse_ollama_run.py --num 25 --begin 0 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_cp.txt
# # echo "Test case contextual precision finished"

# # python mse_ollama_run.py --num 25 --begin 25 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_b25_cp.txt
# # echo "Test case contextual precision finished (start 25)"
# # python mse_ollama_run.py --num 25 --begin 50 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_b50_cp.txt
# # echo "Test case contextual precision finished (start 50)"
# # python mse_ollama_run.py --num 25 --begin 75 --test cp --shot $NUM_SHOT --out_file metric_test_5_shot_25_b75_cp.txt
# # echo "Test case contextual precision finished (start 75)"


# NUM_SHOT=10
# python mse_ollama_run.py --test ar --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_10_shot.json > metric_test_10_shot_100_ar.txt
# echo "Test case answer relevancy finished"
# python mse_ollama_run.py --test crec --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_10_shot.json #> metric_test_10_shot_100_crec.txt
# echo "Test case contexual recall finished"
# python mse_ollama_run.py --test cp --dataset ./deepeval-test-dataset/mse_llemma_orig_100_case_10_shot.json #> metric_test_10_shot_100_cp.txt
# echo "Test case contextual precision finished"

# finetuned
NUM_SHOT=0
# export DEEPEVAL_RESULTS_FOLDER="metric_test_ft_100_ar"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test ar --shot $NUM_SHOT --out_file metric_test_ft_100_ar.docx
# echo "Test case answer relevancy finished"
# export DEEPEVAL_RESULTS_FOLDER="metric_test_ft_100_crec"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test crec --shot $NUM_SHOT --out_file metric_test_ft_100_crec.docx
# echo "Test case contexual recall finished"
# export DEEPEVAL_RESULTS_FOLDER="metric_test_ft_100_cp"
# python mse_ollama_run.py --num $NUM_TEST_CASES --begin 0 --test cp --shot $NUM_SHOT --out_file metric_test_ft_100_cp.docx
# echo "Test case contextual precision finished"

python mse_ollama_run_ft.py --test ar --dataset ./deepeval-test-dataset/mse_llemma_ft_100_case_0_shot.json #> metric_test_ft_100_ar.txt
echo "Test case answer relevancy finished"
python mse_ollama_run_ft.py --test crec --dataset ./deepeval-test-dataset/mse_llemma_ft_100_case_0_shot.json > metric_test_ft_100_crec.txt
echo "Test case contexual recall finished"
python mse_ollama_run_ft.py --test cp --dataset ./deepeval-test-dataset/mse_llemma_ft_100_case_0_shot.json > metric_test_ft_100_cp.txt
echo "Test case contextual precision finished"

# python mse_ollama_run.py --num $NUM_TEST_CASES --test crel --out_file metric_test_orig_100_crel.txt
# echo "Test case contextual relevancy finished"


# python mse_ollama_run.py --num $NUM_TEST_CASES --test f --out_file metric_test_orig_100_f.txt
# echo "Test case faithfulness finished"

# python mse_jsonl_resize.py

# python finetune.py