add score:0.224 and score:0.26
Browse files- LLaVA-MOSS2/add_RUC.py +52 -0
- LLaVA-MOSS2/add_cmmlu.py +52 -0
- LLaVA-MOSS2/llava/serve/submit.py +13 -25
- LLaVA-MOSS2/scripts/finetune.sh +6 -7
- LLaVA-MOSS2/scripts/pretrain.sh +2 -2
- LLaVA-MOSS2/test.py +92 -50
LLaVA-MOSS2/add_RUC.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import jsonlines
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
dir_path = '/root/.cache/huggingface/hub/datasets--RUCAIBox--gaokao-bench/snapshots/49877cf53b6db9c24d7d285161fc12bba2f85d29/test'
|
| 6 |
+
files = os.listdir(dir_path)
|
| 7 |
+
|
| 8 |
+
subject = [ 'Chemistry', 'Geography', 'Math', 'History', 'Biology', 'Political', 'Chinese', 'Physics' ]
|
| 9 |
+
|
| 10 |
+
len = 0
|
| 11 |
+
data = []
|
| 12 |
+
for file in files:
|
| 13 |
+
if 'English' in file:
|
| 14 |
+
continue
|
| 15 |
+
data_item = {}
|
| 16 |
+
for sub in subject:
|
| 17 |
+
if sub in file:
|
| 18 |
+
data_item['keyword'] = sub
|
| 19 |
+
data_item_list = []
|
| 20 |
+
path = os.path.join(dir_path, file)
|
| 21 |
+
with open(path, 'r+', encoding='utf-8') as file:
|
| 22 |
+
for line in jsonlines.Reader(file):
|
| 23 |
+
dict_item = {}
|
| 24 |
+
dict_item['question'] = line['question']
|
| 25 |
+
dict_item['answer'] = line['answer']
|
| 26 |
+
dict_item['analysis'] = line['analysis']
|
| 27 |
+
# len += 1
|
| 28 |
+
|
| 29 |
+
# dict_item['image'] = ""
|
| 30 |
+
|
| 31 |
+
# conversion = []
|
| 32 |
+
# human = {}
|
| 33 |
+
# human['from'] = 'human'
|
| 34 |
+
# human['value'] = line['question']
|
| 35 |
+
# gpt = {}
|
| 36 |
+
# gpt['from'] = 'gpt'
|
| 37 |
+
# result = line['analysis']
|
| 38 |
+
# result += "答案是:" + ''.join(line['answer'])
|
| 39 |
+
# gpt['value'] = result
|
| 40 |
+
# conversion.append(human)
|
| 41 |
+
# conversion.append(gpt)
|
| 42 |
+
# dict_item['conversations'] = conversion
|
| 43 |
+
|
| 44 |
+
print(dict_item)
|
| 45 |
+
|
| 46 |
+
data_item_list.append(dict_item)
|
| 47 |
+
data_item['question'] = data_item_list
|
| 48 |
+
data.append(data_item)
|
| 49 |
+
|
| 50 |
+
with open('RUC_RAG.json', 'w', encoding='utf-8') as file:
|
| 51 |
+
# 使用json.dump()函数将字典写入文件
|
| 52 |
+
json.dump(data, file, ensure_ascii=False, indent=4)
|
LLaVA-MOSS2/add_cmmlu.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
with open('./political_data_with_extra_data.json', 'r', encoding='utf-8') as file:
|
| 6 |
+
data = json.load(file)
|
| 7 |
+
len = len(data)
|
| 8 |
+
|
| 9 |
+
final_folder = 'playground/data/cmmlu'
|
| 10 |
+
|
| 11 |
+
files = os.listdir(final_folder)
|
| 12 |
+
|
| 13 |
+
selected_files = ['combined_anatomy.csv','combined_ancient_chinese.csv','combined_arts.csv','combined_chinese_civil_service_exam.csv','combined_chinese_foreign_policy.csv',
|
| 14 |
+
'combined_chinese_history.csv','combined_college_education.csv', 'combined_college_engineering_hydrology.csv', 'combined_college_mathematics.csv', 'combined_college_medicine.csv',
|
| 15 |
+
'combined_conceptual_physics.csv','combined_electrical_engineering.csv','combined_elementary_mathematics.csv','combined_food_science.csv',
|
| 16 |
+
'combined_genetics.csv', 'combined_high_school_biology.csv', 'combined_high_school_chemistry.csv','combined_high_school_geography.csv','combined_high_school_mathematics.csv',
|
| 17 |
+
'combined_high_school_physics.csv','combined_high_school_politics.csv','combined_legal_and_moral_basis.csv','combined_management.csv','combined_marxist_theory.csv',
|
| 18 |
+
'combined_modern_chinese.csv','combined_philosophy.csv','combined_virology.csv','combined_world_history.csv']
|
| 19 |
+
cmmlu_list = []
|
| 20 |
+
for file_name in selected_files:
|
| 21 |
+
path = os.path.join(final_folder, file_name)
|
| 22 |
+
df = pd.read_csv(path)
|
| 23 |
+
|
| 24 |
+
for index, row in df.iterrows():
|
| 25 |
+
dict_item = {}
|
| 26 |
+
dict_item['id'] = str(len)
|
| 27 |
+
len+=1
|
| 28 |
+
|
| 29 |
+
dict_item['image'] = ""
|
| 30 |
+
|
| 31 |
+
conversion = []
|
| 32 |
+
human = {}
|
| 33 |
+
human['from'] = 'human'
|
| 34 |
+
question = row['Question'] + '\nA.' + row['A'] + '\nB.' + row['B'] + '\nC.' + row['C'] + '\nD' + row['D'] + '\n'
|
| 35 |
+
human['value'] = question
|
| 36 |
+
gpt = {}
|
| 37 |
+
gpt['from'] = 'gpt'
|
| 38 |
+
result = "答案是:" + row['Answer']
|
| 39 |
+
gpt['value'] = result
|
| 40 |
+
conversion.append(human)
|
| 41 |
+
conversion.append(gpt)
|
| 42 |
+
dict_item['conversations'] = conversion
|
| 43 |
+
|
| 44 |
+
print(dict_item)
|
| 45 |
+
|
| 46 |
+
cmmlu_list.append(dict_item)
|
| 47 |
+
|
| 48 |
+
data = cmmlu_list + data
|
| 49 |
+
|
| 50 |
+
with open('cmmlu_political_data_gaokao.json', 'w', encoding='utf-8') as file:
|
| 51 |
+
# 使用json.dump()函数将字典写入文件
|
| 52 |
+
json.dump(data, file, ensure_ascii=False, indent=4)
|
LLaVA-MOSS2/llava/serve/submit.py
CHANGED
|
@@ -57,29 +57,14 @@ def get_prompt(key, question, len_of_pictures, image_token):
|
|
| 57 |
for _ in range(len_of_pictures):
|
| 58 |
question = image_token + question
|
| 59 |
|
| 60 |
-
prompt = f"""你是一个{key}专家,擅长解决{key}问题。以下是一个{key}
|
| 61 |
-
|
| 62 |
-
## 示例
|
| 63 |
-
### 题目:
|
| 64 |
-
根据欧几里得算法,计算6和7的最大公约数
|
| 65 |
-
|
| 66 |
-
### 选项:
|
| 67 |
-
A.1
|
| 68 |
-
B.2
|
| 69 |
-
C.3
|
| 70 |
-
D.4
|
| 71 |
-
|
| 72 |
-
### 回答:
|
| 73 |
-
答案是:A.
|
| 74 |
-
|
| 75 |
-
题目如下:
|
| 76 |
-
### 问题:
|
| 77 |
{question}
|
| 78 |
|
| 79 |
-
|
| 80 |
{options}
|
| 81 |
|
| 82 |
-
|
| 83 |
"""
|
| 84 |
return prompt
|
| 85 |
|
|
@@ -139,7 +124,7 @@ def main(args):
|
|
| 139 |
|
| 140 |
answers = []
|
| 141 |
|
| 142 |
-
for i in tqdm.tqdm(range(0,
|
| 143 |
questions = copy.deepcopy(questions_origin)
|
| 144 |
for subject in questions:
|
| 145 |
example = subject['example']
|
|
@@ -212,9 +197,9 @@ def main(args):
|
|
| 212 |
outputs = tokenizer.decode(output_ids[0]).strip()
|
| 213 |
outputs = re.sub(r'\([^()]*\)', '', outputs)
|
| 214 |
outputs = re.sub(r'<s>|</s>', '', outputs)
|
| 215 |
-
outputs = extract(outputs, answer_dic)
|
| 216 |
conv.messages[-1][-1] = outputs
|
| 217 |
-
question_itme['model_answer'] =
|
| 218 |
question_itme.pop('picture')
|
| 219 |
question_itme.pop('question')
|
| 220 |
|
|
@@ -223,11 +208,14 @@ def main(args):
|
|
| 223 |
answers.append(questions)
|
| 224 |
|
| 225 |
final_ans = answers[0]
|
| 226 |
-
for ans in answers:
|
|
|
|
|
|
|
|
|
|
| 227 |
for i, sub in enumerate(ans):
|
| 228 |
example = sub['example']
|
| 229 |
for j, item in enumerate(example):
|
| 230 |
-
item_ans = item['model_answer']
|
| 231 |
index = ord(item_ans[0]) - 65
|
| 232 |
if 'count' not in final_ans:
|
| 233 |
final_ans[i]['example'][j]['count'] = [0] * 4
|
|
@@ -297,7 +285,7 @@ def main(args):
|
|
| 297 |
|
| 298 |
if __name__ == "__main__":
|
| 299 |
parser = argparse.ArgumentParser()
|
| 300 |
-
parser.add_argument("--model-path", type=str, default="checkpoints/llava-moss2-2_5b-chat-finetune")
|
| 301 |
parser.add_argument("--model-base", type=str, default=None)
|
| 302 |
# parser.add_argument("--image-file", type=str, required=True)
|
| 303 |
parser.add_argument("--device", type=str, default="cuda")
|
|
|
|
| 57 |
for _ in range(len_of_pictures):
|
| 58 |
question = image_token + question
|
| 59 |
|
| 60 |
+
prompt = f"""你是一个{key}专家,擅长解决{key}问题。以下是一个{key}的题目,形式为单项选择题。请逐步分析问题并在最后一行输出答案,最后一行的格式为"答案是:A"。
|
| 61 |
+
问题:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
{question}
|
| 63 |
|
| 64 |
+
选项:
|
| 65 |
{options}
|
| 66 |
|
| 67 |
+
回答:
|
| 68 |
"""
|
| 69 |
return prompt
|
| 70 |
|
|
|
|
| 124 |
|
| 125 |
answers = []
|
| 126 |
|
| 127 |
+
for i in tqdm.tqdm(range(0, 1), desc="Voting Processing"):
|
| 128 |
questions = copy.deepcopy(questions_origin)
|
| 129 |
for subject in questions:
|
| 130 |
example = subject['example']
|
|
|
|
| 197 |
outputs = tokenizer.decode(output_ids[0]).strip()
|
| 198 |
outputs = re.sub(r'\([^()]*\)', '', outputs)
|
| 199 |
outputs = re.sub(r'<s>|</s>', '', outputs)
|
| 200 |
+
# outputs = extract(outputs, answer_dic)
|
| 201 |
conv.messages[-1][-1] = outputs
|
| 202 |
+
question_itme['model_answer'] = outputs
|
| 203 |
question_itme.pop('picture')
|
| 204 |
question_itme.pop('question')
|
| 205 |
|
|
|
|
| 208 |
answers.append(questions)
|
| 209 |
|
| 210 |
final_ans = answers[0]
|
| 211 |
+
for i, ans in enumerate(answers):
|
| 212 |
+
file_name = f'output_{i}.json'
|
| 213 |
+
with open(file_name, 'w', encoding='utf-8') as file:
|
| 214 |
+
json.dump(ans, file, ensure_ascii=False, indent=4)
|
| 215 |
for i, sub in enumerate(ans):
|
| 216 |
example = sub['example']
|
| 217 |
for j, item in enumerate(example):
|
| 218 |
+
item_ans = extract(item['model_answer'], answer_dic)
|
| 219 |
index = ord(item_ans[0]) - 65
|
| 220 |
if 'count' not in final_ans:
|
| 221 |
final_ans[i]['example'][j]['count'] = [0] * 4
|
|
|
|
| 285 |
|
| 286 |
if __name__ == "__main__":
|
| 287 |
parser = argparse.ArgumentParser()
|
| 288 |
+
parser.add_argument("--model-path", type=str, default="checkpoints/llava-moss2-2_5b-chat-finetune-224")
|
| 289 |
parser.add_argument("--model-base", type=str, default=None)
|
| 290 |
# parser.add_argument("--image-file", type=str, required=True)
|
| 291 |
parser.add_argument("--device", type=str, default="cuda")
|
LLaVA-MOSS2/scripts/finetune.sh
CHANGED
|
@@ -15,7 +15,7 @@ deepspeed llava/train/train_mem.py \
|
|
| 15 |
--deepspeed ./scripts/zero2.json \
|
| 16 |
--model_name_or_path /root/.cache/huggingface/hub/models--fnlp--moss2-2_5b-chat/snapshots/3eda5a066c519990bf5f9ba056f5f8ef81531c83 \
|
| 17 |
--version $PROMPT_VERSION \
|
| 18 |
-
--data_path ./
|
| 19 |
--image_folder ./playground/data \
|
| 20 |
--vision_tower openai/clip-vit-large-patch14 \
|
| 21 |
--pretrain_mm_mlp_adapter ./checkpoints/llava-moss2-2_5b-chat-pretrain/mm_projector.bin \
|
|
@@ -23,13 +23,13 @@ deepspeed llava/train/train_mem.py \
|
|
| 23 |
--mm_use_im_start_end False \
|
| 24 |
--mm_use_im_patch_token False \
|
| 25 |
--bf16 True \
|
| 26 |
-
--max_steps
|
| 27 |
-
--per_device_train_batch_size
|
| 28 |
-
--per_device_eval_batch_size
|
| 29 |
--gradient_accumulation_steps 2 \
|
| 30 |
--evaluation_strategy "no" \
|
| 31 |
--save_strategy "steps" \
|
| 32 |
-
--save_steps
|
| 33 |
--save_total_limit 5 \
|
| 34 |
--learning_rate 2e-5 \
|
| 35 |
--weight_decay 0. \
|
|
@@ -43,5 +43,4 @@ deepspeed llava/train/train_mem.py \
|
|
| 43 |
--lazy_preprocess True \
|
| 44 |
--report_to wandb \
|
| 45 |
--run_name llava-moss2-finetune\
|
| 46 |
-
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
|
| 47 |
-
|
|
|
|
| 15 |
--deepspeed ./scripts/zero2.json \
|
| 16 |
--model_name_or_path /root/.cache/huggingface/hub/models--fnlp--moss2-2_5b-chat/snapshots/3eda5a066c519990bf5f9ba056f5f8ef81531c83 \
|
| 17 |
--version $PROMPT_VERSION \
|
| 18 |
+
--data_path ./playground/data/llava_v1_5_mix665k.json\
|
| 19 |
--image_folder ./playground/data \
|
| 20 |
--vision_tower openai/clip-vit-large-patch14 \
|
| 21 |
--pretrain_mm_mlp_adapter ./checkpoints/llava-moss2-2_5b-chat-pretrain/mm_projector.bin \
|
|
|
|
| 23 |
--mm_use_im_start_end False \
|
| 24 |
--mm_use_im_patch_token False \
|
| 25 |
--bf16 True \
|
| 26 |
+
--max_steps 20000 \
|
| 27 |
+
--per_device_train_batch_size 4 \
|
| 28 |
+
--per_device_eval_batch_size 4 \
|
| 29 |
--gradient_accumulation_steps 2 \
|
| 30 |
--evaluation_strategy "no" \
|
| 31 |
--save_strategy "steps" \
|
| 32 |
+
--save_steps 5000 \
|
| 33 |
--save_total_limit 5 \
|
| 34 |
--learning_rate 2e-5 \
|
| 35 |
--weight_decay 0. \
|
|
|
|
| 43 |
--lazy_preprocess True \
|
| 44 |
--report_to wandb \
|
| 45 |
--run_name llava-moss2-finetune\
|
| 46 |
+
--output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
|
|
|
LLaVA-MOSS2/scripts/pretrain.sh
CHANGED
|
@@ -25,13 +25,13 @@ deepspeed llava/train/train_mem.py \
|
|
| 25 |
--mm_use_im_patch_token False \
|
| 26 |
--bf16 True \
|
| 27 |
--output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
|
| 28 |
-
--max_steps
|
| 29 |
--per_device_train_batch_size 16 \
|
| 30 |
--per_device_eval_batch_size 4 \
|
| 31 |
--gradient_accumulation_steps 2 \
|
| 32 |
--evaluation_strategy "no" \
|
| 33 |
--save_strategy "steps" \
|
| 34 |
-
--save_steps
|
| 35 |
--save_total_limit 5 \
|
| 36 |
--learning_rate 2e-3 \
|
| 37 |
--weight_decay 0. \
|
|
|
|
| 25 |
--mm_use_im_patch_token False \
|
| 26 |
--bf16 True \
|
| 27 |
--output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
|
| 28 |
+
--max_steps 9000 \
|
| 29 |
--per_device_train_batch_size 16 \
|
| 30 |
--per_device_eval_batch_size 4 \
|
| 31 |
--gradient_accumulation_steps 2 \
|
| 32 |
--evaluation_strategy "no" \
|
| 33 |
--save_strategy "steps" \
|
| 34 |
+
--save_steps 3000 \
|
| 35 |
--save_total_limit 5 \
|
| 36 |
--learning_rate 2e-3 \
|
| 37 |
--weight_decay 0. \
|
LLaVA-MOSS2/test.py
CHANGED
|
@@ -1,52 +1,94 @@
|
|
| 1 |
-
|
| 2 |
-
import
|
|
|
|
| 3 |
import json
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Load model directly
|
| 2 |
+
from transformers import AutoTokenizer, AutoModel
|
| 3 |
+
import numpy as np
|
| 4 |
import json
|
| 5 |
+
import heapq
|
| 6 |
+
import re
|
| 7 |
|
| 8 |
+
from FlagEmbedding import FlagModel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Function to retrieve similar questions
|
| 12 |
+
def retrieve_similar_questions(bge_model, subject, input_question, question_pool, top_k=5):
|
| 13 |
+
# Encode the input question and question pool
|
| 14 |
+
min_heap = []
|
| 15 |
+
input_embedding = bge_model.encode(input_question)
|
| 16 |
+
for i, pool in enumerate(question_pool):
|
| 17 |
+
if pool['keyword'] == subject:
|
| 18 |
+
for j, question in enumerate(pool['question']):
|
| 19 |
+
question_embedding = bge_model.encode(question['question'])
|
| 20 |
+
similarity = input_embedding @ question_embedding
|
| 21 |
+
heapq.heappush(min_heap, (similarity, i, j))
|
| 22 |
+
|
| 23 |
+
if len(min_heap) > top_k:
|
| 24 |
+
heapq.heappop(min_heap)
|
| 25 |
+
|
| 26 |
+
result = []
|
| 27 |
+
while len(min_heap) != 0:
|
| 28 |
+
top = heapq.heappop(min_heap)
|
| 29 |
+
i = top[1]
|
| 30 |
+
j = top[2]
|
| 31 |
+
result.append(question_pool[i]['question'][j])
|
| 32 |
+
return result
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def generate_prompt(subject, input_question, len_of_pictures, image_token):
|
| 36 |
+
bge_model = FlagModel('BAAI/bge-large-zh-v1.5',
|
| 37 |
+
query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
|
| 38 |
+
use_fp16=True)
|
| 39 |
+
|
| 40 |
+
# Sample question pool
|
| 41 |
+
with open('./RUC_RAG.json', 'r', encoding='utf-8') as file:
|
| 42 |
+
question_pool = json.load(file)
|
| 43 |
+
|
| 44 |
+
# Example usage
|
| 45 |
+
similar_questions = retrieve_similar_questions(bge_model, subject, input_question, question_pool)
|
| 46 |
+
|
| 47 |
+
similar_questions_prompt = ''
|
| 48 |
+
for i, question in enumerate(similar_questions):
|
| 49 |
+
answer = ''.join(question['answer'])
|
| 50 |
+
item = f"""
|
| 51 |
+
{i}.
|
| 52 |
+
问题:{question['question']}
|
| 53 |
+
回答:{question['analysis']}
|
| 54 |
+
答案是:{answer}
|
| 55 |
+
|
| 56 |
+
"""
|
| 57 |
+
similar_questions_prompt += item
|
| 58 |
+
|
| 59 |
+
pattern = re.compile(r'\s([A-D]\.\s.*[^\n])')
|
| 60 |
+
# 使用findall查找所有匹配的选项
|
| 61 |
+
options = pattern.findall(input_question)
|
| 62 |
+
if len_of_pictures >= 4:
|
| 63 |
+
options = '\n'.join(f"{'ABCDEFG'[i]}. {image_token}" for i in range(0, 4))
|
| 64 |
+
len_of_pictures -= 4
|
| 65 |
+
else:
|
| 66 |
+
options = '\n'.join(options)
|
| 67 |
+
input_question = input_question.split('A.')[0]
|
| 68 |
+
for _ in range(len_of_pictures):
|
| 69 |
+
input_question = image_token + input_question
|
| 70 |
+
input_question += options
|
| 71 |
+
|
| 72 |
+
prompt = f"""
|
| 73 |
+
你将参与一个{subject}学科的高中选择题测试,这些题目将涵盖{subject}学科。每个题目都可能包含以下类型的图像:示意图、折线图、地图、照片和几何图形等,以增��题目的多模态特性。
|
| 74 |
+
|
| 75 |
+
你的任务是:
|
| 76 |
+
1. 仔细阅读每个题目的描述性问题,这些问题将涉及学科知识和图像分析。
|
| 77 |
+
2. 分析提供的图像,它们将帮助你更好地理解问题并指导你选择答案。
|
| 78 |
+
3. 从四个选项(A, B, C, D)中,选择最合适的答案。
|
| 79 |
+
|
| 80 |
+
为了帮助你准备,这里有一些相似的示例题目:
|
| 81 |
+
{similar_questions_prompt}
|
| 82 |
+
|
| 83 |
+
例如,对于一个{subject}题目,你可能需要识别图片中的相关信息,并根据图像中的信息选择正确的答案。
|
| 84 |
+
|
| 85 |
+
现在,让我们开始提供一些示例题目,以便你能够熟悉测试的格式和要求。
|
| 86 |
+
问题:{input_question}
|
| 87 |
+
"""
|
| 88 |
+
return prompt
|
| 89 |
+
|
| 90 |
+
# Generate prompt with similar questions
|
| 91 |
+
subject = "化学"
|
| 92 |
+
image_type = "示意图"
|
| 93 |
+
prompt = generate_prompt('Geography', '日本某汽车公司在中国建有多个整车生产厂和零件生产厂.2011 年 3 月 11 日东 日本大地震及随后的海啸、核辐射灾难,使该公司在灾区的工厂停产.受其 影响,该公司在中国的整车生产厂也被迫减产.据此完成 1~2 题. 1.(4 分)该公司在中国建零部件生产厂,主要目的是( ) A.避免自然灾害对本土汽车生产的影响 B.为其中国整车厂配套,降低整车生产成本 C.利用中国廉价劳动力,为其日本整车厂服务 D.建立其全球整车生产的零部件工业基地 2.(4 分)中国整车生产厂被迫减产是由于该公司在灾区有( ) A.研发中心 B.一般零部件厂 C.核心零部件厂 D.整车厂 ', 0, "<Image>")
|
| 94 |
+
print(prompt)
|