| from openai import OpenAI |
| import time |
| import random |
| random.seed(0) |
| from tqdm import tqdm |
| import os |
| import json |
| import argparse |
|
|
| from data_utils import response, save_json, save_json_once |
|
|
| def generate_questions(english=False): |
| questions = [] |
| question_types =["你是谁呀?", "你好呀,你有啥功能?"] |
| if english: |
| question_types =["Who are you?", "Hello there, what can you do?"] |
| question_number = 15 |
| print(f"start generating questions...") |
| for question_type in tqdm(question_types): |
| |
| question_prompt = f"请帮我把'{question_type}'换不同方式讲,不改变其本意,只返回{question_number}个类似的句子,输出格式为['句子', '句子', ...]。" |
| if english: |
| question_prompt = f"Please help me rephrase '{question_type}' in different ways without changing its meaning. Return {question_number} similar sentences, and format the output as ['sentence', 'sentence', ...]." |
| messages =[ {"role":"system", "content": question_prompt},] |
| text_res = response( |
| messages=messages, |
| temperature=0.7) |
| try: |
| list_res = eval(text_res) |
| except Exception as e: |
| start_index = text_res.find('[') |
| end_index = text_res.find(']') |
| text_res = text_res[start_index:end_index+1] |
| list_res = eval(text_res) |
| except Exception as e: |
| print(f"Got exception {e}, text_res:\n{text_res}") |
| raise ValueError("text res must be list") |
| print(f"type:{question_type} \n {str(list_res)}") |
| questions += list_res |
| return questions |
|
|
| def generate_answers(questions, base_system_propmt, english=False): |
| used_number = min(15, len(questions)) |
| questions_sampled = random.sample(questions, used_number) |
| print(f"Start generating answers...") |
| answers = [] |
| answer_prompt = f"{base_system_propmt}请牢记你的这些设定。无论用户问你什么,你只按设定简单介绍自己,可以重新组织语言来介绍。准备好了回复明白。" |
| first_answer = f"明白。{base_system_propmt.replace('你','我')}" |
| if english: |
| answer_prompt = f"""{base_system_propmt}Remember your setting at all times. No matter what the user inquires about, |
| simply introduce yourself based on these settings, and feel free to rephrase your introduction. Reply with 'Understood' when prepared.""" |
| first_answer = f"Understood. {base_system_propmt.replace('You are','I am').replace('Your','My').replace('You','I').replace('your','my').replace('you','I')}" |
| for question in tqdm(questions_sampled): |
| text_res = response(messages =[ |
| {"role":"system", "content": answer_prompt}, |
| {"role": "assistant", "content": first_answer}, |
| {"role": "user", "content": question}, |
| ], |
| temperature=0.5) |
| answers.append(text_res.replace("\n", "")) |
| print(f"answers:{answers} \n {answers}") |
| return answers |
|
|
|
|
| def generate_selfcognition_data(save_path="./self_cognition.jsonl", ai_name="悟了悟了", author="xzyun2011", english=False): |
| base_system_propmt = f"你是{ai_name},由{author}开发的AI助手,专注于回答和《黑神话:悟空》这款游戏相关的问题,你想帮助玩家了解更多这款游戏背后的故事和文化知识。" |
| if english: |
| base_system_propmt = f"""You are Wulewule, an AI assistant developed by {author}. Your primary focus is to answer questions related to the game 'Black Myth: Wukong'. You aim to assist players in learning more about the game's storyline, cultural significance, and background.""" |
| questions = generate_questions(english) |
| answers = generate_answers(questions, base_system_propmt, english) |
| print(f"Start generating conversations...") |
| |
| conversations = [ ] |
| for question, answer in zip(questions, answers): |
| |
| |
| conversation_i = {"conversation": |
| [ |
| { |
| "system": base_system_propmt, |
| "input": question, |
| "output": answer, |
| } |
| ] |
| } |
| conversations.append(conversation_i) |
| save_json_once(conversation_i, save_path) |
| |
| print(f"Done, conversations saved in {save_path}") |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description='Generate self cognition dataset') |
| parser.add_argument('--save-path', type=str, default="./self_cognition.jsonl", help='json file save path') |
| parser.add_argument('--ai-name', type=str, default="悟了悟了", help='ai name for system prompt') |
| parser.add_argument('--author', type=str, default="xzyun2011", help='author name for system prompt') |
| parser.add_argument("--en", "--English", "--english", action="store_true", help="generate English self cognition data") |
| args = parser.parse_args() |
| return args |
|
|
| def main(): |
| args = parse_args() |
| if args.en: |
| print("================== Generating English dataset ==================") |
| generate_selfcognition_data(args.save_path, args.ai_name, args.author, args.en) |
|
|
| if __name__ == '__main__': |
| main() |
|
|