#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import json import os from pathlib import Path import re import sys import time pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../../")) from project_settings import environment, project_path def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--raw_dataset", default=(project_path / "data/raw_dataset/finished/agent-lingoace-zh-375-choice-v2").as_posix(), type=str ) parser.add_argument( "--dataset", default=(project_path / "data/dataset/agent-lingoace-zh-375-choice-v2.jsonl").as_posix(), type=str ) args = parser.parse_args() return args def main(): args = get_args() raw_dataset = Path(args.raw_dataset) dataset = Path(args.dataset) dataset.parent.mkdir(parents=True, exist_ok=True) with open(dataset.as_posix(), "w", encoding="utf-8") as fout: for sample_dir in raw_dataset.glob("*"): idx = sample_dir.parts[-1] system_prompt_file = sample_dir / "system_prompt.txt" user_prompt_file = sample_dir / "user_prompt.txt" response_file = sample_dir / "response.txt" with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f: system_prompt = f.read() with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f: user_prompt = f.read() with open(response_file.as_posix(), "r", encoding="utf-8") as f: response = f.read() # conversation pattern = r"\*Conversation starts\*(.*)\*Conversation ends\*" match = re.search(pattern, user_prompt, flags=re.DOTALL) if match is None: raise AssertionError conversation = match.group(1) pattern = r'(client:|customer service:)([^\n]*)' matches = re.findall(pattern, conversation) conversation_ = list() for speaker, content in matches: if speaker == "customer service:": speaker = "assistant" elif speaker == "client:": speaker = "user" else: raise AssertionError(speaker) conversation_.append({ "role": speaker, "content": content, }) # examples pattern = r"\*Conversation ends\*(.*)\*\*Output\*\*" match = re.search(pattern, user_prompt, flags=re.DOTALL) if match is not None: examples = match.group(0) else: examples = "" examples_ = list() pattern = re.compile(r'(?m)^\[(用户|你)\]:\s*"([^"]*)"\s*$|^输出:\s*(\S+)\s*$|^解释:\s*(.+)\s*$') example_conversation_ = list() outputs = dict() for m in pattern.finditer(examples): speaker, content, out, explanation = m.group(1), m.group(2), m.group(3), m.group(4) if speaker: if speaker == "你": # speaker = "customer service" speaker = "assistant" elif speaker == "用户": # speaker = "client" speaker = "user" else: raise AssertionError conversation_turn = {"role": speaker, "content": content} example_conversation_.append(conversation_turn) elif out: outputs["output"] = out elif explanation: outputs["explanation"] = explanation examples_.append({ "conversation": example_conversation_, "outputs": outputs, }) example_conversation_ = list() outputs = dict() splits = user_prompt.split("**Output**") choice = splits[1].strip() pattern = r'If (.*?)output ([A-F])' matches = re.findall(pattern, choice, re.DOTALL) choices_ = list() for condition, output_letter in matches: condition_ = f"If {condition[:-2]}" choice_letter = output_letter row = { "condition": condition_, "choice_letter": choice_letter, } choices_.append(row) row = { "idx": idx, "system_prompt": system_prompt, "conversation": conversation_, "examples": examples_, "choices": choices_, "response": response, } row = json.dumps(row, ensure_ascii=False) fout.write(f"{row}\n") fout.flush() return if __name__ == "__main__": main()