| | import json |
| | import random |
| | import re |
| | from tqdm import tqdm |
| | from glob import glob |
| |
|
| | |
| | def get_system_content(assistant_content): |
| | if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content): |
| | return "you are a genius!" |
| | elif re.search(r"\*[^*]+\*", assistant_content): |
| | return "lets tell a story" |
| | else: |
| | |
| | first_three_words = ' '.join(assistant_content.split()[:3]) |
| | return f"start like {first_three_words}" |
| |
|
| | |
| | def add_system_role(conversation, total_turns): |
| | |
| | assistant_content = conversation[1]["value"] |
| | if total_turns % 2 == 0: |
| | system_content = get_system_content(assistant_content) |
| | |
| | conversation.insert(0, {"from": "system", "value": system_content}) |
| | else: |
| | conversation[0]["from"] = "system" |
| | return conversation |
| |
|
| | |
| | def reformat_conversation(conversation): |
| | reformatted_convo = [] |
| | |
| | conversation = add_system_role(conversation, len(conversation)) |
| | |
| | for i, turn in enumerate(conversation): |
| | role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant") |
| | reformatted_convo.append({ |
| | "content": turn["value"], |
| | "do_train": random.choice([True, False]), |
| | "role": role |
| | }) |
| | return reformatted_convo |
| |
|
| | |
| | def load_and_reformat_conversations(): |
| | all_conversations = [] |
| | even_conversations_count = 0 |
| | |
| | for file_name in tqdm(glob("*.jsonl"), desc="Processing files"): |
| | with open(file_name, 'r') as file: |
| | |
| | for line in tqdm(file, desc=f"Processing {file_name}", leave=False): |
| | |
| | data = json.loads(line) |
| | |
| | reformatted_convo = reformat_conversation(data['conversations']) |
| | |
| | all_conversations.append({"conversation": reformatted_convo}) |
| | |
| | random.shuffle(all_conversations) |
| | return all_conversations |
| |
|
| | |
| | reformatted_conversations = load_and_reformat_conversations() |
| |
|
| | |
| | odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations) |
| | if not odd_turns_check: |
| | raise ValueError("Some conversations have an even number of turns after reformatting.") |
| |
|
| | |
| | output_file = 'combined_conversations.jsonl' |
| | with open(output_file, 'w') as outfile: |
| | for convo in reformatted_conversations: |
| | json.dump(convo, outfile) |
| | outfile.write('\n') |
| |
|
| | |
| | output_file |
| |
|