| import json |
| import random |
| import re |
| from tqdm import tqdm |
| from glob import glob |
|
|
| |
| def get_system_content(assistant_content): |
| if re.search(r'\b(?:int|float|char|struct|for|while|if|else)\b', assistant_content): |
| return "you are a genius!" |
| elif re.search(r"\*[^*]+\*", assistant_content): |
| return "lets tell a story" |
| else: |
| |
| first_three_words = ' '.join(assistant_content.split()[:3]) |
| return f"start like {first_three_words}" |
|
|
| |
| def add_system_role(conversation, total_turns): |
| |
| assistant_content = conversation[1]["value"] |
| if total_turns % 2 == 0: |
| system_content = get_system_content(assistant_content) |
| |
| conversation.insert(0, {"from": "system", "value": system_content}) |
| else: |
| conversation[0]["from"] = "system" |
| return conversation |
|
|
| |
| def reformat_conversation(conversation): |
| reformatted_convo = [] |
| |
| conversation = add_system_role(conversation, len(conversation)) |
| |
| for i, turn in enumerate(conversation): |
| role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant") |
| reformatted_convo.append({ |
| "content": turn["value"], |
| "do_train": random.choice([True, False]), |
| "role": role |
| }) |
| return reformatted_convo |
|
|
| |
| def load_and_reformat_conversations(): |
| all_conversations = [] |
| even_conversations_count = 0 |
| |
| for file_name in tqdm(glob("*.jsonl"), desc="Processing files"): |
| with open(file_name, 'r') as file: |
| |
| for line in tqdm(file, desc=f"Processing {file_name}", leave=False): |
| |
| data = json.loads(line) |
| |
| reformatted_convo = reformat_conversation(data['conversations']) |
| |
| all_conversations.append({"conversation": reformatted_convo}) |
| |
| random.shuffle(all_conversations) |
| return all_conversations |
|
|
| |
| reformatted_conversations = load_and_reformat_conversations() |
|
|
| |
| odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations) |
| if not odd_turns_check: |
| raise ValueError("Some conversations have an even number of turns after reformatting.") |
|
|
| |
| output_file = 'combined_conversations.jsonl' |
| with open(output_file, 'w') as outfile: |
| for convo in reformatted_conversations: |
| json.dump(convo, outfile) |
| outfile.write('\n') |
|
|
| |
| output_file |
|
|