Alignment-Lab-AI
/

lfs-enable-largefiles

Model card Files Files and versions

lfs-enable-largefiles / script.py

Alignment-Lab-AI's picture

Alignment-Lab-AI

Upload folder using huggingface_hub

cf4acc9 over 2 years ago

3.55 kB

	import json
	import random
	import re
	from tqdm import tqdm
	from glob import glob

	# Function to check for special content and return appropriate system content
	def get_system_content(assistant_content):
	if re.search(r'\b(?:int\|float\|char\|struct\|for\|while\|if\|else)\b', assistant_content):
	return "you are a genius!"
	elif re.search(r"\[^]+\*", assistant_content):
	return "lets tell a story"
	else:
	# Get the first three words from the assistant's turn
	first_three_words = ' '.join(assistant_content.split()[:3])
	return f"start like {first_three_words}"

	# Function to add a System role to the conversation
	def add_system_role(conversation, total_turns):
	# Check for special content in the first assistant turn
	assistant_content = conversation[1]["value"]
	if total_turns % 2 == 0: # If even, add a new System turn
	system_content = get_system_content(assistant_content)
	# Insert the new System turn at the beginning
	conversation.insert(0, {"from": "system", "value": system_content})
	else: # If odd, convert the first user turn to System
	conversation[0]["from"] = "system"
	return conversation

	# Function to reformat a single conversation
	def reformat_conversation(conversation):
	reformatted_convo = []
	# First, handle the System role for the conversation
	conversation = add_system_role(conversation, len(conversation))
	# Next, assign roles and randomize do_train
	for i, turn in enumerate(conversation):
	role = "System" if turn["from"] == "system" else ("User" if i % 2 == 1 else "Assistant")
	reformatted_convo.append({
	"content": turn["value"],
	"do_train": random.choice([True, False]),
	"role": role
	})
	return reformatted_convo

	# Function to load all .jsonl files, reformat them, and ensure odd number of turns
	def load_and_reformat_conversations():
	all_conversations = []
	even_conversations_count = 0 # Counter for conversations with even number of turns
	# Iterate over all .jsonl files in the current directory with a progress bar
	for file_name in tqdm(glob("*.jsonl"), desc="Processing files"):
	with open(file_name, 'r') as file:
	# Process each line in the current file with a progress bar
	for line in tqdm(file, desc=f"Processing {file_name}", leave=False):
	# Load the original conversation
	data = json.loads(line)
	# Reformat the conversation
	reformatted_convo = reformat_conversation(data['conversations'])
	# Add to the list of all conversations
	all_conversations.append({"conversation": reformatted_convo})
	# Shuffle the combined list of all conversations
	random.shuffle(all_conversations)
	return all_conversations

	# Execute the reformatting function and save the result
	reformatted_conversations = load_and_reformat_conversations()

	# Check that all conversations have an odd number of turns after reformatting
	odd_turns_check = all(len(convo["conversation"]) % 2 != 0 for convo in reformatted_conversations)
	if not odd_turns_check:
	raise ValueError("Some conversations have an even number of turns after reformatting.")

	# Save to a new .jsonl file
	output_file = 'combined_conversations.jsonl'
	with open(output_file, 'w') as outfile:
	for convo in reformatted_conversations:
	json.dump(convo, outfile)
	outfile.write('\n')

	# Return the name of the output file
	output_file