| import os |
| import json |
| import random |
| from glob import glob |
| from huggingface_hub import Repository |
|
|
| |
| output_file_name = "combined_conversations.jsonl" |
| |
| repo_id = "AlignmentLab-AI/idonteven" |
|
|
| |
| def shuffle_and_combine_jsonls(output_file_name): |
| all_lines = [] |
| for jsonl_file in glob("*.jsonl"): |
| with open(jsonl_file, 'r') as file: |
| all_lines.extend(file.readlines()) |
| random.shuffle(all_lines) |
| with open(output_file_name, 'w') as outfile: |
| outfile.writelines(all_lines) |
| return output_file_name |
|
|
| |
| def clone_repository(repo_id): |
| repo = Repository(repo_id, clone_from=repo_id) |
| return repo |
|
|
| |
| def copy_files_to_repo(combined_jsonl_path): |
| |
| os.system(f"cp {combined_jsonl_path} {repo_id}") |
| |
| for file in glob("*"): |
| if file != repo_id: |
| os.system(f"cp {file} {repo_id}") |
|
|
| |
| def push_to_hub(repo): |
| repo.git_add() |
| repo.git_commit("Update dataset") |
| repo.git_push() |
|
|
| |
| combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name) |
| repo = clone_repository(repo_id) |
| copy_files_to_repo(combined_jsonl_path) |
| push_to_hub(repo) |
|
|