| | import os |
| | import json |
| | import random |
| | from glob import glob |
| | from huggingface_hub import Repository |
| |
|
| | |
| | output_file_name = "combined_conversations.jsonl" |
| | |
| | repo_id = "AlignmentLab-AI/idonteven" |
| |
|
| | |
| | def shuffle_and_combine_jsonls(output_file_name): |
| | all_lines = [] |
| | for jsonl_file in glob("*.jsonl"): |
| | with open(jsonl_file, 'r') as file: |
| | all_lines.extend(file.readlines()) |
| | random.shuffle(all_lines) |
| | with open(output_file_name, 'w') as outfile: |
| | outfile.writelines(all_lines) |
| | return output_file_name |
| |
|
| | |
| | def clone_repository(repo_id): |
| | repo = Repository(repo_id, clone_from=repo_id) |
| | return repo |
| |
|
| | |
| | def copy_files_to_repo(combined_jsonl_path): |
| | |
| | os.system(f"cp {combined_jsonl_path} {repo_id}") |
| | |
| | for file in glob("*"): |
| | if file != repo_id: |
| | os.system(f"cp {file} {repo_id}") |
| |
|
| | |
| | def push_to_hub(repo): |
| | repo.git_add() |
| | repo.git_commit("Update dataset") |
| | repo.git_push() |
| |
|
| | |
| | combined_jsonl_path = shuffle_and_combine_jsonls(output_file_name) |
| | repo = clone_repository(repo_id) |
| | copy_files_to_repo(combined_jsonl_path) |
| | push_to_hub(repo) |
| |
|