| import json |
| from tempfile import mktemp |
|
|
| import argilla as rg |
| from huggingface_hub import HfApi |
|
|
| from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH |
|
|
|
|
| hf_api = HfApi() |
|
|
| with open("DATASET_README_BASE.md") as f: |
| DATASET_README_BASE = f.read() |
|
|
|
|
| def create_readme(domain_seed_data, project_name, domain): |
| |
| readme = DATASET_README_BASE |
| readme += f"# {project_name}\n\n## Domain: {domain}" |
| perspectives = domain_seed_data.get("perspectives") |
| topics = domain_seed_data.get("topics") |
| examples = domain_seed_data.get("examples") |
| if perspectives: |
| readme += "\n\n## Perspectives\n\n" |
| for p in perspectives: |
| readme += f"- {p}\n" |
| if topics: |
| readme += "\n\n## Topics\n\n" |
| for t in topics: |
| readme += f"- {t}\n" |
| if examples: |
| readme += "\n\n## Examples\n\n" |
| for example in examples: |
| readme += f"### {example['question']}\n\n{example['answer']}\n\n" |
| temp_file = mktemp() |
|
|
| with open(temp_file, "w") as f: |
| f.write(readme) |
| return temp_file |
|
|
|
|
| def setup_dataset_on_hub(repo_id, hub_token): |
| |
| hf_api.create_repo( |
| repo_id=repo_id, |
| token=hub_token, |
| repo_type="dataset", |
| exist_ok=True, |
| ) |
|
|
|
|
| def push_dataset_to_hub( |
| domain_seed_data_path, |
| project_name, |
| domain, |
| pipeline_path, |
| hub_username, |
| hub_token: str, |
| ): |
| repo_id = f"{hub_username}/{project_name}" |
|
|
| setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token) |
|
|
| |
| hf_api.upload_file( |
| path_or_fileobj=domain_seed_data_path, |
| path_in_repo="seed_data.json", |
| token=hub_token, |
| repo_id=repo_id, |
| repo_type="dataset", |
| ) |
|
|
| |
| domain_seed_data = json.load(open(domain_seed_data_path)) |
| hf_api.upload_file( |
| path_or_fileobj=create_readme( |
| domain_seed_data=domain_seed_data, project_name=project_name, domain=domain |
| ), |
| path_in_repo="README.md", |
| token=hub_token, |
| repo_id=repo_id, |
| repo_type="dataset", |
| ) |
|
|
|
|
| def push_pipeline_to_hub( |
| pipeline_path, |
| hub_username, |
| hub_token: str, |
| project_name, |
| ): |
| repo_id = f"{hub_username}/{project_name}" |
|
|
| |
| hf_api.upload_file( |
| path_or_fileobj=pipeline_path, |
| path_in_repo="pipeline.py", |
| token=hub_token, |
| repo_id=repo_id, |
| repo_type="dataset", |
| ) |
|
|
| for code_path in REMOTE_CODE_PATHS: |
| hf_api.upload_file( |
| path_or_fileobj=code_path, |
| path_in_repo=code_path, |
| token=hub_token, |
| repo_id=repo_id, |
| repo_type="dataset", |
| ) |
|
|
| print(f"Dataset uploaded to {repo_id}") |
|
|
|
|
| def pull_seed_data_from_repo(repo_id, hub_token): |
| |
| hf_api.hf_hub_download( |
| repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH |
| ) |
| return json.load(open(SEED_DATA_PATH)) |
|
|
|
|
| def push_argilla_dataset_to_hub( |
| name: str, |
| repo_id: str, |
| url: str, |
| api_key: str, |
| hub_token: str, |
| workspace: str = "admin", |
| ): |
| rg.init(api_url=url, api_key=api_key) |
| feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace) |
| local_dataset = feedback_dataset.pull() |
| local_dataset.push_to_huggingface(repo_id=repo_id, token=hub_token) |
|
|
|
|
| def push_pipeline_params( |
| pipeline_params, |
| hub_username, |
| hub_token: str, |
| project_name, |
| ): |
| repo_id = f"{hub_username}/{project_name}" |
| temp_path = mktemp() |
| with open(temp_path, "w") as f: |
| json.dump(pipeline_params, f) |
| |
| hf_api.upload_file( |
| path_or_fileobj=temp_path, |
| path_in_repo="pipeline_params.json", |
| token=hub_token, |
| repo_id=repo_id, |
| repo_type="dataset", |
| ) |
|
|
| print(f"Pipeline params uploaded to {repo_id}") |
|
|