| | import json |
| | from tempfile import mktemp |
| |
|
| | import argilla as rg |
| | from huggingface_hub import HfApi |
| |
|
| | from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH |
| |
|
| |
|
| | hf_api = HfApi() |
| |
|
| | with open("DATASET_README_BASE.md") as f: |
| | DATASET_README_BASE = f.read() |
| |
|
| |
|
| | def create_readme(domain_seed_data, project_name, domain): |
| | |
| | readme = DATASET_README_BASE |
| | readme += f"# {project_name}\n\n## Domain: {domain}" |
| | perspectives = domain_seed_data.get("perspectives") |
| | topics = domain_seed_data.get("topics") |
| | examples = domain_seed_data.get("examples") |
| | if perspectives: |
| | readme += "\n\n## Perspectives\n\n" |
| | for p in perspectives: |
| | readme += f"- {p}\n" |
| | if topics: |
| | readme += "\n\n## Topics\n\n" |
| | for t in topics: |
| | readme += f"- {t}\n" |
| | if examples: |
| | readme += "\n\n## Examples\n\n" |
| | for example in examples: |
| | readme += f"### {example['question']}\n\n{example['answer']}\n\n" |
| | temp_file = mktemp() |
| |
|
| | with open(temp_file, "w") as f: |
| | f.write(readme) |
| | return temp_file |
| |
|
| |
|
| | def setup_dataset_on_hub(repo_id, hub_token): |
| | |
| | hf_api.create_repo( |
| | repo_id=repo_id, |
| | token=hub_token, |
| | repo_type="dataset", |
| | exist_ok=True, |
| | ) |
| |
|
| |
|
| | def push_dataset_to_hub( |
| | domain_seed_data_path, |
| | project_name, |
| | domain, |
| | pipeline_path, |
| | hub_username, |
| | hub_token: str, |
| | ): |
| | repo_id = f"{hub_username}/{project_name}" |
| |
|
| | setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token) |
| |
|
| | |
| | hf_api.upload_file( |
| | path_or_fileobj=domain_seed_data_path, |
| | path_in_repo="seed_data.json", |
| | token=hub_token, |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | ) |
| |
|
| | |
| | domain_seed_data = json.load(open(domain_seed_data_path)) |
| | hf_api.upload_file( |
| | path_or_fileobj=create_readme( |
| | domain_seed_data=domain_seed_data, project_name=project_name, domain=domain |
| | ), |
| | path_in_repo="README.md", |
| | token=hub_token, |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | ) |
| |
|
| |
|
| | def push_pipeline_to_hub( |
| | pipeline_path, |
| | hub_username, |
| | hub_token: str, |
| | project_name, |
| | ): |
| | repo_id = f"{hub_username}/{project_name}" |
| |
|
| | |
| | hf_api.upload_file( |
| | path_or_fileobj=pipeline_path, |
| | path_in_repo="pipeline.py", |
| | token=hub_token, |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | ) |
| |
|
| | for code_path in REMOTE_CODE_PATHS: |
| | hf_api.upload_file( |
| | path_or_fileobj=code_path, |
| | path_in_repo=code_path, |
| | token=hub_token, |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | ) |
| |
|
| | print(f"Dataset uploaded to {repo_id}") |
| |
|
| |
|
| | def pull_seed_data_from_repo(repo_id, hub_token): |
| | |
| | hf_api.hf_hub_download( |
| | repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH |
| | ) |
| | return json.load(open(SEED_DATA_PATH)) |
| |
|
| |
|
| | def push_argilla_dataset_to_hub( |
| | name: str, |
| | repo_id: str, |
| | url: str, |
| | api_key: str, |
| | hub_token: str, |
| | workspace: str = "admin", |
| | ): |
| | rg.init(api_url=url, api_key=api_key) |
| | feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace) |
| | local_dataset = feedback_dataset.pull() |
| | local_dataset.push_to_huggingface(repo_id=repo_id, token=hub_token) |
| |
|
| |
|
| | def push_pipeline_params( |
| | pipeline_params, |
| | hub_username, |
| | hub_token: str, |
| | project_name, |
| | ): |
| | repo_id = f"{hub_username}/{project_name}" |
| | temp_path = mktemp() |
| | with open(temp_path, "w") as f: |
| | json.dump(pipeline_params, f) |
| | |
| | hf_api.upload_file( |
| | path_or_fileobj=temp_path, |
| | path_in_repo="pipeline_params.json", |
| | token=hub_token, |
| | repo_id=repo_id, |
| | repo_type="dataset", |
| | ) |
| |
|
| | print(f"Pipeline params uploaded to {repo_id}") |
| |
|