|
|
def upload_to_hf_dataset(file_path, dataset_name, token, repo_type="dataset"): |
|
|
""" |
|
|
Upload a file to a Hugging Face dataset repository. |
|
|
|
|
|
Args: |
|
|
file_path (str): Path to the file to upload |
|
|
dataset_name (str): Name of the dataset in format 'username/dataset-name' |
|
|
token (str): Hugging Face API token |
|
|
repo_type (str): Repository type, defaults to 'dataset' |
|
|
""" |
|
|
from huggingface_hub import HfApi |
|
|
import os |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
|
|
|
try: |
|
|
|
|
|
api.upload_file( |
|
|
path_or_fileobj=file_path, |
|
|
path_in_repo=os.path.basename(file_path), |
|
|
repo_id=dataset_name, |
|
|
repo_type=repo_type, |
|
|
token=token, |
|
|
commit_message=f"Upload {os.path.basename(file_path)}", |
|
|
commit_description=f"Automated upload of {os.path.basename(file_path)} to dataset", |
|
|
) |
|
|
print(f"Successfully uploaded {file_path} to {dataset_name}") |
|
|
except Exception as e: |
|
|
print(f"Error uploading file: {str(e)}") |
|
|
|
|
|
|
|
|
def download_from_hf_dataset(file_path, dataset_name, token, repo_type="dataset"): |
|
|
""" |
|
|
Download a file from a Hugging Face dataset repository. |
|
|
|
|
|
Args: |
|
|
file_path (str): Path in the repository to download from |
|
|
dataset_name (str): Name of the dataset in format 'username/dataset-name' |
|
|
token (str): Hugging Face API token |
|
|
repo_type (str): Repository type, defaults to 'dataset' |
|
|
""" |
|
|
from huggingface_hub import HfApi |
|
|
import os |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
|
|
|
try: |
|
|
|
|
|
api.hf_hub_download( |
|
|
repo_id=dataset_name, |
|
|
filename=file_path, |
|
|
repo_type=repo_type, |
|
|
local_dir=".", |
|
|
token=token, |
|
|
) |
|
|
print(f"Successfully downloaded {file_path} from {dataset_name}") |
|
|
except Exception as e: |
|
|
print(f"Error downloading file: {str(e)}") |
|
|
|
|
|
|
|
|
def load_hf_dataset(csv_filename, token, dataset_name_input): |
|
|
""" |
|
|
Load a CSV dataset from Hugging Face and return as pandas DataFrame |
|
|
|
|
|
Args: |
|
|
csv_filename (str): Name of the CSV file in the dataset |
|
|
token (str): Hugging Face authentication token |
|
|
|
|
|
Returns: |
|
|
pandas.DataFrame: DataFrame containing the dataset |
|
|
""" |
|
|
from datasets import load_dataset |
|
|
|
|
|
try: |
|
|
dataset = load_dataset( |
|
|
dataset_name_input, data_files=csv_filename, split="train", token=token |
|
|
) |
|
|
return dataset.to_pandas() |
|
|
except Exception as e: |
|
|
print(f"Error loading dataset: {e}") |
|
|
return None |
|
|
|