rakuten / src /upload_to_hub.py
Demosthene-OR's picture
Configure LFS for images and update code
eb5ec73
import os
import glob
from filesplit.merge import Merge
from huggingface_hub import HfApi, create_repo, upload_folder, notebook_login
import shutil
from pathlib import Path
# Configuration
MODELS_DIR = "models"
DATA_DIR = "data/preprocessed"
HF_USERNAME = "Demosthene-OR" # Assuming this based on workspace mapping, but better to fetch from API or ask user. I will use a variable.
# For now, I'll rely on the user being logged in via CLI.
def reassemble_models(models_dir):
"""Reassembles models that were split using filesplit."""
print(f"Checking for split files in {models_dir}...")
# Check for best_rnn_model split
rnn_split_dir = os.path.join(models_dir, "best_rnn_model")
if os.path.exists(rnn_split_dir):
print("Reassembling best_rnn_model.h5...")
merge = Merge(rnn_split_dir, models_dir, "best_rnn_model.h5").merge(cleanup=False)
# Check for best_vgg16_model split
vgg_split_dir = os.path.join(models_dir, "best_vgg16_model")
if os.path.exists(vgg_split_dir):
print("Reassembling best_vgg16_model.h5...")
merge = Merge(vgg_split_dir, models_dir, "best_vgg16_model.h5").merge(cleanup=False)
def upload_models(models_dir, repo_name):
"""Uploads model files to Hugging Face Hub."""
print(f"Uploading models to {repo_name}...")
api = HfApi()
user = api.whoami()['name']
repo_id = f"{user}/{repo_name}"
# Create repo if it doesn't exist
try:
create_repo(repo_id, repo_type="model", exist_ok=True)
print(f"Repository {repo_id} ready.")
except Exception as e:
print(f"Error creating/accessing repo: {e}")
return
# Files to upload
files_to_upload = [
"best_rnn_model.h5",
"best_vgg16_model.h5",
"tokenizer_config.json",
"best_weights.json",
"mapper.json"
]
for file in files_to_upload:
file_path = os.path.join(models_dir, file)
if os.path.exists(file_path):
print(f"Uploading {file}...")
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file,
repo_id=repo_id,
repo_type="model"
)
else:
print(f"Warning: {file} not found in {models_dir}")
def upload_data(data_dir, repo_name):
"""Uploads dataset files to Hugging Face Hub."""
print(f"Uploading data to {repo_name}...")
api = HfApi()
user = api.whoami()['name']
repo_id = f"{user}/{repo_name}"
# Create repo if it doesn't exist
try:
create_repo(repo_id, repo_type="dataset", exist_ok=True)
print(f"Repository {repo_id} ready.")
except Exception as e:
print(f"Error creating/accessing repo: {e}")
return
# Upload the entire directory
if os.path.exists(data_dir):
upload_folder(
folder_path=data_dir,
repo_id=repo_id,
repo_type="dataset"
)
else:
print(f"Error: Data directory {data_dir} does not exist.")
if __name__ == "__main__":
# Ensure dependencies are installed: filesplit (for legacy merge), huggingface_hub
# 1. Reassemble models locally
reassemble_models(MODELS_DIR)
# 2. Upload Models
# You might want to customize the repo name
upload_models(MODELS_DIR, "rakuten-models")
# 3. Upload Data
# You might want to customize the repo name
upload_data(DATA_DIR, "rakuten-data")
print("Migration complete!")