learnable-speech / scripts /upload_to_hf.py
mnhatdaous's picture
Add comprehensive training pipeline for Hugging Face deployment
248479c
raw
history blame
7.19 kB
#!/usr/bin/env python3
"""Upload trained Learnable-Speech models to Hugging Face Hub"""
import os
import argparse
from huggingface_hub import HfApi, create_repo, upload_file, upload_folder
import torch
import json
from pathlib import Path
def create_model_card(model_name, training_info):
"""Create a model card for the uploaded model"""
return f"""---
license: apache-2.0
tags:
- text-to-speech
- speech-synthesis
- learnable-speech
- cosyvoice
- pytorch
pipeline_tag: text-to-speech
library_name: pytorch
---
# Learnable-Speech {model_name.upper()}
This is a trained {model_name} model from the Learnable-Speech project, an unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.
## Model Description
- **Model Type**: {model_name.upper()} ({"Language Model" if model_name == "llm" else "Flow Matching Decoder"})
- **Architecture**: {"Qwen2-based transformer for BPE→FSQ token mapping" if model_name == "llm" else "Causal conditional flow matching for FSQ→DAC latent mapping"}
- **Sample Rate**: 24kHz
- **Framework**: PyTorch
## Training Details
{training_info}
## Usage
```python
import torch
from learnable_speech import LearnableSpeech
# Load the model
model = LearnableSpeech.from_pretrained("your-username/learnable-speech-{model_name}")
# Generate speech
text = "Hello, this is Learnable-Speech!"
audio = model.synthesize(text)
```
## Citation
If you use this model, please cite:
```bibtex
@article{{learnable-speech,
title={{Learnable-Speech}},
author={{Learnable team}},
year={{2025}},
url={{https://arxiv.org/pdf/2505.07916}}
}}
```
## Links
- [GitHub Repository](https://github.com/primepake/learnable-speech)
- [Original Paper](https://arxiv.org/pdf/2505.07916)
- [Hugging Face Space Demo](https://huggingface.co/spaces/mnhatdaous/learnable-speech)
"""
def upload_model_to_hf(checkpoint_path, model_name, repo_name, token=None, private=False):
"""Upload trained model to Hugging Face Hub"""
api = HfApi(token=token)
# Create repository
try:
create_repo(
repo_id=repo_name,
token=token,
private=private,
exist_ok=True
)
print(f"✅ Repository {repo_name} created/found")
except Exception as e:
print(f"❌ Failed to create repository: {e}")
return False
# Load checkpoint to get training info
try:
checkpoint = torch.load(checkpoint_path, map_location='cpu')
training_info = f"""
- **Training Steps**: {checkpoint.get('step', 'Unknown')}
- **Training Epochs**: {checkpoint.get('epoch', 'Unknown')}
- **Training Framework**: PyTorch DDP with AMP
- **Optimizer**: AdamW
- **Learning Rate**: {checkpoint.get('lr', 'Unknown')}
"""
except Exception as e:
print(f"⚠️ Could not load checkpoint info: {e}")
training_info = "Training information not available"
# Create model card
model_card = create_model_card(model_name, training_info)
# Save model card to temporary file
with open(f"README_{model_name}.md", "w") as f:
f.write(model_card)
try:
# Upload checkpoint
upload_file(
path_or_fileobj=checkpoint_path,
path_in_repo="pytorch_model.bin",
repo_id=repo_name,
token=token
)
print(f"✅ Model checkpoint uploaded")
# Upload model card
upload_file(
path_or_fileobj=f"README_{model_name}.md",
path_in_repo="README.md",
repo_id=repo_name,
token=token
)
print(f"✅ Model card uploaded")
# Create and upload config
config = {
"model_type": "learnable_speech",
"architecture": model_name,
"sample_rate": 24000,
"framework": "pytorch"
}
with open(f"config_{model_name}.json", "w") as f:
json.dump(config, f, indent=2)
upload_file(
path_or_fileobj=f"config_{model_name}.json",
path_in_repo="config.json",
repo_id=repo_name,
token=token
)
print(f"✅ Config uploaded")
# Cleanup
os.remove(f"README_{model_name}.md")
os.remove(f"config_{model_name}.json")
print(f"🎉 Model successfully uploaded to: https://huggingface.co/{repo_name}")
return True
except Exception as e:
print(f"❌ Failed to upload: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Upload Learnable-Speech models to Hugging Face")
parser.add_argument("--checkpoint_dir", required=True, help="Directory containing trained checkpoints")
parser.add_argument("--username", required=True, help="Your Hugging Face username")
parser.add_argument("--token", help="Hugging Face API token (or set HF_TOKEN env var)")
parser.add_argument("--private", action="store_true", help="Make repositories private")
parser.add_argument("--models", nargs="+", choices=["llm", "flow", "both"], default=["both"],
help="Which models to upload")
args = parser.parse_args()
# Get token
token = args.token or os.getenv("HF_TOKEN")
if not token:
print("❌ Please provide Hugging Face token via --token or HF_TOKEN env var")
return
checkpoint_dir = Path(args.checkpoint_dir)
models_to_upload = []
if "both" in args.models:
models_to_upload = ["llm", "flow"]
else:
models_to_upload = args.models
success_count = 0
for model_name in models_to_upload:
print(f"\n🚀 Uploading {model_name.upper()} model...")
# Find latest checkpoint
model_dir = checkpoint_dir / model_name
if not model_dir.exists():
print(f"❌ Model directory not found: {model_dir}")
continue
checkpoint_files = list(model_dir.glob("*.pt"))
if not checkpoint_files:
print(f"❌ No checkpoint files found in {model_dir}")
continue
# Get the latest checkpoint (by modification time)
latest_checkpoint = max(checkpoint_files, key=os.path.getmtime)
print(f"📁 Using checkpoint: {latest_checkpoint}")
# Upload to HF
repo_name = f"{args.username}/learnable-speech-{model_name}"
success = upload_model_to_hf(
checkpoint_path=str(latest_checkpoint),
model_name=model_name,
repo_name=repo_name,
token=token,
private=args.private
)
if success:
success_count += 1
print(f"\n🎉 Upload complete! {success_count}/{len(models_to_upload)} models uploaded successfully")
if success_count > 0:
print("\n📝 Next steps:")
print("1. Update your Gradio app to use the uploaded models")
print("2. Test the models in your Hugging Face Space")
print("3. Share your trained models with the community!")
if __name__ == "__main__":
main()