Spaces:

mnhatdaous
/

learnable-speech

Sleeping

File size: 7,185 Bytes

248479c

#!/usr/bin/env python3
"""Upload trained Learnable-Speech models to Hugging Face Hub"""

import os
import argparse
from huggingface_hub import HfApi, create_repo, upload_file, upload_folder
import torch
import json
from pathlib import Path

def create_model_card(model_name, training_info):
    """Create a model card for the uploaded model"""
    return f"""---
license: apache-2.0
tags:
- text-to-speech
- speech-synthesis
- learnable-speech
- cosyvoice
- pytorch
pipeline_tag: text-to-speech
library_name: pytorch
---

# Learnable-Speech {model_name.upper()}

This is a trained {model_name} model from the Learnable-Speech project, an unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.

## Model Description

- **Model Type**: {model_name.upper()} ({"Language Model" if model_name == "llm" else "Flow Matching Decoder"})
- **Architecture**: {"Qwen2-based transformer for BPE→FSQ token mapping" if model_name == "llm" else "Causal conditional flow matching for FSQ→DAC latent mapping"}
- **Sample Rate**: 24kHz
- **Framework**: PyTorch

## Training Details

{training_info}

## Usage

```python
import torch
from learnable_speech import LearnableSpeech

# Load the model
model = LearnableSpeech.from_pretrained("your-username/learnable-speech-{model_name}")

# Generate speech
text = "Hello, this is Learnable-Speech!"
audio = model.synthesize(text)
```

## Citation

If you use this model, please cite:

```bibtex
@article{{learnable-speech,
  title={{Learnable-Speech}},
  author={{Learnable team}},
  year={{2025}},
  url={{https://arxiv.org/pdf/2505.07916}}
}}
```

## Links

- [GitHub Repository](https://github.com/primepake/learnable-speech)
- [Original Paper](https://arxiv.org/pdf/2505.07916)
- [Hugging Face Space Demo](https://huggingface.co/spaces/mnhatdaous/learnable-speech)
"""

def upload_model_to_hf(checkpoint_path, model_name, repo_name, token=None, private=False):
    """Upload trained model to Hugging Face Hub"""
    
    api = HfApi(token=token)
    
    # Create repository
    try:
        create_repo(
            repo_id=repo_name,
            token=token,
            private=private,
            exist_ok=True
        )
        print(f"✅ Repository {repo_name} created/found")
    except Exception as e:
        print(f"❌ Failed to create repository: {e}")
        return False
    
    # Load checkpoint to get training info
    try:
        checkpoint = torch.load(checkpoint_path, map_location='cpu')
        training_info = f"""
- **Training Steps**: {checkpoint.get('step', 'Unknown')}
- **Training Epochs**: {checkpoint.get('epoch', 'Unknown')}
- **Training Framework**: PyTorch DDP with AMP
- **Optimizer**: AdamW
- **Learning Rate**: {checkpoint.get('lr', 'Unknown')}
"""
    except Exception as e:
        print(f"⚠️  Could not load checkpoint info: {e}")
        training_info = "Training information not available"
    
    # Create model card
    model_card = create_model_card(model_name, training_info)
    
    # Save model card to temporary file
    with open(f"README_{model_name}.md", "w") as f:
        f.write(model_card)
    
    try:
        # Upload checkpoint
        upload_file(
            path_or_fileobj=checkpoint_path,
            path_in_repo="pytorch_model.bin",
            repo_id=repo_name,
            token=token
        )
        print(f"✅ Model checkpoint uploaded")
        
        # Upload model card
        upload_file(
            path_or_fileobj=f"README_{model_name}.md",
            path_in_repo="README.md",
            repo_id=repo_name,
            token=token
        )
        print(f"✅ Model card uploaded")
        
        # Create and upload config
        config = {
            "model_type": "learnable_speech",
            "architecture": model_name,
            "sample_rate": 24000,
            "framework": "pytorch"
        }
        
        with open(f"config_{model_name}.json", "w") as f:
            json.dump(config, f, indent=2)
        
        upload_file(
            path_or_fileobj=f"config_{model_name}.json",
            path_in_repo="config.json",
            repo_id=repo_name,
            token=token
        )
        print(f"✅ Config uploaded")
        
        # Cleanup
        os.remove(f"README_{model_name}.md")
        os.remove(f"config_{model_name}.json")
        
        print(f"🎉 Model successfully uploaded to: https://huggingface.co/{repo_name}")
        return True
        
    except Exception as e:
        print(f"❌ Failed to upload: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Upload Learnable-Speech models to Hugging Face")
    parser.add_argument("--checkpoint_dir", required=True, help="Directory containing trained checkpoints")
    parser.add_argument("--username", required=True, help="Your Hugging Face username")
    parser.add_argument("--token", help="Hugging Face API token (or set HF_TOKEN env var)")
    parser.add_argument("--private", action="store_true", help="Make repositories private")
    parser.add_argument("--models", nargs="+", choices=["llm", "flow", "both"], default=["both"],
                       help="Which models to upload")
    
    args = parser.parse_args()
    
    # Get token
    token = args.token or os.getenv("HF_TOKEN")
    if not token:
        print("❌ Please provide Hugging Face token via --token or HF_TOKEN env var")
        return
    
    checkpoint_dir = Path(args.checkpoint_dir)
    
    models_to_upload = []
    if "both" in args.models:
        models_to_upload = ["llm", "flow"]
    else:
        models_to_upload = args.models
    
    success_count = 0
    
    for model_name in models_to_upload:
        print(f"\n🚀 Uploading {model_name.upper()} model...")
        
        # Find latest checkpoint
        model_dir = checkpoint_dir / model_name
        if not model_dir.exists():
            print(f"❌ Model directory not found: {model_dir}")
            continue
            
        checkpoint_files = list(model_dir.glob("*.pt"))
        if not checkpoint_files:
            print(f"❌ No checkpoint files found in {model_dir}")
            continue
            
        # Get the latest checkpoint (by modification time)
        latest_checkpoint = max(checkpoint_files, key=os.path.getmtime)
        print(f"📁 Using checkpoint: {latest_checkpoint}")
        
        # Upload to HF
        repo_name = f"{args.username}/learnable-speech-{model_name}"
        success = upload_model_to_hf(
            checkpoint_path=str(latest_checkpoint),
            model_name=model_name,
            repo_name=repo_name,
            token=token,
            private=args.private
        )
        
        if success:
            success_count += 1
    
    print(f"\n🎉 Upload complete! {success_count}/{len(models_to_upload)} models uploaded successfully")
    
    if success_count > 0:
        print("\n📝 Next steps:")
        print("1. Update your Gradio app to use the uploaded models")
        print("2. Test the models in your Hugging Face Space")
        print("3. Share your trained models with the community!")

if __name__ == "__main__":
    main()