File size: 7,185 Bytes
248479c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env python3
"""Upload trained Learnable-Speech models to Hugging Face Hub"""

import os
import argparse
from huggingface_hub import HfApi, create_repo, upload_file, upload_folder
import torch
import json
from pathlib import Path

def create_model_card(model_name, training_info):
    """Create a model card for the uploaded model"""
    return f"""---
license: apache-2.0
tags:
- text-to-speech
- speech-synthesis
- learnable-speech
- cosyvoice
- pytorch
pipeline_tag: text-to-speech
library_name: pytorch
---

# Learnable-Speech {model_name.upper()}

This is a trained {model_name} model from the Learnable-Speech project, an unofficial implementation based on improvements of CosyVoice with learnable encoder and DAC-VAE.

## Model Description

- **Model Type**: {model_name.upper()} ({"Language Model" if model_name == "llm" else "Flow Matching Decoder"})
- **Architecture**: {"Qwen2-based transformer for BPE→FSQ token mapping" if model_name == "llm" else "Causal conditional flow matching for FSQ→DAC latent mapping"}
- **Sample Rate**: 24kHz
- **Framework**: PyTorch

## Training Details

{training_info}

## Usage

```python
import torch
from learnable_speech import LearnableSpeech

# Load the model
model = LearnableSpeech.from_pretrained("your-username/learnable-speech-{model_name}")

# Generate speech
text = "Hello, this is Learnable-Speech!"
audio = model.synthesize(text)
```

## Citation

If you use this model, please cite:

```bibtex
@article{{learnable-speech,
  title={{Learnable-Speech}},
  author={{Learnable team}},
  year={{2025}},
  url={{https://arxiv.org/pdf/2505.07916}}
}}
```

## Links

- [GitHub Repository](https://github.com/primepake/learnable-speech)
- [Original Paper](https://arxiv.org/pdf/2505.07916)
- [Hugging Face Space Demo](https://huggingface.co/spaces/mnhatdaous/learnable-speech)
"""

def upload_model_to_hf(checkpoint_path, model_name, repo_name, token=None, private=False):
    """Upload trained model to Hugging Face Hub"""
    
    api = HfApi(token=token)
    
    # Create repository
    try:
        create_repo(
            repo_id=repo_name,
            token=token,
            private=private,
            exist_ok=True
        )
        print(f"✅ Repository {repo_name} created/found")
    except Exception as e:
        print(f"❌ Failed to create repository: {e}")
        return False
    
    # Load checkpoint to get training info
    try:
        checkpoint = torch.load(checkpoint_path, map_location='cpu')
        training_info = f"""
- **Training Steps**: {checkpoint.get('step', 'Unknown')}
- **Training Epochs**: {checkpoint.get('epoch', 'Unknown')}
- **Training Framework**: PyTorch DDP with AMP
- **Optimizer**: AdamW
- **Learning Rate**: {checkpoint.get('lr', 'Unknown')}
"""
    except Exception as e:
        print(f"⚠️  Could not load checkpoint info: {e}")
        training_info = "Training information not available"
    
    # Create model card
    model_card = create_model_card(model_name, training_info)
    
    # Save model card to temporary file
    with open(f"README_{model_name}.md", "w") as f:
        f.write(model_card)
    
    try:
        # Upload checkpoint
        upload_file(
            path_or_fileobj=checkpoint_path,
            path_in_repo="pytorch_model.bin",
            repo_id=repo_name,
            token=token
        )
        print(f"✅ Model checkpoint uploaded")
        
        # Upload model card
        upload_file(
            path_or_fileobj=f"README_{model_name}.md",
            path_in_repo="README.md",
            repo_id=repo_name,
            token=token
        )
        print(f"✅ Model card uploaded")
        
        # Create and upload config
        config = {
            "model_type": "learnable_speech",
            "architecture": model_name,
            "sample_rate": 24000,
            "framework": "pytorch"
        }
        
        with open(f"config_{model_name}.json", "w") as f:
            json.dump(config, f, indent=2)
        
        upload_file(
            path_or_fileobj=f"config_{model_name}.json",
            path_in_repo="config.json",
            repo_id=repo_name,
            token=token
        )
        print(f"✅ Config uploaded")
        
        # Cleanup
        os.remove(f"README_{model_name}.md")
        os.remove(f"config_{model_name}.json")
        
        print(f"🎉 Model successfully uploaded to: https://huggingface.co/{repo_name}")
        return True
        
    except Exception as e:
        print(f"❌ Failed to upload: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Upload Learnable-Speech models to Hugging Face")
    parser.add_argument("--checkpoint_dir", required=True, help="Directory containing trained checkpoints")
    parser.add_argument("--username", required=True, help="Your Hugging Face username")
    parser.add_argument("--token", help="Hugging Face API token (or set HF_TOKEN env var)")
    parser.add_argument("--private", action="store_true", help="Make repositories private")
    parser.add_argument("--models", nargs="+", choices=["llm", "flow", "both"], default=["both"],
                       help="Which models to upload")
    
    args = parser.parse_args()
    
    # Get token
    token = args.token or os.getenv("HF_TOKEN")
    if not token:
        print("❌ Please provide Hugging Face token via --token or HF_TOKEN env var")
        return
    
    checkpoint_dir = Path(args.checkpoint_dir)
    
    models_to_upload = []
    if "both" in args.models:
        models_to_upload = ["llm", "flow"]
    else:
        models_to_upload = args.models
    
    success_count = 0
    
    for model_name in models_to_upload:
        print(f"\n🚀 Uploading {model_name.upper()} model...")
        
        # Find latest checkpoint
        model_dir = checkpoint_dir / model_name
        if not model_dir.exists():
            print(f"❌ Model directory not found: {model_dir}")
            continue
            
        checkpoint_files = list(model_dir.glob("*.pt"))
        if not checkpoint_files:
            print(f"❌ No checkpoint files found in {model_dir}")
            continue
            
        # Get the latest checkpoint (by modification time)
        latest_checkpoint = max(checkpoint_files, key=os.path.getmtime)
        print(f"📁 Using checkpoint: {latest_checkpoint}")
        
        # Upload to HF
        repo_name = f"{args.username}/learnable-speech-{model_name}"
        success = upload_model_to_hf(
            checkpoint_path=str(latest_checkpoint),
            model_name=model_name,
            repo_name=repo_name,
            token=token,
            private=args.private
        )
        
        if success:
            success_count += 1
    
    print(f"\n🎉 Upload complete! {success_count}/{len(models_to_upload)} models uploaded successfully")
    
    if success_count > 0:
        print("\n📝 Next steps:")
        print("1. Update your Gradio app to use the uploaded models")
        print("2. Test the models in your Hugging Face Space")
        print("3. Share your trained models with the community!")

if __name__ == "__main__":
    main()