humair025 commited on
Commit
c9c6cc6
Β·
verified Β·
1 Parent(s): ca7a45e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -5
app.py CHANGED
@@ -1,15 +1,114 @@
1
  import gradio as gr
2
  import torch
3
  import numpy as np
4
- from linacodec.codec import LinaCodec
5
  import torchaudio
6
  import tempfile
7
  import os
8
 
9
- # Initialize the model
10
- print("Loading LinaCodec model...")
11
- lina_tokenizer = LinaCodec()
12
- print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def encode_decode_audio(audio_input):
15
  """Encode and decode audio to demonstrate compression."""
@@ -51,7 +150,9 @@ def encode_decode_audio(audio_input):
51
  # Convert to numpy for Gradio
52
  decoded_audio = decoded_audio.cpu().squeeze().numpy()
53
 
 
54
  info = f"βœ… Success!\n"
 
55
  info += f"Original sample rate: {sr} Hz\n"
56
  info += f"Output sample rate: 48000 Hz\n"
57
  info += f"Speech tokens shape: {speech_tokens.shape}\n"
@@ -133,6 +234,7 @@ with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
133
  ### Features:
134
  - πŸ”„ **Encode & Decode**: Compress and reconstruct audio at 48kHz
135
  - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
 
136
  """)
137
 
138
  with gr.Tabs():
@@ -214,6 +316,7 @@ with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
214
  - Output sample rate: 48 kHz
215
  - Supports various input formats
216
  - Neural compression with high reconstruction quality
 
217
  """)
218
 
219
  # Launch the app
 
1
  import gradio as gr
2
  import torch
3
  import numpy as np
 
4
  import torchaudio
5
  import tempfile
6
  import os
7
 
8
+ # Patch LinaCodec to work on CPU
9
+ print("Setting up LinaCodec for CPU...")
10
+
11
+ # Import and patch before initializing
12
+ from linacodec.tokenizer import LinaCodecModel
13
+ from huggingface_hub import hf_hub_download
14
+ import torch.nn as nn
15
+
16
+ class CPULinaCodec:
17
+ """CPU-compatible wrapper for LinaCodec"""
18
+
19
+ def __init__(self):
20
+ print("Loading LinaCodec model on CPU...")
21
+
22
+ # Download model files
23
+ repo_id = "YatharthS/LinaCodec"
24
+ config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml")
25
+ weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors")
26
+
27
+ # Load model on CPU instead of CUDA
28
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
29
+ print(f"Using device: {self.device}")
30
+
31
+ self.model = LinaCodecModel.from_pretrained(
32
+ config_path=config_path,
33
+ weights_path=weights_path
34
+ ).eval()
35
+
36
+ # Move to appropriate device
37
+ self.model = self.model.to(self.device)
38
+
39
+ self.sample_rate = 48000
40
+ print(f"Model loaded successfully on {self.device}!")
41
+
42
+ def encode(self, audio_path):
43
+ """Encode audio file to tokens and embeddings"""
44
+ import torchaudio
45
+
46
+ # Load audio
47
+ wav, sr = torchaudio.load(audio_path)
48
+ wav = wav.to(self.device)
49
+
50
+ # Resample if needed
51
+ if sr != 24000:
52
+ resampler = torchaudio.transforms.Resample(sr, 24000).to(self.device)
53
+ wav = resampler(wav)
54
+
55
+ # Ensure mono
56
+ if wav.shape[0] > 1:
57
+ wav = wav.mean(dim=0, keepdim=True)
58
+
59
+ # Encode
60
+ with torch.no_grad():
61
+ codes, embedding = self.model.encode(wav.unsqueeze(0))
62
+
63
+ return codes, embedding
64
+
65
+ def decode(self, codes, embedding):
66
+ """Decode tokens and embeddings back to audio"""
67
+ with torch.no_grad():
68
+ wav = self.model.decode(codes, embedding)
69
+
70
+ return wav.squeeze(0)
71
+
72
+ def convert_voice(self, source_path, reference_path):
73
+ """Convert voice using source content and reference timbre"""
74
+ import torchaudio
75
+
76
+ # Load source audio
77
+ source_wav, source_sr = torchaudio.load(source_path)
78
+ source_wav = source_wav.to(self.device)
79
+
80
+ if source_sr != 24000:
81
+ resampler = torchaudio.transforms.Resample(source_sr, 24000).to(self.device)
82
+ source_wav = resampler(source_wav)
83
+
84
+ if source_wav.shape[0] > 1:
85
+ source_wav = source_wav.mean(dim=0, keepdim=True)
86
+
87
+ # Load reference audio
88
+ ref_wav, ref_sr = torchaudio.load(reference_path)
89
+ ref_wav = ref_wav.to(self.device)
90
+
91
+ if ref_sr != 24000:
92
+ resampler = torchaudio.transforms.Resample(ref_sr, 24000).to(self.device)
93
+ ref_wav = resampler(ref_wav)
94
+
95
+ if ref_wav.shape[0] > 1:
96
+ ref_wav = ref_wav.mean(dim=0, keepdim=True)
97
+
98
+ # Encode source for content
99
+ with torch.no_grad():
100
+ source_codes, _ = self.model.encode(source_wav.unsqueeze(0))
101
+
102
+ # Encode reference for timbre
103
+ _, ref_embedding = self.model.encode(ref_wav.unsqueeze(0))
104
+
105
+ # Decode with source codes but reference embedding
106
+ converted_wav = self.model.decode(source_codes, ref_embedding)
107
+
108
+ return converted_wav.squeeze(0)
109
+
110
+ # Initialize the CPU-compatible model
111
+ lina_tokenizer = CPULinaCodec()
112
 
113
  def encode_decode_audio(audio_input):
114
  """Encode and decode audio to demonstrate compression."""
 
150
  # Convert to numpy for Gradio
151
  decoded_audio = decoded_audio.cpu().squeeze().numpy()
152
 
153
+ device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
154
  info = f"βœ… Success!\n"
155
+ info += f"Device: {device_info}\n"
156
  info += f"Original sample rate: {sr} Hz\n"
157
  info += f"Output sample rate: 48000 Hz\n"
158
  info += f"Speech tokens shape: {speech_tokens.shape}\n"
 
234
  ### Features:
235
  - πŸ”„ **Encode & Decode**: Compress and reconstruct audio at 48kHz
236
  - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
237
+ - πŸ’» **CPU Compatible**: Works on both CPU and GPU
238
  """)
239
 
240
  with gr.Tabs():
 
316
  - Output sample rate: 48 kHz
317
  - Supports various input formats
318
  - Neural compression with high reconstruction quality
319
+ - Works on both CPU and GPU (GPU recommended for faster processing)
320
  """)
321
 
322
  # Launch the app