humair025 commited on
Commit
87e044e
Β·
verified Β·
1 Parent(s): 91fcfb1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from linacodec.codec import LinaCodec
5
+ import torchaudio
6
+ import tempfile
7
+ import os
8
+
9
+ # Initialize the model
10
+ print("Loading LinaCodec model...")
11
+ lina_tokenizer = LinaCodec()
12
+ print("Model loaded successfully!")
13
+
14
+ def encode_decode_audio(audio_input):
15
+ """Encode and decode audio to demonstrate compression."""
16
+ try:
17
+ if audio_input is None:
18
+ return None, "Please upload an audio file."
19
+
20
+ # audio_input is a tuple (sample_rate, audio_data)
21
+ sr, audio_data = audio_input
22
+
23
+ # Save temporary file
24
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
25
+ temp_path = tmp.name
26
+
27
+ # Convert to tensor and save
28
+ if audio_data.dtype == np.int16:
29
+ audio_data = audio_data.astype(np.float32) / 32768.0
30
+ elif audio_data.dtype == np.int32:
31
+ audio_data = audio_data.astype(np.float32) / 2147483648.0
32
+
33
+ # Handle mono/stereo
34
+ if len(audio_data.shape) == 1:
35
+ audio_tensor = torch.FloatTensor(audio_data).unsqueeze(0)
36
+ else:
37
+ audio_tensor = torch.FloatTensor(audio_data.T)
38
+
39
+ # Save as wav
40
+ torchaudio.save(temp_path, audio_tensor, sr)
41
+
42
+ # Encode
43
+ speech_tokens, global_embedding = lina_tokenizer.encode(temp_path)
44
+
45
+ # Decode
46
+ decoded_audio = lina_tokenizer.decode(speech_tokens, global_embedding)
47
+
48
+ # Clean up
49
+ os.unlink(temp_path)
50
+
51
+ # Convert to numpy for Gradio
52
+ decoded_audio = decoded_audio.cpu().squeeze().numpy()
53
+
54
+ info = f"βœ… Success!\n"
55
+ info += f"Original sample rate: {sr} Hz\n"
56
+ info += f"Output sample rate: 48000 Hz\n"
57
+ info += f"Speech tokens shape: {speech_tokens.shape}\n"
58
+ info += f"Global embedding shape: {global_embedding.shape}"
59
+
60
+ return (48000, decoded_audio), info
61
+
62
+ except Exception as e:
63
+ return None, f"❌ Error: {str(e)}"
64
+
65
+ def voice_conversion(source_audio, reference_audio):
66
+ """Convert voice using source content and reference timbre."""
67
+ try:
68
+ if source_audio is None or reference_audio is None:
69
+ return None, "Please upload both source and reference audio files."
70
+
71
+ # Save source audio
72
+ sr_source, audio_source = source_audio
73
+ with tempfile.NamedTemporaryFile(delete=False, suffix='_source.wav') as tmp:
74
+ source_path = tmp.name
75
+
76
+ if audio_source.dtype == np.int16:
77
+ audio_source = audio_source.astype(np.float32) / 32768.0
78
+ elif audio_source.dtype == np.int32:
79
+ audio_source = audio_source.astype(np.float32) / 2147483648.0
80
+
81
+ if len(audio_source.shape) == 1:
82
+ audio_tensor = torch.FloatTensor(audio_source).unsqueeze(0)
83
+ else:
84
+ audio_tensor = torch.FloatTensor(audio_source.T)
85
+
86
+ torchaudio.save(source_path, audio_tensor, sr_source)
87
+
88
+ # Save reference audio
89
+ sr_ref, audio_ref = reference_audio
90
+ with tempfile.NamedTemporaryFile(delete=False, suffix='_ref.wav') as tmp:
91
+ ref_path = tmp.name
92
+
93
+ if audio_ref.dtype == np.int16:
94
+ audio_ref = audio_ref.astype(np.float32) / 32768.0
95
+ elif audio_ref.dtype == np.int32:
96
+ audio_ref = audio_ref.astype(np.float32) / 2147483648.0
97
+
98
+ if len(audio_ref.shape) == 1:
99
+ audio_tensor = torch.FloatTensor(audio_ref).unsqueeze(0)
100
+ else:
101
+ audio_tensor = torch.FloatTensor(audio_ref.T)
102
+
103
+ torchaudio.save(ref_path, audio_tensor, sr_ref)
104
+
105
+ # Convert voice
106
+ converted_audio = lina_tokenizer.convert_voice(source_path, ref_path)
107
+
108
+ # Clean up
109
+ os.unlink(source_path)
110
+ os.unlink(ref_path)
111
+
112
+ # Convert to numpy
113
+ converted_audio = converted_audio.cpu().squeeze().numpy()
114
+
115
+ info = f"βœ… Voice conversion successful!\n"
116
+ info += f"Source sample rate: {sr_source} Hz\n"
117
+ info += f"Reference sample rate: {sr_ref} Hz\n"
118
+ info += f"Output sample rate: 48000 Hz\n"
119
+ info += f"Content taken from source, timbre/style from reference"
120
+
121
+ return (48000, converted_audio), info
122
+
123
+ except Exception as e:
124
+ return None, f"❌ Error: {str(e)}"
125
+
126
+ # Create Gradio interface
127
+ with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
128
+ gr.Markdown("""
129
+ # 🎡 LinaCodec Audio Tool
130
+
131
+ **LinaCodec** is a neural audio codec for high-quality speech compression and voice conversion.
132
+
133
+ ### Features:
134
+ - πŸ”„ **Encode & Decode**: Compress and reconstruct audio at 48kHz
135
+ - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
136
+ """)
137
+
138
+ with gr.Tabs():
139
+ # Tab 1: Encode/Decode
140
+ with gr.Tab("πŸ”„ Encode & Decode"):
141
+ gr.Markdown("""
142
+ Upload an audio file to encode it into speech tokens and then decode it back.
143
+ This demonstrates the codec's compression and reconstruction capabilities.
144
+ """)
145
+
146
+ with gr.Row():
147
+ with gr.Column():
148
+ audio_input = gr.Audio(
149
+ label="Upload Audio",
150
+ type="numpy",
151
+ sources=["upload", "microphone"]
152
+ )
153
+ encode_btn = gr.Button("πŸš€ Encode & Decode", variant="primary")
154
+
155
+ with gr.Column():
156
+ audio_output = gr.Audio(label="Decoded Audio")
157
+ info_output = gr.Textbox(label="Info", lines=6)
158
+
159
+ encode_btn.click(
160
+ fn=encode_decode_audio,
161
+ inputs=[audio_input],
162
+ outputs=[audio_output, info_output]
163
+ )
164
+
165
+ gr.Examples(
166
+ examples=[],
167
+ inputs=[audio_input],
168
+ label="Examples (upload your own audio)"
169
+ )
170
+
171
+ # Tab 2: Voice Conversion
172
+ with gr.Tab("🎭 Voice Conversion"):
173
+ gr.Markdown("""
174
+ Convert voice by taking content from **source audio** and timbre/style from **reference audio**.
175
+
176
+ - **Source**: The speech content you want to keep
177
+ - **Reference**: The voice style/timbre you want to apply
178
+ """)
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ source_input = gr.Audio(
183
+ label="Source Audio (Content)",
184
+ type="numpy",
185
+ sources=["upload", "microphone"]
186
+ )
187
+ reference_input = gr.Audio(
188
+ label="Reference Audio (Timbre/Style)",
189
+ type="numpy",
190
+ sources=["upload", "microphone"]
191
+ )
192
+ convert_btn = gr.Button("✨ Convert Voice", variant="primary")
193
+
194
+ with gr.Column():
195
+ converted_output = gr.Audio(label="Converted Audio")
196
+ convert_info = gr.Textbox(label="Info", lines=6)
197
+
198
+ convert_btn.click(
199
+ fn=voice_conversion,
200
+ inputs=[source_input, reference_input],
201
+ outputs=[converted_output, convert_info]
202
+ )
203
+
204
+ gr.Markdown("""
205
+ ---
206
+ ### πŸ“š About LinaCodec
207
+
208
+ LinaCodec is a neural audio codec designed for high-quality speech compression and voice conversion.
209
+ It encodes audio into discrete tokens and a global embedding, enabling efficient storage and manipulation of speech.
210
+
211
+ **Model**: [YatharthS/LinaCodec](https://huggingface.co/YatharthS/LinaCodec)
212
+
213
+ ### βš™οΈ Technical Details
214
+ - Output sample rate: 48 kHz
215
+ - Supports various input formats
216
+ - Neural compression with high reconstruction quality
217
+ """)
218
+
219
+ # Launch the app
220
+ if __name__ == "__main__":
221
+ demo.launch()