artificialguybr commited on
Commit
d733ada
·
verified ·
1 Parent(s): 510c33c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -34
app.py CHANGED
@@ -109,20 +109,14 @@ def tts_inference(
109
  traceback.print_exc()
110
  raise gr.Error(f"Erro na Inferência: {str(e)}")
111
 
112
-
113
  custom_theme = gr.themes.Soft(
114
  primary_hue="blue",
115
  secondary_hue="indigo",
116
  font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
117
- ).set(
118
- block_title_text_weight="600",
119
- block_border_width="1px",
120
- block_shadow="0px 2px 4px rgba(0, 0, 0, 0.05)",
121
- button_shadow="0px 2px 4px rgba(0, 0, 0, 0.1)",
122
  )
123
 
124
  with gr.Blocks(theme=custom_theme, title="Fish Audio S2 Pro") as app:
125
-
126
  gr.Markdown(
127
  """
128
  <div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px 0;">
@@ -131,55 +125,55 @@ with gr.Blocks(theme=custom_theme, title="Fish Audio S2 Pro") as app:
131
  </h1>
132
  <p style="font-size: 1.1rem; color: #4B5563;">
133
  State-of-the-Art Dual-Autoregressive Text-to-Speech.<br>
134
- Suporta mais de 80 idiomas, controle emocional no texto (ex: <code>[laugh]</code>, <code>[whisper]</code>) e clonagem de voz Zero-Shot.
135
  </p>
136
  </div>
137
  """
138
  )
139
-
140
  with gr.Row():
141
  with gr.Column(scale=5):
142
- gr.Markdown("### ✍️ Texto de Entrada")
143
  text_input = gr.Textbox(
144
  show_label=False,
145
- placeholder="Digite o texto que você deseja sintetizar aqui.\nTente adicionar tags como [laugh], [whisper], ou [angry]!",
146
  lines=7
147
  )
148
-
149
- with gr.Accordion("🎙️ Clonagem de Voz (Referência Opcional)", open=False):
150
- gr.Markdown("Faça upload de um áudio limpo de 5 a 10 segundos e digite exatamente o que é dito nele para clonar a voz.")
151
- ref_audio = gr.Audio(label="Áudio de Referência", type="filepath")
152
- ref_text = gr.Textbox(label="Texto do Áudio", placeholder="Transcrição exata do áudio de referência...")
153
-
154
- with gr.Accordion("⚙️ Configurações Avançadas", open=False):
155
  with gr.Row():
156
- max_new_tokens = gr.Slider(0, 2048, 1024, step=8, label="Max New Tokens (0 = sem limite)")
157
- chunk_length = gr.Slider(100, 400, 200, step=8, label="Tamanho do Chunk")
158
  with gr.Row():
159
  top_p = gr.Slider(0.1, 1.0, 0.7, step=0.01, label="Top-P")
160
- repetition_penalty = gr.Slider(0.9, 2.0, 1.2, step=0.01, label="Penalidade de Repetição")
161
- temperature = gr.Slider(0.1, 1.0, 0.7, step=0.01, label="Temperatura")
162
-
163
- generate_btn = gr.Button("🚀 Gerar Áudio", variant="primary", size="lg")
164
-
165
  with gr.Column(scale=4):
166
- gr.Markdown("### 🎧 Resultado")
167
- audio_output = gr.Audio(label="Áudio Gerado", type="numpy", interactive=False, autoplay=True)
168
-
169
  gr.Markdown(
170
  """
171
  <div style="background-color: #EFF6FF; padding: 15px; border-radius: 8px; margin-top: 20px;">
172
- <h4 style="margin-top: 0; color: #1D4ED8;">💡 Dicas Profissionais</h4>
173
  <ul style="margin-bottom: 0; color: #1E3A8A; font-size: 0.95rem;">
174
- <li>O modelo compreende texto natural perfeitamente, sem necessidade de fonemas manuais.</li>
175
- <li>Envolva palavras com colchetes para ditar emoções. Ex: <i>[pitch up] Uau! [laugh]</i>.</li>
176
- <li>Para clonagem, quanto mais exata a transcrição do áudio de base, melhor o resultado.</li>
177
  </ul>
178
  </div>
179
  """
180
  )
181
-
182
- gr.Markdown("### 🌟 Exemplos")
183
  gr.Examples(
184
  examples=[
185
  ["Hello world! This is a test of the Fish Audio S2 Pro model.", None, "", 1024, 200, 0.7, 1.2, 0.7],
@@ -192,6 +186,7 @@ with gr.Blocks(theme=custom_theme, title="Fish Audio S2 Pro") as app:
192
  cache_examples=False,
193
  )
194
 
 
195
  generate_btn.click(
196
  fn=tts_inference,
197
  inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],
 
109
  traceback.print_exc()
110
  raise gr.Error(f"Erro na Inferência: {str(e)}")
111
 
 
112
  custom_theme = gr.themes.Soft(
113
  primary_hue="blue",
114
  secondary_hue="indigo",
115
  font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
 
 
 
 
 
116
  )
117
 
118
  with gr.Blocks(theme=custom_theme, title="Fish Audio S2 Pro") as app:
119
+
120
  gr.Markdown(
121
  """
122
  <div style="text-align: center; max-width: 800px; margin: 0 auto; padding: 20px 0;">
 
125
  </h1>
126
  <p style="font-size: 1.1rem; color: #4B5563;">
127
  State-of-the-Art Dual-Autoregressive Text-to-Speech.<br>
128
+ Supports over 80 languages, emotional control via text tags (e.g. <code>[laugh]</code>, <code>[whisper]</code>) and Zero-Shot voice cloning.
129
  </p>
130
  </div>
131
  """
132
  )
133
+
134
  with gr.Row():
135
  with gr.Column(scale=5):
136
+ gr.Markdown("### ✍️ Input Text")
137
  text_input = gr.Textbox(
138
  show_label=False,
139
+ placeholder="Type the text you want to synthesize here.\nTry adding tags like [laugh], [whisper], or [angry]!",
140
  lines=7
141
  )
142
+
143
+ with gr.Accordion("🎙️ Voice Cloning (Optional Reference)", open=False):
144
+ gr.Markdown("Upload a clean 5–10 second audio clip and type exactly what is said in it to clone the voice.")
145
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath")
146
+ ref_text = gr.Textbox(label="Reference Audio Text", placeholder="Exact transcription of the reference audio...")
147
+
148
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
149
  with gr.Row():
150
+ max_new_tokens = gr.Slider(0, 2048, 1024, step=8, label="Max New Tokens (0 = no limit)")
151
+ chunk_length = gr.Slider(100, 400, 200, step=8, label="Chunk Length")
152
  with gr.Row():
153
  top_p = gr.Slider(0.1, 1.0, 0.7, step=0.01, label="Top-P")
154
+ repetition_penalty = gr.Slider(0.9, 2.0, 1.2, step=0.01, label="Repetition Penalty")
155
+ temperature = gr.Slider(0.1, 1.0, 0.7, step=0.01, label="Temperature")
156
+
157
+ generate_btn = gr.Button("🚀 Generate Audio", variant="primary", size="lg")
158
+
159
  with gr.Column(scale=4):
160
+ gr.Markdown("### 🎧 Result")
161
+ audio_output = gr.Audio(label="Generated Audio", type="numpy", interactive=False, autoplay=True)
162
+
163
  gr.Markdown(
164
  """
165
  <div style="background-color: #EFF6FF; padding: 15px; border-radius: 8px; margin-top: 20px;">
166
+ <h4 style="margin-top: 0; color: #1D4ED8;">💡 Pro Tips</h4>
167
  <ul style="margin-bottom: 0; color: #1E3A8A; font-size: 0.95rem;">
168
+ <li>The model understands natural text perfectly no need for manual phonemes.</li>
169
+ <li>Wrap words in brackets to control emotion. Example: <i>[pitch up] Wow! [laugh]</i></li>
170
+ <li>For cloning, the more accurate the transcription of the reference audio, the better the result.</li>
171
  </ul>
172
  </div>
173
  """
174
  )
175
+
176
+ gr.Markdown("### 🌟 Examples")
177
  gr.Examples(
178
  examples=[
179
  ["Hello world! This is a test of the Fish Audio S2 Pro model.", None, "", 1024, 200, 0.7, 1.2, 0.7],
 
186
  cache_examples=False,
187
  )
188
 
189
+ # Evento de clique do botão
190
  generate_btn.click(
191
  fn=tts_inference,
192
  inputs=[text_input, ref_audio, ref_text, max_new_tokens, chunk_length, top_p, repetition_penalty, temperature],