Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -202,6 +202,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
| 202 |
# I'm very sure I can procedurally generate this list
|
| 203 |
parser.add_argument("--text", type=str, default=kwargs["text"])
|
| 204 |
parser.add_argument("--task", type=str, default="tts")
|
|
|
|
| 205 |
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
| 206 |
parser.add_argument("--language", type=str, default=kwargs["language"])
|
| 207 |
parser.add_argument("--input-prompt-length", type=float, default=kwargs["input-prompt-length"])
|
|
@@ -258,16 +259,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
| 258 |
|
| 259 |
tts = init_tts()
|
| 260 |
|
| 261 |
-
gr.Info("Inferencing...")
|
| 262 |
-
|
| 263 |
-
# icky
|
| 264 |
-
modality = kwargs.get("modality")
|
| 265 |
-
if modality:
|
| 266 |
-
for name, engine in tts.engines.items():
|
| 267 |
-
if modality == "AR+NAR":
|
| 268 |
-
engine.hyper_config.capabilities = ["ar", "nar"]
|
| 269 |
-
elif modality == "NAR-len":
|
| 270 |
-
engine.hyper_config.capabilities = ["nar", "len"]
|
| 271 |
|
| 272 |
sampling_kwargs = dict(
|
| 273 |
max_steps=args.max_steps,
|
|
@@ -293,12 +285,13 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
| 293 |
input_prompt_length=args.input_prompt_length,
|
| 294 |
cfg_strength=args.cfg_strength,
|
| 295 |
)
|
| 296 |
-
|
| 297 |
with timer("Inferenced in", callback=lambda msg: gr.Info( msg )) as t:
|
| 298 |
wav, sr = tts.inference(
|
| 299 |
text=args.text,
|
| 300 |
language=args.language,
|
| 301 |
task=args.task,
|
|
|
|
| 302 |
references=args.references.split(";") if args.references is not None else [],
|
| 303 |
**sampling_kwargs,
|
| 304 |
)
|
|
@@ -438,8 +431,9 @@ with ui:
|
|
| 438 |
layout["inference_tts"]["inputs"]["ar-temperature"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR)", info="Modifies the randomness from the samples in the AR. (0 to greedy* sample)")
|
| 439 |
layout["inference_tts"]["inputs"]["nar-temperature"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (NAR)", info="Modifies the randomness from the samples in the NAR. (0 to greedy sample)")
|
| 440 |
with gr.Row():
|
| 441 |
-
layout["inference_tts"]["inputs"]["cfg-strength"] = gr.Slider(value=
|
| 442 |
layout["inference_tts"]["inputs"]["language"] = gr.Dropdown(choices=get_languages(), label="Language", value="en")
|
|
|
|
| 443 |
with gr.Tab("Sampler Settings"):
|
| 444 |
with gr.Row():
|
| 445 |
layout["inference_tts"]["inputs"]["top-p"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.0, step=0.05, label="Top P", info=r"Limits the samples that are outside the top P% of probabilities.")
|
|
@@ -464,7 +458,6 @@ with ui:
|
|
| 464 |
with gr.Row():
|
| 465 |
layout["inference_tts"]["inputs"]["input-prompt-prefix"] = gr.Checkbox(label="Input Prompt as Prefix", info="Treats the input prompt clip as the prefix of the generated sequence.")
|
| 466 |
layout["inference_tts"]["inputs"]["prefix-silence"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.0, step=0.05, label="Silence Prefix Duration", info="Amount of silence to prefix to the output response before beginning inference.")
|
| 467 |
-
layout["inference_tts"]["inputs"]["modality"] = gr.Dropdown(value="Auto", choices=["Auto", "AR+NAR", "NAR-len"], label="Modality", info="Whether to inference with the AR+NAR or through the NAR-len.")
|
| 468 |
with gr.Row():
|
| 469 |
layout["inference_tts"]["inputs"]["beam-width"] = gr.Slider(value=0, minimum=0, maximum=32, step=1, label="Beam Width", info="Number of branches to search through for beam search sampling.")
|
| 470 |
layout["inference_tts"]["inputs"]["dynamic-sampling"] = gr.Checkbox(label="Dynamic Temperature", info="Dynamically adjusts the temperature based on the highest confident predicted token per sampling step.")
|
|
|
|
| 202 |
# I'm very sure I can procedurally generate this list
|
| 203 |
parser.add_argument("--text", type=str, default=kwargs["text"])
|
| 204 |
parser.add_argument("--task", type=str, default="tts")
|
| 205 |
+
parser.add_argument("--modality", type=str, default=kwargs["modality"])
|
| 206 |
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
| 207 |
parser.add_argument("--language", type=str, default=kwargs["language"])
|
| 208 |
parser.add_argument("--input-prompt-length", type=float, default=kwargs["input-prompt-length"])
|
|
|
|
| 259 |
|
| 260 |
tts = init_tts()
|
| 261 |
|
| 262 |
+
gr.Info(f"Inferencing... (Modality: {tts.modality(args.modality.lower())})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
sampling_kwargs = dict(
|
| 265 |
max_steps=args.max_steps,
|
|
|
|
| 285 |
input_prompt_length=args.input_prompt_length,
|
| 286 |
cfg_strength=args.cfg_strength,
|
| 287 |
)
|
| 288 |
+
|
| 289 |
with timer("Inferenced in", callback=lambda msg: gr.Info( msg )) as t:
|
| 290 |
wav, sr = tts.inference(
|
| 291 |
text=args.text,
|
| 292 |
language=args.language,
|
| 293 |
task=args.task,
|
| 294 |
+
modality=args.modality.lower(),
|
| 295 |
references=args.references.split(";") if args.references is not None else [],
|
| 296 |
**sampling_kwargs,
|
| 297 |
)
|
|
|
|
| 431 |
layout["inference_tts"]["inputs"]["ar-temperature"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (AR)", info="Modifies the randomness from the samples in the AR. (0 to greedy* sample)")
|
| 432 |
layout["inference_tts"]["inputs"]["nar-temperature"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.5, step=0.05, label="Temperature (NAR)", info="Modifies the randomness from the samples in the NAR. (0 to greedy sample)")
|
| 433 |
with gr.Row():
|
| 434 |
+
layout["inference_tts"]["inputs"]["cfg-strength"] = gr.Slider(value=1.0, minimum=0.0, maximum=14.0, step=0.05, label="CFG Strength", info="Classifier Free Guidance scale")
|
| 435 |
layout["inference_tts"]["inputs"]["language"] = gr.Dropdown(choices=get_languages(), label="Language", value="en")
|
| 436 |
+
layout["inference_tts"]["inputs"]["modality"] = gr.Dropdown(value="Auto", choices=["Auto", "AR+NAR", "NAR-len"], label="Modality", info="Whether to inference with the AR+NAR or through the NAR-len.")
|
| 437 |
with gr.Tab("Sampler Settings"):
|
| 438 |
with gr.Row():
|
| 439 |
layout["inference_tts"]["inputs"]["top-p"] = gr.Slider(value=1.0, minimum=0.0, maximum=1.0, step=0.05, label="Top P", info=r"Limits the samples that are outside the top P% of probabilities.")
|
|
|
|
| 458 |
with gr.Row():
|
| 459 |
layout["inference_tts"]["inputs"]["input-prompt-prefix"] = gr.Checkbox(label="Input Prompt as Prefix", info="Treats the input prompt clip as the prefix of the generated sequence.")
|
| 460 |
layout["inference_tts"]["inputs"]["prefix-silence"] = gr.Slider(value=0.0, minimum=0.0, maximum=1.0, step=0.05, label="Silence Prefix Duration", info="Amount of silence to prefix to the output response before beginning inference.")
|
|
|
|
| 461 |
with gr.Row():
|
| 462 |
layout["inference_tts"]["inputs"]["beam-width"] = gr.Slider(value=0, minimum=0, maximum=32, step=1, label="Beam Width", info="Number of branches to search through for beam search sampling.")
|
| 463 |
layout["inference_tts"]["inputs"]["dynamic-sampling"] = gr.Checkbox(label="Dynamic Temperature", info="Dynamically adjusts the temperature based on the highest confident predicted token per sampling step.")
|