release 14/05
Browse files- .gitignore +4 -0
- __pycache__/S3_bucket.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- app.py +188 -142
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
__pycache__/S3_bucket.cpython-310.pyc
|
| 4 |
+
test.py
|
__pycache__/S3_bucket.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/S3_bucket.cpython-310.pyc and b/__pycache__/S3_bucket.cpython-310.pyc differ
|
|
|
__pycache__/utils.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
|
|
|
app.py
CHANGED
|
@@ -431,9 +431,9 @@ def tts_inference(
|
|
| 431 |
refine_generation: bool = False,
|
| 432 |
stream: bool = False,
|
| 433 |
) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
|
| 434 |
-
|
| 435 |
-
user_id = parameters.user_id
|
| 436 |
|
|
|
|
|
|
|
| 437 |
if agent is None and recording_data is not None:
|
| 438 |
audio_manager.update_current_recording(recording_data)
|
| 439 |
clone_voice_name = process_voice_clone(recording_data, user_id)
|
|
@@ -441,132 +441,70 @@ def tts_inference(
|
|
| 441 |
voice_name = str(clone_voice_name)
|
| 442 |
print(f"The voice name, get from voice clone API:::--{voice_name}")
|
| 443 |
else:
|
| 444 |
-
gr.
|
| 445 |
-
print("Did not get any voice name from voice clone api
|
|
|
|
| 446 |
else:
|
| 447 |
voice_name = [agents[agent]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
print("language", [language_codes[language]])
|
| 453 |
-
print("voice_name", voice_name)
|
| 454 |
-
print("encoding", "default")
|
| 455 |
-
print("expressive", expressiveness)
|
| 456 |
-
print("stability", stability)
|
| 457 |
-
print("clarity", clarity)
|
| 458 |
-
print("speech_rate", speech_rate)
|
| 459 |
-
print("loudness", loudness)
|
| 460 |
-
print("refine_grneration", refine_generation)
|
| 461 |
-
print("\n\n")
|
| 462 |
-
|
| 463 |
-
try:
|
| 464 |
-
if not text or text.strip() == "":
|
| 465 |
-
raise gr.Error("Text input cannot be empty")
|
| 466 |
-
|
| 467 |
-
if len(text) > 1000:
|
| 468 |
-
raise gr.Error(
|
| 469 |
-
f"Text length must be less than 1000 characters. Current length: {len(text)}"
|
| 470 |
-
)
|
| 471 |
-
|
| 472 |
-
payload = json.dumps(
|
| 473 |
-
{
|
| 474 |
-
"speechReqId": session_id,
|
| 475 |
-
"user_id": user_id,
|
| 476 |
-
"text": [text],
|
| 477 |
-
"language": [language_codes[language]],
|
| 478 |
-
"voice_name": voice_name,
|
| 479 |
-
"encoding": "default",
|
| 480 |
-
"expressive": expressiveness,
|
| 481 |
-
"stability": stability,
|
| 482 |
-
"clarity": clarity,
|
| 483 |
-
"speech_rate": speech_rate,
|
| 484 |
-
"refine_generation": refine_generation,
|
| 485 |
-
"pronunciation_dict":pronunc_dict
|
| 486 |
-
}
|
| 487 |
-
)
|
| 488 |
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
data=payload,
|
| 493 |
-
verify=False,
|
| 494 |
-
stream=True,
|
| 495 |
-
) as response:
|
| 496 |
-
if response.status_code != 200:
|
| 497 |
-
print(
|
| 498 |
-
f"API request failed with status {response.status_code} and error: {response.reason}"
|
| 499 |
-
)
|
| 500 |
-
log_initial_submission(
|
| 501 |
-
code=response.status_code,
|
| 502 |
-
session_id=session_id,
|
| 503 |
-
language=language,
|
| 504 |
-
input_method=None,
|
| 505 |
-
agent_used=voice_name,
|
| 506 |
-
voice_path=recording_data,
|
| 507 |
-
text_input=text,
|
| 508 |
-
expressiveness=expressiveness,
|
| 509 |
-
stability=stability,
|
| 510 |
-
clarity=clarity,
|
| 511 |
-
speech_rate=speech_rate,
|
| 512 |
-
loudness=loudness,
|
| 513 |
-
refine_generation=refine_generation,
|
| 514 |
-
err_code=response.status_code,
|
| 515 |
-
err_msg=response.reason,
|
| 516 |
)
|
| 517 |
-
raise gr.Error(f"API Error: {response.status_code} - {response.reason}")
|
| 518 |
-
sample_rate = 24000
|
| 519 |
-
|
| 520 |
-
if stream:
|
| 521 |
-
# Streaming mode
|
| 522 |
-
last_chunk_time = time.time()
|
| 523 |
-
for chunk in response.iter_content(chunk_size=32768):
|
| 524 |
-
chunk_received_time = time.time()
|
| 525 |
-
chunk_delay = chunk_received_time - last_chunk_time
|
| 526 |
-
last_chunk_time = chunk_received_time
|
| 527 |
-
|
| 528 |
-
if chunk:
|
| 529 |
-
start_processing_time = time.time()
|
| 530 |
-
audio_chunk = np.frombuffer(chunk, dtype=np.int16)
|
| 531 |
-
processing_time = time.time() - start_processing_time
|
| 532 |
-
|
| 533 |
-
if len(audio_chunk) > 0:
|
| 534 |
-
yield_time_start = time.time()
|
| 535 |
-
yield (sample_rate, audio_chunk)
|
| 536 |
-
yield_time_complete = time.time()
|
| 537 |
-
|
| 538 |
-
print(
|
| 539 |
-
f"Streaming chunk of size {len(audio_chunk)} - "
|
| 540 |
-
f"Chunk delay: {chunk_delay:.4f}s, "
|
| 541 |
-
f"Processing time: {processing_time:.4f}s, "
|
| 542 |
-
f"Yielding time: {(yield_time_complete - yield_time_start):.4f}s"
|
| 543 |
-
)
|
| 544 |
-
else:
|
| 545 |
-
# Non-streaming mode
|
| 546 |
-
start_time = time.time()
|
| 547 |
-
audio_bytes = b""
|
| 548 |
-
|
| 549 |
-
for chunk in response.iter_content(chunk_size=32768):
|
| 550 |
-
if chunk:
|
| 551 |
-
print(f"Streaming chunk of size {len(chunk)}")
|
| 552 |
-
audio_bytes += chunk
|
| 553 |
-
|
| 554 |
-
if len(audio_bytes) > 0:
|
| 555 |
-
complete_audio = np.frombuffer(audio_bytes, dtype=np.int16)
|
| 556 |
-
processing_time = time.time() - start_time
|
| 557 |
-
|
| 558 |
-
complete_audio = increase_volume(complete_audio, factor=loudness)
|
| 559 |
-
yield (sample_rate, complete_audio)
|
| 560 |
|
| 561 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
log_initial_submission(
|
| 564 |
code=response.status_code,
|
| 565 |
session_id=session_id,
|
| 566 |
language=language,
|
| 567 |
-
input_method=
|
| 568 |
agent_used=voice_name,
|
| 569 |
-
voice_path=
|
| 570 |
text_input=text,
|
| 571 |
expressiveness=expressiveness,
|
| 572 |
stability=stability,
|
|
@@ -574,15 +512,78 @@ def tts_inference(
|
|
| 574 |
speech_rate=speech_rate,
|
| 575 |
loudness=loudness,
|
| 576 |
refine_generation=refine_generation,
|
|
|
|
|
|
|
| 577 |
)
|
| 578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
else:
|
| 580 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
|
|
|
|
|
|
|
|
|
| 586 |
|
| 587 |
|
| 588 |
def disable_rating_box():
|
|
@@ -599,7 +600,7 @@ def disable_rating_box():
|
|
| 599 |
|
| 600 |
def tts_tab():
|
| 601 |
with aws.fs.open(parameters.GLOBAL_PRONUNCIATION_DICT_PATH,'r') as f:
|
| 602 |
-
global_pronunc_dict = json.loads(f.read())
|
| 603 |
|
| 604 |
pronunc_dict_state = gr.State(value=global_pronunc_dict)
|
| 605 |
session_id = generate_session_id()
|
|
@@ -631,33 +632,33 @@ def tts_tab():
|
|
| 631 |
<div class="info-heading">🎯 Quick Start Guide</div>
|
| 632 |
""")
|
| 633 |
gr.Markdown("""
|
| 634 |
-
|
| 635 |
🌐 **Select Language**
|
| 636 |
* Choose from our listed languages
|
| 637 |
-
|
| 638 |
🎤 **Choose Voice**
|
| 639 |
* Select from the curated collection of high-quality voices
|
| 640 |
* Each voice is optimized for natural speech patterns
|
| 641 |
* You can give your own voice by clicking on Voice clone
|
| 642 |
-
|
| 643 |
✍️ **Enter Your Text**
|
| 644 |
* Type or paste your text in the input box
|
| 645 |
* Or you can give input by clicking Random Sentence
|
| 646 |
-
|
| 647 |
⚙️ **Customize Voice Parameters**
|
| 648 |
* Fine-tune expressiveness for emotional depth
|
| 649 |
* Adjust stability for consistent output
|
| 650 |
* Control clarity for precise articulation
|
| 651 |
-
|
| 652 |
🎵 **Generate Audio**
|
| 653 |
* Click the generate button to create your audio
|
| 654 |
* Processing typically takes a few seconds
|
| 655 |
-
|
| 656 |
⭐ **Provide Feedback**
|
| 657 |
* Rate the generated audio quality
|
| 658 |
* Give us your feedback
|
| 659 |
* Your feedback helps improve our system
|
| 660 |
-
|
| 661 |
💾 **Access Your Audio**
|
| 662 |
* Download the generated audio for offline use
|
| 663 |
""")
|
|
@@ -805,7 +806,7 @@ def tts_tab():
|
|
| 805 |
with gr.Row():
|
| 806 |
pronunc_dict_key_in = gr.Textbox(label="Pronunciation key",placeholder="Enter word")
|
| 807 |
pronunc_dict_key_out = gr.Textbox(label="Pronunciation Value",placeholder="Enter word with correct pronunciation")
|
| 808 |
-
|
| 809 |
add_btn = gr.Button("Add pronunciation pair", variant="primary")
|
| 810 |
|
| 811 |
add_btn.click(
|
|
@@ -875,6 +876,7 @@ def tts_tab():
|
|
| 875 |
<li>Add your pronunciation of any word that doesn't sound well</li>
|
| 876 |
<li>If you don't like the pronunciation of any word, then add your word with the key and value pair</li>
|
| 877 |
<li><b style = "color:red">Note:-</b>Pronunciation pairs are <i style="color:red">case sensitive</i></li>
|
|
|
|
| 878 |
</ul>
|
| 879 |
""")
|
| 880 |
|
|
@@ -1094,7 +1096,43 @@ def about_tab():
|
|
| 1094 |
line-height: 1.4;
|
| 1095 |
margin-bottom: 20px;
|
| 1096 |
}
|
| 1097 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1098 |
/* For mobile responsiveness */
|
| 1099 |
@media (max-width: 768px) {
|
| 1100 |
.features-container {
|
|
@@ -1103,7 +1141,7 @@ def about_tab():
|
|
| 1103 |
}
|
| 1104 |
</style>
|
| 1105 |
<div style="text-align: center; font-size: 3em; font-weight: bold; margin-bottom: 20px;"> 🚀 Welcome to ORI Text-to-Speech </div>
|
| 1106 |
-
|
| 1107 |
<div class="section-header">🌟 About Our Technology</div>
|
| 1108 |
<div class="intro-text">Greetings from Oriserve! We're excited to showcase our refined Text-to-Speech capabilities—powered by generative voice synthesis to deliver <strong>natural-sounding</strong> and <strong>professionally tuned</strong> speech output.</div>
|
| 1109 |
|
|
@@ -1114,12 +1152,12 @@ def about_tab():
|
|
| 1114 |
<div class="feature-title">🎯 Core Capabilities</div>
|
| 1115 |
<ul class="feature-list">
|
| 1116 |
<li><strong>Robust voice models suited for production use</strong></li>
|
| 1117 |
-
<li><strong>Optimized for English and Hindi, with multilingual expansion underway</strong></li>
|
| 1118 |
<li><strong>Diverse voice styles for varied use cases</strong></li>
|
| 1119 |
<li><strong>Responsive audio generation with practical latency</strong></li>
|
| 1120 |
</ul>
|
| 1121 |
</div>
|
| 1122 |
-
|
| 1123 |
<div class="feature-block">
|
| 1124 |
<div class="feature-title">🛠️ Advanced Controls</div>
|
| 1125 |
<ul class="feature-list">
|
|
@@ -1128,7 +1166,7 @@ def about_tab():
|
|
| 1128 |
<li><strong>Balance tuning for clarity and stability</strong></li>
|
| 1129 |
</ul>
|
| 1130 |
</div>
|
| 1131 |
-
|
| 1132 |
<div class="feature-block">
|
| 1133 |
<div class="feature-title">💫 Special Features</div>
|
| 1134 |
<ul class="feature-list">
|
|
@@ -1137,7 +1175,7 @@ def about_tab():
|
|
| 1137 |
<li><strong>Improved handling of common pronunciation cases</strong></li>
|
| 1138 |
</ul>
|
| 1139 |
</div>
|
| 1140 |
-
|
| 1141 |
<div class="feature-block">
|
| 1142 |
<div class="feature-title">⚡ Processing Capabilities</div>
|
| 1143 |
<ul class="feature-list">
|
|
@@ -1146,7 +1184,7 @@ def about_tab():
|
|
| 1146 |
<li><strong>Audio streaming with first-byte latency as low as ~150 ms</strong></li>
|
| 1147 |
</ul>
|
| 1148 |
</div>
|
| 1149 |
-
|
| 1150 |
<div class="feature-block">
|
| 1151 |
<div class="feature-title">🔊 Audio Quality</div>
|
| 1152 |
<ul class="feature-list">
|
|
@@ -1155,7 +1193,7 @@ def about_tab():
|
|
| 1155 |
<li><strong>Consistent synthesis across sessions</strong></li>
|
| 1156 |
</ul>
|
| 1157 |
</div>
|
| 1158 |
-
|
| 1159 |
<div class="feature-block">
|
| 1160 |
<div class="feature-title">📈 Future Development</div>
|
| 1161 |
<ul class="feature-list">
|
|
@@ -1164,6 +1202,15 @@ def about_tab():
|
|
| 1164 |
<li><strong>Expanded language and dialect support coming soon</strong></li>
|
| 1165 |
</ul>
|
| 1166 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1167 |
</div>
|
| 1168 |
|
| 1169 |
"""
|
|
@@ -1171,12 +1218,11 @@ def about_tab():
|
|
| 1171 |
|
| 1172 |
return gr.Markdown("")
|
| 1173 |
|
| 1174 |
-
|
| 1175 |
def initialize_app():
|
| 1176 |
# await audio_manager.load_agent_voices(agents)
|
| 1177 |
|
| 1178 |
try:
|
| 1179 |
-
with gr.Blocks(js=js) as demo:
|
| 1180 |
with gr.Tabs() as tabs:
|
| 1181 |
with gr.Tab("🗣️ TTS"):
|
| 1182 |
tts_tab()
|
|
@@ -1189,4 +1235,4 @@ def initialize_app():
|
|
| 1189 |
|
| 1190 |
|
| 1191 |
demo = initialize_app()
|
| 1192 |
-
demo.launch()
|
|
|
|
| 431 |
refine_generation: bool = False,
|
| 432 |
stream: bool = False,
|
| 433 |
) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
|
|
|
|
|
|
|
| 434 |
|
| 435 |
+
user_id = parameters.user_id
|
| 436 |
+
make_request = True
|
| 437 |
if agent is None and recording_data is not None:
|
| 438 |
audio_manager.update_current_recording(recording_data)
|
| 439 |
clone_voice_name = process_voice_clone(recording_data, user_id)
|
|
|
|
| 441 |
voice_name = str(clone_voice_name)
|
| 442 |
print(f"The voice name, get from voice clone API:::--{voice_name}")
|
| 443 |
else:
|
| 444 |
+
gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
|
| 445 |
+
print("Did not get any voice name from voice clone api")
|
| 446 |
+
make_request=False
|
| 447 |
else:
|
| 448 |
voice_name = [agents[agent]]
|
| 449 |
+
if make_request:
|
| 450 |
+
print("\nParameters Recieved:\n")
|
| 451 |
+
print("speechReqId", session_id)
|
| 452 |
+
print("text", [text])
|
| 453 |
+
print("language", [language_codes[language]])
|
| 454 |
+
print("voice_name", voice_name)
|
| 455 |
+
print("encoding", "default")
|
| 456 |
+
print("expressive", expressiveness)
|
| 457 |
+
print("stability", stability)
|
| 458 |
+
print("clarity", clarity)
|
| 459 |
+
print("speech_rate", speech_rate)
|
| 460 |
+
print("loudness", loudness)
|
| 461 |
+
print("refine_grneration", refine_generation)
|
| 462 |
+
print("\n\n")
|
| 463 |
|
| 464 |
+
try:
|
| 465 |
+
if not text or text.strip() == "":
|
| 466 |
+
raise gr.Error("Text input cannot be empty")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
+
if len(text) > 1000:
|
| 469 |
+
raise gr.Error(
|
| 470 |
+
f"Text length must be less than 1000 characters. Current length: {len(text)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
+
payload = json.dumps(
|
| 474 |
+
{
|
| 475 |
+
"speechReqId": session_id,
|
| 476 |
+
"user_id": user_id,
|
| 477 |
+
"text": [text],
|
| 478 |
+
"language": [language_codes[language]],
|
| 479 |
+
"voice_name": voice_name,
|
| 480 |
+
"encoding": "default",
|
| 481 |
+
"expressive": expressiveness,
|
| 482 |
+
"stability": stability,
|
| 483 |
+
"clarity": clarity,
|
| 484 |
+
"speech_rate": speech_rate,
|
| 485 |
+
"refine_generation": refine_generation,
|
| 486 |
+
"pronunciation_dict":pronunc_dict
|
| 487 |
+
}
|
| 488 |
+
)
|
| 489 |
|
| 490 |
+
with requests.post(
|
| 491 |
+
url=parameters.TTS_URL,
|
| 492 |
+
headers={"Authorization": f"Bearer {parameters.TTS_secret_key}"},
|
| 493 |
+
data=payload,
|
| 494 |
+
verify=False,
|
| 495 |
+
stream=True,
|
| 496 |
+
) as response:
|
| 497 |
+
if response.status_code != 200:
|
| 498 |
+
print(
|
| 499 |
+
f"API request failed with status {response.status_code} and error: {response.reason}"
|
| 500 |
+
)
|
| 501 |
log_initial_submission(
|
| 502 |
code=response.status_code,
|
| 503 |
session_id=session_id,
|
| 504 |
language=language,
|
| 505 |
+
input_method=None,
|
| 506 |
agent_used=voice_name,
|
| 507 |
+
voice_path=recording_data,
|
| 508 |
text_input=text,
|
| 509 |
expressiveness=expressiveness,
|
| 510 |
stability=stability,
|
|
|
|
| 512 |
speech_rate=speech_rate,
|
| 513 |
loudness=loudness,
|
| 514 |
refine_generation=refine_generation,
|
| 515 |
+
err_code=response.status_code,
|
| 516 |
+
err_msg=response.reason,
|
| 517 |
)
|
| 518 |
+
raise gr.Error(f"API Error: {response.status_code} - {response.reason}")
|
| 519 |
+
sample_rate = 24000
|
| 520 |
+
|
| 521 |
+
if stream:
|
| 522 |
+
# Streaming mode
|
| 523 |
+
last_chunk_time = time.time()
|
| 524 |
+
for chunk in response.iter_content(chunk_size=32768):
|
| 525 |
+
chunk_received_time = time.time()
|
| 526 |
+
chunk_delay = chunk_received_time - last_chunk_time
|
| 527 |
+
last_chunk_time = chunk_received_time
|
| 528 |
+
|
| 529 |
+
if chunk:
|
| 530 |
+
start_processing_time = time.time()
|
| 531 |
+
audio_chunk = np.frombuffer(chunk, dtype=np.int16)
|
| 532 |
+
processing_time = time.time() - start_processing_time
|
| 533 |
+
|
| 534 |
+
if len(audio_chunk) > 0:
|
| 535 |
+
yield_time_start = time.time()
|
| 536 |
+
yield (sample_rate, audio_chunk)
|
| 537 |
+
yield_time_complete = time.time()
|
| 538 |
+
|
| 539 |
+
print(
|
| 540 |
+
f"Streaming chunk of size {len(audio_chunk)} - "
|
| 541 |
+
f"Chunk delay: {chunk_delay:.4f}s, "
|
| 542 |
+
f"Processing time: {processing_time:.4f}s, "
|
| 543 |
+
f"Yielding time: {(yield_time_complete - yield_time_start):.4f}s"
|
| 544 |
+
)
|
| 545 |
else:
|
| 546 |
+
# Non-streaming mode
|
| 547 |
+
start_time = time.time()
|
| 548 |
+
audio_bytes = b""
|
| 549 |
+
|
| 550 |
+
for chunk in response.iter_content(chunk_size=32768):
|
| 551 |
+
if chunk:
|
| 552 |
+
print(f"Streaming chunk of size {len(chunk)}")
|
| 553 |
+
audio_bytes += chunk
|
| 554 |
+
|
| 555 |
+
if len(audio_bytes) > 0:
|
| 556 |
+
complete_audio = np.frombuffer(audio_bytes, dtype=np.int16)
|
| 557 |
+
processing_time = time.time() - start_time
|
| 558 |
+
|
| 559 |
+
complete_audio = increase_volume(complete_audio, factor=loudness)
|
| 560 |
+
yield (sample_rate, complete_audio)
|
| 561 |
+
|
| 562 |
+
saved_path = save_generated_audio(complete_audio, session_id)
|
| 563 |
+
|
| 564 |
+
log_initial_submission(
|
| 565 |
+
code=response.status_code,
|
| 566 |
+
session_id=session_id,
|
| 567 |
+
language=language,
|
| 568 |
+
input_method="Select Voice" if agent else "Voice clone",
|
| 569 |
+
agent_used=voice_name,
|
| 570 |
+
voice_path=saved_path,
|
| 571 |
+
text_input=text,
|
| 572 |
+
expressiveness=expressiveness,
|
| 573 |
+
stability=stability,
|
| 574 |
+
clarity=clarity,
|
| 575 |
+
speech_rate=speech_rate,
|
| 576 |
+
loudness=loudness,
|
| 577 |
+
refine_generation=refine_generation,
|
| 578 |
+
)
|
| 579 |
|
| 580 |
+
else:
|
| 581 |
+
raise ValueError("No audio data received from API")
|
| 582 |
+
|
| 583 |
+
except requests.RequestException as e:
|
| 584 |
+
raise gr.Error(f"Network Error: Failed to connect to the API server - {str(e)}")
|
| 585 |
+
except Exception as e:
|
| 586 |
+
raise gr.Error(f"An unexpected error occurred: {str(e)}")
|
| 587 |
|
| 588 |
|
| 589 |
def disable_rating_box():
|
|
|
|
| 600 |
|
| 601 |
def tts_tab():
|
| 602 |
with aws.fs.open(parameters.GLOBAL_PRONUNCIATION_DICT_PATH,'r') as f:
|
| 603 |
+
global_pronunc_dict = json.loads(f.read())
|
| 604 |
|
| 605 |
pronunc_dict_state = gr.State(value=global_pronunc_dict)
|
| 606 |
session_id = generate_session_id()
|
|
|
|
| 632 |
<div class="info-heading">🎯 Quick Start Guide</div>
|
| 633 |
""")
|
| 634 |
gr.Markdown("""
|
| 635 |
+
|
| 636 |
🌐 **Select Language**
|
| 637 |
* Choose from our listed languages
|
| 638 |
+
|
| 639 |
🎤 **Choose Voice**
|
| 640 |
* Select from the curated collection of high-quality voices
|
| 641 |
* Each voice is optimized for natural speech patterns
|
| 642 |
* You can give your own voice by clicking on Voice clone
|
| 643 |
+
|
| 644 |
✍️ **Enter Your Text**
|
| 645 |
* Type or paste your text in the input box
|
| 646 |
* Or you can give input by clicking Random Sentence
|
| 647 |
+
|
| 648 |
⚙️ **Customize Voice Parameters**
|
| 649 |
* Fine-tune expressiveness for emotional depth
|
| 650 |
* Adjust stability for consistent output
|
| 651 |
* Control clarity for precise articulation
|
| 652 |
+
|
| 653 |
🎵 **Generate Audio**
|
| 654 |
* Click the generate button to create your audio
|
| 655 |
* Processing typically takes a few seconds
|
| 656 |
+
|
| 657 |
⭐ **Provide Feedback**
|
| 658 |
* Rate the generated audio quality
|
| 659 |
* Give us your feedback
|
| 660 |
* Your feedback helps improve our system
|
| 661 |
+
|
| 662 |
💾 **Access Your Audio**
|
| 663 |
* Download the generated audio for offline use
|
| 664 |
""")
|
|
|
|
| 806 |
with gr.Row():
|
| 807 |
pronunc_dict_key_in = gr.Textbox(label="Pronunciation key",placeholder="Enter word")
|
| 808 |
pronunc_dict_key_out = gr.Textbox(label="Pronunciation Value",placeholder="Enter word with correct pronunciation")
|
| 809 |
+
|
| 810 |
add_btn = gr.Button("Add pronunciation pair", variant="primary")
|
| 811 |
|
| 812 |
add_btn.click(
|
|
|
|
| 876 |
<li>Add your pronunciation of any word that doesn't sound well</li>
|
| 877 |
<li>If you don't like the pronunciation of any word, then add your word with the key and value pair</li>
|
| 878 |
<li><b style = "color:red">Note:-</b>Pronunciation pairs are <i style="color:red">case sensitive</i></li>
|
| 879 |
+
<li>If the model mispronounces some word incorrectly, you can correct it by adding the term as the Pronunciation Key and its phonetical spelling as the Pronunciation Value. For example, if <i><b style="color:red">AI/Cholestrol</b></i> isn't pronounced correctly, respell it as <i><b style = "color:green">ए आई/colestrol</b></i>: enter <i><b style="color:red">AI/Cholestrol</b></i> in the Pronunciation Key field and <i><b style = "color:green">ए आई/colestrol</b></i> in the Pronunciation Value field, then click Add Pronunciation Pair.</li>
|
| 880 |
</ul>
|
| 881 |
""")
|
| 882 |
|
|
|
|
| 1096 |
line-height: 1.4;
|
| 1097 |
margin-bottom: 20px;
|
| 1098 |
}
|
| 1099 |
+
.footer {
|
| 1100 |
+
margin-top:10px;
|
| 1101 |
+
padding: 15px;
|
| 1102 |
+
border-radius: 8px;
|
| 1103 |
+
transition: background-color 0.3s ease;
|
| 1104 |
+
min-height: 200px; /* Consistent height */
|
| 1105 |
+
display: flex;
|
| 1106 |
+
flex-direction: column;
|
| 1107 |
+
justify-content: flex-start;
|
| 1108 |
+
border: 1px solid #e0e0e0;
|
| 1109 |
+
}
|
| 1110 |
+
.footer:hover{
|
| 1111 |
+
background-color: #3f3f46;
|
| 1112 |
+
}
|
| 1113 |
+
.footer .feature-list a.hf-link {
|
| 1114 |
+
color: #FFFF;
|
| 1115 |
+
text-decoration: none;
|
| 1116 |
+
transition: all 0.3s ease;
|
| 1117 |
+
display: inline-block;
|
| 1118 |
+
}
|
| 1119 |
+
.footer .feature-list a.hf-link:hover {
|
| 1120 |
+
color: #EA580C;
|
| 1121 |
+
font-weight: 600;
|
| 1122 |
+
transform: translateX(10px);
|
| 1123 |
+
}
|
| 1124 |
+
.footer .feature-list spam {
|
| 1125 |
+
color: #FFFF;
|
| 1126 |
+
text-decoration: none;
|
| 1127 |
+
transition: all 0.3s ease;
|
| 1128 |
+
display: inline-block;
|
| 1129 |
+
}
|
| 1130 |
+
.footer .feature-list spam:hover {
|
| 1131 |
+
color: #EA580C;
|
| 1132 |
+
font-weight: 600;
|
| 1133 |
+
text-decoration: underline;
|
| 1134 |
+
}
|
| 1135 |
+
|
| 1136 |
/* For mobile responsiveness */
|
| 1137 |
@media (max-width: 768px) {
|
| 1138 |
.features-container {
|
|
|
|
| 1141 |
}
|
| 1142 |
</style>
|
| 1143 |
<div style="text-align: center; font-size: 3em; font-weight: bold; margin-bottom: 20px;"> 🚀 Welcome to ORI Text-to-Speech </div>
|
| 1144 |
+
|
| 1145 |
<div class="section-header">🌟 About Our Technology</div>
|
| 1146 |
<div class="intro-text">Greetings from Oriserve! We're excited to showcase our refined Text-to-Speech capabilities—powered by generative voice synthesis to deliver <strong>natural-sounding</strong> and <strong>professionally tuned</strong> speech output.</div>
|
| 1147 |
|
|
|
|
| 1152 |
<div class="feature-title">🎯 Core Capabilities</div>
|
| 1153 |
<ul class="feature-list">
|
| 1154 |
<li><strong>Robust voice models suited for production use</strong></li>
|
| 1155 |
+
<li><strong>Optimized for English and Hindi, with multilingual expansion underway</strong></li>
|
| 1156 |
<li><strong>Diverse voice styles for varied use cases</strong></li>
|
| 1157 |
<li><strong>Responsive audio generation with practical latency</strong></li>
|
| 1158 |
</ul>
|
| 1159 |
</div>
|
| 1160 |
+
|
| 1161 |
<div class="feature-block">
|
| 1162 |
<div class="feature-title">🛠️ Advanced Controls</div>
|
| 1163 |
<ul class="feature-list">
|
|
|
|
| 1166 |
<li><strong>Balance tuning for clarity and stability</strong></li>
|
| 1167 |
</ul>
|
| 1168 |
</div>
|
| 1169 |
+
|
| 1170 |
<div class="feature-block">
|
| 1171 |
<div class="feature-title">💫 Special Features</div>
|
| 1172 |
<ul class="feature-list">
|
|
|
|
| 1175 |
<li><strong>Improved handling of common pronunciation cases</strong></li>
|
| 1176 |
</ul>
|
| 1177 |
</div>
|
| 1178 |
+
|
| 1179 |
<div class="feature-block">
|
| 1180 |
<div class="feature-title">⚡ Processing Capabilities</div>
|
| 1181 |
<ul class="feature-list">
|
|
|
|
| 1184 |
<li><strong>Audio streaming with first-byte latency as low as ~150 ms</strong></li>
|
| 1185 |
</ul>
|
| 1186 |
</div>
|
| 1187 |
+
|
| 1188 |
<div class="feature-block">
|
| 1189 |
<div class="feature-title">🔊 Audio Quality</div>
|
| 1190 |
<ul class="feature-list">
|
|
|
|
| 1193 |
<li><strong>Consistent synthesis across sessions</strong></li>
|
| 1194 |
</ul>
|
| 1195 |
</div>
|
| 1196 |
+
|
| 1197 |
<div class="feature-block">
|
| 1198 |
<div class="feature-title">📈 Future Development</div>
|
| 1199 |
<ul class="feature-list">
|
|
|
|
| 1202 |
<li><strong>Expanded language and dialect support coming soon</strong></li>
|
| 1203 |
</ul>
|
| 1204 |
</div>
|
| 1205 |
+
<div class = "footer">
|
| 1206 |
+
<div class="feature-title">How to Reach Us</div>
|
| 1207 |
+
<ul class="feature-list">
|
| 1208 |
+
<li><strong>Email : <spam>ai-team@oriserve.com</spam></strong></li>
|
| 1209 |
+
<li><strong>Huggingface : <a href="https://huggingface.co/Oriserve" class="hf-link">Oriserve huggingface</a></strong></li>
|
| 1210 |
+
<li><strong>GitHub : <a href="https://github.com/OriserveAI" class="hf-link">OriserveAI github</a></strong></li>
|
| 1211 |
+
<li><strong>Website : <a href="https://oriserve.com/" class="hf-link">Oriserve website</a></strong></li>
|
| 1212 |
+
</ul>
|
| 1213 |
+
</div>
|
| 1214 |
</div>
|
| 1215 |
|
| 1216 |
"""
|
|
|
|
| 1218 |
|
| 1219 |
return gr.Markdown("")
|
| 1220 |
|
|
|
|
| 1221 |
def initialize_app():
|
| 1222 |
# await audio_manager.load_agent_voices(agents)
|
| 1223 |
|
| 1224 |
try:
|
| 1225 |
+
with gr.Blocks(js=js,css="footer{display:none !important}") as demo:
|
| 1226 |
with gr.Tabs() as tabs:
|
| 1227 |
with gr.Tab("🗣️ TTS"):
|
| 1228 |
tts_tab()
|
|
|
|
| 1235 |
|
| 1236 |
|
| 1237 |
demo = initialize_app()
|
| 1238 |
+
demo.launch(show_api=False)
|