Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,7 +14,7 @@ import numpy as np
|
|
| 14 |
from pydub import AudioSegment
|
| 15 |
from docx import Document
|
| 16 |
import PyPDF2
|
| 17 |
-
import
|
| 18 |
|
| 19 |
# Initialize logging
|
| 20 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -45,7 +45,11 @@ app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
|
| 45 |
app.layout = dbc.Container([
|
| 46 |
dbc.Row([
|
| 47 |
dbc.Col([
|
| 48 |
-
html.H1("Orpheus Text-to-Speech", className="mb-4"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"),
|
| 50 |
dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"),
|
| 51 |
dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"),
|
|
@@ -65,7 +69,9 @@ app.layout = dbc.Container([
|
|
| 65 |
'margin': '10px 0'
|
| 66 |
},
|
| 67 |
),
|
|
|
|
| 68 |
dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"),
|
|
|
|
| 69 |
dbc.RadioItems(
|
| 70 |
id="num-hosts",
|
| 71 |
options=[{"label": i, "value": i} for i in ["1", "2"]],
|
|
@@ -77,18 +83,24 @@ app.layout = dbc.Container([
|
|
| 77 |
], width=6),
|
| 78 |
dbc.Col([
|
| 79 |
dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"),
|
|
|
|
|
|
|
| 80 |
dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"),
|
|
|
|
| 81 |
dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"),
|
| 82 |
dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"),
|
| 83 |
html.Div(id="audio-output"),
|
| 84 |
-
dbc.Button("
|
| 85 |
dbc.Collapse([
|
|
|
|
| 86 |
dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"),
|
|
|
|
| 87 |
dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"),
|
|
|
|
| 88 |
dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"),
|
|
|
|
| 89 |
dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"),
|
| 90 |
], id="advanced-settings", is_open=False),
|
| 91 |
-
dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"),
|
| 92 |
], width=6),
|
| 93 |
]),
|
| 94 |
dcc.Store(id='generated-script'),
|
|
@@ -126,13 +138,12 @@ def detect_silence(audio, threshold=0.01, min_silence_len=1000):
|
|
| 126 |
silent_regions.append((silent_start, len(audio)))
|
| 127 |
return silent_regions
|
| 128 |
|
| 129 |
-
@spaces.GPU()
|
| 130 |
def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
|
| 131 |
try:
|
| 132 |
paragraphs = script_output.split('\n\n') # Split by double newline
|
| 133 |
audio_samples = []
|
| 134 |
|
| 135 |
-
for i, paragraph in enumerate(paragraphs):
|
| 136 |
if not paragraph.strip():
|
| 137 |
continue
|
| 138 |
|
|
@@ -151,6 +162,7 @@ def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p,
|
|
| 151 |
max_new_tokens=max_new_tokens,
|
| 152 |
num_return_sequences=1,
|
| 153 |
eos_token_id=128258,
|
|
|
|
| 154 |
)
|
| 155 |
|
| 156 |
code_list = parse_output(generated_ids)
|
|
@@ -265,52 +277,4 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
|
|
| 265 |
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
| 266 |
Maintain natural conversation flow and speech patterns within each monologue.
|
| 267 |
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
| 268 |
-
Use speaker names ({host1_name} and/or {
|
| 269 |
-
Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
|
| 270 |
-
Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
|
| 271 |
-
{'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
|
| 272 |
-
Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
|
| 273 |
-
Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
|
| 274 |
-
"""
|
| 275 |
-
|
| 276 |
-
response = model.generate_content(prompt_template)
|
| 277 |
-
return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text), dash.no_update, dash.no_update, dash.no_update
|
| 278 |
-
except Exception as e:
|
| 279 |
-
logger.error(f"Error generating podcast script: {str(e)}")
|
| 280 |
-
return f"Error: {str(e)}", dash.no_update, dash.no_update, dash.no_update
|
| 281 |
-
|
| 282 |
-
elif trigger_id == "generate-audio-btn":
|
| 283 |
-
if not script_output.strip():
|
| 284 |
-
return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update
|
| 285 |
-
|
| 286 |
-
final_audio = generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens)
|
| 287 |
-
|
| 288 |
-
if final_audio is not None:
|
| 289 |
-
# Convert to base64 for audio playback
|
| 290 |
-
audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
|
| 291 |
-
src = f"data:audio/wav;base64,{audio_base64}"
|
| 292 |
-
|
| 293 |
-
# Create a download link for the audio
|
| 294 |
-
download_link = html.A("Download Audio", href=src, download="generated_audio.wav")
|
| 295 |
-
|
| 296 |
-
return dash.no_update, html.Div([
|
| 297 |
-
html.Audio(src=src, controls=True),
|
| 298 |
-
html.Br(),
|
| 299 |
-
download_link
|
| 300 |
-
]), dash.no_update, dash.no_update
|
| 301 |
-
else:
|
| 302 |
-
return dash.no_update, html.Div("Error generating audio"), dash.no_update, dash.no_update
|
| 303 |
-
|
| 304 |
-
elif trigger_id == "advanced-settings-toggle":
|
| 305 |
-
return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update
|
| 306 |
-
|
| 307 |
-
elif trigger_id == "clear-btn":
|
| 308 |
-
return "", html.Div("No audio generated yet."), dash.no_update, ""
|
| 309 |
-
|
| 310 |
-
return dash.no_update, dash.no_update, dash.no_update, dash.no_update
|
| 311 |
-
|
| 312 |
-
# Run the app
|
| 313 |
-
if __name__ == '__main__':
|
| 314 |
-
print("Starting the Dash application...")
|
| 315 |
-
app.run(debug=True, host='0.0.0.0', port=7860)
|
| 316 |
-
print("Dash application has finished running.")
|
|
|
|
| 14 |
from pydub import AudioSegment
|
| 15 |
from docx import Document
|
| 16 |
import PyPDF2
|
| 17 |
+
from tqdm import tqdm
|
| 18 |
|
| 19 |
# Initialize logging
|
| 20 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 45 |
app.layout = dbc.Container([
|
| 46 |
dbc.Row([
|
| 47 |
dbc.Col([
|
| 48 |
+
html.H1("Orpheus Text-to-Speech", className="text-center mb-4"),
|
| 49 |
+
], width=12),
|
| 50 |
+
]),
|
| 51 |
+
dbc.Row([
|
| 52 |
+
dbc.Col([
|
| 53 |
dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"),
|
| 54 |
dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"),
|
| 55 |
dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"),
|
|
|
|
| 69 |
'margin': '10px 0'
|
| 70 |
},
|
| 71 |
),
|
| 72 |
+
html.Label("Duration (minutes)", className="mt-2"),
|
| 73 |
dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"),
|
| 74 |
+
html.Label("Number of Hosts", className="mt-2"),
|
| 75 |
dbc.RadioItems(
|
| 76 |
id="num-hosts",
|
| 77 |
options=[{"label": i, "value": i} for i in ["1", "2"]],
|
|
|
|
| 83 |
], width=6),
|
| 84 |
dbc.Col([
|
| 85 |
dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"),
|
| 86 |
+
dbc.Button("Clear", id="clear-btn", color="secondary", className="mb-2"),
|
| 87 |
+
html.Label("Voice 1", className="mt-2"),
|
| 88 |
dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"),
|
| 89 |
+
html.Label("Voice 2", className="mt-2"),
|
| 90 |
dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"),
|
| 91 |
dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"),
|
| 92 |
html.Div(id="audio-output"),
|
| 93 |
+
dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"),
|
| 94 |
dbc.Collapse([
|
| 95 |
+
html.Label("Temperature", className="mt-2"),
|
| 96 |
dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"),
|
| 97 |
+
html.Label("Top P", className="mt-2"),
|
| 98 |
dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"),
|
| 99 |
+
html.Label("Repetition Penalty", className="mt-2"),
|
| 100 |
dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"),
|
| 101 |
+
html.Label("Max New Tokens", className="mt-2"),
|
| 102 |
dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"),
|
| 103 |
], id="advanced-settings", is_open=False),
|
|
|
|
| 104 |
], width=6),
|
| 105 |
]),
|
| 106 |
dcc.Store(id='generated-script'),
|
|
|
|
| 138 |
silent_regions.append((silent_start, len(audio)))
|
| 139 |
return silent_regions
|
| 140 |
|
|
|
|
| 141 |
def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
|
| 142 |
try:
|
| 143 |
paragraphs = script_output.split('\n\n') # Split by double newline
|
| 144 |
audio_samples = []
|
| 145 |
|
| 146 |
+
for i, paragraph in tqdm(enumerate(paragraphs), total=len(paragraphs), desc="Generating audio"):
|
| 147 |
if not paragraph.strip():
|
| 148 |
continue
|
| 149 |
|
|
|
|
| 162 |
max_new_tokens=max_new_tokens,
|
| 163 |
num_return_sequences=1,
|
| 164 |
eos_token_id=128258,
|
| 165 |
+
pad_token_id=128258,
|
| 166 |
)
|
| 167 |
|
| 168 |
code_list = parse_output(generated_ids)
|
|
|
|
| 277 |
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
| 278 |
Maintain natural conversation flow and speech patterns within each monologue.
|
| 279 |
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
| 280 |
+
Use speaker names ({host1_name} and/or {host2_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|