EverySpeech / app.py
AK97GAMERZ's picture
Update app.py
8701484 verified
import tempfile
import time
from pathlib import Path
from typing import Optional, Tuple, List, Dict
import spaces # Not strictly needed if not using @spaces.GPU decorator for Piper
import re
import os
import json # For Piper voice config (though Piper library handles it mostly)
import gradio as gr
import numpy as np
import soundfile as sf
# import torch # Keep if other parts might need it
# --- Piper TTS ---
PIPER_TTS_AVAILABLE = False
PiperVoice = None # Placeholder for the class
piper_voice_instance = None # Placeholder for the loaded voice instance
PIPER_VOICE_DIR = Path("./piper_voices") # Directory to store Piper voices
DEFAULT_PIPER_VOICE_REPO_ID = "rhasspy/piper-voices"
# Choose a specific voice: e.g., en_US-lessac-medium from rhasspy/piper-voices
# The path within the repo is often like: en/en_US/lessac/medium/
DEFAULT_PIPER_VOICE_SUBPATH = "en/en_US/lessac/medium/" # Trailing slash is important
DEFAULT_PIPER_VOICE_NAME = "en_US-lessac-medium" # Filename base
try:
from piper import PiperVoice
from huggingface_hub import hf_hub_download # For downloading voices
PIPER_TTS_AVAILABLE = True
print("Piper TTS library and huggingface_hub loaded.")
except ImportError:
print("CRITICAL ERROR: piper-tts or huggingface_hub library not found. Piper TTS features will be disabled. Ensure 'piper-tts', 'onnxruntime', 'huggingface_hub' are in requirements.txt")
def ensure_piper_voice(voice_name_base=DEFAULT_PIPER_VOICE_NAME, voice_subpath=DEFAULT_PIPER_VOICE_SUBPATH):
"""Downloads and checks for Piper voice files (.onnx and .onnx.json)."""
PIPER_VOICE_DIR.mkdir(parents=True, exist_ok=True)
onnx_filename = f"{voice_name_base}.onnx"
json_filename = f"{voice_name_base}.onnx.json"
onnx_path = PIPER_VOICE_DIR / onnx_filename
json_path = PIPER_VOICE_DIR / json_filename
# Check if both files exist
if not (onnx_path.exists() and json_path.exists()):
print(f"Downloading Piper voice '{voice_name_base}' from repo '{DEFAULT_PIPER_VOICE_REPO_ID}' subpath '{voice_subpath}'...")
try:
# Download ONNX model file
hf_hub_download(repo_id=DEFAULT_PIPER_VOICE_REPO_ID,
filename=f"{voice_subpath.strip('/')}/{onnx_filename}",
local_dir=PIPER_VOICE_DIR,
local_dir_use_symlinks=False, # Recommended for Spaces
repo_type="model") # Explicitly model type
# Download JSON config file
hf_hub_download(repo_id=DEFAULT_PIPER_VOICE_REPO_ID,
filename=f"{voice_subpath.strip('/')}/{json_filename}",
local_dir=PIPER_VOICE_DIR,
local_dir_use_symlinks=False,
repo_type="model")
print(f"Piper voice '{voice_name_base}' downloaded successfully.")
except Exception as e:
print(f"ERROR downloading Piper voice '{voice_name_base}': {e}")
gr.Warning(f"Failed to download Piper voice {voice_name_base}. TTS might not work.")
return None, None # Return None if download fails
else:
print(f"Piper voice '{voice_name_base}' found locally at {PIPER_VOICE_DIR}.")
return onnx_path, json_path
if PIPER_TTS_AVAILABLE:
onnx_path, json_path = ensure_piper_voice()
if onnx_path and json_path and onnx_path.exists() and json_path.exists(): # Double check existence after potential download
try:
piper_voice_instance = PiperVoice.load(str(onnx_path), config_path=str(json_path))
print("Default Piper TTS voice loaded successfully.")
except Exception as e:
print(f"ERROR loading Piper voice from {onnx_path} and {json_path}: {e}")
PIPER_TTS_AVAILABLE = False # Disable if voice loading fails
else:
print("Piper voice files not found or not downloaded correctly. Piper TTS disabled.")
PIPER_TTS_AVAILABLE = False
from pydub import AudioSegment # For background sound mixing
# --- Gemini Configuration (Keep your existing Gemini setup) ---
try:
import google.generativeai as genai
GOOGLE_GEMINI_AVAILABLE = True
except ImportError:
GOOGLE_GEMINI_AVAILABLE = False
print("Warning: google-generativeai library not found. Gemini features will be disabled.")
GEMINI_API_KEY = os.environ.get("GOOGLE_API_KEY")
gemini_configured_successfully = False
gemini_model_instance = None
GEMINI_MODEL_NAME = "gemini-1.5-flash-latest" # Or your preferred Gemini model
if GOOGLE_GEMINI_AVAILABLE and GEMINI_API_KEY:
try:
genai.configure(api_key=GEMINI_API_KEY)
gemini_model_instance = genai.GenerativeModel(GEMINI_MODEL_NAME)
# Optional: Test call to confirm API key and model access if desired (can slow startup)
# _ = gemini_model_instance.generate_content("test", generation_config={'candidate_count': 1})
gemini_configured_successfully = True
print(f"Google Gemini configured successfully with model: {GEMINI_MODEL_NAME}")
except Exception as e:
print(f"Error configuring Google Gemini at startup: {e}. Gemini features might not work correctly.")
elif GOOGLE_GEMINI_AVAILABLE and not GEMINI_API_KEY:
print("INFO: GOOGLE_API_KEY not found in environment variables/secrets for Gemini. Manual input will be required if feature is used.")
else:
if GOOGLE_GEMINI_AVAILABLE: # SDK is there, but no key
print("INFO: Gemini SDK is available, but no API key found (neither in secrets nor manually provided yet).")
else: # SDK itself is missing
print("INFO: Gemini features disabled (SDK not available).")
def enhance_text_with_gemini(text_to_enhance: str, enhancement_prompt: str, api_key_manual: Optional[str] = None) -> str:
"""Enhances text using Google Gemini. (Paste your full working function here)"""
if not GOOGLE_GEMINI_AVAILABLE:
gr.Warning("Gemini SDK not available. Cannot enhance text.")
return text_to_enhance
active_gemini_model_instance = None
temp_gemini_configured_manually = False
original_global_api_key_for_revert = genai.API_KEY if hasattr(genai, 'API_KEY') and genai.API_KEY else None
if api_key_manual:
gr.Info("Attempting to use manually provided API key for this Gemini request...")
try:
genai.configure(api_key=api_key_manual)
active_gemini_model_instance = genai.GenerativeModel(GEMINI_MODEL_NAME)
_ = active_gemini_model_instance.generate_content("test", generation_config={'candidate_count': 1}) # Test call
temp_gemini_configured_manually = True
print("Gemini temporarily configured and tested with manual API key for this request.")
except Exception as e:
gr.Error(f"Failed to configure or test Gemini with provided manual API key: {e}")
if original_global_api_key_for_revert and hasattr(genai, 'configure'): # Revert if there was an original key
try: genai.configure(api_key=original_global_api_key_for_revert)
except Exception: pass
return text_to_enhance
elif gemini_configured_successfully and gemini_model_instance:
active_gemini_model_instance = gemini_model_instance
else:
gr.Warning("Google API Key for Gemini not provided or Gemini not configured correctly. Cannot enhance text.")
return text_to_enhance
if not active_gemini_model_instance:
gr.Error("Gemini model instance could not be initialized. Cannot enhance text.")
return text_to_enhance
full_prompt = f"{enhancement_prompt}:\n\n---\n{text_to_enhance}\n---"
print(f"Sending to Gemini: First 100 chars of prompt: {full_prompt[:100]}...")
try:
gen_config = {'candidate_count': 1}
response = active_gemini_model_instance.generate_content(full_prompt, generation_config=gen_config)
if response.parts:
enhanced_text = response.text
print(f"Gemini enhanced text (first 100 chars): {enhanced_text[:100]}")
return enhanced_text
else: # Handle no parts / blocked response
block_reason = "Unknown"; safety_ratings_str = "N/A"
if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
block_reason = response.prompt_feedback.block_reason or "Not Blocked but no parts"
if hasattr(response.prompt_feedback, 'safety_ratings'):
safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in response.prompt_feedback.safety_ratings])
elif hasattr(response, 'candidates') and response.candidates and hasattr(response.candidates[0], 'finish_reason') and response.candidates[0].finish_reason != 1: # 1 is typically "STOP"
block_reason = f"Finish Reason: {response.candidates[0].finish_reason.name if hasattr(response.candidates[0].finish_reason, 'name') else response.candidates[0].finish_reason}"
gr.Warning(f"Gemini returned no content. Block Reason: {block_reason}. Safety: {safety_ratings_str}. Using original text.")
return text_to_enhance
except Exception as e:
gr.Error(f"Error calling Gemini API: {type(e).__name__}: {e}")
return text_to_enhance
finally: # Revert manual key config if it was used and there was an original key
if temp_gemini_configured_manually and original_global_api_key_for_revert and hasattr(genai, 'configure'):
try:
genai.configure(api_key=original_global_api_key_for_revert)
print("Reverted Gemini configuration to use original API key (if it was set).")
except Exception as revert_e:
print(f"Warning: Could not revert Gemini config: {revert_e}")
elif temp_gemini_configured_manually and not original_global_api_key_for_revert and hasattr(genai, 'configure'):
# If there was no original global key, it's harder to "unset" a manual key.
# For now, just print a message. The next call that doesn't provide a manual key
# will fail if the global key isn't set via secrets.
print("Manual Gemini API key was used. Global API key (from secrets) remains unset if it wasn't there before.")
# --- Text Parsing for Piper TTS and Background Sounds ---
def parse_text_for_piper_and_sounds(text: str) -> Tuple[str, List[Dict]]:
"""
Strips special tags not understood by Piper (like S1=(), {} )
and extracts [sound:...] directives.
Non-verbal tags like (laughs) will be passed as text to Piper.
"""
sound_requests = []
# Strip S1=(...) and {characteristic} tags so Piper doesn't say them literally.
# If you want Piper to *try* to interpret them, remove or comment out these lines.
text_cleaned = re.sub(r"S[12]=\([^)]+\)", "", text)
text_cleaned = re.sub(r"\{[^}]+\}", "", text_cleaned)
# Non-verbal tags like (laughs) will be kept in the text for Piper to attempt to read.
# Piper won't generate a laugh, but will say "laughs".
def extract_and_remove_sounds(match):
sound_tag_content = match.group(1) # Content inside [sound:...]
try:
# Reconstruct the full tag for consistent parsing if needed, or parse sound_tag_content directly
full_sound_tag_for_parsing = f"[sound:{sound_tag_content}]"
sound_name_match = re.search(r"sound:\s*([\w\-\_]+)", full_sound_tag_for_parsing)
sound_name = sound_name_match.group(1).lower() if sound_name_match else None
if not sound_name:
gr.Warning(f"Malformed sound tag (missing sound name): '[sound:{sound_tag_content}]'. Will be spoken as text.")
return match.group(0) # Return original malformed tag to be spoken
volume_match = re.search(r"volume:\s*([0-9\.]+)", full_sound_tag_for_parsing)
volume_factor = float(volume_match.group(1)) if volume_match else 0.5
sound_requests.append({"name": sound_name, "volume_factor": max(0.0, min(1.0, volume_factor))})
return "" # Remove the [sound:...] tag from text sent to Piper
except Exception as e:
gr.Warning(f"Could not parse sound tag '[sound:{sound_tag_content}]': {e}. Will be spoken as text.")
return match.group(0) # Return original tag to be spoken if parsing fails
text_for_tts = re.sub(r"\[sound:([^\]]+)\]", extract_and_remove_sounds, text_cleaned)
return text_for_tts.strip(), sound_requests
# --- Background Sound Configuration ---
SOUND_EFFECTS_DIR = Path("./sounds")
SUPPORTED_SOUND_FORMATS_BG = [".wav", ".mp3"]
DEFAULT_BACKGROUND_VOLUME_ADJUST_DB = -18 # Made background sounds a bit quieter
# --- Main Inference Function ---
# @spaces.GPU # Not needed for Piper as it's CPU-fast
def run_inference(
text_input_original: str,
piper_length_scale: float,
piper_noise_scale: float,
piper_noise_w: float,
speed_factor_post_tts: float,
enable_gemini_enhancement: bool,
gemini_enhancement_prompt: str,
gemini_api_key_manual: Optional[str]
):
if not PIPER_TTS_AVAILABLE or not piper_voice_instance:
gr.Error("Piper TTS is not available or a voice model is not loaded. Cannot synthesize speech.")
return (22050, np.zeros(100, dtype=np.int16)) # Default SR, short silence
# 1. Gemini Enhancement (if enabled)
processed_text = text_input_original
if enable_gemini_enhancement:
if gemini_enhancement_prompt and gemini_enhancement_prompt.strip():
gr.Info("Enhancing text with Gemini...")
processed_text = enhance_text_with_gemini(
text_input_original,
gemini_enhancement_prompt,
gemini_api_key_manual
)
else:
gr.Warning("Gemini enhancement enabled, but the enhancement prompt is empty. Using original text.")
# 2. Parse for Piper TTS text and [sound:...] directives
text_for_piper, sound_requests = parse_text_for_piper_and_sounds(processed_text)
print(f"Text for Piper TTS: '{text_for_piper}'")
print(f"Sound requests: {sound_requests}")
# 3. Piper TTS Generation
generated_speech_np = np.array([], dtype=np.int16) # Piper outputs int16
output_sr = piper_voice_instance.config.sample_rate if piper_voice_instance.config else 22050 # Default if config missing
if text_for_piper:
gr.Info(f"Synthesizing with Piper TTS...")
synthesis_kwargs = {
'length_scale': piper_length_scale,
'noise_scale': piper_noise_scale,
'noise_w': piper_noise_w,
}
start_time = time.time()
# Piper's synthesize method directly gives audio bytes or chunks of bytes
try:
audio_bytes = b"".join(piper_voice_instance.synthesize(text_for_piper, **synthesis_kwargs))
end_time = time.time()
print(f"Piper TTS Generation finished in {end_time - start_time:.2f} seconds.")
if audio_bytes:
generated_speech_np = np.frombuffer(audio_bytes, dtype=np.int16)
else:
gr.Warning("Piper TTS produced no audio bytes for the input text.")
except Exception as e_piper:
gr.Error(f"Error during Piper TTS synthesis: {e_piper}")
# Fallback to silence if Piper fails
generated_speech_np = np.zeros(int(output_sr * 0.1), dtype=np.int16)
elif sound_requests: # Only sound requests, no text for Piper
gr.Info("No text for Piper TTS. Will generate background sound over silence if requested.")
generated_speech_np = np.zeros(int(output_sr * 0.1), dtype=np.int16) # 0.1 sec silence for mixing
else: # No text and no sound requests
if not (text_input_original and text_input_original.strip()): # Check if original input was also empty
raise gr.Error("Input text is empty and no sound was requested.")
else: # Original text was there but got stripped to nothing, and no sound requests
gr.Warning("After processing, no text remained for TTS and no background sound was requested.")
generated_speech_np = np.zeros(int(output_sr * 0.1), dtype=np.int16)
# 4. Post-TTS Speed Adjustment (Applied to the speech generated by Piper)
if speed_factor_post_tts != 1.0 and len(generated_speech_np) > 0:
# Convert to float for interpolation, then back to int16
# Ensure normalization to avoid clipping if values are already near max int16
max_val = np.iinfo(np.int16).max
generated_speech_float = generated_speech_np.astype(np.float32) / max_val
original_len = len(generated_speech_float)
speed_factor_clamped = max(0.5, min(2.0, speed_factor_post_tts))
target_len = int(original_len / speed_factor_clamped)
if target_len != original_len and target_len > 0:
x_original = np.arange(original_len)
x_resampled = np.linspace(0, original_len - 1, target_len)
resampled_float = np.interp(x_resampled, x_original, generated_speech_float)
# Clip before converting back to int16
generated_speech_np = (np.clip(resampled_float, -1.0, 1.0) * max_val).astype(np.int16)
print(f"Applied post-TTS speed factor {speed_factor_clamped}")
else:
print("Skipped post-TTS speed adjustment (target length invalid or no change).")
# 5. Background Sound Mixing
final_audio_data_np = generated_speech_np # Start with (potentially speed-adjusted) speech
final_sr = output_sr
if sound_requests: # Only one overall background sound is handled from the first [sound:] tag
overall_sound_effect_to_mix = sound_requests[0]
sound_name = overall_sound_effect_to_mix["name"]
volume_factor = overall_sound_effect_to_mix["volume_factor"]
bg_sound_file_path = next((p for fmt in SUPPORTED_SOUND_FORMATS_BG for p in [(SOUND_EFFECTS_DIR / (sound_name + fmt))] if p.exists()), None)
if bg_sound_file_path:
print(f"Mixing with background sound: {bg_sound_file_path}")
try:
# Convert current final_audio_data_np (which is speech) to AudioSegment
speech_segment = AudioSegment(
data=final_audio_data_np.tobytes(),
sample_width=final_audio_data_np.dtype.itemsize, # Should be 2 for int16
frame_rate=final_sr,
channels=1 # Piper output is mono
)
# Load background sound
bg_segment = AudioSegment.from_file(bg_sound_file_path).set_channels(1).set_frame_rate(final_sr)
# Adjust background sound volume
db_reduction = 0.0
if volume_factor > 0.001 and volume_factor < 1.0:
db_reduction = 20 * np.log10(volume_factor)
elif volume_factor <= 0.001: # Make it very quiet if volume is near zero
db_reduction = -60 # Effectively very quiet
bg_segment_adjusted = bg_segment + db_reduction + DEFAULT_BACKGROUND_VOLUME_ADJUST_DB
# Ensure speech_segment is not shorter than bg_segment if only bg sound is effectively playing
if len(speech_segment) < 100 and len(bg_segment_adjusted) > 0: # If speech is very short (e.g. default silence)
# Make the "speech" part long enough to hear the background sound for a bit
target_duration_ms = max(len(speech_segment), min(len(bg_segment_adjusted), 5000)) # cap at 5s
speech_segment = AudioSegment.silent(duration=target_duration_ms, frame_rate=final_sr)
# Overlay (loop background if shorter than speech)
# Ensure bg_segment_adjusted is not zero length before division
loop_times = int(len(speech_segment) / len(bg_segment_adjusted)) + 1 if len(bg_segment_adjusted) > 0 else 1
mixed_segment = speech_segment.overlay(bg_segment_adjusted, loop=True, times=loop_times)
final_audio_data_np = np.array(mixed_segment.get_array_of_samples()).astype(np.int16)
final_sr = mixed_segment.frame_rate # Should be same as output_sr
except Exception as e:
gr.Warning(f"Could not mix background sound '{sound_name}': {e}")
print(f"Error mixing sound: {e}"); import traceback; traceback.print_exc()
# Fallback to just the speech if mixing fails (already in final_audio_data_np)
else:
gr.Warning(f"Background sound '{sound_name}' not found in {SOUND_EFFECTS_DIR}. Using speech only.")
# Ensure there's some audio, even if it's short silence, if all inputs were valid but produced no sound
if len(final_audio_data_np) == 0:
print("Warning: Final audio data is empty. Returning short silence.")
final_audio_data_np = np.zeros(int(final_sr * 0.1), dtype=np.int16)
return (final_sr, final_audio_data_np)
# --- Gradio UI ---
css = """
#col-container {max-width: 90%; margin-left: auto; margin-right: auto;}
.gr-prose {font-size: 100% !important;} .gr-prose h1 {font-size: 2.5em !important;}
.gr-prose h2 {font-size: 1.8em !important;} .gr-prose p {font-size: 1.1em !important; margin-bottom: 0.5em !important;}
.gr-prose ul {font-size: 1.0em !important; margin-left: 20px !important;} .gr-prose li { margin-bottom: 0.3em !important;}
"""
default_text_piper = """Welcome to EverySpeech with the speedy Piper TTS!
S1=(This speaker tag will be removed) {excited} This characteristic tag will also be removed.
(laughs) This (laughs) tag will be read out by Piper if not stripped by a more aggressive parser.
You can add background sound too: [sound: city, volume: 0.3] I am speaking over city noise.
"""
example_txt_path = Path("./example.txt") # You can create this if you want
if example_txt_path.exists():
try:
loaded_default_text = example_txt_path.read_text(encoding="utf-8").strip()
if loaded_default_text: default_text_piper = loaded_default_text
except Exception as e: print(f"Warning: Could not read example.txt: {e}")
how_to_use_text_piper = f"""
## How to Use EverySpeech (Piper TTS Edition):
**Fast CPU-based Text-to-Speech!**
- Write text in **Input Text**.
- `S1=(description)` and `{{characteristic}}` tags are currently **stripped** before sending to Piper TTS.
- Non-verbal tags like `(laughs)` or `(coughs)` will be **read as text** by Piper (e.g., it will say the word "laughs").
- **Background Music/Sound:** Use `[sound: name, volume: X]` (e.g., `[sound: rain, volume:0.3]`). The first such tag applies to the whole output. Sound files from `sounds` directory.
**Optional Gemini Text Enhancement:**
- Enable, then provide a prompt like "Make this text more formal:". Your original text is appended.
- Provide API key if not in Space secrets.
**Piper TTS Parameters:** Control voice speed/style. Current voice: `{DEFAULT_PIPER_VOICE_NAME}`
"""
if not PIPER_TTS_AVAILABLE:
how_to_use_text_piper = "## WARNING: Piper TTS FAILED TO LOAD. Speech synthesis will not work. Check Space logs and `requirements.txt` for `piper-tts` and `onnxruntime`.\n\n" + how_to_use_text_piper
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
gr.Markdown("# EverySpeech by Anand (Piper TTS - Fast CPU)")
gr.Markdown(how_to_use_text_piper)
with gr.Row(equal_height=False):
with gr.Column(scale=2):
text_input_original_ui = gr.Textbox(
label="Input Text",
placeholder="Enter text here...", value=default_text_piper, lines=7,
)
with gr.Accordion("✨ Optional: Gemini Text Enhancement", open=False):
enable_gemini_ui = gr.Checkbox(label="Enable Gemini Text Enhancement", value=False,
visible=GOOGLE_GEMINI_AVAILABLE,
info="If checked, your input text will first be processed by Gemini.")
gemini_enhancement_prompt_ui = gr.Textbox(label="Gemini Enhancement Prompt",
placeholder="e.g., Make this more descriptive:", lines=2,
info="Your 'Input Text' above will be appended to this prompt for Gemini.",
visible=GOOGLE_GEMINI_AVAILABLE)
gemini_api_key_manual_ui = gr.Textbox(label="Google API Key (if not in secrets or startup config failed)", type="password",
placeholder="Paste Google AI API key if needed",
visible=GOOGLE_GEMINI_AVAILABLE, # Show always if SDK there
info="Needed if GOOGLE_API_KEY secret isn't set or failed at startup.")
# No audio_prompt_input for this basic Piper setup
with gr.Column(scale=1):
audio_output_ui = gr.Audio(label="Generated Audio", type="numpy", autoplay=False)
run_button = gr.Button("Generate Audio", variant="primary", scale=1)
with gr.Accordion("Piper TTS Generation Parameters", open=True, visible=PIPER_TTS_AVAILABLE):
gr.Markdown(f"Using Piper voice: `{DEFAULT_PIPER_VOICE_NAME}`. Adjust parameters for style.")
piper_length_scale_ui = gr.Slider(label="Length Scale (Prosody/Speed)", minimum=0.5, maximum=2.0, value=1.0, step=0.05, info="Smaller is faster. Affects prosody.")
piper_noise_scale_ui = gr.Slider(label="Noise Scale (Variability)", minimum=0.0, maximum=1.5, value=0.667, step=0.05, info="Adds noise for expressiveness. Default: 0.667")
piper_noise_w_ui = gr.Slider(label="Noise W (Phoneme Duration Noise)", minimum=0.0, maximum=1.5, value=0.8, step=0.05, info="Noise for phoneme pronunciation/duration. Default: 0.8")
speed_factor_post_tts_ui = gr.Slider(label="Global Speed Factor (Post-TTS Resample)", minimum=0.5, maximum=2.0, value=1.0, step=0.05, info="Overall speed change after TTS generation by resampling.")
gradio_inputs_piper = [
text_input_original_ui,
piper_length_scale_ui, piper_noise_scale_ui, piper_noise_w_ui,
speed_factor_post_tts_ui,
enable_gemini_ui, gemini_enhancement_prompt_ui, gemini_api_key_manual_ui
]
run_button.click(
fn=run_inference,
inputs=gradio_inputs_piper,
outputs=[audio_output_ui],
api_name="generate_audio_every_speech_piper",
)
gr.Examples(
examples=[
["Hello, this is Piper TTS. It should be very fast on CPU.", 1.0, 0.667, 0.8, 1.0, False, "", ""],
["S1=(this will be stripped) {excited} This will also be stripped. Piper will say this part. (laughs) And it will say laughs.", 1.0, 0.7, 0.9, 1.0, False, "", ""],
["This text will be made more formal by Gemini. [sound: city, volume:0.2]", 1.2, 0.6, 0.7, 0.9, True, "Make this text extremely formal and verbose:", ""],
["[sound: rain]", 1.0, 0.667, 0.8, 1.0, False, "", ""], # Only sound
],
inputs=gradio_inputs_piper, # Ensure this matches the function inputs
outputs=[audio_output_ui],
fn=run_inference,
cache_examples=False, # "lazy" or False is safer with dynamic/file-based inputs
label="Examples (Piper TTS - Click to Run)",
)
if __name__ == "__main__":
print(f"Looking for sound effects in: {SOUND_EFFECTS_DIR.resolve()}")
if not SOUND_EFFECTS_DIR.exists():
try:
SOUND_EFFECTS_DIR.mkdir(parents=True, exist_ok=True)
print(f"Created sound effects directory: '{SOUND_EFFECTS_DIR}'. Please upload sound files there for [sound:] tags.")
except Exception as e_mkdir:
print(f"Error creating sound directory {SOUND_EFFECTS_DIR}: {e_mkdir}")
else:
print(f"Sound effects directory '{SOUND_EFFECTS_DIR}' found.")
if not PIPER_TTS_AVAILABLE:
print("\nCRITICAL WARNING: Piper TTS FAILED TO LOAD or voice files are missing. SPEECH SYNTHESIS WILL NOT WORK.")
print("Ensure 'piper-tts' and 'onnxruntime' are in requirements.txt and that voice files can be downloaded/found.")
if not GOOGLE_GEMINI_AVAILABLE:
print("\nWARNING: 'google-generativeai' package not installed. Gemini features disabled.")
elif not GEMINI_API_KEY and not gemini_configured_successfully :
print("\nINFO: GOOGLE_API_KEY secret not found. Gemini features will require manual API key input in the UI if used.")
elif not gemini_configured_successfully:
print("\nWARNING: Gemini was not configured successfully at startup (e.g. API key invalid or model access issue). Check logs. Gemini features may not work as expected if used.")
elif gemini_configured_successfully:
print("\nINFO: Gemini configured with API key from secrets (or at least key was found).")
demo.launch()