Spaces:
Running
Running
| import tempfile | |
| import time | |
| from pathlib import Path | |
| from typing import Optional, Tuple, List, Dict | |
| import spaces # Not strictly needed if not using @spaces.GPU decorator for Piper | |
| import re | |
| import os | |
| import json # For Piper voice config (though Piper library handles it mostly) | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| # import torch # Keep if other parts might need it | |
| # --- Piper TTS --- | |
| PIPER_TTS_AVAILABLE = False | |
| PiperVoice = None # Placeholder for the class | |
| piper_voice_instance = None # Placeholder for the loaded voice instance | |
| PIPER_VOICE_DIR = Path("./piper_voices") # Directory to store Piper voices | |
| DEFAULT_PIPER_VOICE_REPO_ID = "rhasspy/piper-voices" | |
| # Choose a specific voice: e.g., en_US-lessac-medium from rhasspy/piper-voices | |
| # The path within the repo is often like: en/en_US/lessac/medium/ | |
| DEFAULT_PIPER_VOICE_SUBPATH = "en/en_US/lessac/medium/" # Trailing slash is important | |
| DEFAULT_PIPER_VOICE_NAME = "en_US-lessac-medium" # Filename base | |
| try: | |
| from piper import PiperVoice | |
| from huggingface_hub import hf_hub_download # For downloading voices | |
| PIPER_TTS_AVAILABLE = True | |
| print("Piper TTS library and huggingface_hub loaded.") | |
| except ImportError: | |
| print("CRITICAL ERROR: piper-tts or huggingface_hub library not found. Piper TTS features will be disabled. Ensure 'piper-tts', 'onnxruntime', 'huggingface_hub' are in requirements.txt") | |
| def ensure_piper_voice(voice_name_base=DEFAULT_PIPER_VOICE_NAME, voice_subpath=DEFAULT_PIPER_VOICE_SUBPATH): | |
| """Downloads and checks for Piper voice files (.onnx and .onnx.json).""" | |
| PIPER_VOICE_DIR.mkdir(parents=True, exist_ok=True) | |
| onnx_filename = f"{voice_name_base}.onnx" | |
| json_filename = f"{voice_name_base}.onnx.json" | |
| onnx_path = PIPER_VOICE_DIR / onnx_filename | |
| json_path = PIPER_VOICE_DIR / json_filename | |
| # Check if both files exist | |
| if not (onnx_path.exists() and json_path.exists()): | |
| print(f"Downloading Piper voice '{voice_name_base}' from repo '{DEFAULT_PIPER_VOICE_REPO_ID}' subpath '{voice_subpath}'...") | |
| try: | |
| # Download ONNX model file | |
| hf_hub_download(repo_id=DEFAULT_PIPER_VOICE_REPO_ID, | |
| filename=f"{voice_subpath.strip('/')}/{onnx_filename}", | |
| local_dir=PIPER_VOICE_DIR, | |
| local_dir_use_symlinks=False, # Recommended for Spaces | |
| repo_type="model") # Explicitly model type | |
| # Download JSON config file | |
| hf_hub_download(repo_id=DEFAULT_PIPER_VOICE_REPO_ID, | |
| filename=f"{voice_subpath.strip('/')}/{json_filename}", | |
| local_dir=PIPER_VOICE_DIR, | |
| local_dir_use_symlinks=False, | |
| repo_type="model") | |
| print(f"Piper voice '{voice_name_base}' downloaded successfully.") | |
| except Exception as e: | |
| print(f"ERROR downloading Piper voice '{voice_name_base}': {e}") | |
| gr.Warning(f"Failed to download Piper voice {voice_name_base}. TTS might not work.") | |
| return None, None # Return None if download fails | |
| else: | |
| print(f"Piper voice '{voice_name_base}' found locally at {PIPER_VOICE_DIR}.") | |
| return onnx_path, json_path | |
| if PIPER_TTS_AVAILABLE: | |
| onnx_path, json_path = ensure_piper_voice() | |
| if onnx_path and json_path and onnx_path.exists() and json_path.exists(): # Double check existence after potential download | |
| try: | |
| piper_voice_instance = PiperVoice.load(str(onnx_path), config_path=str(json_path)) | |
| print("Default Piper TTS voice loaded successfully.") | |
| except Exception as e: | |
| print(f"ERROR loading Piper voice from {onnx_path} and {json_path}: {e}") | |
| PIPER_TTS_AVAILABLE = False # Disable if voice loading fails | |
| else: | |
| print("Piper voice files not found or not downloaded correctly. Piper TTS disabled.") | |
| PIPER_TTS_AVAILABLE = False | |
| from pydub import AudioSegment # For background sound mixing | |
| # --- Gemini Configuration (Keep your existing Gemini setup) --- | |
| try: | |
| import google.generativeai as genai | |
| GOOGLE_GEMINI_AVAILABLE = True | |
| except ImportError: | |
| GOOGLE_GEMINI_AVAILABLE = False | |
| print("Warning: google-generativeai library not found. Gemini features will be disabled.") | |
| GEMINI_API_KEY = os.environ.get("GOOGLE_API_KEY") | |
| gemini_configured_successfully = False | |
| gemini_model_instance = None | |
| GEMINI_MODEL_NAME = "gemini-1.5-flash-latest" # Or your preferred Gemini model | |
| if GOOGLE_GEMINI_AVAILABLE and GEMINI_API_KEY: | |
| try: | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| gemini_model_instance = genai.GenerativeModel(GEMINI_MODEL_NAME) | |
| # Optional: Test call to confirm API key and model access if desired (can slow startup) | |
| # _ = gemini_model_instance.generate_content("test", generation_config={'candidate_count': 1}) | |
| gemini_configured_successfully = True | |
| print(f"Google Gemini configured successfully with model: {GEMINI_MODEL_NAME}") | |
| except Exception as e: | |
| print(f"Error configuring Google Gemini at startup: {e}. Gemini features might not work correctly.") | |
| elif GOOGLE_GEMINI_AVAILABLE and not GEMINI_API_KEY: | |
| print("INFO: GOOGLE_API_KEY not found in environment variables/secrets for Gemini. Manual input will be required if feature is used.") | |
| else: | |
| if GOOGLE_GEMINI_AVAILABLE: # SDK is there, but no key | |
| print("INFO: Gemini SDK is available, but no API key found (neither in secrets nor manually provided yet).") | |
| else: # SDK itself is missing | |
| print("INFO: Gemini features disabled (SDK not available).") | |
| def enhance_text_with_gemini(text_to_enhance: str, enhancement_prompt: str, api_key_manual: Optional[str] = None) -> str: | |
| """Enhances text using Google Gemini. (Paste your full working function here)""" | |
| if not GOOGLE_GEMINI_AVAILABLE: | |
| gr.Warning("Gemini SDK not available. Cannot enhance text.") | |
| return text_to_enhance | |
| active_gemini_model_instance = None | |
| temp_gemini_configured_manually = False | |
| original_global_api_key_for_revert = genai.API_KEY if hasattr(genai, 'API_KEY') and genai.API_KEY else None | |
| if api_key_manual: | |
| gr.Info("Attempting to use manually provided API key for this Gemini request...") | |
| try: | |
| genai.configure(api_key=api_key_manual) | |
| active_gemini_model_instance = genai.GenerativeModel(GEMINI_MODEL_NAME) | |
| _ = active_gemini_model_instance.generate_content("test", generation_config={'candidate_count': 1}) # Test call | |
| temp_gemini_configured_manually = True | |
| print("Gemini temporarily configured and tested with manual API key for this request.") | |
| except Exception as e: | |
| gr.Error(f"Failed to configure or test Gemini with provided manual API key: {e}") | |
| if original_global_api_key_for_revert and hasattr(genai, 'configure'): # Revert if there was an original key | |
| try: genai.configure(api_key=original_global_api_key_for_revert) | |
| except Exception: pass | |
| return text_to_enhance | |
| elif gemini_configured_successfully and gemini_model_instance: | |
| active_gemini_model_instance = gemini_model_instance | |
| else: | |
| gr.Warning("Google API Key for Gemini not provided or Gemini not configured correctly. Cannot enhance text.") | |
| return text_to_enhance | |
| if not active_gemini_model_instance: | |
| gr.Error("Gemini model instance could not be initialized. Cannot enhance text.") | |
| return text_to_enhance | |
| full_prompt = f"{enhancement_prompt}:\n\n---\n{text_to_enhance}\n---" | |
| print(f"Sending to Gemini: First 100 chars of prompt: {full_prompt[:100]}...") | |
| try: | |
| gen_config = {'candidate_count': 1} | |
| response = active_gemini_model_instance.generate_content(full_prompt, generation_config=gen_config) | |
| if response.parts: | |
| enhanced_text = response.text | |
| print(f"Gemini enhanced text (first 100 chars): {enhanced_text[:100]}") | |
| return enhanced_text | |
| else: # Handle no parts / blocked response | |
| block_reason = "Unknown"; safety_ratings_str = "N/A" | |
| if hasattr(response, 'prompt_feedback') and response.prompt_feedback: | |
| block_reason = response.prompt_feedback.block_reason or "Not Blocked but no parts" | |
| if hasattr(response.prompt_feedback, 'safety_ratings'): | |
| safety_ratings_str = ", ".join([f"{rating.category.name}: {rating.probability.name}" for rating in response.prompt_feedback.safety_ratings]) | |
| elif hasattr(response, 'candidates') and response.candidates and hasattr(response.candidates[0], 'finish_reason') and response.candidates[0].finish_reason != 1: # 1 is typically "STOP" | |
| block_reason = f"Finish Reason: {response.candidates[0].finish_reason.name if hasattr(response.candidates[0].finish_reason, 'name') else response.candidates[0].finish_reason}" | |
| gr.Warning(f"Gemini returned no content. Block Reason: {block_reason}. Safety: {safety_ratings_str}. Using original text.") | |
| return text_to_enhance | |
| except Exception as e: | |
| gr.Error(f"Error calling Gemini API: {type(e).__name__}: {e}") | |
| return text_to_enhance | |
| finally: # Revert manual key config if it was used and there was an original key | |
| if temp_gemini_configured_manually and original_global_api_key_for_revert and hasattr(genai, 'configure'): | |
| try: | |
| genai.configure(api_key=original_global_api_key_for_revert) | |
| print("Reverted Gemini configuration to use original API key (if it was set).") | |
| except Exception as revert_e: | |
| print(f"Warning: Could not revert Gemini config: {revert_e}") | |
| elif temp_gemini_configured_manually and not original_global_api_key_for_revert and hasattr(genai, 'configure'): | |
| # If there was no original global key, it's harder to "unset" a manual key. | |
| # For now, just print a message. The next call that doesn't provide a manual key | |
| # will fail if the global key isn't set via secrets. | |
| print("Manual Gemini API key was used. Global API key (from secrets) remains unset if it wasn't there before.") | |
| # --- Text Parsing for Piper TTS and Background Sounds --- | |
| def parse_text_for_piper_and_sounds(text: str) -> Tuple[str, List[Dict]]: | |
| """ | |
| Strips special tags not understood by Piper (like S1=(), {} ) | |
| and extracts [sound:...] directives. | |
| Non-verbal tags like (laughs) will be passed as text to Piper. | |
| """ | |
| sound_requests = [] | |
| # Strip S1=(...) and {characteristic} tags so Piper doesn't say them literally. | |
| # If you want Piper to *try* to interpret them, remove or comment out these lines. | |
| text_cleaned = re.sub(r"S[12]=\([^)]+\)", "", text) | |
| text_cleaned = re.sub(r"\{[^}]+\}", "", text_cleaned) | |
| # Non-verbal tags like (laughs) will be kept in the text for Piper to attempt to read. | |
| # Piper won't generate a laugh, but will say "laughs". | |
| def extract_and_remove_sounds(match): | |
| sound_tag_content = match.group(1) # Content inside [sound:...] | |
| try: | |
| # Reconstruct the full tag for consistent parsing if needed, or parse sound_tag_content directly | |
| full_sound_tag_for_parsing = f"[sound:{sound_tag_content}]" | |
| sound_name_match = re.search(r"sound:\s*([\w\-\_]+)", full_sound_tag_for_parsing) | |
| sound_name = sound_name_match.group(1).lower() if sound_name_match else None | |
| if not sound_name: | |
| gr.Warning(f"Malformed sound tag (missing sound name): '[sound:{sound_tag_content}]'. Will be spoken as text.") | |
| return match.group(0) # Return original malformed tag to be spoken | |
| volume_match = re.search(r"volume:\s*([0-9\.]+)", full_sound_tag_for_parsing) | |
| volume_factor = float(volume_match.group(1)) if volume_match else 0.5 | |
| sound_requests.append({"name": sound_name, "volume_factor": max(0.0, min(1.0, volume_factor))}) | |
| return "" # Remove the [sound:...] tag from text sent to Piper | |
| except Exception as e: | |
| gr.Warning(f"Could not parse sound tag '[sound:{sound_tag_content}]': {e}. Will be spoken as text.") | |
| return match.group(0) # Return original tag to be spoken if parsing fails | |
| text_for_tts = re.sub(r"\[sound:([^\]]+)\]", extract_and_remove_sounds, text_cleaned) | |
| return text_for_tts.strip(), sound_requests | |
| # --- Background Sound Configuration --- | |
| SOUND_EFFECTS_DIR = Path("./sounds") | |
| SUPPORTED_SOUND_FORMATS_BG = [".wav", ".mp3"] | |
| DEFAULT_BACKGROUND_VOLUME_ADJUST_DB = -18 # Made background sounds a bit quieter | |
| # --- Main Inference Function --- | |
| # @spaces.GPU # Not needed for Piper as it's CPU-fast | |
| def run_inference( | |
| text_input_original: str, | |
| piper_length_scale: float, | |
| piper_noise_scale: float, | |
| piper_noise_w: float, | |
| speed_factor_post_tts: float, | |
| enable_gemini_enhancement: bool, | |
| gemini_enhancement_prompt: str, | |
| gemini_api_key_manual: Optional[str] | |
| ): | |
| if not PIPER_TTS_AVAILABLE or not piper_voice_instance: | |
| gr.Error("Piper TTS is not available or a voice model is not loaded. Cannot synthesize speech.") | |
| return (22050, np.zeros(100, dtype=np.int16)) # Default SR, short silence | |
| # 1. Gemini Enhancement (if enabled) | |
| processed_text = text_input_original | |
| if enable_gemini_enhancement: | |
| if gemini_enhancement_prompt and gemini_enhancement_prompt.strip(): | |
| gr.Info("Enhancing text with Gemini...") | |
| processed_text = enhance_text_with_gemini( | |
| text_input_original, | |
| gemini_enhancement_prompt, | |
| gemini_api_key_manual | |
| ) | |
| else: | |
| gr.Warning("Gemini enhancement enabled, but the enhancement prompt is empty. Using original text.") | |
| # 2. Parse for Piper TTS text and [sound:...] directives | |
| text_for_piper, sound_requests = parse_text_for_piper_and_sounds(processed_text) | |
| print(f"Text for Piper TTS: '{text_for_piper}'") | |
| print(f"Sound requests: {sound_requests}") | |
| # 3. Piper TTS Generation | |
| generated_speech_np = np.array([], dtype=np.int16) # Piper outputs int16 | |
| output_sr = piper_voice_instance.config.sample_rate if piper_voice_instance.config else 22050 # Default if config missing | |
| if text_for_piper: | |
| gr.Info(f"Synthesizing with Piper TTS...") | |
| synthesis_kwargs = { | |
| 'length_scale': piper_length_scale, | |
| 'noise_scale': piper_noise_scale, | |
| 'noise_w': piper_noise_w, | |
| } | |
| start_time = time.time() | |
| # Piper's synthesize method directly gives audio bytes or chunks of bytes | |
| try: | |
| audio_bytes = b"".join(piper_voice_instance.synthesize(text_for_piper, **synthesis_kwargs)) | |
| end_time = time.time() | |
| print(f"Piper TTS Generation finished in {end_time - start_time:.2f} seconds.") | |
| if audio_bytes: | |
| generated_speech_np = np.frombuffer(audio_bytes, dtype=np.int16) | |
| else: | |
| gr.Warning("Piper TTS produced no audio bytes for the input text.") | |
| except Exception as e_piper: | |
| gr.Error(f"Error during Piper TTS synthesis: {e_piper}") | |
| # Fallback to silence if Piper fails | |
| generated_speech_np = np.zeros(int(output_sr * 0.1), dtype=np.int16) | |
| elif sound_requests: # Only sound requests, no text for Piper | |
| gr.Info("No text for Piper TTS. Will generate background sound over silence if requested.") | |
| generated_speech_np = np.zeros(int(output_sr * 0.1), dtype=np.int16) # 0.1 sec silence for mixing | |
| else: # No text and no sound requests | |
| if not (text_input_original and text_input_original.strip()): # Check if original input was also empty | |
| raise gr.Error("Input text is empty and no sound was requested.") | |
| else: # Original text was there but got stripped to nothing, and no sound requests | |
| gr.Warning("After processing, no text remained for TTS and no background sound was requested.") | |
| generated_speech_np = np.zeros(int(output_sr * 0.1), dtype=np.int16) | |
| # 4. Post-TTS Speed Adjustment (Applied to the speech generated by Piper) | |
| if speed_factor_post_tts != 1.0 and len(generated_speech_np) > 0: | |
| # Convert to float for interpolation, then back to int16 | |
| # Ensure normalization to avoid clipping if values are already near max int16 | |
| max_val = np.iinfo(np.int16).max | |
| generated_speech_float = generated_speech_np.astype(np.float32) / max_val | |
| original_len = len(generated_speech_float) | |
| speed_factor_clamped = max(0.5, min(2.0, speed_factor_post_tts)) | |
| target_len = int(original_len / speed_factor_clamped) | |
| if target_len != original_len and target_len > 0: | |
| x_original = np.arange(original_len) | |
| x_resampled = np.linspace(0, original_len - 1, target_len) | |
| resampled_float = np.interp(x_resampled, x_original, generated_speech_float) | |
| # Clip before converting back to int16 | |
| generated_speech_np = (np.clip(resampled_float, -1.0, 1.0) * max_val).astype(np.int16) | |
| print(f"Applied post-TTS speed factor {speed_factor_clamped}") | |
| else: | |
| print("Skipped post-TTS speed adjustment (target length invalid or no change).") | |
| # 5. Background Sound Mixing | |
| final_audio_data_np = generated_speech_np # Start with (potentially speed-adjusted) speech | |
| final_sr = output_sr | |
| if sound_requests: # Only one overall background sound is handled from the first [sound:] tag | |
| overall_sound_effect_to_mix = sound_requests[0] | |
| sound_name = overall_sound_effect_to_mix["name"] | |
| volume_factor = overall_sound_effect_to_mix["volume_factor"] | |
| bg_sound_file_path = next((p for fmt in SUPPORTED_SOUND_FORMATS_BG for p in [(SOUND_EFFECTS_DIR / (sound_name + fmt))] if p.exists()), None) | |
| if bg_sound_file_path: | |
| print(f"Mixing with background sound: {bg_sound_file_path}") | |
| try: | |
| # Convert current final_audio_data_np (which is speech) to AudioSegment | |
| speech_segment = AudioSegment( | |
| data=final_audio_data_np.tobytes(), | |
| sample_width=final_audio_data_np.dtype.itemsize, # Should be 2 for int16 | |
| frame_rate=final_sr, | |
| channels=1 # Piper output is mono | |
| ) | |
| # Load background sound | |
| bg_segment = AudioSegment.from_file(bg_sound_file_path).set_channels(1).set_frame_rate(final_sr) | |
| # Adjust background sound volume | |
| db_reduction = 0.0 | |
| if volume_factor > 0.001 and volume_factor < 1.0: | |
| db_reduction = 20 * np.log10(volume_factor) | |
| elif volume_factor <= 0.001: # Make it very quiet if volume is near zero | |
| db_reduction = -60 # Effectively very quiet | |
| bg_segment_adjusted = bg_segment + db_reduction + DEFAULT_BACKGROUND_VOLUME_ADJUST_DB | |
| # Ensure speech_segment is not shorter than bg_segment if only bg sound is effectively playing | |
| if len(speech_segment) < 100 and len(bg_segment_adjusted) > 0: # If speech is very short (e.g. default silence) | |
| # Make the "speech" part long enough to hear the background sound for a bit | |
| target_duration_ms = max(len(speech_segment), min(len(bg_segment_adjusted), 5000)) # cap at 5s | |
| speech_segment = AudioSegment.silent(duration=target_duration_ms, frame_rate=final_sr) | |
| # Overlay (loop background if shorter than speech) | |
| # Ensure bg_segment_adjusted is not zero length before division | |
| loop_times = int(len(speech_segment) / len(bg_segment_adjusted)) + 1 if len(bg_segment_adjusted) > 0 else 1 | |
| mixed_segment = speech_segment.overlay(bg_segment_adjusted, loop=True, times=loop_times) | |
| final_audio_data_np = np.array(mixed_segment.get_array_of_samples()).astype(np.int16) | |
| final_sr = mixed_segment.frame_rate # Should be same as output_sr | |
| except Exception as e: | |
| gr.Warning(f"Could not mix background sound '{sound_name}': {e}") | |
| print(f"Error mixing sound: {e}"); import traceback; traceback.print_exc() | |
| # Fallback to just the speech if mixing fails (already in final_audio_data_np) | |
| else: | |
| gr.Warning(f"Background sound '{sound_name}' not found in {SOUND_EFFECTS_DIR}. Using speech only.") | |
| # Ensure there's some audio, even if it's short silence, if all inputs were valid but produced no sound | |
| if len(final_audio_data_np) == 0: | |
| print("Warning: Final audio data is empty. Returning short silence.") | |
| final_audio_data_np = np.zeros(int(final_sr * 0.1), dtype=np.int16) | |
| return (final_sr, final_audio_data_np) | |
| # --- Gradio UI --- | |
| css = """ | |
| #col-container {max-width: 90%; margin-left: auto; margin-right: auto;} | |
| .gr-prose {font-size: 100% !important;} .gr-prose h1 {font-size: 2.5em !important;} | |
| .gr-prose h2 {font-size: 1.8em !important;} .gr-prose p {font-size: 1.1em !important; margin-bottom: 0.5em !important;} | |
| .gr-prose ul {font-size: 1.0em !important; margin-left: 20px !important;} .gr-prose li { margin-bottom: 0.3em !important;} | |
| """ | |
| default_text_piper = """Welcome to EverySpeech with the speedy Piper TTS! | |
| S1=(This speaker tag will be removed) {excited} This characteristic tag will also be removed. | |
| (laughs) This (laughs) tag will be read out by Piper if not stripped by a more aggressive parser. | |
| You can add background sound too: [sound: city, volume: 0.3] I am speaking over city noise. | |
| """ | |
| example_txt_path = Path("./example.txt") # You can create this if you want | |
| if example_txt_path.exists(): | |
| try: | |
| loaded_default_text = example_txt_path.read_text(encoding="utf-8").strip() | |
| if loaded_default_text: default_text_piper = loaded_default_text | |
| except Exception as e: print(f"Warning: Could not read example.txt: {e}") | |
| how_to_use_text_piper = f""" | |
| ## How to Use EverySpeech (Piper TTS Edition): | |
| **Fast CPU-based Text-to-Speech!** | |
| - Write text in **Input Text**. | |
| - `S1=(description)` and `{{characteristic}}` tags are currently **stripped** before sending to Piper TTS. | |
| - Non-verbal tags like `(laughs)` or `(coughs)` will be **read as text** by Piper (e.g., it will say the word "laughs"). | |
| - **Background Music/Sound:** Use `[sound: name, volume: X]` (e.g., `[sound: rain, volume:0.3]`). The first such tag applies to the whole output. Sound files from `sounds` directory. | |
| **Optional Gemini Text Enhancement:** | |
| - Enable, then provide a prompt like "Make this text more formal:". Your original text is appended. | |
| - Provide API key if not in Space secrets. | |
| **Piper TTS Parameters:** Control voice speed/style. Current voice: `{DEFAULT_PIPER_VOICE_NAME}` | |
| """ | |
| if not PIPER_TTS_AVAILABLE: | |
| how_to_use_text_piper = "## WARNING: Piper TTS FAILED TO LOAD. Speech synthesis will not work. Check Space logs and `requirements.txt` for `piper-tts` and `onnxruntime`.\n\n" + how_to_use_text_piper | |
| with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# EverySpeech by Anand (Piper TTS - Fast CPU)") | |
| gr.Markdown(how_to_use_text_piper) | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=2): | |
| text_input_original_ui = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text here...", value=default_text_piper, lines=7, | |
| ) | |
| with gr.Accordion("✨ Optional: Gemini Text Enhancement", open=False): | |
| enable_gemini_ui = gr.Checkbox(label="Enable Gemini Text Enhancement", value=False, | |
| visible=GOOGLE_GEMINI_AVAILABLE, | |
| info="If checked, your input text will first be processed by Gemini.") | |
| gemini_enhancement_prompt_ui = gr.Textbox(label="Gemini Enhancement Prompt", | |
| placeholder="e.g., Make this more descriptive:", lines=2, | |
| info="Your 'Input Text' above will be appended to this prompt for Gemini.", | |
| visible=GOOGLE_GEMINI_AVAILABLE) | |
| gemini_api_key_manual_ui = gr.Textbox(label="Google API Key (if not in secrets or startup config failed)", type="password", | |
| placeholder="Paste Google AI API key if needed", | |
| visible=GOOGLE_GEMINI_AVAILABLE, # Show always if SDK there | |
| info="Needed if GOOGLE_API_KEY secret isn't set or failed at startup.") | |
| # No audio_prompt_input for this basic Piper setup | |
| with gr.Column(scale=1): | |
| audio_output_ui = gr.Audio(label="Generated Audio", type="numpy", autoplay=False) | |
| run_button = gr.Button("Generate Audio", variant="primary", scale=1) | |
| with gr.Accordion("Piper TTS Generation Parameters", open=True, visible=PIPER_TTS_AVAILABLE): | |
| gr.Markdown(f"Using Piper voice: `{DEFAULT_PIPER_VOICE_NAME}`. Adjust parameters for style.") | |
| piper_length_scale_ui = gr.Slider(label="Length Scale (Prosody/Speed)", minimum=0.5, maximum=2.0, value=1.0, step=0.05, info="Smaller is faster. Affects prosody.") | |
| piper_noise_scale_ui = gr.Slider(label="Noise Scale (Variability)", minimum=0.0, maximum=1.5, value=0.667, step=0.05, info="Adds noise for expressiveness. Default: 0.667") | |
| piper_noise_w_ui = gr.Slider(label="Noise W (Phoneme Duration Noise)", minimum=0.0, maximum=1.5, value=0.8, step=0.05, info="Noise for phoneme pronunciation/duration. Default: 0.8") | |
| speed_factor_post_tts_ui = gr.Slider(label="Global Speed Factor (Post-TTS Resample)", minimum=0.5, maximum=2.0, value=1.0, step=0.05, info="Overall speed change after TTS generation by resampling.") | |
| gradio_inputs_piper = [ | |
| text_input_original_ui, | |
| piper_length_scale_ui, piper_noise_scale_ui, piper_noise_w_ui, | |
| speed_factor_post_tts_ui, | |
| enable_gemini_ui, gemini_enhancement_prompt_ui, gemini_api_key_manual_ui | |
| ] | |
| run_button.click( | |
| fn=run_inference, | |
| inputs=gradio_inputs_piper, | |
| outputs=[audio_output_ui], | |
| api_name="generate_audio_every_speech_piper", | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Hello, this is Piper TTS. It should be very fast on CPU.", 1.0, 0.667, 0.8, 1.0, False, "", ""], | |
| ["S1=(this will be stripped) {excited} This will also be stripped. Piper will say this part. (laughs) And it will say laughs.", 1.0, 0.7, 0.9, 1.0, False, "", ""], | |
| ["This text will be made more formal by Gemini. [sound: city, volume:0.2]", 1.2, 0.6, 0.7, 0.9, True, "Make this text extremely formal and verbose:", ""], | |
| ["[sound: rain]", 1.0, 0.667, 0.8, 1.0, False, "", ""], # Only sound | |
| ], | |
| inputs=gradio_inputs_piper, # Ensure this matches the function inputs | |
| outputs=[audio_output_ui], | |
| fn=run_inference, | |
| cache_examples=False, # "lazy" or False is safer with dynamic/file-based inputs | |
| label="Examples (Piper TTS - Click to Run)", | |
| ) | |
| if __name__ == "__main__": | |
| print(f"Looking for sound effects in: {SOUND_EFFECTS_DIR.resolve()}") | |
| if not SOUND_EFFECTS_DIR.exists(): | |
| try: | |
| SOUND_EFFECTS_DIR.mkdir(parents=True, exist_ok=True) | |
| print(f"Created sound effects directory: '{SOUND_EFFECTS_DIR}'. Please upload sound files there for [sound:] tags.") | |
| except Exception as e_mkdir: | |
| print(f"Error creating sound directory {SOUND_EFFECTS_DIR}: {e_mkdir}") | |
| else: | |
| print(f"Sound effects directory '{SOUND_EFFECTS_DIR}' found.") | |
| if not PIPER_TTS_AVAILABLE: | |
| print("\nCRITICAL WARNING: Piper TTS FAILED TO LOAD or voice files are missing. SPEECH SYNTHESIS WILL NOT WORK.") | |
| print("Ensure 'piper-tts' and 'onnxruntime' are in requirements.txt and that voice files can be downloaded/found.") | |
| if not GOOGLE_GEMINI_AVAILABLE: | |
| print("\nWARNING: 'google-generativeai' package not installed. Gemini features disabled.") | |
| elif not GEMINI_API_KEY and not gemini_configured_successfully : | |
| print("\nINFO: GOOGLE_API_KEY secret not found. Gemini features will require manual API key input in the UI if used.") | |
| elif not gemini_configured_successfully: | |
| print("\nWARNING: Gemini was not configured successfully at startup (e.g. API key invalid or model access issue). Check logs. Gemini features may not work as expected if used.") | |
| elif gemini_configured_successfully: | |
| print("\nINFO: Gemini configured with API key from secrets (or at least key was found).") | |
| demo.launch() |