Spaces:

johnbridges
/

NetMonTTS

Running

App Files Files Community

johnbridges commited on Jun 3, 2025

Commit

df3ba3c

verified ·

1 Parent(s): db0a2ce

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -83

app.py CHANGED Viewed

@@ -11,15 +11,14 @@ import uuid
 import logging
 from flask_cors import CORS
 import threading
 import tempfile
 from huggingface_hub import snapshot_download
-from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError
-import time
 from tts_processor import preprocess_all
 import hashlib
 # Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = Flask(__name__)
@@ -38,13 +37,52 @@ SERVE_DIR = os.environ.get("SERVE_DIR", "./files")  # Default to './files' if no
 os.makedirs(SERVE_DIR, exist_ok=True)
 def validate_audio_file(file):
-    if file.content_type not in ["audio/wav", "audio/x-wav", "audio/mpeg", "audio/mp3"]:
-        raise ValueError("Unsupported file type")
     file.seek(0, os.SEEK_END)
     file_size = file.tell()
     file.seek(0)  # Reset file pointer
-    if file_size > 10 * 1024 * 1024:  # 10 MB limit
-        raise ValueError("File is too large (max 10 MB)")
 def validate_text_input(text):
     if not isinstance(text, str):
@@ -66,72 +104,59 @@ def is_cached(cached_file_path):
     exists = os.path.exists(cached_file_path)  # Perform disk check
     file_cache[cached_file_path] = exists  # Update the cache
     return exists
-import time
-from huggingface_hub import snapshot_download
-from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError
 def initialize_models():
     global sess, voice_style, processor, whisper_model
-    max_retries = 5  # Maximum number of retries
-    retry_delay = 2  # Initial delay in seconds (will double after each retry)
-    for attempt in range(max_retries):
-        try:
-            # Download the ONNX model if not already downloaded
-            if not os.path.exists(model_path):
-                logger.info(f"Attempt {attempt + 1} to download and load Kokoro model...")
-                kokoro_dir = snapshot_download(kokoro_model_id, cache_dir=model_path)
-                logger.info(f"Kokoro model directory: {kokoro_dir}")
-            else:
-                kokoro_dir = model_path
-                logger.info(f"Using cached Kokoro model directory: {kokoro_dir}")
-            # Validate ONNX file path
-            onnx_path = None
-            for root, _, files in os.walk(kokoro_dir):
-                if 'model.onnx' in files:
-                    onnx_path = os.path.join(root, 'model.onnx')
-                    break
-            if not onnx_path or not os.path.exists(onnx_path):
-                raise FileNotFoundError(f"ONNX file not found after redownload at {kokoro_dir}")
-            logger.info("Loading ONNX session...")
-            sess = InferenceSession(onnx_path)
-            logger.info(f"ONNX session loaded successfully from {onnx_path}")
-            # Load the voice style vector
-            voice_style_path = None
-            for root, _, files in os.walk(kokoro_dir):
-                if f'{voice_name}.bin' in files:
-                    voice_style_path = os.path.join(root, f'{voice_name}.bin')
-                    break
-            if not voice_style_path or not os.path.exists(voice_style_path):
-                raise FileNotFoundError(f"Voice style file not found at {voice_style_path}")
-            logger.info("Loading voice style vector...")
-            voice_style = np.fromfile(voice_style_path, dtype=np.float32).reshape(-1, 1, 256)
-            logger.info(f"Voice style vector loaded successfully from {voice_style_path}")
-            # Initialize Whisper model for S2T
-            logger.info("Downloading and loading Whisper model...")
-            processor = WhisperProcessor.from_pretrained("openai/whisper-base")
-            whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
-            whisper_model.config.forced_decoder_ids = None
-            logger.info("Whisper model loaded successfully")
-            # If everything succeeds, break out of the retry loop
-            break
-        except (RepositoryNotFoundError, HfHubHTTPError, FileNotFoundError) as e:
-            logger.error(f"Attempt {attempt + 1} failed: {str(e)}")
-            if attempt == max_retries - 1:
-                logger.error("Max retries reached. Failed to initialize models.")
-                raise  # Re-raise the exception if max retries are reached
-            time.sleep(retry_delay)
-            retry_delay *= 2  # Exponential backoff
 # Initialize models
 initialize_models()
@@ -221,24 +246,60 @@ def generate_audio():
             return jsonify({"status": "error", "message": str(e)}), 500
 # Speech-to-Text (S2T) Endpoint
 @app.route('/transcribe_audio', methods=['POST'])
 def transcribe_audio():
-    """Speech-to-Text (S2T) Endpoint"""
     with global_lock:  # Acquire global lock to ensure only one instance runs
-        audio_path = None
         try:
             logger.debug("Received request to /transcribe_audio")
             file = request.files['file']
-            validate_audio_file(file)
-            # Generate a unique filename using uuid
-            unique_filename = f"{uuid.uuid4().hex}_{file.filename}"
-            audio_path = os.path.join("/tmp", unique_filename)
-            file.save(audio_path)
-            logger.debug(f"Audio file saved to {audio_path}")
-            # Load and preprocess audio
             logger.debug("Processing audio for transcription...")
-            audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
             input_features = processor(
                 audio_array,
@@ -257,10 +318,14 @@ def transcribe_audio():
             logger.error(f"Error transcribing audio: {str(e)}")
             return jsonify({"status": "error", "message": str(e)}), 500
         finally:
-            # Ensure temporary file is removed
-            if audio_path and os.path.exists(audio_path):
-                os.remove(audio_path)
-                logger.debug(f"Temporary file {audio_path} removed")
 @app.route('/files/<filename>', methods=['GET'])
 def serve_wav_file(filename):

 import logging
 from flask_cors import CORS
 import threading
+import werkzeug
 import tempfile
 from huggingface_hub import snapshot_download
 from tts_processor import preprocess_all
 import hashlib
 # Configure logging
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 app = Flask(__name__)
 os.makedirs(SERVE_DIR, exist_ok=True)
 def validate_audio_file(file):
+    """Validates audio files including WebM/Opus format"""
+    if not isinstance(file, werkzeug.datastructures.FileStorage):
+        raise ValueError("Invalid file type")
+    # Supported MIME types (add WebM/Opus)
+    supported_types = [
+        "audio/wav",
+        "audio/x-wav",
+        "audio/mpeg",
+        "audio/mp3",
+        "audio/webm",
+        "audio/ogg"  # For Opus in Ogg container
+    ]
+    # Check MIME type
+    if file.content_type not in supported_types:
+        raise ValueError(f"Unsupported file type. Must be one of: {', '.join(supported_types)}")
+    # Check file size
     file.seek(0, os.SEEK_END)
     file_size = file.tell()
     file.seek(0)  # Reset file pointer
+    max_size = 10 * 1024 * 1024  # 10 MB
+    if file_size > max_size:
+        raise ValueError(f"File is too large (max {max_size//(1024*1024)} MB)")
+    # Optional: Verify file header matches content_type
+    if not verify_audio_header(file):
+        raise ValueError("File header doesn't match declared content type")
+def verify_audio_header(file):
+    """Quickly checks if file headers match the declared audio format"""
+    header = file.read(4)
+    file.seek(0)  # Rewind after reading
+    if file.content_type in ["audio/webm", "audio/ogg"]:
+        # WebM starts with \x1aE\xdf\xa3, Ogg with OggS
+        return (
+            (file.content_type == "audio/webm" and header.startswith(b'\x1aE\xdf\xa3')) or
+            (file.content_type == "audio/ogg" and header.startswith(b'OggS'))
+        )
+    elif file.content_type in ["audio/wav", "audio/x-wav"]:
+        return header.startswith(b'RIFF')
+    elif file.content_type in ["audio/mpeg", "audio/mp3"]:
+        return header.startswith(b'\xff\xfb')  # MP3 frame sync
+    return True  # Skip verification for other types
 def validate_text_input(text):
     if not isinstance(text, str):
     exists = os.path.exists(cached_file_path)  # Perform disk check
     file_cache[cached_file_path] = exists  # Update the cache
     return exists
+# Initialize models
 def initialize_models():
     global sess, voice_style, processor, whisper_model
+    try:
+        # Download the ONNX model if not already downloaded
+        if not os.path.exists(model_path):
+            logger.info("Downloading and loading Kokoro model...")
+            kokoro_dir = snapshot_download(kokoro_model_id, cache_dir=model_path)
+            logger.info(f"Kokoro model directory: {kokoro_dir}")
+        else:
+            kokoro_dir = model_path
+            logger.info(f"Using cached Kokoro model directory: {kokoro_dir}")
+        # Validate ONNX file path
+        onnx_path = None
+        for root, _, files in os.walk(kokoro_dir):
+            if 'model.onnx' in files:
+                onnx_path = os.path.join(root, 'model.onnx')
+                break
+        if not onnx_path or not os.path.exists(onnx_path):
+            raise FileNotFoundError(f"ONNX file not found after redownload at {kokoro_dir}")
+        logger.info("Loading ONNX session...")
+        sess = InferenceSession(onnx_path)
+        logger.info(f"ONNX session loaded successfully from {onnx_path}")
+        # Load the voice style vector
+        voice_style_path = None
+        for root, _, files in os.walk(kokoro_dir):
+            if f'{voice_name}.bin' in files:
+                voice_style_path = os.path.join(root, f'{voice_name}.bin')
+                break
+        if not voice_style_path or not os.path.exists(voice_style_path):
+            raise FileNotFoundError(f"Voice style file not found at {voice_style_path}")
+        logger.info("Loading voice style vector...")
+        voice_style = np.fromfile(voice_style_path, dtype=np.float32).reshape(-1, 1, 256)
+        logger.info(f"Voice style vector loaded successfully from {voice_style_path}")
+        # Initialize Whisper model for S2T
+        logger.info("Downloading and loading Whisper model...")
+        processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+        whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+        whisper_model.config.forced_decoder_ids = None
+        logger.info("Whisper model loaded successfully")
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        raise
 # Initialize models
 initialize_models()
             return jsonify({"status": "error", "message": str(e)}), 500
 # Speech-to-Text (S2T) Endpoint
+# Add these imports at the top with the other imports
+import subprocess
+import tempfile
+from pathlib import Path
+# Then update the transcribe_audio function:
 @app.route('/transcribe_audio', methods=['POST'])
 def transcribe_audio():
+    """Speech-to-Text (S2T) Endpoint with automatic format conversion"""
     with global_lock:  # Acquire global lock to ensure only one instance runs
+        input_audio_path = None
+        converted_audio_path = None
         try:
             logger.debug("Received request to /transcribe_audio")
             file = request.files['file']
+            # Create temporary files for both input and output
+            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as input_temp:
+                input_audio_path = input_temp.name
+                file.save(input_audio_path)
+                logger.debug(f"Original audio file saved to {input_audio_path}")
+            # Create a temporary file for the converted WAV
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as output_temp:
+                converted_audio_path = output_temp.name
+            # Convert to WAV with ffmpeg (16kHz, mono)
+            logger.debug(f"Converting audio to 16kHz mono WAV format...")
+            conversion_command = [
+                'ffmpeg',
+                '-y',                  # Force overwrite without prompting
+                '-i', input_audio_path,
+                '-acodec', 'pcm_s16le', # 16-bit PCM
+                '-ac', '1',             # mono
+                '-ar', '16000',         # 16kHz sample rate
+                '-af', 'highpass=f=80,lowpass=f=7500,afftdn=nr=10:nf=-25,loudnorm=I=-16:TP=-1.5:LRA=11',  # Audio cleanup filters
+                converted_audio_path
+            ]
+            result = subprocess.run(
+                conversion_command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            if result.returncode != 0:
+                logger.error(f"FFmpeg conversion error: {result.stderr}")
+                raise Exception(f"Audio conversion failed: {result.stderr}")
+            logger.debug(f"Audio successfully converted to {converted_audio_path}")
+            # Load and process the converted audio
             logger.debug("Processing audio for transcription...")
+            audio_array, sampling_rate = librosa.load(converted_audio_path, sr=16000)
             input_features = processor(
                 audio_array,
             logger.error(f"Error transcribing audio: {str(e)}")
             return jsonify({"status": "error", "message": str(e)}), 500
         finally:
+            # Clean up temporary files
+            for path in [input_audio_path, converted_audio_path]:
+                if path and os.path.exists(path):
+                    try:
+                        os.remove(path)
+                        logger.debug(f"Temporary file {path} removed")
+                    except Exception as e:
+                        logger.warning(f"Failed to remove temporary file {path}: {e}")
 @app.route('/files/<filename>', methods=['GET'])
 def serve_wav_file(filename):