Spaces:

johnbridges
/

NetMonTTS2

Running

App Files Files Community

johnbridges commited on Sep 16, 2025

Commit

25d7670

1 Parent(s): 58a6828

init commit

Browse files

Files changed (6) hide show

Dockerfile +38 -0
app.py +377 -0
commit +3 -0
kokoro.py +165 -0
requirements.txt +17 -0
tts_processor.py +163 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+FROM python:3.11-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libsndfile1 \
+    espeak-ng \
+    ffmpeg \
+    git \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Create the files directory
+RUN mkdir -p $HOME/app/files
+# Copy and install Python dependencies
+COPY requirements.txt $HOME/app/
+RUN pip install --no-cache-dir -r requirements.txt && pip install --upgrade pip
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+# Expose port
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,377 @@

+from flask import Flask, request, jsonify, send_from_directory, abort
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import librosa
+import torch
+import numpy as np
+from onnxruntime import InferenceSession
+import soundfile as sf
+import os
+import sys
+import uuid
+import logging
+from flask_cors import CORS
+import threading
+import werkzeug
+import tempfile
+from huggingface_hub import snapshot_download
+from tts_processor import preprocess_all
+import hashlib
+import os
+import torch
+import numpy as np
+import onnxruntime as ort
+# ---------------------------
+# THREAD LIMIT CONFIG
+# ---------------------------
+MAX_THREADS = 2  # <-- change this number to control all thread usage
+# ---------------------------
+# ---------------------------
+# STORAGE ROOT
+# ---------------------------
+SERVE_DIR = "/home/user/app/files"
+os.makedirs(SERVE_DIR, exist_ok=True)
+# Limit NumPy / BLAS / MKL threads
+os.environ["OMP_NUM_THREADS"] = str(MAX_THREADS)
+os.environ["OPENBLAS_NUM_THREADS"] = str(MAX_THREADS)
+os.environ["MKL_NUM_THREADS"] = str(MAX_THREADS)
+os.environ["VECLIB_MAXIMUM_THREADS"] = str(MAX_THREADS)
+os.environ["NUMEXPR_NUM_THREADS"] = str(MAX_THREADS)
+# Torch thread limits
+torch.set_num_threads(MAX_THREADS)
+torch.set_num_interop_threads(1)  # keep inter-op small to avoid overhead
+# ONNXRuntime session options (use when creating the session)
+sess_options = ort.SessionOptions()
+sess_options.intra_op_num_threads = MAX_THREADS
+sess_options.inter_op_num_threads = 1
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
+CORS(app, resources={r"/*": {"origins": "*"}})
+# Global lock to ensure one method runs at a time
+global_lock = threading.Lock()
+# Repository ID and paths
+kokoro_model_id = 'onnx-community/Kokoro-82M-v1.0-ONNX'
+model_path = 'kokoro_model'
+voice_name = 'am_adam'  # Example voice: af (adjust as needed)
+# Directory to serve files from
+SERVE_DIR = os.environ.get("SERVE_DIR", "./files")  # Default to './files' if not provided
+os.makedirs(SERVE_DIR, exist_ok=True)
+def validate_audio_file(file):
+    """Validates audio files including WebM/Opus format"""
+    if not isinstance(file, werkzeug.datastructures.FileStorage):
+        raise ValueError("Invalid file type")
+    # Supported MIME types (add WebM/Opus)
+    supported_types = [
+        "audio/wav",
+        "audio/x-wav",
+        "audio/mpeg",
+        "audio/mp3",
+        "audio/webm",
+        "audio/ogg"  # For Opus in Ogg container
+    ]
+    # Check MIME type
+    if file.content_type not in supported_types:
+        raise ValueError(f"Unsupported file type. Must be one of: {', '.join(supported_types)}")
+    # Check file size
+    file.seek(0, os.SEEK_END)
+    file_size = file.tell()
+    file.seek(0)  # Reset file pointer
+    max_size = 10 * 1024 * 1024  # 10 MB
+    if file_size > max_size:
+        raise ValueError(f"File is too large (max {max_size//(1024*1024)} MB)")
+    # Optional: Verify file header matches content_type
+    if not verify_audio_header(file):
+        raise ValueError("File header doesn't match declared content type")
+def verify_audio_header(file):
+    """Quickly checks if file headers match the declared audio format"""
+    header = file.read(4)
+    file.seek(0)  # Rewind after reading
+    if file.content_type in ["audio/webm", "audio/ogg"]:
+        # WebM starts with \x1aE\xdf\xa3, Ogg with OggS
+        return (
+            (file.content_type == "audio/webm" and header.startswith(b'\x1aE\xdf\xa3')) or
+            (file.content_type == "audio/ogg" and header.startswith(b'OggS'))
+        )
+    elif file.content_type in ["audio/wav", "audio/x-wav"]:
+        return header.startswith(b'RIFF')
+    elif file.content_type in ["audio/mpeg", "audio/mp3"]:
+        return header.startswith(b'\xff\xfb')  # MP3 frame sync
+    return True  # Skip verification for other types
+def validate_text_input(text):
+    if not isinstance(text, str):
+        raise ValueError("Text input must be a string")
+    if len(text.strip()) == 0:
+        raise ValueError("Text input cannot be empty")
+    if len(text) > 1024:  # Limit to 1024 characters
+        raise ValueError("Text input is too long (max 1024 characters)")
+file_cache = {}
+def is_cached(cached_file_path):
+    """
+    Check if a file exists in the cache.
+    If the file is not in the cache, perform a disk check and update the cache.
+    """
+    if cached_file_path in file_cache:
+        return file_cache[cached_file_path]  # Return cached result
+    exists = os.path.exists(cached_file_path)  # Perform disk check
+    file_cache[cached_file_path] = exists  # Update the cache
+    return exists
+# Initialize models
+def initialize_models():
+    global sess, voice_style, processor, whisper_model
+    try:
+        # Download the ONNX model if not already downloaded
+        if not os.path.exists(model_path):
+            logger.info("Downloading and loading Kokoro model...")
+            kokoro_dir = snapshot_download(kokoro_model_id, cache_dir=model_path)
+            logger.info(f"Kokoro model directory: {kokoro_dir}")
+        else:
+            kokoro_dir = model_path
+            logger.info(f"Using cached Kokoro model directory: {kokoro_dir}")
+        # Validate ONNX file path
+        onnx_path = None
+        for root, _, files in os.walk(kokoro_dir):
+            if 'model.onnx' in files:
+                onnx_path = os.path.join(root, 'model.onnx')
+                break
+        if not onnx_path or not os.path.exists(onnx_path):
+            raise FileNotFoundError(f"ONNX file not found after redownload at {kokoro_dir}")
+        logger.info("Loading ONNX session...")
+        sess = InferenceSession(onnx_path, sess_options)
+        logger.info(f"ONNX session loaded successfully from {onnx_path}")
+        # Load the voice style vector
+        voice_style_path = None
+        for root, _, files in os.walk(kokoro_dir):
+            if f'{voice_name}.bin' in files:
+                voice_style_path = os.path.join(root, f'{voice_name}.bin')
+                break
+        if not voice_style_path or not os.path.exists(voice_style_path):
+            raise FileNotFoundError(f"Voice style file not found at {voice_style_path}")
+        logger.info("Loading voice style vector...")
+        voice_style = np.fromfile(voice_style_path, dtype=np.float32).reshape(-1, 1, 256)
+        logger.info(f"Voice style vector loaded successfully from {voice_style_path}")
+        # Initialize Whisper model for S2T
+        logger.info("Downloading and loading Whisper model...")
+        processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+        whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+        whisper_model.config.forced_decoder_ids = None
+        logger.info("Whisper model loaded successfully")
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        raise
+# Initialize models
+initialize_models()
+# Health check endpoint
+@app.route('/health', methods=['GET'])
+def health_check():
+    try:
+        return jsonify({"status": "healthy"}), 200
+    except Exception as e:
+        logger.error(f"Health check failed: {str(e)}")
+        return jsonify({"status": "unhealthy"}), 500
+# Text-to-Speech (T2S) Endpoint
+@app.route('/generate_audio', methods=['POST'])
+def generate_audio():
+    """Text-to-Speech (T2S) Endpoint"""
+    with global_lock:
+        try:
+            logger.debug("Received request to /generate_audio")
+            data = request.json
+            text = data['text']
+            validate_text_input(text)
+            # Preprocess & stable hash
+            text = preprocess_all(text)
+            text_hash = hashlib.sha256(text.encode('utf-8')).hexdigest()
+            filename = f"{text_hash}.wav"
+            cached_file_path = os.path.join(SERVE_DIR, filename)
+            # Cache hit
+            if is_cached(cached_file_path):
+                logger.info("Returning cached audio")
+                return jsonify({"status": "success", "filename": filename})
+            # Tokenize
+            from kokoro import phonemize, tokenize  # lazy import is fine
+            tokens = tokenize(phonemize(text, 'a'))
+            if len(tokens) > 510:
+                logger.warning("Text too long; truncating to 510 tokens.")
+                tokens = tokens[:510]
+            tokens = [[0, *tokens, 0]]
+            # Style vector
+            ref_s = voice_style[len(tokens[0]) - 2]  # (1,256)
+            # ONNX inference
+            audio = sess.run(None, dict(
+                input_ids=np.array(tokens, dtype=np.int64),
+                style=ref_s,
+                speed=np.ones(1, dtype=np.float32),
+            ))[0]
+            # Save
+            audio = np.squeeze(audio).astype(np.float32)
+            sf.write(cached_file_path, audio, 24000)
+            logger.info(f"Audio saved: {cached_file_path}")
+            return jsonify({"status": "success", "filename": filename})
+        except Exception as e:
+            logger.error(f"Error generating audio: {str(e)}")
+            return jsonify({"status": "error", "message": str(e)}), 500
+# Speech-to-Text (S2T) Endpoint
+# Add these imports at the top with the other imports
+import subprocess
+import tempfile
+from pathlib import Path
+# Then update the transcribe_audio function:
+@app.route('/transcribe_audio', methods=['POST'])
+def transcribe_audio():
+    """Speech-to-Text (S2T) Endpoint with automatic format conversion"""
+    with global_lock:  # Acquire global lock to ensure only one instance runs
+        input_audio_path = None
+        converted_audio_path = None
+        try:
+            logger.debug("Received request to /transcribe_audio")
+            file = request.files['file']
+            # Create temporary files for both input and output
+            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as input_temp:
+                input_audio_path = input_temp.name
+                file.save(input_audio_path)
+                logger.debug(f"Original audio file saved to {input_audio_path}")
+            # Create a temporary file for the converted WAV
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as output_temp:
+                converted_audio_path = output_temp.name
+            # Convert to WAV with ffmpeg (16kHz, mono)
+            logger.debug(f"Converting audio to 16kHz mono WAV format...")
+            conversion_command = [
+                'ffmpeg',
+                '-y',                  # Force overwrite without prompting
+                '-i', input_audio_path,
+                '-acodec', 'pcm_s16le', # 16-bit PCM
+                '-ac', '1',             # mono
+                '-ar', '16000',         # 16kHz sample rate
+                '-af', 'highpass=f=80,lowpass=f=7500,afftdn=nr=10:nf=-25,loudnorm=I=-16:TP=-1.5:LRA=11',  # Audio cleanup filters
+                converted_audio_path
+            ]
+            result = subprocess.run(
+                conversion_command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            if result.returncode != 0:
+                logger.error(f"FFmpeg conversion error: {result.stderr}")
+                raise Exception(f"Audio conversion failed: {result.stderr}")
+            logger.debug(f"Audio successfully converted to {converted_audio_path}")
+            # Load and process the converted audio
+            logger.debug("Processing audio for transcription...")
+            audio_array, sampling_rate = librosa.load(converted_audio_path, sr=16000)
+            input_features = processor(
+                audio_array,
+                sampling_rate=sampling_rate,
+                return_tensors="pt"
+            ).input_features
+            # Generate transcription
+            logger.debug("Generating transcription...")
+            predicted_ids = whisper_model.generate(input_features)
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+            logger.info(f"Transcription: {transcription}")
+            return jsonify({"status": "success", "transcription": transcription})
+        except Exception as e:
+            logger.error(f"Error transcribing audio: {str(e)}")
+            return jsonify({"status": "error", "message": str(e)}), 500
+        finally:
+            # Clean up temporary files
+            for path in [input_audio_path, converted_audio_path]:
+                if path and os.path.exists(path):
+                    try:
+                        os.remove(path)
+                        logger.debug(f"Temporary file {path} removed")
+                    except Exception as e:
+                        logger.warning(f"Failed to remove temporary file {path}: {e}")
+@app.route('/files/<filename>', methods=['GET'])
+def serve_wav_file(filename):
+    """
+    Serve a .wav file from the configured directory.
+    Only serves files ending with '.wav'.
+    """
+    # Ensure only .wav files are allowed
+    if not filename.lower().endswith('.wav'):
+        abort(400, "Only .wav files are allowed.")
+    # Check if the file exists in the directory
+    file_path = os.path.join(SERVE_DIR, filename)
+    logger.debug(f"Looking for file at: {file_path}")
+    if not os.path.isfile(file_path):
+        logger.error(f"File not found: {file_path}")
+        abort(404, "File not found.")
+    # Serve the file
+    return send_from_directory(SERVE_DIR, filename)
+# Error handlers
+@app.errorhandler(400)
+def bad_request(error):
+    """Handle 400 errors."""
+    return {"error": "Bad Request", "message": str(error)}, 400
+@app.errorhandler(404)
+def not_found(error):
+    """Handle 404 errors."""
+    return {"error": "Not Found", "message": str(error)}, 404
+@app.errorhandler(500)
+def internal_error(error):
+    """Handle unexpected errors."""
+    return {"error": "Internal Server Error", "message": "An unexpected error occurred."}, 500
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, threaded=False, processes=1)

commit ADDED Viewed

	@@ -0,0 +1,3 @@

+git add .
+git commit -m "$*"
+git push

kokoro.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import phonemizer
+import re
+import torch
+import numpy as np
+def split_num(num):
+    num = num.group()
+    if '.' in num:
+        return num
+    elif ':' in num:
+        h, m = [int(n) for n in num.split(':')]
+        if m == 0:
+            return f"{h} o'clock"
+        elif m < 10:
+            return f'{h} oh {m}'
+        return f'{h} {m}'
+    year = int(num[:4])
+    if year < 1100 or year % 1000 < 10:
+        return num
+    left, right = num[:2], int(num[2:4])
+    s = 's' if num.endswith('s') else ''
+    if 100 <= year % 1000 <= 999:
+        if right == 0:
+            return f'{left} hundred{s}'
+        elif right < 10:
+            return f'{left} oh {right}{s}'
+    return f'{left} {right}{s}'
+def flip_money(m):
+    m = m.group()
+    bill = 'dollar' if m[0] == '$' else 'pound'
+    if m[-1].isalpha():
+        return f'{m[1:]} {bill}s'
+    elif '.' not in m:
+        s = '' if m[1:] == '1' else 's'
+        return f'{m[1:]} {bill}{s}'
+    b, c = m[1:].split('.')
+    s = '' if b == '1' else 's'
+    c = int(c.ljust(2, '0'))
+    coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence')
+    return f'{b} {bill}{s} and {c} {coins}'
+def point_num(num):
+    a, b = num.group().split('.')
+    return ' point '.join([a, ' '.join(b)])
+def normalize_text(text):
+    text = text.replace(chr(8216), "'").replace(chr(8217), "'")
+    text = text.replace('«', chr(8220)).replace('»', chr(8221))
+    text = text.replace(chr(8220), '"').replace(chr(8221), '"')
+    text = text.replace('(', '«').replace(')', '»')
+    for a, b in zip('、。！，：；？', ',.!,:;?'):
+        text = text.replace(a, b+' ')
+    text = re.sub(r'[^\S \n]', ' ', text)
+    text = re.sub(r'  +', ' ', text)
+    text = re.sub(r'(?<=\n) +(?=\n)', '', text)
+    text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
+    text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
+    text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
+    text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
+    text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
+    text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
+    text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
+    text = re.sub(r'(?<=\d),(?=\d)', '', text)
+    text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
+    text = re.sub(r'\d*\.\d+', point_num, text)
+    text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text)
+    text = re.sub(r'(?<=\d)S', ' S', text)
+    text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
+    text = re.sub(r"(?<=X')S\b", 's', text)
+    text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
+    text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
+    return text.strip()
+def get_vocab():
+    _pad = "$"
+    _punctuation = ';:,.!?¡¿—…"«»“” '
+    _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+    symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
+    dicts = {}
+    for i in range(len((symbols))):
+        dicts[symbols[i]] = i
+    return dicts
+VOCAB = get_vocab()
+def tokenize(ps):
+    return [i for i in map(VOCAB.get, ps) if i is not None]
+phonemizers = dict(
+    a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
+    b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
+)
+def phonemize(text, lang, norm=True):
+    if norm:
+        text = normalize_text(text)
+    ps = phonemizers[lang].phonemize([text])
+    ps = ps[0] if ps else ''
+    # https://en.wiktionary.org/wiki/kokoro#English
+    ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
+    ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
+    ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
+    ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', ps)
+    if lang == 'a':
+        ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
+    ps = ''.join(filter(lambda p: p in VOCAB, ps))
+    return ps.strip()
+def length_to_mask(lengths):
+    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
+    mask = torch.gt(mask+1, lengths.unsqueeze(1))
+    return mask
+@torch.no_grad()
+def forward(model, tokens, ref_s, speed):
+    device = ref_s.device
+    tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
+    input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
+    text_mask = length_to_mask(input_lengths).to(device)
+    bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
+    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+    s = ref_s[:, 128:]
+    d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
+    x, _ = model.predictor.lstm(d)
+    duration = model.predictor.duration_proj(x)
+    duration = torch.sigmoid(duration).sum(axis=-1) / speed
+    pred_dur = torch.round(duration).clamp(min=1).long()
+    pred_aln_trg = torch.zeros(input_lengths, pred_dur.sum().item())
+    c_frame = 0
+    for i in range(pred_aln_trg.size(0)):
+        pred_aln_trg[i, c_frame:c_frame + pred_dur[0,i].item()] = 1
+        c_frame += pred_dur[0,i].item()
+    en = d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device)
+    F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+    t_en = model.text_encoder(tokens, input_lengths, text_mask)
+    asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
+    return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
+def generate(model, text, voicepack, lang='a', speed=1, ps=None):
+    ps = ps or phonemize(text, lang)
+    tokens = tokenize(ps)
+    if not tokens:
+        return None
+    elif len(tokens) > 510:
+        tokens = tokens[:510]
+        print('Truncated to 510 tokens')
+    ref_s = voicepack[len(tokens)]
+    out = forward(model, tokens, ref_s, speed)
+    ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
+    return out, ps
+def generate_full(model, text, voicepack, lang='a', speed=1, ps=None):
+    ps = ps or phonemize(text, lang)
+    tokens = tokenize(ps)
+    if not tokens:
+        return None
+    outs = []
+    loop_count = len(tokens)//510 + (1 if len(tokens) % 510 != 0 else 0)
+    for i in range(loop_count):
+        ref_s = voicepack[len(tokens[i*510:(i+1)*510])]
+        out = forward(model, tokens[i*510:(i+1)*510], ref_s, speed)
+        outs.append(out)
+    outs = np.concatenate(outs)
+    ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
+    return outs, ps

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+flask
+flask-cors
+transformers
+librosa
+numpy
+soundfile
+huggingface_hub
+phonemizer
+munch
+werkzeug
+num2words
+dateparser
+inflect
+ftfy
+sentencepiece
+torch --index-url https://download.pytorch.org/whl/cpu
+onnxruntime

tts_processor.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import re
+from dateutil.parser import parse
+from num2words import num2words
+import inflect
+from ftfy import fix_text
+# Initialize the inflect engine
+inflect_engine = inflect.engine()
+# Define alphabet pronunciation mapping
+alphabet_map = {
+    "A": " Eh ", "B": " Bee ", "C": " See ", "D": " Dee ", "E": " Eee ",
+    "F": " Eff ", "G": " Jee ", "H": " Aitch ", "I": " Eye ", "J": " Jay ",
+    "K": " Kay ", "L": " El ", "M": " Emm ", "N": " Enn ", "O": " Ohh ",
+    "P": " Pee ", "Q": " Queue ", "R": " Are ", "S": " Ess ", "T": " Tee ",
+    "U": " You ", "V": " Vee ", "W": " Double You ", "X": " Ex ", "Y": " Why ", "Z": " Zed "
+}
+# Function to add ordinal suffix to a number
+def add_ordinal_suffix(day):
+    """Adds ordinal suffix to a day (e.g., 13 -> 13th)."""
+    if 11 <= day <= 13:  # Special case for 11th, 12th, 13th
+        return f"{day}th"
+    elif day % 10 == 1:
+        return f"{day}st"
+    elif day % 10 == 2:
+        return f"{day}nd"
+    elif day % 10 == 3:
+        return f"{day}rd"
+    else:
+        return f"{day}th"
+# Function to format dates in a human-readable form
+def format_date(parsed_date, include_time=True):
+    """Formats a parsed date into a human-readable string."""
+    if not parsed_date:
+        return None
+    # Convert the day into an ordinal (e.g., 13 -> 13th)
+    day = add_ordinal_suffix(parsed_date.day)
+    # Format the date in a TTS-friendly way
+    if include_time and parsed_date.hour != 0 and parsed_date.minute != 0:
+        return parsed_date.strftime(f"%B {day}, %Y at %-I:%M %p")  # Unix
+    return parsed_date.strftime(f"%B {day}, %Y")  # Only date
+# Normalize dates in the text
+def normalize_dates(text):
+    """
+    Finds and replaces date strings with a nicely formatted, TTS-friendly version.
+    """
+    def replace_date(match):
+        raw_date = match.group(0)
+        try:
+            parsed_date = parse(raw_date)
+            if parsed_date:
+                include_time = "T" in raw_date or " " in raw_date  # Include time only if explicitly provided
+                return format_date(parsed_date, include_time)
+        except ValueError:
+            pass
+        return raw_date
+    # Match common date formats
+    date_pattern = r"\b(\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?|\d{2}/\d{2}/\d{4}|\d{1,2} \w+ \d{4})\b"
+    return re.sub(date_pattern, replace_date, text)
+# Replace invalid characters and clean text
+def replace_invalid_chars(string):
+    string = fix_text(string)
+    replacements = {
+        "**": "",
+        '&#x27;': "'",
+        'AI;': 'Artificial Intelligence!',
+        'iddqd;': 'Immortality cheat code',
+        '😉;': 'wink wink!',
+        ':D': '*laughs* Ahahaha!',
+        ';D': '*laughs* Ahahaha!'
+    }
+    for old, new in replacements.items():
+        string = string.replace(old, new)
+    return string
+# Replace numbers with their word equivalents
+def replace_numbers(string):
+    ipv4_pattern = r'(\b\d{1,3}(\.\d{1,3}){3}\b)'
+    ipv6_pattern = r'([0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}'
+    range_pattern = r'\b\d+-\d+\b'  # Detect ranges like 1-4
+    date_pattern = r'\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2})?\b'
+    alphanumeric_pattern = r'\b[A-Za-z]+\d+|\d+[A-Za-z]+\b'
+    # Do not process IP addresses, date patterns, or alphanumerics
+    if re.search(ipv4_pattern, string) or re.search(ipv6_pattern, string) or re.search(range_pattern, string) or re.search(date_pattern, string) or re.search(alphanumeric_pattern, string):
+        return string
+    # Convert standalone numbers and port numbers
+    def convert_number(match):
+        number = match.group()
+        return num2words(int(number)) if number.isdigit() else number
+    pattern = re.compile(r'\b\d+\b')
+    return re.sub(pattern, convert_number, string)
+# Replace abbreviations with expanded form
+def replace_abbreviations(string):
+    words = string.split()
+    for i, word in enumerate(words):
+        if word.isupper() and len(word) <= 4 and not any(char.isdigit() for char in word) and word not in ["ID", "AM", "PM"]:
+            words[i] = ''.join([alphabet_map.get(char, char) for char in word])
+    return ' '.join(words)
+def clean_whitespace(string):
+    # Remove spaces before punctuation
+    string = re.sub(r'\s+([.,?!])', r'\1', string)
+    # Collapse multiple spaces into one, but don’t touch inside tokens like "test.com"
+    string = re.sub(r'\s{2,}', ' ', string)
+    return string.strip()
+def make_dots_tts_friendly(text):
+    # Handle IP addresses (force "dot")
+    ipv4_pattern = r'\b\d{1,3}(\.\d{1,3}){3}\b'
+    text = re.sub(ipv4_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
+    # Handle domain-like endings (force "dot")
+    domain_pattern = r'\b([\w-]+)\.(com|net|org|io|gov|edu|exe|dll|local)\b'
+    text = re.sub(domain_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
+    # Handle decimals (use "point")
+    decimal_pattern = r'\b\d+\.\d+\b'
+    text = re.sub(decimal_pattern, lambda m: m.group(0).replace('.', ' point '), text)
+    # Handle leading dot words (.Net → dot Net)
+    text = re.sub(r'\.(?=\w)', 'dot ', text)
+    return text
+# Main preprocessing pipeline
+def preprocess_all(string):
+    string = normalize_dates(string)
+    string = replace_invalid_chars(string)
+    string = replace_numbers(string)
+    string = replace_abbreviations(string)
+    string = make_dots_tts_friendly(string)
+    string = clean_whitespace(string)
+    return string
+# Expose a testing function for external use
+def test_preprocessing(file_path):
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+    for line in lines:
+        original = line.strip()
+        processed = preprocess_all(original)
+        print(f"Original: {original}")
+        print(f"Processed: {processed}\n")
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        test_file = sys.argv[1]
+        test_preprocessing(test_file)
+    else:
+        print("Please provide a file path as an argument.")