Spaces:

johnbridges
/

NetMonTTS

Running

App Files Files Community

johnbridges commited on Jan 12

Commit

c153cff

1 Parent(s): 35df04b

added new models and normalizations

Browse files

Files changed (2) hide show

app.py +264 -27
tts_processor.py +287 -7

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from flask import Flask, request, jsonify, send_from_directory, abort
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import librosa
 import torch
 import numpy as np
@@ -10,6 +11,7 @@ import sys
 import uuid
 import logging
 from flask_cors import CORS
 import threading
 import werkzeug
 import tempfile
@@ -26,13 +28,6 @@ import onnxruntime as ort
 # ---------------------------
 MAX_THREADS = 2  # <-- change this number to control all thread usage
-# ---------------------------
-# ---------------------------
-# STORAGE ROOT
-# ---------------------------
-SERVE_DIR = "/home/user/app/files"
-os.makedirs(SERVE_DIR, exist_ok=True)
 # Limit NumPy / BLAS / MKL threads
 os.environ["OMP_NUM_THREADS"] = str(MAX_THREADS)
 os.environ["OPENBLAS_NUM_THREADS"] = str(MAX_THREADS)
@@ -66,7 +61,8 @@ model_path = 'kokoro_model'
 voice_name = 'am_adam'  # Example voice: af (adjust as needed)
 # Directory to serve files from
-SERVE_DIR = os.environ.get("SERVE_DIR", "./files")  # Default to './files' if not provided
 os.makedirs(SERVE_DIR, exist_ok=True)
 def validate_audio_file(file):
@@ -138,9 +134,18 @@ def is_cached(cached_file_path):
     file_cache[cached_file_path] = exists  # Update the cache
     return exists
 # Initialize models
 def initialize_models():
-    global sess, voice_style, processor, whisper_model
     try:
         # Download the ONNX model if not already downloaded
@@ -180,12 +185,64 @@ def initialize_models():
         voice_style = np.fromfile(voice_style_path, dtype=np.float32).reshape(-1, 1, 256)
         logger.info(f"Voice style vector loaded successfully from {voice_style_path}")
-        # Initialize Whisper model for S2T
-        logger.info("Downloading and loading Whisper model...")
-        processor = WhisperProcessor.from_pretrained("openai/whisper-base")
-        whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
-        whisper_model.config.forced_decoder_ids = None
-        logger.info("Whisper model loaded successfully")
     except Exception as e:
         logger.error(f"Error initializing models: {str(e)}")
@@ -194,6 +251,150 @@ def initialize_models():
 # Initialize models
 initialize_models()
 # Health check endpoint
 @app.route('/health', methods=['GET'])
 def health_check():
@@ -310,17 +511,54 @@ def transcribe_audio():
             logger.debug("Processing audio for transcription...")
             audio_array, sampling_rate = librosa.load(converted_audio_path, sr=16000)
-            input_features = processor(
-                audio_array,
-                sampling_rate=sampling_rate,
-                return_tensors="pt"
-            ).input_features
-            # Generate transcription
-            logger.debug("Generating transcription...")
-            predicted_ids = whisper_model.generate(input_features)
-            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-            logger.info(f"Transcription: {transcription}")
             return jsonify({"status": "success", "transcription": transcription})
         except Exception as e:
@@ -374,4 +612,3 @@ def internal_error(error):
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, threaded=False, processes=1)

 from flask import Flask, request, jsonify, send_from_directory, abort
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from transformers import Wav2Vec2Processor, AutoTokenizer, AutoModelForTokenClassification
 import librosa
 import torch
 import numpy as np
 import uuid
 import logging
 from flask_cors import CORS
+import re
 import threading
 import werkzeug
 import tempfile
 # ---------------------------
 MAX_THREADS = 2  # <-- change this number to control all thread usage
 # Limit NumPy / BLAS / MKL threads
 os.environ["OMP_NUM_THREADS"] = str(MAX_THREADS)
 os.environ["OPENBLAS_NUM_THREADS"] = str(MAX_THREADS)
 voice_name = 'am_adam'  # Example voice: af (adjust as needed)
 # Directory to serve files from
+default_serve_dir = os.path.join(os.path.expanduser("~"), "app", "files")
+SERVE_DIR = os.environ.get("SERVE_DIR", default_serve_dir)
 os.makedirs(SERVE_DIR, exist_ok=True)
 def validate_audio_file(file):
     file_cache[cached_file_path] = exists  # Update the cache
     return exists
+use_wav2vec2 = os.environ.get("USE_WAV2VEC2", "").lower() in {"1", "true", "yes", "on"}
+ASR_ENGINE = os.environ.get("ASR_ENGINE", "wav2vec2_onnx" if use_wav2vec2 else "whisper_pt").lower()
+ASR_MODEL_NAME = os.environ.get("ASR_MODEL_NAME", "facebook/wav2vec2-base-960h")
+ASR_ONNX_REPO = os.environ.get("ASR_ONNX_REPO", "onnx-community/wav2vec2-base-960h-ONNX")
+PUNCTUATE_TEXT = os.environ.get("PUNCTUATE_TEXT", "0").lower() in {"1", "true", "yes", "on"}
+TECH_NORMALIZE = os.environ.get("TECH_NORMALIZE", "0").lower() in {"1", "true", "yes", "on"}
+PUNCTUATION_MODEL = os.environ.get("PUNCTUATION_MODEL", "kredor/punctuate-all")
 # Initialize models
 def initialize_models():
+    global sess, voice_style, processor, whisper_model, asr_session, asr_processor
+    global punctuation_model, punctuation_tokenizer
     try:
         # Download the ONNX model if not already downloaded
         voice_style = np.fromfile(voice_style_path, dtype=np.float32).reshape(-1, 1, 256)
         logger.info(f"Voice style vector loaded successfully from {voice_style_path}")
+        # Initialize ASR engine
+        if ASR_ENGINE == "wav2vec2_onnx":
+            logger.info(f"Loading Wav2Vec2 ONNX ASR model ({ASR_MODEL_NAME})...")
+            # Load processor for feature extraction + CTC labels
+            asr_processor = Wav2Vec2Processor.from_pretrained(ASR_MODEL_NAME)
+            # Try to locate/download ONNX model; if not present, download a ready-made ONNX repo.
+            default_onnx_path = f"asr_onnx/{ASR_MODEL_NAME.replace('/', '_')}.onnx"
+            asr_onnx_path_env = os.environ.get("ASR_ONNX_PATH", default_onnx_path)
+            if not os.path.exists(asr_onnx_path_env):
+                logger.info(f"ASR ONNX not found at {asr_onnx_path_env}. Attempting to download from {ASR_ONNX_REPO}...")
+                try:
+                    cache_dir = os.environ.get("ASR_ONNX_CACHE_DIR", "asr_onnx_cache")
+                    repo_dir = snapshot_download(ASR_ONNX_REPO, cache_dir=cache_dir)
+                    # Look for common ONNX filenames
+                    onnx_path = None
+                    for root, _, files in os.walk(repo_dir):
+                        for cand in ["model.onnx", "wav2vec2.onnx", "onnx/model.onnx"]:
+                            if cand in files:
+                                onnx_path = os.path.join(root, cand if cand != "onnx/model.onnx" else "model.onnx")
+                                break
+                        if onnx_path:
+                            break
+                    if not onnx_path:
+                        # Fallback: pick first .onnx file found
+                        for root, _, files in os.walk(repo_dir):
+                            for f in files:
+                                if f.endswith(".onnx"):
+                                    onnx_path = os.path.join(root, f)
+                                    break
+                            if onnx_path:
+                                break
+                    if not onnx_path:
+                        raise FileNotFoundError("No .onnx file found in downloaded repo")
+                    os.makedirs(os.path.dirname(asr_onnx_path_env), exist_ok=True)
+                    # Copy to stable location
+                    import shutil
+                    shutil.copyfile(onnx_path, asr_onnx_path_env)
+                    logger.info(f"Downloaded ASR ONNX to {asr_onnx_path_env}")
+                except Exception as de:
+                    logger.error(f"Failed to download ASR ONNX: {de}")
+                    logger.warning("Falling back to Whisper PT engine.")
+                    raise
+            asr_session = InferenceSession(asr_onnx_path_env, sess_options)
+            logger.info("Wav2Vec2 ONNX ASR model loaded")
+        else:
+            logger.info("ASR_ENGINE set to whisper_pt; loading Whisper model...")
+            processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+            whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+            whisper_model.config.forced_decoder_ids = None
+            logger.info("Whisper model loaded successfully")
+        if PUNCTUATE_TEXT:
+            logger.info(f"Loading punctuation model ({PUNCTUATION_MODEL})...")
+            punctuation_tokenizer = AutoTokenizer.from_pretrained(PUNCTUATION_MODEL)
+            punctuation_model = AutoModelForTokenClassification.from_pretrained(PUNCTUATION_MODEL)
+            punctuation_model.eval()
+            logger.info("Punctuation model loaded successfully")
     except Exception as e:
         logger.error(f"Error initializing models: {str(e)}")
 # Initialize models
 initialize_models()
+def restore_punctuation(text, max_words=120):
+    if not PUNCTUATE_TEXT:
+        return text
+    if "punctuation_model" not in globals() or punctuation_model is None:
+        return text
+    words = text.strip().lower().split()
+    if not words:
+        return text
+    label_to_punct = {
+        "O": "",
+        "COMMA": ",",
+        "PERIOD": ".",
+        "QUESTION": "?",
+        "EXCLAMATION": "!",
+        "COLON": ":",
+        "SEMICOLON": ";",
+    }
+    def process_chunk(chunk_words, capitalize_next):
+        inputs = punctuation_tokenizer(
+            chunk_words,
+            is_split_into_words=True,
+            return_tensors="pt",
+            truncation=True,
+        )
+        with torch.no_grad():
+            logits = punctuation_model(**inputs).logits
+        pred_ids = torch.argmax(logits, dim=-1)[0].tolist()
+        word_ids = inputs.word_ids()
+        last_word = -1
+        word_end_labels = {}
+        for idx, word_id in enumerate(word_ids):
+            if word_id is None:
+                continue
+            if word_id != last_word:
+                last_word = word_id
+            word_end_labels[word_id] = pred_ids[idx]
+        decoded = []
+        for i, word in enumerate(chunk_words):
+            label_id = word_end_labels.get(i)
+            label = punctuation_model.config.id2label.get(label_id, "O")
+            punct = label_to_punct.get(label, "")
+            if capitalize_next and word:
+                word = word[0].upper() + word[1:]
+                capitalize_next = False
+            decoded.append(word + punct)
+            if punct in {".", "?", "!"}:
+                capitalize_next = True
+        return " ".join(decoded), capitalize_next
+    out_parts = []
+    capitalize_next = True
+    for i in range(0, len(words), max_words):
+        chunk = words[i:i + max_words]
+        chunk_text, capitalize_next = process_chunk(chunk, capitalize_next)
+        out_parts.append(chunk_text)
+    return " ".join(out_parts).strip()
+def normalize_tech_text(text):
+    """
+    Normalize spoken "tech" tokens (dot/com/slash/etc.) into symbols.
+    Intended for wav2vec2 output; Whisper already handles this better.
+    """
+    normalized = text
+    # Common domain suffixes
+    normalized = re.sub(r"\bdot com\b", ".com", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot come\b", ".com", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot comm\b", ".com", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot net\b", ".net", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot org\b", ".org", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot io\b", ".io", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot ai\b", ".ai", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot co\b", ".co", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot uk\b", ".uk", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot dev\b", ".dev", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\bdot local\b", ".local", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\.\\s+(com|net|org|io|ai|co|uk|dev|local)\\b", r".\\1", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"(\\w)\\s+\\.(com|net|org|io|ai|co|uk|dev|local)\\b", r"\\1.\\2", normalized, flags=re.IGNORECASE)
+    # Symbols between tokens
+    normalized = re.sub(r"(?<=\\w)\\s+dot\\s+(?=\\w)", ".", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"(?<=\\w)\\s+at\\s+(?=\\w)", "@", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"(?<=\\w)\\s+colon\\s+(?=\\w)", ":", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"(?<=\\w)\\s+dash\\s+(?=\\w)", "-", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"(?<=\\w)\\s+hyphen\\s+(?=\\w)", "-", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bhyphen\\b", "-", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bunderscore\\b", "_", normalized, flags=re.IGNORECASE)
+    # Slashes
+    normalized = re.sub(r"\\bback\\s+slash\\b", r"\\\\", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bbackslash\\b", r"\\\\", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bbash\\b", r"\\\\", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bforward\\s+slash\\b", "/", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bslash\\b", "/", normalized, flags=re.IGNORECASE)
+    # Spoken punctuation tokens
+    normalized = re.sub(r"\\bcomma\\b", ",", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bperiod\\b", ".", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bquestion\\s+mark\\b", "?", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bexclamation\\s+point\\b", "!", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bexclamation\\s+mark\\b", "!", normalized, flags=re.IGNORECASE)
+    normalized = re.sub(r"\\bhash\\b", "#", normalized, flags=re.IGNORECASE)
+    # Collapse sequences of spoken digits into numbers (useful for IPs/ports).
+    num_map = {
+        "zero": "0",
+        "oh": "0",
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+    }
+    parts = normalized.split()
+    out = []
+    buffer = []
+    for token in parts:
+        lower = token.lower()
+        if lower in num_map:
+            buffer.append(num_map[lower])
+            continue
+        if lower == ".":
+            buffer.append(".")
+            continue
+        if lower == "dot":
+            buffer.append(".")
+            continue
+        if buffer:
+            out.append("".join(buffer))
+            buffer = []
+        out.append(token)
+    if buffer:
+        out.append("".join(buffer))
+    normalized = " ".join(out)
+    return normalized
 # Health check endpoint
 @app.route('/health', methods=['GET'])
 def health_check():
             logger.debug("Processing audio for transcription...")
             audio_array, sampling_rate = librosa.load(converted_audio_path, sr=16000)
+            if ASR_ENGINE == "wav2vec2_onnx" and 'asr_session' in globals() and asr_session is not None:
+                # Prepare input for Wav2Vec2 ONNX: float32 PCM, shape (batch, samples)
+                inputs = asr_processor(audio_array, sampling_rate=16000, return_tensors="np")
+                # Some exports expect input as (batch, sequence); adjust key as needed
+                ort_inputs = {}
+                # Common input name variants
+                for name in ["input_values", "input_features", "inputs"]:
+                    if name in [i.name for i in asr_session.get_inputs()]:
+                        ort_inputs[name] = inputs["input_values"].astype(np.float32)
+                        break
+                else:
+                    # Fall back to first input name
+                    first_name = asr_session.get_inputs()[0].name
+                    ort_inputs[first_name] = inputs["input_values"].astype(np.float32)
+                logits = asr_session.run(None, ort_inputs)[0]  # (batch, time, vocab)
+                # Greedy CTC decode
+                pred_ids = np.argmax(logits, axis=-1)
+                # Collapse repeats and remove CTC blank (id 0 for many models; rely on processor)
+                transcription = asr_processor.batch_decode(pred_ids)[0]
+                transcription = transcription.strip()
+                logger.info(f"Transcription (Wav2Vec2 ONNX): {transcription}")
+            else:
+                # Whisper fallback
+                input_features = processor(
+                    audio_array,
+                    sampling_rate=sampling_rate,
+                    return_tensors="pt"
+                ).input_features
+                logger.debug("Generating transcription (Whisper)...")
+                predicted_ids = whisper_model.generate(input_features)
+                transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+                logger.info(f"Transcription (Whisper): {transcription}")
+            if PUNCTUATE_TEXT:
+                try:
+                    transcription = restore_punctuation(transcription)
+                    logger.info(f"Transcription (Punctuated): {transcription}")
+                except Exception as pe:
+                    logger.warning(f"Punctuation restore failed: {pe}")
+            if TECH_NORMALIZE:
+                try:
+                    transcription = normalize_tech_text(transcription)
+                    logger.info(f"Transcription (Normalized): {transcription}")
+                except Exception as ne:
+                    logger.warning(f"Tech normalization failed: {ne}")
             return jsonify({"status": "success", "transcription": transcription})
         except Exception as e:
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, threaded=False, processes=1)

tts_processor.py CHANGED Viewed

@@ -16,6 +16,129 @@ alphabet_map = {
     "U": " You ", "V": " Vee ", "W": " Double You ", "X": " Ex ", "Y": " Why ", "Z": " Zed "
 }
 # Function to add ordinal suffix to a number
 def add_ordinal_suffix(day):
     """Adds ordinal suffix to a day (e.g., 13 -> 13th)."""
@@ -82,20 +205,26 @@ def replace_invalid_chars(string):
 # Replace numbers with their word equivalents
 def replace_numbers(string):
-    ipv4_pattern = r'(\b\d{1,3}(\.\d{1,3}){3}\b)'
-    ipv6_pattern = r'([0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}'
     range_pattern = r'\b\d+-\d+\b'  # Detect ranges like 1-4
     date_pattern = r'\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2})?\b'
     alphanumeric_pattern = r'\b[A-Za-z]+\d+|\d+[A-Za-z]+\b'
-    # Do not process IP addresses, date patterns, or alphanumerics
-    if re.search(ipv4_pattern, string) or re.search(ipv6_pattern, string) or re.search(range_pattern, string) or re.search(date_pattern, string) or re.search(alphanumeric_pattern, string):
-        return string
     # Convert standalone numbers and port numbers
     def convert_number(match):
         number = match.group()
-        return num2words(int(number)) if number.isdigit() else number
     pattern = re.compile(r'\b\d+\b')
     return re.sub(pattern, convert_number, string)
@@ -133,11 +262,163 @@ def make_dots_tts_friendly(text):
     return text
 # Main preprocessing pipeline
 def preprocess_all(string):
     string = normalize_dates(string)
     string = replace_invalid_chars(string)
     string = replace_numbers(string)
     string = replace_abbreviations(string)
     string = make_dots_tts_friendly(string)
     string = clean_whitespace(string)
@@ -160,4 +441,3 @@ if __name__ == "__main__":
         test_preprocessing(test_file)
     else:
         print("Please provide a file path as an argument.")

     "U": " You ", "V": " Vee ", "W": " Double You ", "X": " Ex ", "Y": " Why ", "Z": " Zed "
 }
+TECH_ACRONYM_REPLACEMENTS = [
+    (r"\bhttps\b", "H T T P S"),
+    (r"\bhttp\b", "H T T P"),
+    (r"\bssh\b", "S S H"),
+    (r"\bdns\b", "D N S"),
+    (r"\bntp\b", "N T P"),
+    (r"\bsnmp\b", "S N M P"),
+    (r"\btcp\b", "T C P"),
+    (r"\budp\b", "U D P"),
+    (r"\bicmp\b", "I C M P"),
+    (r"\bip\b", "I P"),
+    (r"\bipv4\b", "I P v four"),
+    (r"\bipv6\b", "I P v six"),
+    (r"\btls\b", "T L S"),
+    (r"\bssl\b", "S S L"),
+    (r"\brdp\b", "R D P"),
+    (r"\bsql\b", "sequel"),
+    (r"\bapi\b", "A P I"),
+    (r"\buid\b", "U I D"),
+    (r"\bgpu\b", "G P U"),
+    (r"\bcpu\b", "C P U"),
+    (r"\bram\b", "R A M"),
+    (r"\bttl\b", "T T L"),
+    (r"\brtt\b", "R T T"),
+    (r"\bbgp\b", "B G P"),
+    (r"\bospf\b", "O S P F"),
+    (r"\bospfv2\b", "O S P F v two"),
+    (r"\bospfv3\b", "O S P F v three"),
+    (r"\bis-is\b", "I S I S"),
+    (r"\brip\b", "R I P"),
+    (r"\bdhcp\b", "D H C P"),
+    (r"\barp\b", "A R P"),
+    (r"\bndp\b", "N D P"),
+    (r"\bnat\b", "N A T"),
+    (r"\bpat\b", "P A T"),
+    (r"\bgre\b", "G R E"),
+    (r"\bvrrp\b", "V R R P"),
+    (r"\bhsrp\b", "H S R P"),
+    (r"\bglbp\b", "G L B P"),
+    (r"\bstp\b", "S T P"),
+    (r"\brstp\b", "R S T P"),
+    (r"\bmstp\b", "M S T P"),
+    (r"\blldp\b", "L L D P"),
+    (r"\bcdp\b", "C D P"),
+    (r"\bldap\b", "ell dap"),
+    (r"\bsaml\b", "sam el"),
+    (r"\boauth\b", "oh auth"),
+    (r"\boidc\b", "O I D C"),
+    (r"\bsso\b", "S S O"),
+    (r"\bsmtp\b", "S M T P"),
+    (r"\bimap\b", "I M A P"),
+    (r"\bpop3\b", "P O P three"),
+    (r"\bpop\b", "P O P"),
+    (r"\bftp\b", "F T P"),
+    (r"\bsftp\b", "S F T P"),
+    (r"\bftps\b", "F T P S"),
+    (r"\btftp\b", "T F T P"),
+    (r"\bmqtt\b", "M Q T T"),
+    (r"\bamqp\b", "A M Q P"),
+    (r"\bcoap\b", "C O A P"),
+    (r"\bquic\b", "Q U I C"),
+    (r"\bgrpc\b", "gee R P C"),
+    (r"\bsoap\b", "S O A P"),
+    (r"\bjson\b", "jay son"),
+    (r"\byaml\b", "yam el"),
+    (r"\bxml\b", "ex em el"),
+    (r"\bwebsocket\b", "web socket"),
+    (r"\bwss\b", "W S S"),
+    (r"\bws\b", "W S"),
+    (r"\bicmpv6\b", "I C M P v six"),
+    (r"\bntlm\b", "N T L M"),
+    (r"\bpki\b", "P K I"),
+    (r"\bcsr\b", "C S R"),
+    (r"\bcrt\b", "C R T"),
+    (r"\bca\b", "C A"),
+    (r"\bwan\b", "W A N"),
+    (r"\blan\b", "L A N"),
+    (r"\bvlan\b", "V L A N"),
+    (r"\bvxlan\b", "V X L A N"),
+    (r"\bqos\b", "Q O S"),
+    (r"\bmtu\b", "M T U"),
+    (r"\bpoe\b", "P O E"),
+    (r"\bpoe\+", "P O E plus"),
+    (r"\bvrf\b", "V R F"),
+    (r"\bacl\b", "A C L"),
+    (r"\bnat64\b", "N A T sixty four"),
+    (r"\bdsr\b", "D S R"),
+    (r"\bsiem\b", "S I E M"),
+    (r"\bids\b", "I D S"),
+    (r"\bips\b", "I P S"),
+    (r"\bedr\b", "E D R"),
+    (r"\bxdr\b", "X D R"),
+    (r"\bsoc\b", "S O C"),
+    (r"\bmdr\b", "M D R"),
+    (r"\bndr\b", "N D R"),
+    (r"\bav\b", "A V"),
+    (r"\bendpoint\b", "end point"),
+    (r"\bsaas\b", "S A A S"),
+    (r"\biaas\b", "I A A S"),
+    (r"\bpaas\b", "P A A S"),
+    (r"\bdlp\b", "D L P"),
+    (r"\bmfa\b", "M F A"),
+    (r"\b2fa\b", "two F A"),
+    (r"\b3fa\b", "three F A"),
+    (r"\bmd5\b", "M D five"),
+    (r"\bsha1\b", "sha one"),
+    (r"\bsha256\b", "sha two five six"),
+    (r"\bsha512\b", "sha five one two"),
+    (r"\baes\b", "A E S"),
+    (r"\baes-?gcm\b", "A E S G C M"),
+    (r"\baes-?cbc\b", "A E S C B C"),
+    (r"\brsa\b", "R S A"),
+    (r"\becdsa\b", "E C D S A"),
+    (r"\bed25519\b", "E D two five five one nine"),
+    (r"\bjwt\b", "J W T"),
+    (r"\bsshd\b", "S S H D"),
+    (r"\bntp\d?\b", "N T P"),
+    (r"\bntp\s+server\b", "N T P server"),
+    (r"\bntp\s+pool\b", "N T P pool"),
+    (r"\bhttpd\b", "H T T P D"),
+    (r"\bnginx\b", "engine x"),
+]
 # Function to add ordinal suffix to a number
 def add_ordinal_suffix(day):
     """Adds ordinal suffix to a day (e.g., 13 -> 13th)."""
 # Replace numbers with their word equivalents
 def replace_numbers(string):
+    ipv4_pattern = r'\b\d{1,3}(?:\.\d{1,3}){3}\b'
+    ipv6_pattern = r'\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}\b'
     range_pattern = r'\b\d+-\d+\b'  # Detect ranges like 1-4
     date_pattern = r'\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2})?\b'
     alphanumeric_pattern = r'\b[A-Za-z]+\d+|\d+[A-Za-z]+\b'
+    skip_spans = []
+    for pattern in [ipv4_pattern, ipv6_pattern, range_pattern, date_pattern, alphanumeric_pattern]:
+        for match in re.finditer(pattern, string):
+            skip_spans.append((match.start(), match.end()))
+    def is_skipped(start, end):
+        return any(start >= s and end <= e for s, e in skip_spans)
     # Convert standalone numbers and port numbers
     def convert_number(match):
         number = match.group()
+        if is_skipped(match.start(), match.end()):
+            return number
+        return num2words(int(number)).replace("-", " ") if number.isdigit() else number
     pattern = re.compile(r'\b\d+\b')
     return re.sub(pattern, convert_number, string)
     return text
+def apply_replacements(value, replacements):
+    for pattern, replacement in replacements:
+        value = re.sub(pattern, replacement, value, flags=re.IGNORECASE)
+    return value
+def tech_humanize(text):
+    """
+    Humanize technical tokens (URLs, emails, UUIDs, MACs, paths) for TTS.
+    Keep outputs ASCII and TTS-friendly.
+    """
+    def spell_chars(token):
+        return " ".join(list(token))
+    def normalize_url(match):
+        url = match.group(0)
+        url = url.replace("https://", "HTTPS://").replace("http://", "HTTP://")
+        url = url.replace("://", " colon slash slash ")
+        url = url.replace("/", " forward slash ")
+        url = url.replace("?", " question mark ")
+        url = url.replace("&", " and ")
+        url = url.replace("=", " equals ")
+        url = url.replace("#", " hash ")
+        url = url.replace("_", " underscore ")
+        url = url.replace("-", " dash ")
+        url = url.replace(".", " dot ")
+        return url
+    def normalize_email(match):
+        email = match.group(0)
+        email = email.replace("@", " at ")
+        email = email.replace(".", " dot ")
+        email = email.replace("_", " underscore ")
+        email = email.replace("-", " dash ")
+        return email
+    def normalize_uuid(match):
+        uuid_text = match.group(0)
+        groups = uuid_text.split("-")
+        spelled = [" ".join(list(group)) for group in groups]
+        return " dash ".join(spelled)
+    def normalize_mac(match):
+        mac_text = match.group(0)
+        groups = mac_text.split(":")
+        spelled = [" ".join(list(group)) for group in groups]
+        return " colon ".join(spelled)
+    def normalize_ipv6(match):
+        ipv6_text = match.group(0)
+        groups = ipv6_text.split(":")
+        spelled = [" ".join(list(group)) for group in groups if group]
+        return " colon ".join(spelled)
+    def normalize_ipv6_compact(match):
+        ipv6_text = match.group(0)
+        left, _, right = ipv6_text.partition("::")
+        left_groups = [g for g in left.split(":") if g]
+        right_groups = [g for g in right.split(":") if g]
+        left_spelled = [" ".join(list(group)) for group in left_groups]
+        right_spelled = [" ".join(list(group)) for group in right_groups]
+        middle = " double colon "
+        left_part = " colon ".join(left_spelled)
+        right_part = " colon ".join(right_spelled)
+        if left_part and right_part:
+            return f"{left_part}{middle}{right_part}"
+        if left_part:
+            return f"{left_part}{middle}"
+        return f"{middle}{right_part}"
+    def normalize_mac_dash(match):
+        mac_text = match.group(0)
+        groups = mac_text.split("-")
+        spelled = [" ".join(list(group)) for group in groups]
+        return " dash ".join(spelled)
+    def normalize_hex(match):
+        hex_text = match.group(1)
+        return "hex " + " ".join(list(hex_text))
+    def normalize_cve(match):
+        year = match.group(1)
+        ident = match.group(2)
+        return f"C V E {year} dash {ident}"
+    # URLs and emails (do this early before protocol expansions)
+    text = re.sub(r"\bhttps?://[^\s]+", normalize_url, text, flags=re.IGNORECASE)
+    text = re.sub(r"\b[\w.+-]+@[\w.-]+\.\w+\b", normalize_email, text)
+    # Version tokens like TLS1.3 or HTTP/2
+    text = re.sub(r"\b(tls|ssl)\s*(\d+(?:\.\d+)?)\b", lambda m: f"{m.group(1).upper()} {m.group(2).replace('.', ' point ')}", text, flags=re.IGNORECASE)
+    text = re.sub(r"\bhttps?/(\d+(?:\.\d+)?)\b", lambda m: f"H T T P slash {m.group(1).replace('.', ' point ')}", text, flags=re.IGNORECASE)
+    # Common protocol tokens (force letter-by-letter)
+    text = apply_replacements(text, TECH_ACRONYM_REPLACEMENTS)
+    # Hex values and CVEs
+    text = re.sub(r"\b0x([0-9A-Fa-f]+)\b", normalize_hex, text)
+    text = re.sub(r"\bCVE-(\d{4})-(\d{4,7})\b", normalize_cve, text)
+    # Interfaces like eth0, wlan0, en0, lo0
+    text = re.sub(r"\b(eth|wlan|en|lo)(\d+)\b", lambda m: f"{m.group(1)} {m.group(2)}", text, flags=re.IGNORECASE)
+    # UUIDs, MACs, IPv6
+    text = re.sub(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b", normalize_uuid, text)
+    text = re.sub(r"\b(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}\b", normalize_mac, text)
+    text = re.sub(r"\b(?:[0-9A-Fa-f]{2}-){5}[0-9A-Fa-f]{2}\b", normalize_mac_dash, text)
+    text = re.sub(r"\b(?:[0-9A-Fa-f]{1,4}:){2,7}[0-9A-Fa-f]{1,4}\b", normalize_ipv6, text)
+    text = re.sub(r"\b[0-9A-Fa-f:]*::[0-9A-Fa-f:]*\b", normalize_ipv6_compact, text)
+    # Acronym/acroynm like TCP/IP -> "TCP slash IP"
+    text = re.sub(r"\b([A-Z]{2,})\s*/\s*([A-Z]{2,})\b", r"\1 slash \2", text)
+    # Word/word patterns like this/that -> "this or that"
+    text = re.sub(r"\b([A-Za-z]+)\s*/\s*([A-Za-z]+)\b", r"\1 or \2", text)
+    # Common separators in paths/flags
+    text = re.sub(r"(?<=\w)/(?!\s)", " forward slash ", text)
+    text = re.sub(r"\\", " backslash ", text)
+    text = re.sub(r"(?<=\w)-(?=\w)", " dash ", text)
+    text = re.sub(r"(?<=\w)_(?=\w)", " underscore ", text)
+    text = re.sub(r"(?<=\w):(?=\w)", " colon ", text)
+    text = re.sub(r"--", " double dash ", text)
+    text = re.sub(r"->", " arrow ", text)
+    text = re.sub(r"=>", " arrow ", text)
+    text = re.sub(r"\b(\d+)%\b", r"\1 percent", text)
+    # Versions like v1.2.3 -> v 1 point 2 point 3
+    text = re.sub(r"\bv(\d+(?:\.\d+)+)\b", lambda m: "v " + m.group(1).replace(".", " point "), text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+\.\d+\.\d+)\b", lambda m: m.group(1).replace(".", " point "), text)
+    # Units and rates
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*kbps\b", r"\1 kilobits per second", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*mbps\b", r"\1 megabits per second", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*gbps\b", r"\1 gigabits per second", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*tbps\b", r"\1 terabits per second", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*kb\b", r"\1 kilobytes", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*mb\b", r"\1 megabytes", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*gb\b", r"\1 gigabytes", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*tb\b", r"\1 terabytes", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*mhz\b", r"\1 mega hertz", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*ghz\b", r"\1 giga hertz", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*ms\b", r"\1 milliseconds", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*us\b", r"\1 microseconds", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*ns\b", r"\1 nanoseconds", text, flags=re.IGNORECASE)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*s\b", r"\1 seconds", text)
+    text = re.sub(r"\b(\d+(?:\.\d+)?)\s*min\b", r"\1 minutes", text, flags=re.IGNORECASE)
+    # Optional plural markers like domain(s) -> "domain or domains"
+    text = re.sub(r"\b([A-Za-z]+)\(s\)(?!\w)", r"\1 or \1s", text)
+    return text
 # Main preprocessing pipeline
 def preprocess_all(string):
     string = normalize_dates(string)
     string = replace_invalid_chars(string)
     string = replace_numbers(string)
+    string = tech_humanize(string)
     string = replace_abbreviations(string)
     string = make_dots_tts_friendly(string)
     string = clean_whitespace(string)
         test_preprocessing(test_file)
     else:
         print("Please provide a file path as an argument.")