Spaces:

claudiocaletti
/

medwhisper-large-v3-ita-demo

Sleeping

App Files Files Community

calettippo commited on Sep 16

Commit

65b0afc

1 Parent(s): 700a4c7

Add preprocessing

Browse files

Files changed (2) hide show

app.py +177 -10
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import tempfile
 import time
 import logging
 import gc
 from dataclasses import dataclass
 from typing import Optional, Tuple, List, Any, Dict
 from contextlib import contextmanager
@@ -12,11 +13,16 @@ import gradio as gr
 import torch
 import psutil
 from dotenv import load_dotenv
 load_dotenv()
-# Audio preprocessing not available in Hugging Face Spaces deployment
-PREPROCESSING_AVAILABLE = False
 def get_env_or_secret(key: str, default: Optional[str] = None) -> Optional[str]:
@@ -41,8 +47,143 @@ class PreprocessingConfig:
     normalize_format: bool = True
     normalize_volume: bool = True
-    reduce_noise: bool = False
-    remove_silence: bool = False
 def load_asr_pipeline(
@@ -208,6 +349,23 @@ def transcribe_local(
     if not os.path.exists(audio_path):
         raise FileNotFoundError(f"Audio file not found: {audio_path}")
     # Load ASR pipeline with performance monitoring
     start_time = time.time()
@@ -262,10 +420,10 @@ def transcribe_local(
         try:
             # Primary inference attempt with safe parameters
             if asr_kwargs:
-                result = asr(audio_path, **asr_kwargs)
             else:
                 # Fallback to no parameters if all failed
-                result = asr(audio_path)
             inference_time = time.time() - inference_start
             memory_after = psutil.Process().memory_info().rss / 1024 / 1024  # MB
@@ -295,7 +453,7 @@ def transcribe_local(
             try:
                 inference_start = time.time()
-                result = asr(audio_path)  # No parameters at all
                 inference_time = time.time() - inference_start
                 memory_used = 0  # Reset memory tracking
@@ -313,6 +471,14 @@ def transcribe_local(
         torch.cuda.empty_cache()
     gc.collect()
     # Return results with performance metrics
     meta = {
         "device": device_str,
@@ -320,6 +486,7 @@ def transcribe_local(
         "inference_time": inference_time,
         "memory_used_mb": memory_used,
         "model_type": "original" if model_id == base_model_id else "fine-tuned",
     }
     return {"result": result, "meta": meta}
@@ -386,8 +553,8 @@ def transcribe_comparison(audio_file):
         error_msg = "❌ Modelli non configurati. Impostare HF_MODEL_ID e BASE_WHISPER_MODEL_ID nelle variabili d'ambiente"
         return error_msg, error_msg
-    # Preprocessing sempre attivo (nascosto all'utente)
-    # Non viene più utilizzato nel codice ma potrebbe servire per future implementazioni
     # Fixed settings optimized for medical transcription
     language = "it"  # Always Italian for ScribeAId
@@ -540,7 +707,7 @@ def create_interface():
                 - Modello originale: `{base_model_id}`
                 - Modello fine-tuned: `{model_id}`
                 - Lingua: Italiano (it)
-                - Preprocessing audio: ottimizzato per registrazioni mediche
                 """)
         gr.Markdown("---")

 import time
 import logging
 import gc
+import io
 from dataclasses import dataclass
 from typing import Optional, Tuple, List, Any, Dict
 from contextlib import contextmanager
 import torch
 import psutil
 from dotenv import load_dotenv
+import numpy as np
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+import soundfile as sf
+import noisereduce
 load_dotenv()
+# Audio preprocessing available with required dependencies
+PREPROCESSING_AVAILABLE = True
 def get_env_or_secret(key: str, default: Optional[str] = None) -> Optional[str]:
     normalize_format: bool = True
     normalize_volume: bool = True
+    reduce_noise: bool = True
+    remove_silence: bool = True
+def normalize_audio(audio_bytes: bytes) -> bytes:
+    """
+    Converte un chunk audio in bytes nel formato standard per Whisper.
+    (16kHz, mono, WAV PCM)
+    """
+    # Carica i bytes in pydub usando un file in memoria (BytesIO)
+    audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))
+    # 1. Imposta la frequenza di campionamento a 16kHz
+    audio_segment = audio_segment.set_frame_rate(16000)
+    # 2. Converte in mono
+    audio_segment = audio_segment.set_channels(1)
+    # 3. Assicura che il campione sia a 2 bytes (16-bit), standard per WAV
+    audio_segment = audio_segment.set_sample_width(2)
+    # Esporta i bytes processati in formato WAV
+    buffer = io.BytesIO()
+    audio_segment.export(buffer, format="wav")
+    return buffer.getvalue()
+def normalize_volume(audio_bytes: bytes) -> bytes:
+    """
+    Normalizza il volume di un chunk audio WAV.
+    """
+    # Carica l'audio
+    audio_segment = AudioSegment.from_wav(io.BytesIO(audio_bytes))
+    # Normalizza l'audio. Porta il picco massimo a -1.0 dBFS
+    # Il valore di headroom è una buona pratica per evitare clipping
+    normalized_segment = audio_segment.normalize(headroom=0.1)
+    buffer = io.BytesIO()
+    normalized_segment.export(buffer, format="wav")
+    return buffer.getvalue()
+def reduce_background_noise(audio_bytes: bytes) -> bytes:
+    """
+    Riduce il rumore di fondo da un chunk audio WAV.
+    """
+    # Leggi i dati audio dai bytes
+    buffer_read = io.BytesIO(audio_bytes)
+    rate, data = sf.read(buffer_read)
+    # Assicura che l'audio sia mono per la riduzione
+    if data.ndim > 1:
+        data = np.mean(data, axis=1)
+    # Esegui la riduzione del rumore
+    reduced_noise_data = noisereduce.reduce_noise(y=data, sr=rate)
+    # Scrivi i dati processati in un nuovo buffer di bytes
+    buffer_write = io.BytesIO()
+    sf.write(buffer_write, reduced_noise_data, rate, format="wav")
+    return buffer_write.getvalue()
+def remove_silence(audio_bytes: bytes) -> bytes:
+    """
+    Rimuove i segmenti di silenzio da un chunk audio in formato WAV.
+    """
+    audio_segment = AudioSegment.from_wav(io.BytesIO(audio_bytes))
+    chunks = split_on_silence(
+        audio_segment,
+        min_silence_len=100,
+        silence_thresh=-35,
+        keep_silence=80,  # Mantiene un piccolo silenzio tra i chunk
+    )
+    if not chunks:
+        # Se non trova parlato, restituisce bytes vuoti
+        return b""
+    # Unisce di nuovo i chunk in un unico segmento
+    processed_segment = sum(chunks, AudioSegment.empty())
+    buffer = io.BytesIO()
+    processed_segment.export(buffer, format="wav")
+    return buffer.getvalue()
+def preprocess_audio_pipeline(audio_path: str) -> str:
+    """
+    Applica la pipeline completa di preprocessing audio.
+    Restituisce il path del file audio preprocessato.
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Avvio pipeline di preprocessing audio")
+    try:
+        # Leggi il file audio originale
+        with open(audio_path, "rb") as f:
+            audio_bytes = f.read()
+        # Applica tutte le fasi di preprocessing in sequenza
+        logger.info("1. Normalizzazione formato audio...")
+        audio_bytes = normalize_audio(audio_bytes)
+        logger.info("2. Normalizzazione volume...")
+        audio_bytes = normalize_volume(audio_bytes)
+        logger.info("3. Riduzione rumore di fondo...")
+        audio_bytes = reduce_background_noise(audio_bytes)
+        logger.info("4. Rimozione silenzi...")
+        audio_bytes = remove_silence(audio_bytes)
+        # Se l'audio è vuoto dopo la rimozione del silenzio, usa l'audio originale
+        if not audio_bytes:
+            logger.warning(
+                "Audio vuoto dopo rimozione silenzi, utilizzo audio originale"
+            )
+            with open(audio_path, "rb") as f:
+                audio_bytes = f.read()
+            # Applica solo normalizzazione formato e volume
+            audio_bytes = normalize_audio(audio_bytes)
+            audio_bytes = normalize_volume(audio_bytes)
+        # Salva l'audio preprocessato in un file temporaneo
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            temp_file.write(audio_bytes)
+            preprocessed_path = temp_file.name
+        logger.info(f"Preprocessing completato: {preprocessed_path}")
+        return preprocessed_path
+    except Exception as e:
+        logger.error(f"Errore durante preprocessing: {e}")
+        logger.info("Utilizzo audio originale senza preprocessing")
+        return audio_path
 def load_asr_pipeline(
     if not os.path.exists(audio_path):
         raise FileNotFoundError(f"Audio file not found: {audio_path}")
+    # Apply audio preprocessing pipeline
+    preprocessed_audio_path = audio_path
+    if PREPROCESSING_AVAILABLE:
+        try:
+            logger.info("Applicazione preprocessing audio...")
+            preprocessed_audio_path = preprocess_audio_pipeline(audio_path)
+            logger.info(
+                f"Preprocessing completato. File processato: {os.path.basename(preprocessed_audio_path)}"
+            )
+        except Exception as e:
+            logger.warning(
+                f"Errore durante preprocessing, utilizzo audio originale: {e}"
+            )
+            preprocessed_audio_path = audio_path
+    else:
+        logger.info("Preprocessing audio non disponibile, utilizzo audio originale")
     # Load ASR pipeline with performance monitoring
     start_time = time.time()
         try:
             # Primary inference attempt with safe parameters
             if asr_kwargs:
+                result = asr(preprocessed_audio_path, **asr_kwargs)
             else:
                 # Fallback to no parameters if all failed
+                result = asr(preprocessed_audio_path)
             inference_time = time.time() - inference_start
             memory_after = psutil.Process().memory_info().rss / 1024 / 1024  # MB
             try:
                 inference_start = time.time()
+                result = asr(preprocessed_audio_path)  # No parameters at all
                 inference_time = time.time() - inference_start
                 memory_used = 0  # Reset memory tracking
         torch.cuda.empty_cache()
     gc.collect()
+    # Cleanup temporary preprocessed file if it was created
+    if preprocessed_audio_path != audio_path:
+        try:
+            os.unlink(preprocessed_audio_path)
+            logger.info("File audio preprocessato temporaneo rimosso")
+        except Exception as e:
+            logger.warning(f"Errore rimozione file temporaneo: {e}")
     # Return results with performance metrics
     meta = {
         "device": device_str,
         "inference_time": inference_time,
         "memory_used_mb": memory_used,
         "model_type": "original" if model_id == base_model_id else "fine-tuned",
+        "preprocessing_applied": preprocessed_audio_path != audio_path,
     }
     return {"result": result, "meta": meta}
         error_msg = "❌ Modelli non configurati. Impostare HF_MODEL_ID e BASE_WHISPER_MODEL_ID nelle variabili d'ambiente"
         return error_msg, error_msg
+    # Preprocessing sempre attivo: normalizzazione formato, volume, riduzione rumore, rimozione silenzi
+    # Viene applicato automaticamente prima della trascrizione con entrambi i modelli
     # Fixed settings optimized for medical transcription
     language = "it"  # Always Italian for ScribeAId
                 - Modello originale: `{base_model_id}`
                 - Modello fine-tuned: `{model_id}`
                 - Lingua: Italiano (it)
+                - Preprocessing audio: **ATTIVO** (normalizzazione, riduzione rumore, rimozione silenzi)
                 """)
         gr.Markdown("---")

requirements.txt CHANGED Viewed

@@ -12,3 +12,4 @@ psutil>=5.9.0
 python-dotenv>=1.0.0
 datasets>=2.14.0
 huggingface-hub>=0.17.0

 python-dotenv>=1.0.0
 datasets>=2.14.0
 huggingface-hub>=0.17.0
+noisereduce>=3.0.0