Spaces:

claudiocaletti
/

medwhisper-large-v3-ita-demo

Sleeping

App Files Files Community

calettippo commited on Sep 18

Commit

9bd97fd

1 Parent(s): e1d57cb

Add postprocessing commented

Browse files

Files changed (1) hide show

app.py +341 -5

app.py CHANGED Viewed

@@ -20,11 +20,28 @@ from pydub.silence import split_on_silence
 import soundfile as sf
 import noisereduce
 from huggingface_hub import snapshot_download
 load_dotenv()
 # Audio preprocessing available with required dependencies
 PREPROCESSING_AVAILABLE = True
 # Shared caches to keep models/pipelines in memory across requests
@@ -33,6 +50,10 @@ PIPELINE_CACHE_LOCK = threading.Lock()
 MODEL_PATH_CACHE: Dict[str, str] = {}
 MODEL_PATH_CACHE_LOCK = threading.Lock()
 def get_env_or_secret(key: str, default: Optional[str] = None) -> Optional[str]:
     """Get environment variable or default."""
@@ -161,6 +182,16 @@ def warm_model_cache() -> None:
     if model_id and model_id != base_model_id:
         models_to_check.append((model_id, "fine-tuned"))
     for model_name, label in models_to_check:
         try:
             logger.info("Verifying %s model cache for %s", label, model_name)
@@ -481,6 +512,298 @@ def load_asr_pipeline(
     return asr, final_device, final_dtype_name
 @contextmanager
 def memory_monitor():
     """Context manager to monitor memory usage during inference."""
@@ -709,7 +1032,8 @@ def handle_whisper_problematic_output(text: str, model_name: str = "Whisper") ->
 def transcribe_comparison(audio_file):
     """Main function for Gradio interface."""
     if audio_file is None:
-        return "❌ Nessun file audio fornito", "❌ Nessun file audio fornito"
     # Model configuration
     model_id = get_env_or_secret("HF_MODEL_ID")
@@ -720,7 +1044,7 @@ def transcribe_comparison(audio_file):
     if not model_id or not base_model_id:
         error_msg = "❌ Modelli non configurati. Impostare HF_MODEL_ID e BASE_WHISPER_MODEL_ID nelle variabili d'ambiente"
-        return error_msg, error_msg
     # Preprocessing sempre attivo: normalizzazione formato, volume, riduzione rumore, rimozione silenzi
     # Viene applicato automaticamente prima della trascrizione con entrambi i modelli
@@ -742,6 +1066,7 @@ def transcribe_comparison(audio_file):
         finetuned_result = None
         original_text = ""
         finetuned_text = ""
         try:
             # Transcribe with original model
@@ -819,16 +1144,18 @@ def transcribe_comparison(audio_file):
         except Exception as e:
             finetuned_text = f"❌ Errore modello fine-tuned: {str(e)}"
         # GPU memory cleanup
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         gc.collect()
-        return original_text, finetuned_text
     except Exception as e:
         error_msg = f"❌ Errore generale: {str(e)}"
-        return error_msg, error_msg
 # Gradio interface
@@ -920,11 +1247,20 @@ def create_interface():
                     show_copy_button=True,
                 )
         # Click event
         transcribe_btn.click(
             fn=transcribe_comparison,
             inputs=[audio_input],
-            outputs=[original_output, finetuned_output],
             show_progress=True,
         )

 import soundfile as sf
 import noisereduce
 from huggingface_hub import snapshot_download
+from transformers import pipeline
 load_dotenv()
 # Audio preprocessing available with required dependencies
 PREPROCESSING_AVAILABLE = True
+DEFAULT_TEXT_POSTPROCESS_MODEL = "google/medgemma-4b-it"
+TEXT_POSTPROCESS_PROMPT = (
+    "Agisci come assistente editoriale clinico. Prendi la trascrizione fornita, correggi"
+    " eventuali errori di riconoscimento automatico e migliora la grammatica mantenendo"
+    " il significato. Anonimizza inoltre il testo sostituendo nomi propri di persone con"
+    " segnaposto [PAZIENTE] o [MEDICO] a seconda del ruolo implicato. Non inventare"
+    " informazioni nuove, non tradurre. Restituisci solo la versione finale pulita"
+    " e pseudonimizzata in italiano, senza preamboli né spiegazioni."
+    "\nEsempio 1 - Input: 'Buongiorno dottor Rossi, sono Maria Bianchi e ho prenotato l'holter.'"
+    "\nEsempio 1 - Output: 'Buongiorno [MEDICO], sono [PAZIENTE] e ho prenotato l'Holter.'"
+    "\nEsempio 2 - Input: 'Il paziente Claudio Caletti riferisce che la dottoressa Neri gli ha prescritto Coumadin.'"
+    "\nEsempio 2 - Output: '[PAZIENTE] riferisce che [MEDICO] gli ha prescritto Coumadin.'"
+    "\nEsempio 3 - Input: 'Dott.ssa Gallo, ho parlato con la collega Francesca e confermiamo l'intervento.'"
+    "\nEsempio 3 - Output: '[MEDICO], ho parlato con [MEDICO] e confermiamo l'intervento.'"
+    "\nTesto originale:\n"
+)
 # Shared caches to keep models/pipelines in memory across requests
 MODEL_PATH_CACHE: Dict[str, str] = {}
 MODEL_PATH_CACHE_LOCK = threading.Lock()
+TEXT_POSTPROCESS_PIPELINE: Optional[Any] = None
+TEXT_POSTPROCESS_MODEL_ID: Optional[str] = None
+TEXT_POSTPROCESS_PIPELINE_LOCK = threading.Lock()
 def get_env_or_secret(key: str, default: Optional[str] = None) -> Optional[str]:
     """Get environment variable or default."""
     if model_id and model_id != base_model_id:
         models_to_check.append((model_id, "fine-tuned"))
+    text_postprocess_enabled = get_env_or_secret("TEXT_POSTPROCESS_ENABLED", "false").lower() in {
+        "1",
+        "true",
+        "yes",
+    }
+    text_model_id = get_env_or_secret("TEXT_POSTPROCESS_MODEL_ID", DEFAULT_TEXT_POSTPROCESS_MODEL)
+    if text_postprocess_enabled and text_model_id:
+        models_to_check.append((text_model_id, "text-postprocess"))
     for model_name, label in models_to_check:
         try:
             logger.info("Verifying %s model cache for %s", label, model_name)
     return asr, final_device, final_dtype_name
+def get_text_postprocess_pipeline(
+    model_id: str,
+    device_pref: Optional[str],
+    hf_token: Optional[str],
+) -> Any:
+    """Load a minimal text-generation pipeline for post-processing."""
+    logger = logging.getLogger(__name__)
+    if not model_id:
+        raise ValueError("Model id for text post-processing is not configured")
+    normalized_device_pref = (device_pref or "auto").lower()
+    if normalized_device_pref == "auto":
+        if torch.cuda.is_available():
+            device_choice = "cuda"
+        elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+            device_choice = "mps"
+        else:
+            device_choice = "cpu"
+    else:
+        device_choice = normalized_device_pref
+    device_argument: Any
+    dtype: Optional[torch.dtype] = None
+    if device_choice.startswith("cuda") and torch.cuda.is_available():
+        device_argument = device_choice
+        dtype = torch.bfloat16
+    elif device_choice == "mps" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        device_argument = "mps"
+        dtype = torch.float16
+    else:
+        device_argument = "cpu"
+        dtype = None
+    global TEXT_POSTPROCESS_PIPELINE, TEXT_POSTPROCESS_MODEL_ID
+    with TEXT_POSTPROCESS_PIPELINE_LOCK:
+        if (
+            TEXT_POSTPROCESS_PIPELINE is not None
+            and TEXT_POSTPROCESS_MODEL_ID == model_id
+        ):
+            return TEXT_POSTPROCESS_PIPELINE
+        model_source = ensure_local_model(model_id, hf_token=hf_token)
+        is_medgemma = "medgemma" in model_id.lower()
+        if is_medgemma:
+            pipe_kwargs: Dict[str, Any] = {
+                "task": "image-text-to-text",
+                "model": model_source,
+                "device": device_argument,
+            }
+            if dtype is not None:
+                pipe_kwargs["torch_dtype"] = dtype
+        else:
+            pipe_kwargs = {
+                "task": "text-generation",
+                "model": model_source,
+                "device": device_argument,
+                "tokenizer": model_source,
+            }
+            if dtype is not None:
+                pipe_kwargs["torch_dtype"] = dtype
+            if device_argument != "cpu":
+                pipe_kwargs["device_map"] = "auto"
+        logger.info(
+            "Loading postprocess pipeline for %s with device=%s, dtype=%s",
+            model_id,
+            device_argument,
+            str(dtype) if dtype is not None else "auto",
+        )
+        try:
+            postprocess_pipe = pipeline(**pipe_kwargs)
+        except Exception as primary_error:
+            logger.warning(
+                "Postprocess pipeline init failed on %s (%s). Falling back to CPU.",
+                device_argument,
+                primary_error,
+            )
+            pipe_kwargs["device"] = "cpu"
+            pipe_kwargs.pop("torch_dtype", None)
+            pipe_kwargs.pop("device_map", None)
+            postprocess_pipe = pipeline(**pipe_kwargs)
+        TEXT_POSTPROCESS_PIPELINE = postprocess_pipe
+        TEXT_POSTPROCESS_MODEL_ID = model_id
+        return postprocess_pipe
+def postprocess_transcription_text(
+    text: str,
+    context_label: str,
+) -> str:
+    """Run MedGemma post-processing to clean transcription text."""
+    if not text or not text.strip():
+        return text
+    logger = logging.getLogger(__name__)
+    text_postprocess_enabled = get_env_or_secret("TEXT_POSTPROCESS_ENABLED", "false").lower() in {
+        "1",
+        "true",
+        "yes",
+    }
+    if not text_postprocess_enabled:
+        logger.debug(
+            "Text post-processing skipped for %s: feature disabled",
+            context_label,
+        )
+        return text
+    model_id = get_env_or_secret("TEXT_POSTPROCESS_MODEL_ID", DEFAULT_TEXT_POSTPROCESS_MODEL)
+    if not model_id:
+        logger.info("Text post-processing disabled: no model configured")
+        return text
+    hf_token = get_env_or_secret("TEXT_POSTPROCESS_HF_TOKEN") or get_env_or_secret(
+        "HF_TOKEN"
+    )
+    device_pref = get_env_or_secret("TEXT_POSTPROCESS_DEVICE", "auto")
+    max_new_tokens = int(get_env_or_secret("TEXT_POSTPROCESS_MAX_NEW", "200"))
+    prompt_body = text.strip()
+    prompt = f"{TEXT_POSTPROCESS_PROMPT}{prompt_body}\nRisultato:"
+    is_medgemma = "medgemma" in model_id.lower()
+    try:
+        postprocess_pipe = get_text_postprocess_pipeline(
+            model_id=model_id,
+            device_pref=device_pref,
+            hf_token=hf_token,
+        )
+        if is_medgemma:
+            system_prompt, separator, _ = TEXT_POSTPROCESS_PROMPT.partition("\nTesto originale:\n")
+            if not separator:
+                system_prompt = TEXT_POSTPROCESS_PROMPT
+                user_prefix = ""
+            else:
+                user_prefix = "Testo originale:\n"
+            system_prompt = system_prompt.strip()
+            messages = [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": system_prompt.strip()}],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"{user_prefix}{prompt_body}\nRisultato:",
+                        }
+                    ],
+                },
+            ]
+            outputs = postprocess_pipe(
+                text=messages,
+                max_new_tokens=max_new_tokens,
+            )
+            generated_text = ""
+            if isinstance(outputs, list) and outputs:
+                first = outputs[0]
+                if isinstance(first, dict):
+                    generated = first.get("generated_text")
+                    if isinstance(generated, list):
+                        # Prefer the latest assistant-like turn
+                        for msg in reversed(generated):
+                            if not isinstance(msg, dict):
+                                continue
+                            role = msg.get("role")
+                            if role not in {"assistant", "model", None}:
+                                continue
+                            content = msg.get("content")
+                            if isinstance(content, list):
+                                for block in content:
+                                    if (
+                                        isinstance(block, dict)
+                                        and block.get("type") == "text"
+                                    ):
+                                        text_block = (block.get("text") or "").strip()
+                                        if text_block:
+                                            generated_text = text_block
+                                            break
+                                if generated_text:
+                                    break
+                            elif isinstance(content, str) and content.strip():
+                                generated_text = content.strip()
+                                break
+                        if not generated_text:
+                            # Fallback: use the last text block regardless of role
+                            for msg in reversed(generated):
+                                if not isinstance(msg, dict):
+                                    continue
+                                content = msg.get("content")
+                                if isinstance(content, list):
+                                    for block in content:
+                                        if (
+                                            isinstance(block, dict)
+                                            and block.get("type") == "text"
+                                            and block.get("text")
+                                        ):
+                                            generated_text = block["text"].strip()
+                                            break
+                                    if generated_text:
+                                        break
+                                elif isinstance(content, str) and content.strip():
+                                    generated_text = content.strip()
+                                    break
+                    elif isinstance(generated, str):
+                        generated_text = generated.strip()
+            elif isinstance(outputs, dict):
+                generated = outputs.get("generated_text")
+                if isinstance(generated, list):
+                    for msg in reversed(generated):
+                        if isinstance(msg, dict):
+                            text_block = (
+                                msg.get("text")
+                                or msg.get("content")
+                                or ""
+                            )
+                            if isinstance(text_block, str) and text_block.strip():
+                                generated_text = text_block.strip()
+                                break
+                elif isinstance(generated, str):
+                    generated_text = generated.strip()
+            cleaned = generated_text
+        else:
+            outputs = postprocess_pipe(
+                prompt,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+                return_full_text=False,
+            )
+            generated_text = ""
+            if isinstance(outputs, list) and outputs:
+                first = outputs[0]
+                if isinstance(first, dict):
+                    candidate = first.get("generated_text") or first.get("text")
+                    if isinstance(candidate, str):
+                        generated_text = candidate
+                    elif isinstance(candidate, list):
+                        generated_text = " ".join(
+                            part for part in candidate if isinstance(part, str)
+                        )
+                elif isinstance(first, str):
+                    generated_text = first
+            elif isinstance(outputs, dict):
+                candidate = outputs.get("generated_text") or outputs.get("text")
+                if isinstance(candidate, str):
+                    generated_text = candidate
+            elif isinstance(outputs, str):
+                generated_text = outputs
+            generated_text = (generated_text or "").strip()
+            if generated_text.startswith(prompt):
+                cleaned = generated_text[len(prompt) :].strip()
+            else:
+                cleaned = generated_text
+        if cleaned:
+            if cleaned.startswith(prompt_body):
+                cleaned = cleaned[len(prompt_body) :].strip()
+            if cleaned.startswith("Risultato:"):
+                cleaned = cleaned[len("Risultato:") :].strip()
+            if cleaned.lower().startswith("risultato:"):
+                cleaned = cleaned[len("risultato:") :].strip()
+            logger.debug("Post-processing successful for %s", context_label)
+            return cleaned or text
+        logger.warning("Post-processing returned empty output for %s", context_label)
+        return text
+    except Exception as exc:
+        logger.warning(
+            "Text post-processing failed for %s with model %s: %s",
+            context_label,
+            model_id,
+            exc,
+        )
+        return text
 @contextmanager
 def memory_monitor():
     """Context manager to monitor memory usage during inference."""
 def transcribe_comparison(audio_file):
     """Main function for Gradio interface."""
     if audio_file is None:
+        warning = "❌ Nessun file audio fornito"
+        return warning, warning, warning
     # Model configuration
     model_id = get_env_or_secret("HF_MODEL_ID")
     if not model_id or not base_model_id:
         error_msg = "❌ Modelli non configurati. Impostare HF_MODEL_ID e BASE_WHISPER_MODEL_ID nelle variabili d'ambiente"
+        return error_msg, error_msg, error_msg
     # Preprocessing sempre attivo: normalizzazione formato, volume, riduzione rumore, rimozione silenzi
     # Viene applicato automaticamente prima della trascrizione con entrambi i modelli
         finetuned_result = None
         original_text = ""
         finetuned_text = ""
+        postprocessed_text = ""
         try:
             # Transcribe with original model
         except Exception as e:
             finetuned_text = f"❌ Errore modello fine-tuned: {str(e)}"
+        postprocessed_text = finetuned_text or ""
         # GPU memory cleanup
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         gc.collect()
+        return original_text, finetuned_text, postprocessed_text
     except Exception as e:
         error_msg = f"❌ Errore generale: {str(e)}"
+        return error_msg, error_msg, error_msg
 # Gradio interface
                     show_copy_button=True,
                 )
+        # Post-processing disabilitato temporaneamente: manteniamo il widget ma nascosto
+        medgemma_output = gr.Textbox(
+            label="Testo finale",
+            lines=12,
+            interactive=False,
+            show_copy_button=True,
+            visible=False,
+        )
         # Click event
         transcribe_btn.click(
             fn=transcribe_comparison,
             inputs=[audio_input],
+            outputs=[original_output, finetuned_output, medgemma_output],
             show_progress=True,
         )