Naija_MedModel / app.py
Ephraimmm's picture
Update app.py
044b2dd verified
"""
NaijaMedModel v1.0 β€” ZeroGPU (H200) Gradio Space
Changes vs v1:
- Fixed: system prompt was a tuple not a string (caused Jinja2 crash).
- Fixed: gender leaking into SOAP output.
Layer 1 β€” system prompt explicitly forbids gender inference.
Layer 2 β€” user turn repeats the prohibition.
Layer 3 β€” _strip_gender() post-processes the raw model output
with regex before rendering, catching anything the model
still writes despite the instructions.
"""
import torch, os, tempfile, re
_orig_load = torch.load
def _patched_load(f, *args, **kwargs):
kwargs["weights_only"] = False
return _orig_load(f, *args, **kwargs)
torch.load = _patched_load
try:
from huggingface_hub.utils import _validators as _hf_v
_orig_smooth = _hf_v.smoothly_deprecate_legacy_arguments
def _patched_smooth(fn_name, kwargs):
if "use_auth_token" in kwargs:
v = kwargs.pop("use_auth_token")
if "token" not in kwargs and v is not None:
kwargs["token"] = v
return _orig_smooth(fn_name, kwargs)
_hf_v.smoothly_deprecate_legacy_arguments = _patched_smooth
except Exception:
pass
import numpy as np
import soundfile as sf
from scipy.signal import resample
import gradio as gr
import spaces
ASR_EN_MODEL = "Ephraimmm/asrfinetuned"
ASR_YO_MODEL = "NCAIR1/Yoruba-ASR"
WHISPER_MODEL = "openai/whisper-large-v3"
TRANSLATE_MODEL = "Helsinki-NLP/opus-mt-yo-en"
DIAR_MODEL = "pyannote/speaker-diarization-3.1"
SOAP_MODEL = "Edifon/SOAP_SFT_V1"
HF_TOKEN = os.environ.get("HF_TOKEN", "")
ROLE_OPTIONS = ["Doctor", "Patient", "Parent / Guardian", "Nurse", "Interpreter", "Other"]
ORDER_OPTIONS = ["1st speaker", "2nd speaker", "3rd speaker", "4th speaker"]
_models: dict = {}
# ══════════════════════════════════════════════════════════════════════════
# MODEL LOADING
# ══════════════════════════════════════════════════════════════════════════
def _get_models() -> dict:
if _models:
return _models
from transformers import (
pipeline as hf_pipeline,
AutoProcessor,
AutoModelForSpeechSeq2Seq,
MarianMTModel,
MarianTokenizer,
AutoTokenizer,
AutoModelForCausalLM,
)
from pyannote.audio import Pipeline as DiarizationPipeline
import whisper as _whisper
print("⏳ Loading English ASR…")
_models["asr_en"] = hf_pipeline(
"automatic-speech-recognition", model=ASR_EN_MODEL,
device="cuda", token=HF_TOKEN,
)
print("⏳ Loading Yoruba ASR…")
_models["yo_processor"] = AutoProcessor.from_pretrained(ASR_YO_MODEL)
_models["yo_model"] = AutoModelForSpeechSeq2Seq.from_pretrained(
ASR_YO_MODEL, torch_dtype=torch.float16
).to("cuda")
print("⏳ Loading Whisper large-v3…")
_models["whisper_pipe"] = hf_pipeline(
"automatic-speech-recognition", model=WHISPER_MODEL,
torch_dtype=torch.float16, device="cuda",
generate_kwargs={"task": "translate", "language": None},
)
print("⏳ Loading Yorubaβ†’English MT…")
try:
_models["mt_tokenizer"] = MarianTokenizer.from_pretrained(TRANSLATE_MODEL)
_models["mt_model"] = MarianMTModel.from_pretrained(
TRANSLATE_MODEL, torch_dtype=torch.float16
).to("cuda")
except Exception as e:
print(f"⚠️ MT skipped: {e}")
_models["mt_tokenizer"] = None
_models["mt_model"] = None
print("⏳ Loading diarization…")
diar = DiarizationPipeline.from_pretrained(DIAR_MODEL, use_auth_token=HF_TOKEN)
_models["diar"] = diar.to(torch.device("cuda"))
print("⏳ Loading Whisper tiny…")
_models["lang_model"] = _whisper.load_model("tiny", device="cuda")
print("⏳ Loading SOAP model…")
_models["soap_processor"] = AutoTokenizer.from_pretrained(SOAP_MODEL, token=HF_TOKEN)
_models["soap_model"] = AutoModelForCausalLM.from_pretrained(
SOAP_MODEL, torch_dtype=torch.bfloat16, device_map="cuda", token=HF_TOKEN,
)
_models["whisper_lib"] = _whisper
print("βœ… All models ready.")
return _models
# ══════════════════════════════════════════════════════════════════════════
# SPEAKER MAPPING β€” order-based
# ══════════════════════════════════════════════════════════════════════════
def _get_chunk(arr, start, end, sr=16000):
return arr[int(start * sr): int(end * sr)]
def _build_speaker_map(segs: list[dict], speaker_config: list[dict]) -> dict[str, str]:
"""
pyannote 3.x labels clusters in strict order of first appearance:
SPEAKER_00 = first voice heard, SPEAKER_01 = second, etc.
We sort the user config by their chosen speaking order and zip.
"""
all_clusters = sorted({s["speaker"] for s in segs})
sorted_cfg = sorted(speaker_config, key=lambda c: c["order"])
sp_map: dict[str, str] = {}
for cluster, cfg in zip(all_clusters, sorted_cfg):
sp_map[cluster] = cfg["role"]
for i, cluster in enumerate(all_clusters):
if cluster not in sp_map:
sp_map[cluster] = f"Speaker {i + 1}"
return sp_map
# ══════════════════════════════════════════════════════════════════════════
# GENDER STRIPPING β€” Layer 3 (deterministic safety net)
# ══════════════════════════════════════════════════════════════════════════
def _strip_gender(text: str) -> str:
"""
Remove gendered language the SOAP model writes despite prompt instructions.
This is a hard post-processing pass β€” it does not rely on model compliance.
Applied to the raw model output before any markdown conversion.
"""
# Remove demographic descriptors: "a male", "a female", "a young man", etc.
text = re.sub(
r',?\s*\ban?\s+(male|female|man|woman|boy|girl|gentleman|lady)\b',
'', text, flags=re.IGNORECASE
)
# Remove age+gender combos: "35-year-old male", "elderly woman"
text = re.sub(
r'\b\d+[-\s]year[-\s]old\s+(male|female|man|woman|boy|girl)\b',
'', text, flags=re.IGNORECASE
)
text = re.sub(
r'\b(elderly|young|middle[-\s]aged)\s+(male|female|man|woman|boy|girl)\b',
'', text, flags=re.IGNORECASE
)
# Replace gendered pronouns
text = re.sub(r'\bhe\b', 'the patient', text, flags=re.IGNORECASE)
text = re.sub(r'\bshe\b', 'the patient', text, flags=re.IGNORECASE)
text = re.sub(r'\bhis\b', "the patient's", text, flags=re.IGNORECASE)
text = re.sub(r'\bher\b', "the patient's", text, flags=re.IGNORECASE)
text = re.sub(r'\bhim\b', 'the patient', text, flags=re.IGNORECASE)
# Clean up any double spaces left behind
text = re.sub(r' +', ' ', text)
text = re.sub(r' ,', ',', text)
return text.strip()
# ══════════════════════════════════════════════════════════════════════════
# HELPERS
# ══════════════════════════════════════════════════════════════════════════
def _clean_fillers(text: str) -> str:
return re.sub(r'\b(emm?|erm+|uhh?|umm?|ah+)\b', '', text, flags=re.IGNORECASE).strip()
def to_markdown(text: str) -> str:
sections = {
"S:": "## Subjective",
"O:": "## Objective",
"A:": "## Assessment",
"P:": "## Plan",
}
for tag, heading in sections.items():
text = re.sub(rf"(?i){re.escape(tag)}", f"\n\n{heading}\n", text)
return text.strip()
def _run_whisper_translate(m: dict, chunk: np.ndarray) -> str:
mdl = m["whisper_pipe"].model
proc = m["whisper_pipe"].feature_extractor
tok = m["whisper_pipe"].tokenizer
inp = proc(chunk, sampling_rate=16000, return_tensors="pt").input_features
inp = inp.to("cuda", dtype=torch.float16)
with torch.no_grad():
ids = mdl.generate(inp, task="translate", language=None, return_timestamps=False)
return tok.batch_decode(ids, skip_special_tokens=True)[0].strip()
# ══════════════════════════════════════════════════════════════════════════
# MAIN GPU PIPELINE
# ══════════════════════════════════════════════════════════════════════════
@spaces.GPU(duration=300)
def run_pipeline(wav_path: str, speaker_config: list[dict]) -> str:
m = _get_models()
_whisper = m["whisper_lib"]
num_speakers = len(speaker_config) if speaker_config else None
diar_kwargs = {"num_speakers": num_speakers} if num_speakers else {}
diar_result = m["diar"](wav_path, **diar_kwargs)
segs = [
{"start": t.start, "end": t.end, "speaker": spk}
for t, _, spk in diar_result.itertracks(yield_label=True)
]
if not segs:
return "⚠️ No speakers detected in the recording."
arr, _ = sf.read(wav_path)
arr = arr.astype(np.float32)
sp_map = _build_speaker_map(segs, speaker_config)
# ── ASR per segment ────────────────────────────────────────────────────
transcript: list[dict] = []
for seg in segs:
chunk = _get_chunk(arr, seg["start"], seg["end"])
if len(chunk) < 16000 * 0.5:
continue
padded = _whisper.pad_or_trim(chunk)
mel = _whisper.log_mel_spectrogram(padded).to("cuda")
_, probs = m["lang_model"].detect_language(mel)
lang = max(probs, key=probs.get)
if lang == "yo":
inputs = m["yo_processor"](chunk, sampling_rate=16000, return_tensors="pt")
inputs = {
k: v.to("cuda", dtype=torch.float16) if v.dtype == torch.float32 else v.to("cuda")
for k, v in inputs.items()
}
with torch.no_grad():
ids = m["yo_model"].generate(**inputs)
yo_text = m["yo_processor"].batch_decode(ids, skip_special_tokens=True)[0].strip()
if m["mt_model"]:
tokens = m["mt_tokenizer"]([yo_text], return_tensors="pt", padding=True)
tokens = {k: v.to("cuda") for k, v in tokens.items()}
with torch.no_grad():
out = m["mt_model"].generate(**tokens)
en_text = m["mt_tokenizer"].batch_decode(out, skip_special_tokens=True)[0].strip()
else:
en_text = _run_whisper_translate(m, chunk)
transcript.append({**seg, "language": "yo", "text": en_text, "translated": True})
elif lang != "en":
en_text = _run_whisper_translate(m, chunk)
transcript.append({**seg, "language": lang, "text": en_text, "translated": True})
else:
fe = m["asr_en"].feature_extractor
inputs = fe(chunk, sampling_rate=16000, return_tensors="pt",
truncation=True, return_attention_mask=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.no_grad():
ids = m["asr_en"].model.generate(
inputs["input_features"],
attention_mask=inputs["attention_mask"],
generation_config=m["asr_en"].model.generation_config,
language="english",
)
en_text = m["asr_en"].tokenizer.batch_decode(ids, skip_special_tokens=True)[0].strip()
transcript.append({**seg, "language": "en", "text": en_text, "translated": False})
if not transcript:
return "⚠️ No speech could be transcribed from the recording."
# ── Build flat transcript for SOAP prompt ──────────────────────────────
flat = "\n".join(
f"{sp_map.get(s['speaker'], s['speaker'])}: {_clean_fillers(s['text'])}"
for s in transcript
)
roles_present = sorted(set(sp_map.values()))
roles_desc = ", ".join(roles_present)
_role_instructions: dict[str, str] = {
"Doctor":
"The Doctor's questions, examinations and clinical reasoning drive the Assessment and Plan.",
"Patient":
"The Patient's reported symptoms, history and concerns form the core of the Subjective section.",
"Parent / Guardian":
"The Parent/Guardian provides collateral history on behalf of the patient; "
"include this under Subjective, clearly noted as collateral.",
"Nurse":
"The Nurse may contribute observed vital signs or measurements; include under Objective.",
"Interpreter":
"The Interpreter facilitates communication only; do not attribute clinical statements to them.",
"Other":
"Include contributions from other speakers only if clinically relevant.",
}
role_guidance = "\n".join(
_role_instructions[r] for r in roles_present if r in _role_instructions
)
# ── Layer 1: system prompt β€” explicit gender prohibition ───────────────
system_content = (
"You are an expert medical professor generating medically accurate SOAP notes.\n"
f"Speakers present: {roles_desc}.\n"
f"{role_guidance}\n"
"Output format: use exactly these section labels on their own line: "
"S:, O:, A:, P: β€” no markdown, no bullet points inside the headers.\n"
"CRITICAL RULE β€” GENDER: You must never state, infer, or imply the gender of any speaker. "
"Do not use the words male, female, man, woman, boy, girl, gentleman, or lady to describe any speaker. "
"Do not use gendered pronouns: he, she, his, her, him, hers. "
"Always refer to the patient as 'the patient' and the clinician as 'the clinician'. "
"If gender is mentioned anywhere in the transcript, ignore it completely."
)
# ── Layer 2: user turn β€” repeat prohibition so it is in recent context ─
user_content = (
"Create a SOAP note for this consultation.\n"
"Reminder: use no gendered pronouns and no gender descriptors for any speaker. "
"Write 'the patient' and 'the clinician' throughout.\n\n"
f"### Transcript:\n{flat}\n"
)
tok = m["soap_processor"]
msgs = [
{"role": "system", "content": system_content},
{"role": "user", "content": user_content},
]
encoded = tok.apply_chat_template(msgs, add_generation_prompt=True, return_tensors="pt")
if hasattr(encoded, "input_ids"):
input_ids = encoded.input_ids.to("cuda")
elif isinstance(encoded, dict):
input_ids = encoded["input_ids"].to("cuda")
else:
input_ids = torch.tensor(encoded).unsqueeze(0).to("cuda")
out_ids = m["soap_model"].generate(input_ids, max_new_tokens=2048, do_sample=False)
soap = tok.decode(out_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
# ── Layer 3: post-process β€” deterministic gender stripping ────────────
soap = _strip_gender(soap)
return to_markdown(soap)
# ══════════════════════════════════════════════════════════════════════════
# GRADIO TOP-LEVEL HANDLER
# ══════════════════════════════════════════════════════════════════════════
def _order_str_to_int(order_str: str) -> int:
return {"1st speaker": 0, "2nd speaker": 1, "3rd speaker": 2, "4th speaker": 3}.get(order_str, 0)
def process_audio(
audio_filepath,
num_speakers: int,
spk1_role: str, spk1_order: str,
spk2_role: str, spk2_order: str,
spk3_role: str, spk3_order: str,
spk4_role: str, spk4_order: str,
) -> str:
if audio_filepath is None:
return "⚠️ Please upload an audio file before generating."
raw = [
{"role": spk1_role, "order": _order_str_to_int(spk1_order)},
{"role": spk2_role, "order": _order_str_to_int(spk2_order)},
{"role": spk3_role, "order": _order_str_to_int(spk3_order)},
{"role": spk4_role, "order": _order_str_to_int(spk4_order)},
]
speaker_config = raw[:int(num_speakers)]
tmp_wav = tempfile.mktemp(suffix=".wav")
try:
arr, sr = sf.read(audio_filepath, always_2d=False)
if sr != 16000:
arr = resample(arr, int(len(arr) * 16000 / sr))
if arr.ndim > 1:
arr = arr.mean(axis=1)
arr = arr.astype(np.float32)
sf.write(tmp_wav, arr, 16000)
except Exception:
import subprocess
subprocess.run(
["ffmpeg", "-i", audio_filepath, "-ar", "16000", "-ac", "1",
tmp_wav, "-y", "-loglevel", "quiet"],
check=True,
)
try:
return run_pipeline(tmp_wav, speaker_config)
finally:
try:
os.remove(tmp_wav)
except Exception:
pass
# ══════════════════════════════════════════════════════════════════════════
# CSS
# ══════════════════════════════════════════════════════════════════════════
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,300;0,400;0,500;0,600&family=DM+Mono:wght@300;400;500&display=swap');
:root {
--bg-void: #080c10;
--bg-card: #111820;
--bg-input: #0a0f14;
--border-dim: rgba(0,220,180,0.08);
--border-bright: rgba(0,220,180,0.28);
--teal: #00dca4;
--teal-dim: #00b884;
--amber: #f5a623;
--txt-hi: #e8eef4;
--txt-mid: #8899aa;
--txt-lo: #4a5d6e;
--mono: 'DM Mono', monospace;
--sans: 'DM Sans', sans-serif;
--r-sm: 6px; --r-md: 10px; --r-lg: 16px; --r-xl: 22px;
}
* { box-sizing: border-box; }
body, .gradio-container, gradio-app, #root {
background: var(--bg-void) !important;
font-family: var(--sans) !important;
color: var(--txt-hi) !important;
}
.gradio-container { max-width: 980px !important; margin: 0 auto !important; padding: 0 24px 80px !important; }
::-webkit-scrollbar { width: 4px; }
::-webkit-scrollbar-thumb { background: var(--border-bright); border-radius: 99px; }
/* Hero */
#hero-wrap { text-align:center; padding:64px 0 48px; position:relative; }
#hero-wrap::before {
content:''; position:absolute; top:0; left:50%; transform:translateX(-50%);
width:560px; height:260px;
background:radial-gradient(ellipse at 50% 0%,rgba(0,220,164,.09) 0%,transparent 70%);
pointer-events:none;
}
.badge-row { display:flex; align-items:center; justify-content:center; gap:8px; margin-bottom:22px; }
.badge { display:inline-flex; align-items:center; gap:5px; font-size:11px; font-family:var(--mono); letter-spacing:.06em; text-transform:uppercase; padding:4px 12px; border-radius:99px; border:1px solid; }
.badge-teal { color:var(--teal); border-color:rgba(0,220,164,.3); background:rgba(0,220,164,.06); }
.badge-amber { color:var(--amber); border-color:rgba(245,166,35,.3); background:rgba(245,166,35,.06); }
.badge-v { color:var(--txt-lo); border-color:var(--border-dim); }
.pulse-dot { width:6px; height:6px; border-radius:50%; background:var(--teal); animation:pulse 2s ease-in-out infinite; display:inline-block; }
@keyframes pulse { 0%,100%{box-shadow:0 0 0 0 rgba(0,220,164,.6)} 50%{box-shadow:0 0 0 5px transparent} }
.hero-title { font-size:clamp(32px,5vw,52px); font-weight:300; letter-spacing:-.02em; line-height:1.1; margin:0 0 6px; }
.hero-title span { font-weight:600; color:var(--teal); }
.hero-sub { font-size:15px; color:var(--txt-mid); font-weight:300; max-width:520px; margin:12px auto 0; line-height:1.65; }
/* Stat + pipeline rows */
.stat-row { display:grid; grid-template-columns:repeat(3,1fr); gap:12px; margin-bottom:28px; }
.stat-card { background:var(--bg-card); border:1px solid var(--border-dim); border-radius:var(--r-md); padding:18px 20px; text-align:center; transition:border-color .25s; }
.stat-card:hover { border-color:var(--border-bright); }
.stat-num { font-size:26px; font-weight:500; color:var(--teal); font-family:var(--mono); line-height:1; margin-bottom:5px; }
.stat-label { font-size:11px; color:var(--txt-lo); text-transform:uppercase; letter-spacing:.08em; }
.pipeline-row { display:grid; grid-template-columns:repeat(4,1fr); gap:8px; margin-bottom:28px; }
.pipe-step { background:var(--bg-card); border:1px solid var(--border-dim); border-radius:var(--r-md); padding:14px 12px; text-align:center; }
.pipe-icon { font-size:18px; margin-bottom:6px; display:block; }
.pipe-name { font-size:10px; font-family:var(--mono); color:var(--txt-lo); text-transform:uppercase; letter-spacing:.08em; line-height:1.3; }
/* Section panels */
#config-section, #upload-section, #output-section {
background:var(--bg-card); border:1px solid var(--border-dim);
border-radius:var(--r-xl); padding:28px 32px; margin-bottom:16px;
position:relative; overflow:hidden;
}
#config-section::after, #upload-section::after {
content:''; position:absolute; inset:0;
background:radial-gradient(ellipse at 50% -20%,rgba(0,220,164,.04) 0%,transparent 60%);
pointer-events:none;
}
.section-label {
display:flex; align-items:center; gap:10px;
font-size:12px; font-family:var(--mono); color:var(--txt-lo);
text-transform:uppercase; letter-spacing:.1em; margin-bottom:18px;
}
.section-label::before { content:''; display:inline-block; width:20px; height:1px; background:var(--teal-dim); }
.section-hint { font-size:13px; color:var(--txt-mid); margin:0 0 22px; line-height:1.65; }
/* Speaker cards */
.spk-header { display:flex; align-items:center; gap:10px; margin-bottom:14px; }
.spk-avatar {
width:34px; height:34px; border-radius:50%;
display:flex; align-items:center; justify-content:center;
font-size:11px; font-weight:600; font-family:var(--mono); flex-shrink:0;
}
.av1 { background:rgba(0,220,164,.14); color:#00dca4; border:1px solid rgba(0,220,164,.3); }
.av2 { background:rgba(245,166,35,.12); color:#f5a623; border:1px solid rgba(245,166,35,.3); }
.av3 { background:rgba(168,141,240,.12);color:#a88df0; border:1px solid rgba(168,141,240,.3); }
.av4 { background:rgba(255,107,133,.10);color:#ff6b85; border:1px solid rgba(255,107,133,.3); }
.spk-name { font-size:12px; color:var(--txt-mid); font-family:var(--mono); }
.order-callout {
background: rgba(0,220,164,0.06);
border: 1px solid rgba(0,220,164,0.18);
border-radius: var(--r-md);
padding: 12px 16px;
font-size: 12px;
color: var(--txt-mid);
line-height: 1.6;
margin-bottom: 20px;
}
.order-callout strong { color: var(--teal); font-weight: 500; }
/* Format chips */
.format-hints { display:flex; gap:8px; flex-wrap:wrap; margin-top:14px; }
.fmt-chip { font-size:10px; font-family:var(--mono); color:var(--txt-lo); border:1px solid var(--border-dim); border-radius:4px; padding:3px 8px; }
/* Audio widget */
[data-testid="audio"] { background:var(--bg-input) !important; border:1.5px dashed rgba(0,220,164,.2) !important; border-radius:var(--r-md) !important; }
[data-testid="audio"]:hover { border-color:rgba(0,220,164,.45) !important; }
/* CTA */
#run-btn button {
width:100% !important; height:52px !important;
background:linear-gradient(135deg,#00c49a,#00dca4,#00b884) !important;
border:none !important; border-radius:var(--r-md) !important;
color:#03120d !important; font-family:var(--sans) !important;
font-size:15px !important; font-weight:600 !important;
transition:transform .15s,box-shadow .25s !important;
}
#run-btn button:hover { transform:translateY(-1px) !important; box-shadow:0 8px 32px rgba(0,220,164,.25) !important; }
#run-btn button:active { transform:translateY(0) !important; }
/* SOAP output */
#output-section::before {
content:''; position:absolute; top:0; right:0; width:300px; height:300px;
background:radial-gradient(ellipse at 100% 0%,rgba(0,220,164,.04) 0%,transparent 65%);
pointer-events:none;
}
#soap-out, .gr-markdown { background:transparent !important; border:none !important; color:var(--txt-hi) !important; font-family:var(--sans) !important; }
#soap-out h2 {
font-size:13px !important; font-family:var(--mono) !important; font-weight:500 !important;
letter-spacing:.12em !important; text-transform:uppercase !important; color:var(--teal) !important;
margin:28px 0 10px !important; padding-bottom:8px !important;
border-bottom:1px solid var(--border-dim) !important;
}
#soap-out h2::before { content:'// '; opacity:.4; }
#soap-out p { font-size:15px !important; line-height:1.75 !important; color:var(--txt-mid) !important; margin:0 0 8px !important; }
#soap-out ul,#soap-out ol { color:var(--txt-mid) !important; font-size:15px !important; line-height:1.75 !important; padding-left:20px !important; }
#soap-out em { color:var(--txt-lo) !important; font-style:normal !important; font-size:14px !important; display:block; text-align:center; margin-top:60px; }
#soap-out strong { color:var(--txt-hi) !important; font-weight:500 !important; }
#soap-out hr { border:none !important; border-top:1px solid var(--border-dim) !important; margin:20px 0 !important; }
/* Footer */
#footer-bar { margin-top:48px; padding-top:20px; border-top:1px solid var(--border-dim); display:flex; align-items:center; justify-content:space-between; flex-wrap:wrap; gap:12px; }
.footer-txt { font-size:12px; color:var(--txt-lo); font-family:var(--mono); }
.footer-authors { font-size:12px; color:var(--txt-lo); }
.footer-authors span { color:var(--teal-dim); }
/* Gradio resets */
.gr-block,.block,[data-testid] { background:transparent !important; border:none !important; box-shadow:none !important; padding:0 !important; }
.gr-box,.wrap { background:transparent !important; border:none !important; }
label.float { display:none !important; }
footer,.footer { display:none !important; }
select { background:var(--bg-input) !important; border:1px solid var(--border-dim) !important; color:var(--txt-hi) !important; border-radius:var(--r-sm) !important; font-family:var(--sans) !important; font-size:13px !important; }
select:focus { border-color:var(--border-bright) !important; outline:none !important; }
.gr-form label, label { color:var(--txt-mid) !important; font-size:12px !important; font-family:var(--mono) !important; }
input[type=range]::-webkit-slider-thumb { background:var(--teal) !important; }
"""
# ══════════════════════════════════════════════════════════════════════════
# GRADIO UI
# ══════════════════════════════════════════════════════════════════════════
with gr.Blocks(
title="NaijaMedModel v1",
css=custom_css,
theme=gr.themes.Base(
primary_hue=gr.themes.colors.Color(
c50="#e0faf3", c100="#b3f2e0", c200="#66e6c2",
c300="#00dca4", c400="#00c49a", c500="#00b884",
c600="#009a6d", c700="#007d57", c800="#005f41",
c900="#003e2a", c950="#001f15", name="teal",
),
neutral_hue=gr.themes.colors.slate,
font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
font_mono=[gr.themes.GoogleFont("DM Mono"), "monospace"],
).set(
body_background_fill="#080c10",
block_background_fill="#111820",
block_border_color="rgba(0,220,164,0.08)",
block_label_text_color="#4a5d6e",
input_background_fill="#0a0f14",
button_primary_background_fill="#00dca4",
button_primary_text_color="#03120d",
slider_color="#00dca4",
color_accent="#00dca4",
),
) as demo:
gr.HTML("""
<div id="hero-wrap">
<div class="badge-row">
<span class="badge badge-teal"><span class="pulse-dot"></span> Live</span>
<span class="badge badge-amber">ZeroGPU Β· H200</span>
<span class="badge badge-v">v1</span>
</div>
<h1 class="hero-title">Naija<span>Med</span>Model</h1>
<p class="hero-sub">
Bilingual clinical speech recognition for Nigeria.<br>
English &amp; Yoruba consultations β†’ structured SOAP notes, automatically.
</p>
</div>
<div class="stat-row">
<div class="stat-card"><div class="stat-num">2</div><div class="stat-label">Languages</div></div>
<div class="stat-card"><div class="stat-num">NO</div><div class="stat-label">External API calls</div></div>
<div class="stat-card"><div class="stat-num">SOAP</div><div class="stat-label">Output Format</div></div>
</div>
<div class="pipeline-row">
<div class="pipe-step"><span class="pipe-icon">πŸŽ™</span><div class="pipe-name">Speaker<br>Diarization</div></div>
<div class="pipe-step"><span class="pipe-icon">πŸ”</span><div class="pipe-name">Language<br>Detection</div></div>
<div class="pipe-step"><span class="pipe-icon">πŸ“</span><div class="pipe-name">ASR &amp;<br>Translation</div></div>
<div class="pipe-step"><span class="pipe-icon">πŸ₯</span><div class="pipe-name">SOAP<br>Generation</div></div>
</div>
""")
# ── Section 01: Speaker config ─────────────────────────────────────────
with gr.Group(elem_id="config-section"):
gr.HTML('<div class="section-label">01 β€” Speaker Configuration</div>')
gr.HTML("""
<p class="section-hint">
Who is in the room and <strong style="color:var(--teal)">when each person speaks first</strong>.
The system maps voices by speaking order β€” pyannote always labels the first voice
heard as Speaker 1, the second as Speaker 2, and so on.
</p>
<div class="order-callout">
<strong>How to use:</strong> Play the recording β€” did the doctor speak first,
or did the patient? Set "1st speaker" to whoever opened the conversation.
If you get it wrong, just change the dropdowns and press Generate again β€”
no need to re-upload the file.
</div>
""")
num_speakers = gr.Slider(
minimum=1, maximum=4, value=2, step=1,
label="Number of speakers in the recording",
)
with gr.Row():
with gr.Column():
gr.HTML('<div class="spk-header"><div class="spk-avatar av1">S1</div><span class="spk-name">Speaker 1</span></div>')
spk1_role = gr.Dropdown(choices=ROLE_OPTIONS, value="Doctor", label="Clinical role")
spk1_order = gr.Dropdown(choices=ORDER_OPTIONS, value="1st speaker", label="Speaks first in recording")
with gr.Column():
gr.HTML('<div class="spk-header"><div class="spk-avatar av2">S2</div><span class="spk-name">Speaker 2</span></div>')
spk2_role = gr.Dropdown(choices=ROLE_OPTIONS, value="Patient", label="Clinical role")
spk2_order = gr.Dropdown(choices=ORDER_OPTIONS, value="2nd speaker", label="Speaks first in recording")
with gr.Row(visible=False) as spk34_row:
with gr.Column():
gr.HTML('<div class="spk-header"><div class="spk-avatar av3">S3</div><span class="spk-name">Speaker 3</span></div>')
spk3_role = gr.Dropdown(choices=ROLE_OPTIONS, value="Parent / Guardian", label="Clinical role")
spk3_order = gr.Dropdown(choices=ORDER_OPTIONS, value="3rd speaker", label="Speaks first in recording")
with gr.Column():
gr.HTML('<div class="spk-header"><div class="spk-avatar av4">S4</div><span class="spk-name">Speaker 4</span></div>')
spk4_role = gr.Dropdown(choices=ROLE_OPTIONS, value="Nurse", label="Clinical role")
spk4_order = gr.Dropdown(choices=ORDER_OPTIONS, value="4th speaker", label="Speaks first in recording")
num_speakers.change(
fn=lambda n: gr.update(visible=(n >= 3)),
inputs=num_speakers,
outputs=spk34_row,
)
# ── Section 02: Upload ─────────────────────────────────────────────────
with gr.Group(elem_id="upload-section"):
gr.HTML('<div class="section-label">02 β€” Upload Consultation</div>')
audio_in = gr.Audio(type="filepath", label="", show_label=False)
gr.HTML("""
<div class="format-hints">
<span class="fmt-chip">MP3</span><span class="fmt-chip">WAV</span>
<span class="fmt-chip">AAC</span><span class="fmt-chip">M4A</span>
<span class="fmt-chip">OGG</span><span class="fmt-chip">FLAC</span>
</div>
""")
btn = gr.Button("Generate SOAP Note", variant="primary", size="lg", elem_id="run-btn")
# ── Section 03: Output ─────────────────────────────────────────────────
with gr.Group(elem_id="output-section"):
gr.HTML('<div class="section-label">03 β€” Clinical SOAP Note</div>')
soap_out = gr.Markdown(
value="*Configure speakers, upload audio, then press Generate.*",
elem_id="soap-out",
)
gr.HTML("""
<div id="footer-bar">
<span class="footer-txt">First request ~2 min Β· Models load on demand Β· v1</span>
<span class="footer-authors">Built by <span>Analytics Intelligence</span> Β· Uche &amp; Jimmy</span>
</div>
""")
btn.click(
fn=process_audio,
inputs=[
audio_in, num_speakers,
spk1_role, spk1_order,
spk2_role, spk2_order,
spk3_role, spk3_order,
spk4_role, spk4_order,
],
outputs=soap_out,
)
if __name__ == "__main__":
demo.launch()