Spaces:
Sleeping
Sleeping
File size: 7,822 Bytes
b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 5b7432c b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f b0c3a57 1f7c87f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
"""OphthalmoCapture — Audio Recorder & Transcription Component
Records audio via st.audio_input, transcribes with Whisper, stores the
audio bytes and transcription in the ephemeral session, and lets the
doctor edit the transcription or restore the original.
Includes timestamped segments from Whisper for reference.
"""
import hashlib
import streamlit as st
import database as db
from i18n import t
from services import session_manager as sm
from services.whisper_service import transcribe_audio_with_timestamps, format_timestamp
def _audio_fingerprint(audio_bytes: bytes) -> str:
"""Return a short hash of the audio content for change detection."""
return hashlib.md5(audio_bytes).hexdigest()
def render_recorder(image_id: str, model, language: str):
"""Render the audio recording + transcription panel.
Parameters
----------
image_id : str
UUID of the currently selected image.
model :
Loaded Whisper model instance.
language : str
ISO language code for transcription (e.g. "es").
"""
img = st.session_state.images.get(image_id)
if img is None:
return
st.subheader(t("dictation"))
# Track which audio blob we already processed so we don't re-transcribe
processed_key = f"_last_audio_{image_id}"
segments_key = f"_segments_{image_id}"
# ── Handle button actions BEFORE widget rendering ────────────────────
# Streamlit forbids setting a widget's session_state key after the widget
# is instantiated. We use callback flags to detect button presses from
# the *previous* rerun and apply state changes *before* the text_area.
_rerecord_flag = f"_flag_rerecord_{image_id}"
_restore_flag = f"_flag_restore_{image_id}"
if st.session_state.pop(_rerecord_flag, False):
img["audio_bytes"] = None
img["transcription"] = ""
img["transcription_original"] = ""
st.session_state.pop(segments_key, None)
st.session_state.pop(processed_key, None)
st.session_state.pop(f"audio_input_{image_id}", None)
# Set value BEFORE the text_area is created
st.session_state[f"transcription_area_{image_id}"] = ""
sm.update_activity()
if st.session_state.pop(_restore_flag, False):
img["transcription"] = img["transcription_original"]
# Set value BEFORE the text_area is created
st.session_state[f"transcription_area_{image_id}"] = img["transcription_original"]
sm.update_activity()
# ── Audio recording ──────────────────────────────────────────────────
audio_wav = st.audio_input(
t("record_audio"),
key=f"audio_input_{image_id}",
)
if audio_wav is not None:
audio_bytes = audio_wav.getvalue()
fingerprint = _audio_fingerprint(audio_bytes)
# Only transcribe if this is a *new* recording (content changed)
if st.session_state.get(processed_key) != fingerprint:
with st.spinner(t("transcribing")):
text, segments = transcribe_audio_with_timestamps(
model, audio_bytes, language
)
# Store in session
img["audio_bytes"] = audio_bytes
# Append (don't overwrite) if there was previous text
if img["transcription"]:
img["transcription"] += " " + text
else:
img["transcription"] = text
# Keep a copy of the raw Whisper output
if img["transcription_original"]:
img["transcription_original"] += " " + text
else:
img["transcription_original"] = text
# Store timestamped segments
existing_segments = st.session_state.get(segments_key, [])
st.session_state[segments_key] = existing_segments + segments
# Mark this audio as processed using content hash (stable across reruns)
st.session_state[processed_key] = fingerprint
# Update the text_area widget state so it reflects the new text
st.session_state[f"transcription_area_{image_id}"] = img["transcription"]
# Re-save to audit DB if the image is already labeled (upsert)
if img.get("label"):
try:
db.save_or_update_annotation(
image_filename=img["filename"],
label=img["label"],
transcription=img["transcription"],
doctor_name=st.session_state.get("doctor_name", ""),
session_id=st.session_state.get("session_id", ""),
locs_data=img.get("locs_data", {}),
)
except Exception:
pass
sm.update_activity()
st.rerun()
# ── Editable transcription ───────────────────────────────────────────
edited_text = st.text_area(
t("transcription_editable"),
value=img["transcription"],
height=180,
key=f"transcription_area_{image_id}",
placeholder=t("transcription_placeholder"),
)
# Sync edits back to session
if edited_text != img["transcription"]:
img["transcription"] = edited_text
sm.update_activity()
# ── Timestamped segments (Idea C) ────────────────────────────────────
segments = st.session_state.get(segments_key, [])
if segments:
with st.expander(t("segments_timestamps"), expanded=False):
for seg in segments:
ts_start = format_timestamp(seg["start"])
ts_end = format_timestamp(seg["end"])
st.markdown(
f"`{ts_start} → {ts_end}` {seg['text']}"
)
# ── Helper buttons ───────────────────────────────────────────────────
btn_cols = st.columns(2)
with btn_cols[0]:
# Re-record: set flag → rerun → flag handler above clears state
has_audio = img["audio_bytes"] is not None
if st.button(
t("re_record"),
key=f"rerecord_{image_id}",
disabled=not has_audio,
use_container_width=True,
):
st.session_state[_rerecord_flag] = True
st.rerun()
with btn_cols[1]:
# Restore original: set flag → rerun → flag handler above restores
has_original = bool(img["transcription_original"])
is_different = img["transcription"] != img["transcription_original"]
if st.button(
t("restore_original"),
key=f"restore_{image_id}",
disabled=not (has_original and is_different),
use_container_width=True,
):
st.session_state[_restore_flag] = True
st.rerun()
# ── Status line ──────────────────────────────────────────────────────
if img["transcription"]:
modified_tag = ""
if (
img["transcription_original"]
and img["transcription"] != img["transcription_original"]
):
modified_tag = t("manually_modified")
word_count = len(img["transcription"].split())
st.caption(f"{t('word_count', count=word_count)}{modified_tag}")
else:
st.caption(t("no_transcription_yet"))
|