File size: 7,822 Bytes
b0c3a57
 
 
 
 
 
 
 
 
 
 
 
1f7c87f
b0c3a57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f7c87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0c3a57
 
 
1f7c87f
b0c3a57
 
 
 
 
 
 
 
 
1f7c87f
b0c3a57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b7432c
b0c3a57
 
 
 
 
 
 
 
 
1f7c87f
b0c3a57
 
 
1f7c87f
b0c3a57
 
 
 
 
 
 
 
 
 
1f7c87f
b0c3a57
 
 
 
 
 
 
 
1f7c87f
b0c3a57
 
1f7c87f
b0c3a57
 
1f7c87f
b0c3a57
 
 
 
1f7c87f
b0c3a57
 
 
1f7c87f
b0c3a57
 
 
1f7c87f
b0c3a57
 
 
 
1f7c87f
b0c3a57
 
 
 
 
 
 
 
 
1f7c87f
b0c3a57
1f7c87f
b0c3a57
1f7c87f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""OphthalmoCapture — Audio Recorder & Transcription Component

Records audio via st.audio_input, transcribes with Whisper, stores the
audio bytes and transcription in the ephemeral session, and lets the
doctor edit the transcription or restore the original.

Includes timestamped segments from Whisper for reference.
"""

import hashlib
import streamlit as st
import database as db
from i18n import t
from services import session_manager as sm
from services.whisper_service import transcribe_audio_with_timestamps, format_timestamp


def _audio_fingerprint(audio_bytes: bytes) -> str:
    """Return a short hash of the audio content for change detection."""
    return hashlib.md5(audio_bytes).hexdigest()


def render_recorder(image_id: str, model, language: str):
    """Render the audio recording + transcription panel.

    Parameters
    ----------
    image_id : str
        UUID of the currently selected image.
    model :
        Loaded Whisper model instance.
    language : str
        ISO language code for transcription (e.g. "es").
    """
    img = st.session_state.images.get(image_id)
    if img is None:
        return

    st.subheader(t("dictation"))

    # Track which audio blob we already processed so we don't re-transcribe
    processed_key = f"_last_audio_{image_id}"
    segments_key = f"_segments_{image_id}"

    # ── Handle button actions BEFORE widget rendering ────────────────────
    # Streamlit forbids setting a widget's session_state key after the widget
    # is instantiated. We use callback flags to detect button presses from
    # the *previous* rerun and apply state changes *before* the text_area.
    _rerecord_flag = f"_flag_rerecord_{image_id}"
    _restore_flag = f"_flag_restore_{image_id}"

    if st.session_state.pop(_rerecord_flag, False):
        img["audio_bytes"] = None
        img["transcription"] = ""
        img["transcription_original"] = ""
        st.session_state.pop(segments_key, None)
        st.session_state.pop(processed_key, None)
        st.session_state.pop(f"audio_input_{image_id}", None)
        # Set value BEFORE the text_area is created
        st.session_state[f"transcription_area_{image_id}"] = ""
        sm.update_activity()

    if st.session_state.pop(_restore_flag, False):
        img["transcription"] = img["transcription_original"]
        # Set value BEFORE the text_area is created
        st.session_state[f"transcription_area_{image_id}"] = img["transcription_original"]
        sm.update_activity()

    # ── Audio recording ──────────────────────────────────────────────────
    audio_wav = st.audio_input(
        t("record_audio"),
        key=f"audio_input_{image_id}",
    )

    if audio_wav is not None:
        audio_bytes = audio_wav.getvalue()
        fingerprint = _audio_fingerprint(audio_bytes)

        # Only transcribe if this is a *new* recording (content changed)
        if st.session_state.get(processed_key) != fingerprint:
            with st.spinner(t("transcribing")):
                text, segments = transcribe_audio_with_timestamps(
                    model, audio_bytes, language
                )

            # Store in session
            img["audio_bytes"] = audio_bytes

            # Append (don't overwrite) if there was previous text
            if img["transcription"]:
                img["transcription"] += " " + text
            else:
                img["transcription"] = text

            # Keep a copy of the raw Whisper output
            if img["transcription_original"]:
                img["transcription_original"] += " " + text
            else:
                img["transcription_original"] = text

            # Store timestamped segments
            existing_segments = st.session_state.get(segments_key, [])
            st.session_state[segments_key] = existing_segments + segments

            # Mark this audio as processed using content hash (stable across reruns)
            st.session_state[processed_key] = fingerprint
            # Update the text_area widget state so it reflects the new text
            st.session_state[f"transcription_area_{image_id}"] = img["transcription"]

            # Re-save to audit DB if the image is already labeled (upsert)
            if img.get("label"):
                try:
                    db.save_or_update_annotation(
                        image_filename=img["filename"],
                        label=img["label"],
                        transcription=img["transcription"],
                        doctor_name=st.session_state.get("doctor_name", ""),
                        session_id=st.session_state.get("session_id", ""),
                        locs_data=img.get("locs_data", {}),
                    )
                except Exception:
                    pass

            sm.update_activity()
            st.rerun()

    # ── Editable transcription ───────────────────────────────────────────
    edited_text = st.text_area(
        t("transcription_editable"),
        value=img["transcription"],
        height=180,
        key=f"transcription_area_{image_id}",
        placeholder=t("transcription_placeholder"),
    )

    # Sync edits back to session
    if edited_text != img["transcription"]:
        img["transcription"] = edited_text
        sm.update_activity()

    # ── Timestamped segments (Idea C) ────────────────────────────────────
    segments = st.session_state.get(segments_key, [])
    if segments:
        with st.expander(t("segments_timestamps"), expanded=False):
            for seg in segments:
                ts_start = format_timestamp(seg["start"])
                ts_end = format_timestamp(seg["end"])
                st.markdown(
                    f"`{ts_start}{ts_end}`   {seg['text']}"
                )

    # ── Helper buttons ───────────────────────────────────────────────────
    btn_cols = st.columns(2)

    with btn_cols[0]:
        # Re-record: set flag → rerun → flag handler above clears state
        has_audio = img["audio_bytes"] is not None
        if st.button(
            t("re_record"),
            key=f"rerecord_{image_id}",
            disabled=not has_audio,
            use_container_width=True,
        ):
            st.session_state[_rerecord_flag] = True
            st.rerun()

    with btn_cols[1]:
        # Restore original: set flag → rerun → flag handler above restores
        has_original = bool(img["transcription_original"])
        is_different = img["transcription"] != img["transcription_original"]
        if st.button(
            t("restore_original"),
            key=f"restore_{image_id}",
            disabled=not (has_original and is_different),
            use_container_width=True,
        ):
            st.session_state[_restore_flag] = True
            st.rerun()

    # ── Status line ──────────────────────────────────────────────────────
    if img["transcription"]:
        modified_tag = ""
        if (
            img["transcription_original"]
            and img["transcription"] != img["transcription_original"]
        ):
            modified_tag = t("manually_modified")
        word_count = len(img["transcription"].split())
        st.caption(f"{t('word_count', count=word_count)}{modified_tag}")
    else:
        st.caption(t("no_transcription_yet"))