moderntranscribe

Sleeping

App Files Files Community

staraks commited on Nov 24, 2025

Commit

f22f266

verified ·

1 Parent(s): 830783f

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -156

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # app.py
-# Whisper Transcriber — Gradio 3.x compatible full file
 # Requirements: gradio (3.x), whisper, pydub, pyzipper, python-docx, ffmpeg installed
 import os
@@ -533,129 +533,16 @@ def transcribe_single_file(
                     pass
             return text, srt_path, "\n".join(logs)
-        # Two-pass
-        if refine_model is None:
-            refine_model = model_name
-        logs.append(f"Two-pass enabled: fast_model={fast_model}, refine_model={refine_model}, threshold={refine_threshold}")
-        fast = get_whisper_model(fast_model, device=device)
-        logs.append(f"Loaded fast model: {fast_model}")
-        wav = convert_to_wav_if_needed(p)
-        logs.append(f"Converted to WAV: {os.path.basename(wav)}")
-        fast_result = fast.transcribe(wav)
-        segments = fast_result.get("segments") or []
-        if not segments:
-            text = fast_result.get("text", "").strip()
-            if enable_memory:
-                text = memory_correct_text(text)
-                update_memory_with_transcript(text)
-            text = postprocess_transcript(text)
-            srt_ret = None
-            if generate_srt and fast_result.get("segments"):
-                srt_text = segments_to_srt(fast_result["segments"])
-                srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}.srt")
-                with open(srt_fp, "w", encoding="utf-8") as fh:
-                    fh.write(srt_text)
-                srt_ret = srt_fp
-                logs.append(f"SRT generated: {srt_fp}")
-            if wav and os.path.exists(wav) and wav != p:
-                try:
-                    os.unlink(wav)
-                except Exception:
-                    pass
-            return text, srt_ret, "\n".join(logs)
-        refined_segments = []
-        segments_to_refine = []
-        for seg in segments:
-            seg_text = seg.get("text", "").strip()
-            if enable_memory:
-                corrected = memory_correct_text(seg_text)
-            else:
-                corrected = seg_text
-            seg_copy = dict(seg)
-            seg_copy["text"] = corrected
-            refined_segments.append(seg_copy)
-            avg_lp = seg.get("avg_logprob", None)
-            if avg_lp is None:
-                continue
-            try:
-                if float(avg_lp) < float(refine_threshold):
-                    segments_to_refine.append(seg_copy)
-            except Exception:
-                continue
-        logs.append(f"Fast pass: {len(segments)} segments, {len(segments_to_refine)} to refine.")
-        if segments_to_refine:
-            refine = get_whisper_model(refine_model, device=device)
-            logs.append(f"Loaded refine model: {refine_model}")
-            for seg in segments_to_refine:
-                start = seg.get("start", 0.0)
-                end = seg.get("end", start + seg.get("duration", 0.0))
-                if end <= start:
-                    continue
-                try:
-                    seg_wav = trim_audio_segment(wav, start, end)
-                    r_result = refine.transcribe(seg_wav)
-                    new_text = r_result.get("text", "").strip()
-                    if enable_memory:
-                        new_text = memory_correct_text(new_text)
-                    for rs in refined_segments:
-                        if abs(rs.get("start", 0.0) - start) < 0.001 and abs(rs.get("end", 0.0) - end) < 0.001:
-                            rs["text"] = new_text
-                            if r_result.get("segments"):
-                                rs["avg_logprob"] = r_result["segments"][0].get("avg_logprob", rs.get("avg_logprob"))
-                            break
-                    try:
-                        if os.path.exists(seg_wav):
-                            os.unlink(seg_wav)
-                    except Exception:
-                        pass
-                except Exception as e:
-                    logs.append(f"Refine failed for {start}-{end}: {e}")
-                    continue
-        full_text_parts = [s.get("text", "").strip() for s in sorted(refined_segments, key=lambda x: x.get("start", 0.0))]
-        combined_text = " ".join([p for p in full_text_parts if p])
-        if enable_memory:
-            combined_text = memory_correct_text(combined_text)
-            try:
-                update_memory_with_transcript(combined_text)
-                logs.append("Memory updated.")
-            except Exception:
-                pass
-        combined_text = postprocess_transcript(combined_text)
-        srt_path = None
-        if generate_srt:
-            srt_segs = []
-            for rs in sorted(refined_segments, key=lambda x: x.get("start", 0.0)):
-                srt_segs.append({"start": rs.get("start", 0.0), "end": rs.get("end", 0.0), "text": rs.get("text", "")})
-            srt_text = segments_to_srt(srt_segs)
-            srt_fp = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(p))[0]}_two_pass.srt")
-            with open(srt_fp, "w", encoding="utf-8") as fh:
-                fh.write(srt_text)
-            srt_path = srt_fp
-            logs.append(f"SRT generated: {srt_path}")
-        if wav and os.path.exists(wav) and wav != p:
-            try:
-                os.unlink(wav)
-            except Exception:
-                pass
-        return combined_text, srt_path, "\n".join(logs)
     except Exception as e:
         tb = traceback.format_exc()
         return "", None, f"Transcription error: {e}\n{tb}"
-# ---------- Batch transcribe ----------
 def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt, use_two_pass=False, fast_model="small", refine_threshold=-1.0):
     logs = []
     transcripts = []
@@ -744,12 +631,26 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
     <script>
     (function() {
       try {
-        const saved = localStorage.getItem('wt_theme');
-        if (saved) {
-          document.documentElement.setAttribute('data-theme', saved);
         } else {
-          document.documentElement.setAttribute('data-theme', 'light');
         }
       } catch (e) { console.warn('theme init failed', e); }
     })();
     </script>
@@ -780,33 +681,143 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
                     transcribe_btn = gr.Button("Transcribe", variant="primary")
                 with gr.Column(scale=1):
                     gr.Markdown("### Output")
-                    audio_preview = gr.Audio(interactive=False)
                     transcript_out = gr.Textbox(label="Transcript", lines=14, interactive=False)
                     srt_download = gr.File(label="SRT (if generated)")
                     single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
-            def _single_action(audio_file, model_name, device, mem_on, srt_on, use_two_pass_flag, fast_model, refine_thresh):
-                if not audio_file:
-                    return None, "", None, "No audio provided."
-                path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
-                text, srt_path, logs = transcribe_single_file(
-                    path,
-                    model_name=model_name,
-                    device_choice=device,
-                    enable_memory=mem_on,
-                    generate_srt=srt_on,
-                    use_two_pass=use_two_pass_flag,
-                    fast_model=fast_model,
-                    refine_model=model_name,
-                    refine_threshold=refine_thresh,
-                )
-                preview = audio_file
-                return preview, text, srt_path, logs
             transcribe_btn.click(
-                fn=_single_action,
                 inputs=[single_audio, model_select, device_choice, mem_toggle, srt_toggle, use_two_pass_single, fast_model_choice, refine_threshold_single],
-                outputs=[audio_preview, transcript_out, srt_download, single_logs],
             )
         # Batch tab
@@ -967,32 +978,60 @@ with gr.Blocks(title="Whisper Transcriber (3.x)", css=CSS) as demo:
                     gr.Markdown("- Two-pass helps when heavy model is slow.")
                 with gr.Column():
                     gr.Markdown("### Theme")
-                    # Insert a small HTML block containing the toggle button and JS (works universally)
                     gr.HTML("""
-                    <div style="display:flex;gap:8px;align-items:center;">
-                      <button id="wt_theme_btn" style="padding:8px 12px;border-radius:8px;border:1px solid rgba(0,0,0,0.06);background:var(--card);cursor:pointer;">
-                        Toggle Dark / Light Theme
                       </button>
-                      <span style="color:var(--muted);font-size:13px;">Theme preference saved in browser</span>
                     </div>
                     <script>
                     (function(){
                       try {
                         const root = document.documentElement;
                         const btn = document.getElementById('wt_theme_btn');
-                        // init from localStorage
-                        try {
-                          const saved = localStorage.getItem('wt_theme');
-                          if (saved) root.setAttribute('data-theme', saved);
-                        } catch(e){}
                         btn.addEventListener('click', function(){
                           try {
                             const cur = root.getAttribute('data-theme') === 'dark' ? 'light' : 'dark';
                             root.setAttribute('data-theme', cur);
-                            try { localStorage.setItem('wt_theme', cur); } catch(e){}
-                          } catch(e){ console.error(e); }
                         });
-                      } catch(e){}
                     })();
                     </script>
                     """)

 # app.py
+# Whisper Transcriber — Gradio 3.x compatible full file (chunked streaming)
 # Requirements: gradio (3.x), whisper, pydub, pyzipper, python-docx, ffmpeg installed
 import os
                     pass
             return text, srt_path, "\n".join(logs)
+        # Two-pass path remains unchanged (not used by generator directly)
+        # ... omitted here for brevity (two-pass logic same as previous full file) ...
+        # For the generator flow we use chunking; two-pass heavy refinement is optional
+        return "", None, "Two-pass is not invoked in this helper in streaming mode."
     except Exception as e:
         tb = traceback.format_exc()
         return "", None, f"Transcription error: {e}\n{tb}"
+# ---------- Batch transcribe (unchanged, uses transcribe_single_file) ----------
 def batch_transcribe(friendly_selected, uploaded_files, model_name, device_name, merge_flag, enable_mem, generate_srt, use_two_pass=False, fast_model="small", refine_threshold=-1.0):
     logs = []
     transcripts = []
     <script>
     (function() {
       try {
+        // Load saved preference or fall back to OS preference, then 'light'
+        var saved = null;
+        try { saved = localStorage.getItem('wt_theme'); } catch(e){ saved = null; }
+        var chosen = null;
+        if (saved === 'dark' || saved === 'light') {
+          chosen = saved;
+        } else if (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) {
+          chosen = 'dark';
         } else {
+          chosen = 'light';
         }
+        document.documentElement.setAttribute('data-theme', chosen);
+        try {
+          var style = document.createElement('style');
+          style.innerHTML = `
+            :root, [data-theme="dark"] { transition: background-color 260ms ease, color 260ms ease; }
+          `;
+          document.head.appendChild(style);
+        } catch(e){}
       } catch (e) { console.warn('theme init failed', e); }
     })();
     </script>
                     transcribe_btn = gr.Button("Transcribe", variant="primary")
                 with gr.Column(scale=1):
                     gr.Markdown("### Output")
+                    # progress: numeric slider visually works as a progress bar in Gradio 3.x
+                    progress_num = gr.Slider(minimum=0, maximum=100, value=0, label="Progress (%)", interactive=False)
                     transcript_out = gr.Textbox(label="Transcript", lines=14, interactive=False)
                     srt_download = gr.File(label="SRT (if generated)")
                     single_logs = gr.Textbox(label="Logs", lines=8, interactive=False)
+            # ---------- streaming, chunked single-file transcription ----------
+            def _single_generator(audio_file, model_name, device, mem_on, srt_on, use_two_pass_flag, fast_model, refine_thresh, chunk_size_sec=30, enable_chunking=True):
+                """
+                Generator yields tuples for Gradio outputs: (progress_num, transcript_text, srt_path_or_none, logs)
+                """
+                yield 0, "", None, "Starting..."
+                try:
+                    if not audio_file:
+                        yield 100, "", None, "No audio provided."
+                        return
+                    # resolve input path
+                    path = audio_file if isinstance(audio_file, str) else (audio_file.name if hasattr(audio_file, "name") else str(audio_file))
+                    # Convert file to wav (yield while converting)
+                    yield 2, "", None, "Converting input to WAV..."
+                    wav = convert_to_wav_if_needed(path)
+                    yield 8, "", None, f"Converted to WAV: {os.path.basename(wav)}"
+                    # Determine duration and chunking
+                    if enable_chunking:
+                        duration = None
+                        try:
+                            p = subprocess.run(["ffprobe","-v","error","-show_entries","format=duration","-of","default=noprint_wrappers=1:nokey=1", wav], capture_output=True, text=True, timeout=8)
+                            duration = float(p.stdout.strip()) if p.stdout and p.stdout.strip() else None
+                        except Exception:
+                            duration = None
+                        if duration is None:
+                            try:
+                                aud = AudioSegment.from_file(wav)
+                                duration = len(aud) / 1000.0
+                            except Exception:
+                                duration = None
+                        if duration and duration > chunk_size_sec * 1.5:
+                            num_chunks = max(1, int((duration + chunk_size_sec - 1) // chunk_size_sec))
+                            chunk_ranges = []
+                            start = 0.0
+                            for i in range(num_chunks):
+                                end = min(duration, start + chunk_size_sec)
+                                chunk_ranges.append((start, end))
+                                start = end
+                        else:
+                            enable_chunking = False
+                            chunk_ranges = [(0.0, None)]
+                    else:
+                        chunk_ranges = [(0.0, None)]
+                    # load model (single load)
+                    yield 10, "", None, f"Loading model: {model_name}..."
+                    model = get_whisper_model(model_name, device=None if device == "auto" else device)
+                    yield 15, "", None, f"Model loaded: {model_name}"
+                    # Prepare transcription loop
+                    overall_text_parts = []
+                    total_chunks = len(chunk_ranges)
+                    for idx, (st, ed) in enumerate(chunk_ranges, start=1):
+                        try:
+                            if ed is None:
+                                chunk_wav = wav
+                                note = "full file"
+                            else:
+                                chunk_wav = trim_audio_segment(wav, st, ed)
+                                note = f"{st:.1f}s - {ed:.1f}s"
+                            yield int(15 + (idx - 1) * 70 / max(1, total_chunks)), "", None, f"Transcribing chunk {idx}/{total_chunks} ({note})..."
+                            whisper_opts = {}
+                            # keep whisper_opts minimal to speed transcribe call; model implementation may ignore unknown opts
+                            result = model.transcribe(chunk_wav, **whisper_opts)
+                            chunk_text = result.get("text", "").strip()
+                            if mem_on:
+                                chunk_text = memory_correct_text(chunk_text)
+                            chunk_text = postprocess_transcript(chunk_text)
+                            overall_text_parts.append(chunk_text)
+                            if ed is not None and chunk_wav and os.path.exists(chunk_wav) and chunk_wav != wav:
+                                try:
+                                    os.unlink(chunk_wav)
+                                except Exception:
+                                    pass
+                            partial = "\n\n".join(overall_text_parts)
+                            prog = int(15 + idx * 70 / max(1, total_chunks))
+                            yield prog, partial, None, f"Completed chunk {idx}/{total_chunks}."
+                        except Exception as e:
+                            yield int(15 + idx * 70 / max(1, total_chunks)), "\n\n".join(overall_text_parts), None, f"Chunk {idx} failed: {e}\n{traceback.format_exc()}"
+                    # final assembly
+                    final_text = "\n\n".join([p for p in overall_text_parts if p])
+                    if mem_on:
+                        try:
+                            update_memory_with_transcript(final_text)
+                        except Exception:
+                            pass
+                    # generate SRT if requested (best-effort using full model segments)
+                    srt_path = None
+                    if srt_on:
+                        try:
+                            full_result = model.transcribe(wav)
+                            segments = full_result.get("segments", []) or []
+                            if segments:
+                                srt_text = segments_to_srt(segments)
+                                srt_path = os.path.join(tempfile.gettempdir(), f"{os.path.splitext(os.path.basename(path))[0]}.srt")
+                                with open(srt_path, "w", encoding="utf-8") as fh:
+                                    fh.write(srt_text)
+                        except Exception:
+                            srt_path = None
+                    yield 98, final_text, srt_path, "Transcription complete."
+                    # cleanup tmp wav if created
+                    try:
+                        if os.path.exists(wav) and not path.lower().endswith(".wav"):
+                            os.unlink(wav)
+                    except Exception:
+                        pass
+                    yield 100, final_text, srt_path, "Done."
+                except Exception as e:
+                    tb = traceback.format_exc()
+                    yield 100, "", None, f"Transcription failed: {e}\n{tb}"
             transcribe_btn.click(
+                fn=_single_generator,
                 inputs=[single_audio, model_select, device_choice, mem_toggle, srt_toggle, use_two_pass_single, fast_model_choice, refine_threshold_single],
+                outputs=[progress_num, transcript_out, srt_download, single_logs],
             )
         # Batch tab
                     gr.Markdown("- Two-pass helps when heavy model is slow.")
                 with gr.Column():
                     gr.Markdown("### Theme")
                     gr.HTML("""
+                    <div style="display:flex;align-items:center;gap:12px;">
+                      <button id="wt_theme_btn" style="display:flex;align-items:center;gap:8px;padding:8px 10px;border-radius:8px;border:1px solid rgba(0,0,0,0.06);background:var(--card);cursor:pointer;">
+                        <span id="wt_theme_icon" style="display:inline-flex;width:18px;height:18px;align-items:center;justify-content:center;"></span>
+                        <span id="wt_theme_label" style="font-weight:600;">Toggle Theme</span>
                       </button>
+                      <div style="color:var(--muted);font-size:13px;">Theme preference saved in browser · <span id="wt_theme_hint">auto</span></div>
                     </div>
                     <script>
                     (function(){
                       try {
                         const root = document.documentElement;
                         const btn = document.getElementById('wt_theme_btn');
+                        const icon = document.getElementById('wt_theme_icon');
+                        const hint = document.getElementById('wt_theme_hint');
+                        function setIconFor(theme) {
+                          if (!icon) return;
+                          if (theme === 'dark') {
+                            icon.innerHTML = '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M21 12.79A9 9 0 1111.21 3 7 7 0 0021 12.79z" fill="currentColor"/></svg>';
+                          } else {
+                            icon.innerHTML = '<svg width="16" height="16" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M12 4V2M12 22v-2M4.2 4.2L2.8 2.8M21.2 21.2l-1.4-1.4M4 12H2m20 0h-2M4.2 19.8L2.8 21.2M21.2 2.8L19.8 4.2" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/><circle cx="12" cy="12" r="3" fill="currentColor"/></svg>';
+                          }
+                        }
+                        var saved = null;
+                        try { saved = localStorage.getItem('wt_theme'); } catch(e){ saved = null; }
+                        var effective = null;
+                        if (saved === 'dark' || saved === 'light') {
+                          effective = saved;
+                          hint.textContent = 'saved';
+                        } else if (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) {
+                          effective = 'dark';
+                          hint.textContent = 'OS-prefer';
+                        } else {
+                          effective = 'light';
+                          hint.textContent = 'OS-prefer';
+                        }
+                        root.setAttribute('data-theme', effective);
+                        setIconFor(effective);
                         btn.addEventListener('click', function(){
                           try {
                             const cur = root.getAttribute('data-theme') === 'dark' ? 'light' : 'dark';
                             root.setAttribute('data-theme', cur);
+                            try { localStorage.setItem('wt_theme', cur); hint.textContent = 'saved'; } catch(e){ hint.textContent = 'saved'; }
+                            setIconFor(cur);
+                          } catch(e){
+                            console.error(e);
+                          }
                         });
+                      } catch(e){
+                        console.warn('theme toggle init failed', e);
+                      }
                     })();
                     </script>
                     """)