Spaces:

openpecha
/

stt_demo

Running

App Files Files Community

ganga4364 commited on Jul 31, 2025

Commit

0191635

verified ·

1 Parent(s): 2a3adfa

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -163

app.py CHANGED Viewed

@@ -5,10 +5,6 @@ import torch
 import torchaudio
 import numpy as np
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-from datetime import timedelta
-import os
-import shutil
-from pathlib import Path
 import logging
 # Constants and Configuration
@@ -17,22 +13,12 @@ CHUNK_SECONDS = 30  # Split audio into 30-second chunks
 CHUNK_SAMPLES = SAMPLE_RATE * CHUNK_SECONDS
 MODEL_NAME = "openpecha/general_stt_base_model"
-title = "# Tibetan Speech-to-Text with Subtitles"
 description = """
-This application transcribes Tibetan audio files and generates subtitles using:
 - Wav2Vec2 model fine-tuned on Garchen Rinpoche's teachings
 - 30-second fixed chunking for long audio processing
-- Generates both SRT and WebVTT subtitle formats
-"""
-css = """
-.result {display:flex;flex-direction:column}
-.result_item {padding:15px;margin-bottom:8px;border-radius:15px;width:100%}
-.result_item_success {background-color:mediumaquamarine;color:white;align-self:start}
-.result_item_error {background-color:#ff7070;color:white;align-self:start}
-.player-container {margin: 20px 0;}
-.player-container audio {width: 100%;}
 """
 # Initialize model
@@ -47,73 +33,9 @@ def init_model():
 # Initialize model globally
 model, processor = init_model()
-def format_timestamp(seconds, format_type="srt"):
-    """Convert seconds to SRT or WebVTT timestamp format"""
-    td = timedelta(seconds=seconds)
-    hours = td.seconds // 3600
-    minutes = (td.seconds % 3600) // 60
-    seconds = td.seconds % 60
-    milliseconds = round(td.microseconds / 1000)
-    if format_type == "srt":
-        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
-    else:  # webvtt
-        return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
-def create_subtitle_file(timestamps_with_text, output_path, format_type="srt"):
-    """Create SRT or WebVTT subtitle file"""
-    with open(output_path, 'w', encoding='utf-8') as f:
-        if format_type == "vtt":
-            f.write("WEBVTT\n\n")
-        for i, (start_time, end_time, text) in enumerate(timestamps_with_text, 1):
-            if format_type == "srt":
-                f.write(f"{i}\n")
-                f.write(f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n")
-                f.write(f"{text}\n\n")
-            else:
-                f.write(f"{format_timestamp(start_time, 'vtt')} --> {format_timestamp(end_time, 'vtt')}\n")
-                f.write(f"{text}\n\n")
-def build_html_output(s: str, style: str = "result_item_success"):
-    return f"""
-    <div class='result'>
-        <div class='result_item {style}'>
-          {s}
-        </div>
-    </div>
-    """
-def create_preview_player(audio_path, vtt_path):
-    # Create an HTML preview with audio player and subtitles
-    # Convert file paths to relative URLs that Gradio can serve
-    audio_url = f"file={audio_path}"
-    vtt_url = f"file={vtt_path}"
-    html_content = f"""
-    <div class="audio-player">
-        <audio controls style="width: 100%;">
-            <source src="{audio_url}" type="audio/wav">
-            <track kind="subtitles" src="{vtt_url}" default>
-            Your browser does not support the audio element.
-        </audio>
-    </div>
-    """
-    return html_content
 def process_audio(audio_path: str):
     if audio_path is None or audio_path == "":
-        return (
-            build_html_output(
-                "Please upload an audio file first",
-                "result_item_error",
-            ),
-            None,
-            None,
-            "",
-            "",
-        )
     logging.info(f"Processing audio file: {audio_path}")
@@ -126,16 +48,11 @@ def process_audio(audio_path: str):
         # Split audio into 30-second chunks
         audio_length = wav.shape[0]
-        timestamps_with_text = []
         transcriptions = []
         for start_sample in range(0, audio_length, CHUNK_SAMPLES):
             end_sample = min(start_sample + CHUNK_SAMPLES, audio_length)
-            # Convert sample positions to seconds
-            start_time = start_sample / SAMPLE_RATE
-            end_time = end_sample / SAMPLE_RATE
             # Extract chunk
             chunk = wav[start_sample:end_sample]
@@ -153,92 +70,44 @@ def process_audio(audio_path: str):
             # Skip empty transcriptions
             if transcription.strip():
                 transcriptions.append(transcription)
-                timestamps_with_text.append((start_time, end_time, transcription))
-        if not timestamps_with_text:
-            return (
-                build_html_output("No speech detected or recognized", "result_item_error"),
-                None,
-                None,
-                "",
-                "",
-            )
-        # Generate subtitle files
-        base_path = os.path.splitext(audio_path)[0]
-        srt_path = f"{base_path}.srt"
-        vtt_path = f"{base_path}.vtt"
-        create_subtitle_file(timestamps_with_text, srt_path, "srt")
-        create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
-        # Return the file paths directly
-        srt_file = srt_path
-        vtt_file = vtt_path
-        # Create preview player with the file paths
-        preview_html = create_preview_player(audio_path, vtt_path)
-        all_text = " ".join(transcriptions)
-        return (
-            build_html_output(
-                "Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
-                "result_item_success"
-            ),
-            srt_file,
-            vtt_file,
-            preview_html,
-            all_text,
-        )
     except Exception as e:
         logging.error(f"Error processing audio: {str(e)}")
-        return (
-            build_html_output(
-                f"Error processing audio: {str(e)}",
-                "result_item_error"
-            ),
-            None,
-            None,
-            "",
-            "",
-        )
-demo = gr.Blocks(css=css)
 with demo:
     gr.Markdown(title)
-    with gr.Tabs():
-        with gr.TabItem("Upload Audio"):
-            audio_input = gr.Audio(
-                sources=["upload"],
-                type="filepath",
-                label="Upload audio file",
-            )
-            process_button = gr.Button("Generate Subtitles")
-            with gr.Column():
-                info_output = gr.HTML(label="Status")
-                srt_output = gr.File(label="SRT Subtitle File")
-                vtt_output = gr.File(label="WebVTT Subtitle File")
-                preview_output = gr.HTML(label="Preview Player")
-                text_output = gr.Textbox(
-                    label="Full Transcription",
-                    placeholder="Transcribed text will appear here...",
-                    lines=5
-                )
-        process_button.click(
-            process_audio,
-            inputs=[audio_input],
-            outputs=[
-                info_output,
-                srt_output,
-                vtt_output,
-                preview_output,
-                text_output,
-            ],
         )
     gr.Markdown(description)

 import torchaudio
 import numpy as np
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import logging
 # Constants and Configuration
 CHUNK_SAMPLES = SAMPLE_RATE * CHUNK_SECONDS
 MODEL_NAME = "openpecha/general_stt_base_model"
+title = "# Tibetan Speech-to-Text"
 description = """
+This application transcribes Tibetan audio files using:
 - Wav2Vec2 model fine-tuned on Garchen Rinpoche's teachings
 - 30-second fixed chunking for long audio processing
 """
 # Initialize model
 # Initialize model globally
 model, processor = init_model()
 def process_audio(audio_path: str):
     if audio_path is None or audio_path == "":
+        return "Please upload an audio file first"
     logging.info(f"Processing audio file: {audio_path}")
         # Split audio into 30-second chunks
         audio_length = wav.shape[0]
         transcriptions = []
         for start_sample in range(0, audio_length, CHUNK_SAMPLES):
             end_sample = min(start_sample + CHUNK_SAMPLES, audio_length)
             # Extract chunk
             chunk = wav[start_sample:end_sample]
             # Skip empty transcriptions
             if transcription.strip():
                 transcriptions.append(transcription)
+        if not transcriptions:
+            return "No speech detected or recognized"
+        # Join all transcriptions
+        all_text = " ".join(transcriptions)
+        return all_text
     except Exception as e:
         logging.error(f"Error processing audio: {str(e)}")
+        return f"Error processing audio: {str(e)}"
+demo = gr.Blocks()
 with demo:
     gr.Markdown(title)
+    with gr.Row():
+        audio_input = gr.Audio(
+            sources=["upload"],
+            type="filepath",
+            label="Upload audio file",
         )
+    process_button = gr.Button("Transcribe Audio")
+    with gr.Row():
+        text_output = gr.Textbox(
+            label="Transcription",
+            placeholder="Transcribed text will appear here...",
+            lines=8
+        )
+    process_button.click(
+        process_audio,
+        inputs=[audio_input],
+        outputs=[text_output],
+    )
     gr.Markdown(description)