TristanBehrens commited on
Commit
de6325b
·
verified ·
1 Parent(s): 7f65a48

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +158 -0
  2. packages.txt +1 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ import numpy as np
5
+ import time
6
+ from typing import Tuple
7
+ import logging
8
+ import torch
9
+
10
+
11
+ # Create a logger.
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Check if all the variables are set.
16
+ required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"]
17
+ for required_variable in required_variables:
18
+ if os.environ.get(required_variable, "NO") == "NO":
19
+ logger.error(
20
+ f"Environment variable {required_variable} is not set. "
21
+ "Please set it before running the application."
22
+ )
23
+ raise ValueError(
24
+ f"Environment variable {required_variable} is not set. "
25
+ "Please set it before running the application."
26
+ )
27
+
28
+
29
+ # Create the transcription pipeline.
30
+ model_name = os.environ["MODEL_NAME"]
31
+ model_name = "openai/whisper-tiny" # TODO: Remove this.
32
+ logger.warning("Using hardcoded model name 'openai/whisper-tiny'.")
33
+ device = "cuda" if torch.cuda.is_available() else "cpu"
34
+ logger.info(f"Loading model {model_name} with device {device}...")
35
+ transcriber = pipeline(
36
+ "automatic-speech-recognition",
37
+ model=model_name,
38
+ device=device
39
+ )
40
+ logger.info(f"Model loaded successfully.")
41
+
42
+
43
+ # Start the app.
44
+ def main():
45
+ interface = create_interface()
46
+ interface.launch()
47
+
48
+
49
+ # Create the Gradio interface for the Whisper transcription service.
50
+ def create_interface():
51
+
52
+ # The UI is a block of Gradio components.
53
+ with gr.Blocks() as interface:
54
+
55
+ # Title.
56
+ gr.Markdown("# Whisper Speech Transcription")
57
+
58
+ # One row for the password input and another for the audio input.
59
+ with gr.Row():
60
+ with gr.Column(scale=2):
61
+ passwort_input = gr.Textbox(
62
+ label="Enter Password",
63
+ placeholder="Enter the password to access the transcription service",
64
+ type="password"
65
+ )
66
+
67
+ # Row for audio input.
68
+ with gr.Row():
69
+ with gr.Column(scale=2):
70
+ audio_input = gr.Audio(
71
+ sources=["microphone", "upload"],
72
+ type="numpy",
73
+ label="Record or Upload Audio"
74
+ )
75
+
76
+ # Row for the transcription button.
77
+ with gr.Row():
78
+ transcribe_button = gr.Button("Transcribe", variant="primary")
79
+
80
+ # Row for the transcription output.
81
+ with gr.Row():
82
+ output_text = gr.Textbox(
83
+ label="Transcription Output",
84
+ placeholder="Transcription will appear here...",
85
+ lines=5
86
+ )
87
+
88
+ # Status message for transcription time.
89
+ status_text = gr.Textbox(
90
+ label="Status",
91
+ placeholder="Transcription status will appear here...",
92
+ lines=1,
93
+ interactive=False
94
+ )
95
+
96
+ # Set up the transcribe button click event
97
+ transcribe_button.click(
98
+ fn=transcribe_audio,
99
+ inputs=[audio_input, passwort_input],
100
+ outputs=[output_text, status_text],
101
+ )
102
+
103
+ # Also transcribe when audio is recorded/uploaded
104
+ audio_input.change(
105
+ fn=transcribe_audio,
106
+ inputs=[audio_input, passwort_input],
107
+ outputs=[output_text, status_text],
108
+ )
109
+ return interface
110
+
111
+
112
+ def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str:
113
+
114
+ # If the password is wrong, return an error message.
115
+ # TODO: Enable this.
116
+ #if password != os.environ.get("PASSWORD"):
117
+ # return "Incorrect password. Please try again.", ""
118
+
119
+ # If there is no audio, return an error message.
120
+ if audio is None:
121
+ return "No audio detected. Please record some audio.", ""
122
+
123
+
124
+ print(f"Received audio: {audio}")
125
+ print(f"Audio type: {type(audio)}")
126
+
127
+ # Start measuring the time.
128
+ start_time = time.time()
129
+
130
+ # Unpack the audio.
131
+ sr, y = audio
132
+
133
+ # Convert to mono if stereo
134
+ if y.ndim > 1:
135
+ logger.debug(f"Converting {y.shape[1]} channels to mono")
136
+ y = y.mean(axis=1)
137
+
138
+ # Normalize audio
139
+ y = y.astype(np.float32)
140
+ max_abs = np.max(np.abs(y))
141
+ if max_abs > 0: # Avoid division by zero
142
+ y /= max_abs
143
+
144
+ logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)")
145
+
146
+ # Run transcription
147
+ result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0])
148
+ logger.info(f"Transcription completed.")
149
+
150
+ # Calculate elapsed time
151
+ elapsed_time = time.time() - start_time
152
+ audio_time = len(y) / sr
153
+ status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio"
154
+ return result["text"], status_string
155
+
156
+
157
+ if __name__ == "__main__":
158
+ main()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ torchaudio>=2.0.0
3
+ transformers==4.52.3
4
+ gradio==5.10.0
5
+ pydantic==2.10.6
6
+ numpy