Spaces:

MicroHealth
/

AV-to-transcripts

Paused

App Files Files Community

bluenevus commited on Apr 23, 2025

Commit

26cf8bb

verified ·

1 Parent(s): 5adda7d

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -16

app.py CHANGED Viewed

@@ -22,12 +22,12 @@ print(f"Using device: {device}")
 # Load the Whisper model and processor
 whisper_model_name = "openai/whisper-small"
 whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
-whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name, torch_dtype=torch.float16).to(device)
 # Load the Qwen model and tokenizer
 qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
-qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name)
-qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name, torch_dtype=torch.float16).to(device)
 def download_audio_from_url(url):
     try:
@@ -69,9 +69,10 @@ def transcribe_audio(audio_file):
         audio = audio.set_channels(1).set_frame_rate(16000)
         audio_array = torch.tensor(audio.get_array_of_samples()).float()
         print("Starting transcription...")
-        input_features = whisper_processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device).to(torch.float16)
-        predicted_ids = whisper_model.generate(input_features, language='en', task='translate')
         transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
         print(f"Transcription complete. Length: {len(transcription[0])} characters")
@@ -81,6 +82,7 @@ def transcribe_audio(audio_file):
         raise
 def separate_speakers(transcription):
     prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
 1. Label speakers as "Speaker 1", "Speaker 2", etc.
@@ -94,12 +96,14 @@ Now, please process the following transcribed text:
 """
     inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
-    outputs = qwen_model.generate(**inputs, max_new_tokens=1000)
     result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Extract the processed text (remove the instruction part)
     processed_text = result.split("Now, please process the following transcribed text:")[-1].strip()
     return processed_text
 def transcribe_video(url):
@@ -134,7 +138,10 @@ app.layout = dbc.Container([
                     dbc.Input(id="video-url", type="text", placeholder="Enter video URL"),
                     dbc.Button("Transcribe", id="transcribe-button", color="primary", className="mt-3"),
                     dbc.Spinner(html.Div(id="transcription-output", className="mt-3")),
-                    dcc.Download(id="download-transcript")
                 ])
             ])
         ], width=12)
@@ -143,7 +150,7 @@ app.layout = dbc.Container([
 @app.callback(
     Output("transcription-output", "children"),
-    Output("download-transcript", "data"),
     Input("transcribe-button", "n_clicks"),
     State("video-url", "value"),
     prevent_initial_call=True
@@ -157,28 +164,41 @@ def update_transcription(n_clicks, url):
             transcript = transcribe_video(url)
             return transcript
         except Exception as e:
-            return f"An error occurred: {str(e)}"
     # Run transcription in a separate thread
     thread = threading.Thread(target=transcribe)
     thread.start()
-    thread.join()
     transcript = thread.result if hasattr(thread, 'result') else "Transcription failed"
     if transcript and not transcript.startswith("An error occurred"):
-        download_data = dict(content=transcript, filename="transcript.txt")
         return dbc.Card([
             dbc.CardBody([
                 html.H5("Transcription Result with Speaker Separation"),
-                html.Pre(transcript, style={"white-space": "pre-wrap", "word-wrap": "break-word"}),
-                dbc.Button("Download Transcript", id="btn-download", color="secondary", className="mt-3")
             ])
-        ]), download_data
     else:
-        return transcript, None
-print("Reached end of script definitions")
 if __name__ == '__main__':
     print("Starting the Dash application...")

 # Load the Whisper model and processor
 whisper_model_name = "openai/whisper-small"
 whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
+whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)
 # Load the Qwen model and tokenizer
 qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
+qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name, trust_remote_code=True)
+qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name, trust_remote_code=True).to(device)
 def download_audio_from_url(url):
     try:
         audio = audio.set_channels(1).set_frame_rate(16000)
         audio_array = torch.tensor(audio.get_array_of_samples()).float()
+        print(f"Audio duration: {len(audio) / 1000:.2f} seconds")
         print("Starting transcription...")
+        input_features = whisper_processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device)
+        predicted_ids = whisper_model.generate(input_features)
         transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
         print(f"Transcription complete. Length: {len(transcription[0])} characters")
         raise
 def separate_speakers(transcription):
+    print("Starting speaker separation...")
     prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
 1. Label speakers as "Speaker 1", "Speaker 2", etc.
 """
     inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = qwen_model.generate(**inputs, max_new_tokens=4000)
     result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
     # Extract the processed text (remove the instruction part)
     processed_text = result.split("Now, please process the following transcribed text:")[-1].strip()
+    print("Speaker separation complete.")
     return processed_text
 def transcribe_video(url):
                     dbc.Input(id="video-url", type="text", placeholder="Enter video URL"),
                     dbc.Button("Transcribe", id="transcribe-button", color="primary", className="mt-3"),
                     dbc.Spinner(html.Div(id="transcription-output", className="mt-3")),
+                    html.Div([
+                        dbc.Button("Download Transcript", id="download-button", color="secondary", className="mt-3", style={'display': 'none'}),
+                        dcc.Download(id="download-transcript")
+                    ])
                 ])
             ])
         ], width=12)
 @app.callback(
     Output("transcription-output", "children"),
+    Output("download-button", "style"),
     Input("transcribe-button", "n_clicks"),
     State("video-url", "value"),
     prevent_initial_call=True
             transcript = transcribe_video(url)
             return transcript
         except Exception as e:
+            import traceback
+            return f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
     # Run transcription in a separate thread
     thread = threading.Thread(target=transcribe)
     thread.start()
+    thread.join(timeout=600)  # 10 minutes timeout
+    if thread.is_alive():
+        return "Transcription timed out after 10 minutes", {'display': 'none'}
     transcript = thread.result if hasattr(thread, 'result') else "Transcription failed"
     if transcript and not transcript.startswith("An error occurred"):
         return dbc.Card([
             dbc.CardBody([
                 html.H5("Transcription Result with Speaker Separation"),
+                html.Pre(transcript, style={"white-space": "pre-wrap", "word-wrap": "break-word"})
             ])
+        ]), {'display': 'block'}
     else:
+        return transcript, {'display': 'none'}
+@app.callback(
+    Output("download-transcript", "data"),
+    Input("download-button", "n_clicks"),
+    State("transcription-output", "children"),
+    prevent_initial_call=True
+)
+def download_transcript(n_clicks, transcription_output):
+    if not transcription_output:
+        raise PreventUpdate
+    transcript = transcription_output['props']['children'][0]['props']['children'][1]['props']['children']
+    return dict(content=transcript, filename="transcript.txt")
 if __name__ == '__main__':
     print("Starting the Dash application...")