Spaces:

MicroHealth
/

AV-to-transcripts

Paused

App Files Files Community

bluenevus commited on Apr 24, 2025

Commit

c982392

verified ·

1 Parent(s): dd906ec

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -54

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import io
 import torch
-from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import requests
 from bs4 import BeautifulSoup
 import tempfile
@@ -17,7 +17,7 @@ import librosa
 import numpy as np
 # Set up logging
-logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 print("Script started")
@@ -31,11 +31,6 @@ whisper_model_name = "openai/whisper-small"
 whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
 whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)
-# Load the Qwen model and tokenizer
-qwen_model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name, trust_remote_code=True)
-qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name, trust_remote_code=True).to(device)
 def download_audio_from_url(url):
     try:
         if "youtube.com" in url or "youtu.be" in url:
@@ -92,40 +87,11 @@ def transcribe_audio(audio_file):
         full_transcription = " ".join(transcriptions)
         logger.info(f"Transcription complete. Full transcription length: {len(full_transcription)} characters")
-        logger.info("Applying speaker separation using Qwen...")
-        separated_transcript = separate_speakers(full_transcription)
-        return separated_transcript
     except Exception as e:
         logger.error(f"Error in transcribe_audio: {str(e)}")
         raise
-def separate_speakers(transcription):
-    logger.info("Starting speaker separation...")
-    prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
-1. Label speakers as "Speaker 1", "Speaker 2", etc.
-2. Start each speaker's text on a new line beginning with their label.
-3. Separate different speakers' contributions with a blank line.
-4. If the same speaker continues, do not insert a blank line or repeat the speaker label.
-5. Do not include any additional explanations or metadata.
-Now, please process the following transcribed text:
-{transcription}
-"""
-    inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = qwen_model.generate(**inputs, max_new_tokens=4000)
-    result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract only the processed text (remove the instruction part)
-    processed_text = result.split("Now, please process the following transcribed text:")[-1].strip()
-    logger.info("Speaker separation complete.")
-    return processed_text
 def transcribe_video(url):
     try:
         logger.info(f"Attempting to download audio from URL: {url}")
@@ -141,18 +107,7 @@ def transcribe_video(url):
         if len(transcript) < 10:
             raise ValueError("Transcription too short, possibly failed")
-        logger.info("Separating speakers...")
-        try:
-            diarized_transcript = separate_speakers(transcript)
-            logger.info(f"Speaker separation complete. Result length: {len(diarized_transcript)} characters")
-            if len(diarized_transcript) < 10:
-                logger.warning("Speaker separation result too short, using original transcript")
-                return transcript
-            return diarized_transcript
-        except Exception as e:
-            logger.error(f"Error during speaker separation: {str(e)}")
-            logger.info("Returning original transcript without speaker separation")
-            return transcript
     except Exception as e:
         error_message = f"An error occurred: {str(e)}"
         logger.error(error_message)
@@ -163,8 +118,8 @@ app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
 app.layout = dbc.Container([
     dbc.Row([
         dbc.Col([
-            html.H1("Video Transcription with Speaker Separation", className="text-center mb-4"),
-            html.Div("If you can see this, the app is working!", className="text-center mb-4"),  # Debug element
             dbc.Card([
                 dbc.CardBody([
                     dbc.Input(id="video-url", type="text", placeholder="Enter video URL"),
@@ -191,12 +146,28 @@ def update_transcription(n_clicks, url):
     if not url:
         raise PreventUpdate
-    transcript = transcribe_video(url)
     if transcript and not transcript.startswith("An error occurred"):
         return dbc.Card([
             dbc.CardBody([
-                html.H5("Transcription Result with Speaker Separation"),
                 html.Pre(transcript, style={"white-space": "pre-wrap", "word-wrap": "break-word"})
             ])
         ]), {'display': 'block'}
@@ -209,7 +180,6 @@ def update_transcription(n_clicks, url):
     State("transcription-output", "children"),
     prevent_initial_call=True
 )
 def download_transcript(n_clicks, transcription_output):
     if not transcription_output:
         raise PreventUpdate

 import io
 import torch
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import requests
 from bs4 import BeautifulSoup
 import tempfile
 import numpy as np
 # Set up logging
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 print("Script started")
 whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
 whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)
 def download_audio_from_url(url):
     try:
         if "youtube.com" in url or "youtu.be" in url:
         full_transcription = " ".join(transcriptions)
         logger.info(f"Transcription complete. Full transcription length: {len(full_transcription)} characters")
+        return full_transcription
     except Exception as e:
         logger.error(f"Error in transcribe_audio: {str(e)}")
         raise
 def transcribe_video(url):
     try:
         logger.info(f"Attempting to download audio from URL: {url}")
         if len(transcript) < 10:
             raise ValueError("Transcription too short, possibly failed")
+        return transcript
     except Exception as e:
         error_message = f"An error occurred: {str(e)}"
         logger.error(error_message)
 app.layout = dbc.Container([
     dbc.Row([
         dbc.Col([
+            html.H1("Video Transcription", className="text-center mb-4"),
+            html.Div("If you can see this, the app is working!", className="text-center mb-4"),
             dbc.Card([
                 dbc.CardBody([
                     dbc.Input(id="video-url", type="text", placeholder="Enter video URL"),
     if not url:
         raise PreventUpdate
+    def transcribe():
+        try:
+            transcript = transcribe_video(url)
+            return transcript
+        except Exception as e:
+            logger.exception("Error in transcription:")
+            return f"An error occurred: {str(e)}"
+    # Run transcription in a separate thread
+    thread = threading.Thread(target=transcribe)
+    thread.start()
+    thread.join(timeout=600)  # 10 minutes timeout
+    if thread.is_alive():
+        return "Transcription timed out after 10 minutes", {'display': 'none'}
+    transcript = getattr(thread, 'result', "Transcription failed")
     if transcript and not transcript.startswith("An error occurred"):
         return dbc.Card([
             dbc.CardBody([
+                html.H5("Transcription Result"),
                 html.Pre(transcript, style={"white-space": "pre-wrap", "word-wrap": "break-word"})
             ])
         ]), {'display': 'block'}
     State("transcription-output", "children"),
     prevent_initial_call=True
 )
 def download_transcript(n_clicks, transcription_output):
     if not transcription_output:
         raise PreventUpdate