Spaces:

MicroHealth
/

AV-to-transcripts

Paused

App Files Files Community

bluenevus commited on Apr 23, 2025

Commit

fce37ea

verified ·

1 Parent(s): 640b5e1

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -10

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import io
 import torch
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import requests
 from bs4 import BeautifulSoup
 import tempfile
@@ -20,9 +20,14 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 # Load the Whisper model and processor
-model_name = "openai/whisper-small"
-processor = WhisperProcessor.from_pretrained(model_name)
-model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
 def download_audio_from_url(url):
     try:
@@ -65,9 +70,9 @@ def transcribe_audio(audio_file):
         audio_array = audio.get_array_of_samples()
         print("Starting transcription...")
-        input_features = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device)
-        predicted_ids = model.generate(input_features)
-        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
         print(f"Transcription complete. Length: {len(transcription[0])} characters")
         return transcription[0]
@@ -75,6 +80,28 @@ def transcribe_audio(audio_file):
         print(f"Error in transcribe_audio: {str(e)}")
         raise
 def transcribe_video(url):
     try:
         print(f"Attempting to download audio from URL: {url}")
@@ -86,7 +113,11 @@ def transcribe_video(url):
             transcript = transcribe_audio(temp_audio.name)
         os.unlink(temp_audio.name)
-        return transcript
     except Exception as e:
         error_message = f"An error occurred: {str(e)}"
         print(error_message)
@@ -97,7 +128,7 @@ app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
 app.layout = dbc.Container([
     dbc.Row([
         dbc.Col([
-            html.H1("Video Transcription", className="text-center mb-4"),
             dbc.Card([
                 dbc.CardBody([
                     dbc.Input(id="video-url", type="text", placeholder="Enter video URL"),
@@ -139,7 +170,7 @@ def update_transcription(n_clicks, url):
         download_data = dict(content=transcript, filename="transcript.txt")
         return dbc.Card([
             dbc.CardBody([
-                html.H5("Transcription Result"),
                 html.Pre(transcript, style={"white-space": "pre-wrap", "word-wrap": "break-word"}),
                 dbc.Button("Download Transcript", id="btn-download", color="secondary", className="mt-3")
             ])

 import io
 import torch
+from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import requests
 from bs4 import BeautifulSoup
 import tempfile
 print(f"Using device: {device}")
 # Load the Whisper model and processor
+whisper_model_name = "openai/whisper-small"
+whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
+whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)
+# Load the Qwen model and tokenizer
+qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
+qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name)
+qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name).to(device)
 def download_audio_from_url(url):
     try:
         audio_array = audio.get_array_of_samples()
         print("Starting transcription...")
+        input_features = whisper_processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device)
+        predicted_ids = whisper_model.generate(input_features)
+        transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
         print(f"Transcription complete. Length: {len(transcription[0])} characters")
         return transcription[0]
         print(f"Error in transcribe_audio: {str(e)}")
         raise
+def separate_speakers(transcription):
+    prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
+1. Label speakers as "Speaker 1", "Speaker 2", etc.
+2. Start each speaker's text on a new line beginning with their label.
+3. Separate different speakers' contributions with a blank line.
+4. If the same speaker continues, do not insert a blank line or repeat the speaker label.
+Now, please process the following transcribed text:
+{transcription}
+"""
+    inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
+    outputs = qwen_model.generate(**inputs, max_new_tokens=1000)
+    result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract the processed text (remove the instruction part)
+    processed_text = result.split("Now, please process the following transcribed text:")[-1].strip()
+    return processed_text
 def transcribe_video(url):
     try:
         print(f"Attempting to download audio from URL: {url}")
             transcript = transcribe_audio(temp_audio.name)
         os.unlink(temp_audio.name)
+        print("Separating speakers...")
+        separated_transcript = separate_speakers(transcript)
+        return separated_transcript
     except Exception as e:
         error_message = f"An error occurred: {str(e)}"
         print(error_message)
 app.layout = dbc.Container([
     dbc.Row([
         dbc.Col([
+            html.H1("Video Transcription with Speaker Separation", className="text-center mb-4"),
             dbc.Card([
                 dbc.CardBody([
                     dbc.Input(id="video-url", type="text", placeholder="Enter video URL"),
         download_data = dict(content=transcript, filename="transcript.txt")
         return dbc.Card([
             dbc.CardBody([
+                html.H5("Transcription Result with Speaker Separation"),
                 html.Pre(transcript, style={"white-space": "pre-wrap", "word-wrap": "break-word"}),
                 dbc.Button("Download Transcript", id="btn-download", color="secondary", className="mt-3")
             ])