Spaces:

MicroHealth
/

AV-to-transcripts

Paused

App Files Files Community

bluenevus commited on Apr 23, 2025

Commit

81f702f

verified ·

1 Parent(s): df42ab3

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -88

app.py CHANGED Viewed

@@ -1,22 +1,15 @@
 import io
-import re
 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import requests
 from bs4 import BeautifulSoup
 import tempfile
 import os
-import soundfile as sf
-from spellchecker import SpellChecker
 from pydub import AudioSegment
-import librosa
-import numpy as np
-from pyannote.audio import Pipeline
 import dash
 from dash import dcc, html, Input, Output, State
 import dash_bootstrap_components as dbc
 from dash.exceptions import PreventUpdate
-import base64
 import threading
 from pytube import YouTube
@@ -31,8 +24,6 @@ model_name = "openai/whisper-small"
 processor = WhisperProcessor.from_pretrained(model_name)
 model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
-spell = SpellChecker()
 def download_audio_from_url(url):
     try:
         if "youtube.com" in url or "youtu.be" in url:
@@ -66,92 +57,35 @@ def download_audio_from_url(url):
         print(f"Error in download_audio_from_url: {str(e)}")
         raise
-def correct_spelling(text):
-    words = text.split()
-    corrected_words = [spell.correction(word) or word for word in words]
-    return ' '.join(corrected_words)
-def format_transcript_with_speakers(transcript, diarization):
-    formatted_transcript = []
-    current_speaker = None
-    for segment, _, speaker in diarization.itertracks(yield_label=True):
-        start = segment.start
-        end = segment.end
-        if speaker != current_speaker:
-            if current_speaker is not None:
-                formatted_transcript.append("\n")  # Add a blank line between speakers
-            formatted_transcript.append(f"Speaker {speaker}:\n")
-            current_speaker = speaker
-        segment_text = transcript[start:end].strip()
-        if segment_text:
-            formatted_transcript.append(f"{segment_text}\n")
-    return "".join(formatted_transcript)
-def transcribe_audio(audio_file, pipeline):
     try:
-        if pipeline is None:
-            raise ValueError("Speaker diarization pipeline is not initialized")
         print("Loading audio file...")
-        audio_input, sr = librosa.load(audio_file, sr=16000)
-        audio_input = audio_input.astype(np.float32)
-        print(f"Audio duration: {len(audio_input) / sr:.2f} seconds")
-        # Apply speaker diarization
-        print("Applying speaker diarization...")
-        diarization = pipeline(audio_file)
-        print("Speaker diarization complete.")
-        chunk_length = 30 * sr
-        overlap = 5 * sr
-        transcriptions = []
         print("Starting transcription...")
-        for i in range(0, len(audio_input), chunk_length - overlap):
-            chunk = audio_input[i:i+chunk_length]
-            input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
-            predicted_ids = model.generate(input_features)
-            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
-            transcriptions.extend(transcription)
-            print(f"Processed {i / sr:.2f} to {(i + chunk_length) / sr:.2f} seconds")
-        full_transcription = " ".join(transcriptions)
-        print(f"Transcription complete. Full transcription length: {len(full_transcription)} characters")
-        print("Applying formatting with speaker diarization...")
-        formatted_transcription = format_transcript_with_speakers(full_transcription, diarization)
-        return formatted_transcription
     except Exception as e:
         print(f"Error in transcribe_audio: {str(e)}")
         raise
-def transcribe_video(url, pipeline):
     try:
         print(f"Attempting to download audio from URL: {url}")
         audio_bytes = download_audio_from_url(url)
         print(f"Successfully downloaded {len(audio_bytes)} bytes of audio data")
-        # Convert audio bytes to AudioSegment
-        audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
-        print(f"Audio duration: {len(audio) / 1000} seconds")
-        # Save as WAV file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
-            audio.export(temp_audio.name, format="wav")
-            temp_audio_path = temp_audio.name
-        print("Starting audio transcription...")
-        transcript = transcribe_audio(temp_audio_path, pipeline)
-        print(f"Transcription completed. Transcript length: {len(transcript)} characters")
-        # Clean up the temporary file
-        os.unlink(temp_audio_path)
-        # Apply spelling correction
-        transcript = correct_spelling(transcript)
         return transcript
     except Exception as e:
         error_message = f"An error occurred: {str(e)}"
@@ -189,13 +123,7 @@ def update_transcription(n_clicks, url):
     def transcribe():
         try:
-            # Initialize the speaker diarization pipeline without token
-            pipeline = Pipeline.from_pretrained("collinbarnwell/pyannote-speaker-diarization-31")
-            if pipeline is None:
-                raise ValueError("Failed to initialize the speaker diarization pipeline")
-            print("Speaker diarization pipeline initialized successfully")
-            transcript = transcribe_video(url, pipeline)
             return transcript
         except Exception as e:
             return f"An error occurred: {str(e)}"
@@ -218,7 +146,9 @@ def update_transcription(n_clicks, url):
         ]), download_data
     else:
         return transcript, None
 if __name__ == '__main__':
     print("Starting the Dash application...")
     app.run(debug=True, host='0.0.0.0', port=7860)

 import io
 import torch
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import requests
 from bs4 import BeautifulSoup
 import tempfile
 import os
 from pydub import AudioSegment
 import dash
 from dash import dcc, html, Input, Output, State
 import dash_bootstrap_components as dbc
 from dash.exceptions import PreventUpdate
 import threading
 from pytube import YouTube
 processor = WhisperProcessor.from_pretrained(model_name)
 model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
 def download_audio_from_url(url):
     try:
         if "youtube.com" in url or "youtu.be" in url:
         print(f"Error in download_audio_from_url: {str(e)}")
         raise
+def transcribe_audio(audio_file):
     try:
         print("Loading audio file...")
+        audio = AudioSegment.from_file(audio_file)
+        audio = audio.set_channels(1).set_frame_rate(16000)
+        audio_array = audio.get_array_of_samples()
         print("Starting transcription...")
+        input_features = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device)
+        predicted_ids = model.generate(input_features)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        print(f"Transcription complete. Length: {len(transcription[0])} characters")
+        return transcription[0]
     except Exception as e:
         print(f"Error in transcribe_audio: {str(e)}")
         raise
+def transcribe_video(url):
     try:
         print(f"Attempting to download audio from URL: {url}")
         audio_bytes = download_audio_from_url(url)
         print(f"Successfully downloaded {len(audio_bytes)} bytes of audio data")
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
+            AudioSegment.from_file(io.BytesIO(audio_bytes)).export(temp_audio.name, format="wav")
+            transcript = transcribe_audio(temp_audio.name)
+        os.unlink(temp_audio.name)
         return transcript
     except Exception as e:
         error_message = f"An error occurred: {str(e)}"
     def transcribe():
         try:
+            transcript = transcribe_video(url)
             return transcript
         except Exception as e:
             return f"An error occurred: {str(e)}"
         ]), download_data
     else:
         return transcript, None
+print("Reached end of script definitions")
 if __name__ == '__main__':
     print("Starting the Dash application...")
     app.run(debug=True, host='0.0.0.0', port=7860)