Spaces:

MicroHealth
/

AV-to-transcripts

Paused

App Files Files Community

bluenevus commited on Apr 23, 2025

Commit

54c226c

verified ·

1 Parent(s): 261d49a

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -4

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_na
 # Load the Qwen model and tokenizer
 qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
 qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name, trust_remote_code=True)
-qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name, trust_remote_code=True, torch_dtype=torch.float16).to(device)
 def download_audio_from_url(url):
     try:
@@ -86,6 +86,8 @@ def transcribe_audio(audio_file):
         transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
         print(f"Transcription complete. Length: {len(transcription[0])} characters")
         return transcription[0]
     except Exception as e:
         print(f"Error in transcribe_audio: {str(e)}")
@@ -95,7 +97,7 @@ def separate_speakers(transcription):
     print("Starting speaker separation...")
     prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
-1. Label speakers as "Speaker 1", "Speaker 2", etc.  You will have to use dialog context to asume which speaker is saying their dialog as that isn't in the text.
 2. Start each speaker's text on a new line beginning with their label.
 3. Separate different speakers' contributions with a blank line.
 4. If the same speaker continues, do not insert a blank line or repeat the speaker label.
@@ -106,7 +108,6 @@ Now, please process the following transcribed text:
 """
     inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
-    inputs = {k: v.to(torch.float16) for k, v in inputs.items()}  # Convert inputs to float16
     with torch.no_grad():
         outputs = qwen_model.generate(**inputs, max_new_tokens=4000)
     result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -116,7 +117,7 @@ Now, please process the following transcribed text:
     print("Speaker separation complete.")
     return processed_text
 def transcribe_video(url):
     try:
         print(f"Attempting to download audio from URL: {url}")
@@ -129,6 +130,9 @@ def transcribe_video(url):
         os.unlink(temp_audio.name)
         print("Separating speakers...")
         separated_transcript = separate_speakers(transcript)

 # Load the Qwen model and tokenizer
 qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
 qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name, trust_remote_code=True)
+qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name, trust_remote_code=True).to(device)
 def download_audio_from_url(url):
     try:
         transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
         print(f"Transcription complete. Length: {len(transcription[0])} characters")
+        if len(transcription[0]) < 10:
+            raise ValueError(f"Transcription too short: {transcription[0]}")
         return transcription[0]
     except Exception as e:
         print(f"Error in transcribe_audio: {str(e)}")
     print("Starting speaker separation...")
     prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
+1. Label speakers as "Speaker 1", "Speaker 2", etc.
 2. Start each speaker's text on a new line beginning with their label.
 3. Separate different speakers' contributions with a blank line.
 4. If the same speaker continues, do not insert a blank line or repeat the speaker label.
 """
     inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = qwen_model.generate(**inputs, max_new_tokens=4000)
     result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
     print("Speaker separation complete.")
     return processed_text
 def transcribe_video(url):
     try:
         print(f"Attempting to download audio from URL: {url}")
         os.unlink(temp_audio.name)
+        if len(transcript) < 10:
+            raise ValueError("Transcription too short, possibly failed")
         print("Separating speakers...")
         separated_transcript = separate_speakers(transcript)