Spaces:

MicroHealth
/

AV-to-transcripts

Paused

App Files Files Community

bluenevus commited on Apr 23, 2025

Commit

60d3e8d

verified ·

1 Parent(s): 26cf8bb

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -3

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_na
 # Load the Qwen model and tokenizer
 qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
 qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name, trust_remote_code=True)
-qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name, trust_remote_code=True).to(device)
 def download_audio_from_url(url):
     try:
@@ -85,7 +85,7 @@ def separate_speakers(transcription):
     print("Starting speaker separation...")
     prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
-1. Label speakers as "Speaker 1", "Speaker 2", etc.
 2. Start each speaker's text on a new line beginning with their label.
 3. Separate different speakers' contributions with a blank line.
 4. If the same speaker continues, do not insert a blank line or repeat the speaker label.
@@ -96,6 +96,7 @@ Now, please process the following transcribed text:
 """
     inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = qwen_model.generate(**inputs, max_new_tokens=4000)
     result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -105,7 +106,7 @@ Now, please process the following transcribed text:
     print("Speaker separation complete.")
     return processed_text
 def transcribe_video(url):
     try:
         print(f"Attempting to download audio from URL: {url}")

 # Load the Qwen model and tokenizer
 qwen_model_name = "Qwen/Qwen2.5-3B-Instruct"
 qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name, trust_remote_code=True)
+qwen_model = AutoModelForCausalLM.from_pretrained(qwen_model_name, trust_remote_code=True, torch_dtype=torch.float16).to(device)
 def download_audio_from_url(url):
     try:
     print("Starting speaker separation...")
     prompt = f"""Analyze the following transcribed text and separate it into different speakers. Identify potential speaker changes based on context, content shifts, or dialogue patterns. Format the output as follows:
+1. Label speakers as "Speaker 1", "Speaker 2", etc.  You will have to use dialog context to asume which speaker is saying their dialog as that isn't in the text.
 2. Start each speaker's text on a new line beginning with their label.
 3. Separate different speakers' contributions with a blank line.
 4. If the same speaker continues, do not insert a blank line or repeat the speaker label.
 """
     inputs = qwen_tokenizer(prompt, return_tensors="pt").to(device)
+    inputs = {k: v.to(torch.float16) for k, v in inputs.items()}  # Convert inputs to float16
     with torch.no_grad():
         outputs = qwen_model.generate(**inputs, max_new_tokens=4000)
     result = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)
     print("Speaker separation complete.")
     return processed_text
 def transcribe_video(url):
     try:
         print(f"Attempting to download audio from URL: {url}")