VoiceChat

Paused

legolasyiu commited on Jan 22

Commit

8c47ec1

verified ·

1 Parent(s): 430aac7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -40,6 +40,51 @@ tts_model = AutoModelForCausalLM.from_pretrained(
     torch_dtype="auto",
 )
 # -----------------------------
 # PIPELINE FUNCTION
 # -----------------------------
@@ -51,23 +96,8 @@ def speech_to_speech(audio_file):
     audio, sr = librosa.load(audio_file, sr=TARGET_SR)
     # ---------- STT ----------
-    stt_inputs = stt_processor(
-        audio=audio,
-        sampling_rate=TARGET_SR,
-        text="Transcribe the audio accurately.",
-        return_tensors="pt",
-    ).to(DEVICE)
-    with torch.no_grad():
-        output_ids = stt_model.generate(
-            **stt_inputs,
-            max_new_tokens=512,
-        )
-    transcription = stt_processor.decode(
-        output_ids[0],
-        skip_special_tokens=True,
-    )
     # ---------- TTS ----------
     tts_inputs = tts_tokenizer(

     torch_dtype="auto",
 )
+def transcribe_and_translate(audio_file):
+    if audio_file is None:
+        return "Please upload an audio file."
+    # Save temp file path
+    audio_path = audio_file
+    prompt = f"Transcribe the audio accurately."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio", "audio": audio_path},
+                {"type": "text", "text": prompt},
+            ]
+        }
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    )
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=MAX_TOKENS,
+            do_sample=False,
+            temperature=0.2,
+        )
+    decoded = processor.batch_decode(
+        outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True
+    )
+    return decoded[0]
 # -----------------------------
 # PIPELINE FUNCTION
 # -----------------------------
     audio, sr = librosa.load(audio_file, sr=TARGET_SR)
     # ---------- STT ----------
+    transcription = transcribe_and_translate(audio_file)
     # ---------- TTS ----------
     tts_inputs = tts_tokenizer(