Chia Woon Yap
commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,12 +7,12 @@ Original file is located at
|
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
| 10 |
-
from transformers import pipeline
|
| 11 |
import os
|
| 12 |
import time
|
| 13 |
import groq
|
| 14 |
import uuid
|
| 15 |
import re
|
|
|
|
| 16 |
|
| 17 |
# LangChain imports
|
| 18 |
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
|
@@ -29,8 +29,6 @@ import docx # python-docx for Word files
|
|
| 29 |
import gtts # Google Text-to-Speech library
|
| 30 |
from pptx import Presentation # python-pptx for PowerPoint files
|
| 31 |
|
| 32 |
-
import torch
|
| 33 |
-
|
| 34 |
# Set API Key
|
| 35 |
groq.api_key = os.getenv("GROQ_API_KEY")
|
| 36 |
|
|
@@ -85,41 +83,14 @@ Answer: d) 0.4
|
|
| 85 |
Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
|
| 86 |
"""
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
class
|
| 90 |
def __init__(self):
|
| 91 |
-
self.
|
| 92 |
-
print(
|
| 93 |
-
|
| 94 |
-
# Try multiple models in order
|
| 95 |
-
self.model = self._load_model()
|
| 96 |
-
|
| 97 |
-
def _load_model(self):
|
| 98 |
-
"""Try loading different models until one works"""
|
| 99 |
-
models_to_try = [
|
| 100 |
-
"openai/whisper-base",
|
| 101 |
-
"openai/whisper-tiny",
|
| 102 |
-
"openai/whisper-small",
|
| 103 |
-
]
|
| 104 |
-
|
| 105 |
-
for model_name in models_to_try:
|
| 106 |
-
try:
|
| 107 |
-
print(f"Trying to load: {model_name}")
|
| 108 |
-
pipe = pipeline(
|
| 109 |
-
"automatic-speech-recognition",
|
| 110 |
-
model=model_name,
|
| 111 |
-
device=self.device,
|
| 112 |
-
)
|
| 113 |
-
print(f"β
Successfully loaded: {model_name}")
|
| 114 |
-
return pipe
|
| 115 |
-
except Exception as e:
|
| 116 |
-
print(f"β Failed to load {model_name}: {e}")
|
| 117 |
-
continue
|
| 118 |
-
|
| 119 |
-
raise Exception("All models failed to load")
|
| 120 |
|
| 121 |
def transcribe_audio(self, audio):
|
| 122 |
-
"""
|
| 123 |
if audio is None:
|
| 124 |
return "Please record audio first"
|
| 125 |
|
|
@@ -136,74 +107,74 @@ class FixedWhisperTranscriber:
|
|
| 136 |
if y.ndim > 1:
|
| 137 |
y = np.mean(y, axis=1)
|
| 138 |
|
| 139 |
-
# Convert to
|
| 140 |
y = y.astype(np.float32)
|
| 141 |
-
if np.max(np.abs(y)) > 0:
|
| 142 |
-
y = y / np.max(np.abs(y))
|
| 143 |
|
| 144 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
audio_duration = len(y) / sr
|
| 146 |
print(f"Audio duration: {audio_duration:.2f} seconds")
|
| 147 |
|
| 148 |
if audio_duration < 0.5:
|
| 149 |
return "Audio too short. Speak for at least 1 second."
|
| 150 |
|
| 151 |
-
if audio_duration >
|
| 152 |
-
return "Audio too long. Keep it under
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
| 156 |
|
| 157 |
-
|
|
|
|
| 158 |
|
| 159 |
-
|
| 160 |
-
result = self.model(audio_dict)
|
| 161 |
-
transcription = result["text"].strip()
|
| 162 |
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
return "No clear speech detected. Please try again with clearer audio."
|
| 168 |
|
| 169 |
-
return
|
| 170 |
|
| 171 |
except Exception as e:
|
| 172 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
return f"Transcription failed: {str(e)}"
|
| 174 |
-
|
| 175 |
-
def _is_garbage_transcription(self, text):
|
| 176 |
-
"""Check if transcription is garbage"""
|
| 177 |
-
if not text:
|
| 178 |
-
return True
|
| 179 |
-
|
| 180 |
-
# Common garbage patterns
|
| 181 |
-
garbage_patterns = [
|
| 182 |
-
r"^(oh,\s*)+oh$",
|
| 183 |
-
r"^(ah,\s*)+ah$",
|
| 184 |
-
r"^(\w+,\s*)+\w+$", # Repeated single words
|
| 185 |
-
]
|
| 186 |
-
|
| 187 |
-
text_lower = text.lower().strip()
|
| 188 |
-
|
| 189 |
-
for pattern in garbage_patterns:
|
| 190 |
-
if re.match(pattern, text_lower):
|
| 191 |
-
return True
|
| 192 |
-
|
| 193 |
-
# Check if it's just repetitive nonsense
|
| 194 |
-
words = text_lower.split()
|
| 195 |
-
if len(words) > 10:
|
| 196 |
-
unique_words = len(set(words))
|
| 197 |
-
if unique_words / len(words) < 0.3: # Too repetitive
|
| 198 |
-
return True
|
| 199 |
-
|
| 200 |
-
return False
|
| 201 |
|
| 202 |
# Initialize transcriber
|
| 203 |
try:
|
| 204 |
-
transcriber =
|
|
|
|
| 205 |
except Exception as e:
|
| 206 |
-
print(f"Failed to initialize transcriber: {e}")
|
| 207 |
transcriber = None
|
| 208 |
|
| 209 |
def transcribe_audio(audio):
|
|
@@ -227,7 +198,7 @@ def get_transcription_status(audio):
|
|
| 227 |
elif duration > 10:
|
| 228 |
return "Processing longer audio..."
|
| 229 |
else:
|
| 230 |
-
return "Processing audio..."
|
| 231 |
except:
|
| 232 |
return "Ready to record"
|
| 233 |
|
|
@@ -445,11 +416,11 @@ def tutor_ai_chatbot():
|
|
| 445 |
- π£οΈ Speak at normal volume and pace
|
| 446 |
- π± Use a good quality microphone
|
| 447 |
|
| 448 |
-
**
|
| 449 |
-
-
|
| 450 |
-
-
|
| 451 |
-
-
|
| 452 |
-
-
|
| 453 |
""")
|
| 454 |
|
| 455 |
# Clear chat history button
|
|
@@ -499,7 +470,7 @@ def tutor_ai_chatbot():
|
|
| 499 |
inputs=audio_input,
|
| 500 |
outputs=msg
|
| 501 |
).then(
|
| 502 |
-
fn=lambda x: "Transcription completed!" if x and "failed" not in x.lower() and "error" not in x.lower() and "sorry" not in x.lower() else "Ready for new recording",
|
| 503 |
inputs=msg,
|
| 504 |
outputs=transcription_status
|
| 505 |
)
|
|
|
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
|
|
|
| 10 |
import os
|
| 11 |
import time
|
| 12 |
import groq
|
| 13 |
import uuid
|
| 14 |
import re
|
| 15 |
+
import tempfile
|
| 16 |
|
| 17 |
# LangChain imports
|
| 18 |
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
|
|
|
| 29 |
import gtts # Google Text-to-Speech library
|
| 30 |
from pptx import Presentation # python-pptx for PowerPoint files
|
| 31 |
|
|
|
|
|
|
|
| 32 |
# Set API Key
|
| 33 |
groq.api_key = os.getenv("GROQ_API_KEY")
|
| 34 |
|
|
|
|
| 83 |
Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
|
| 84 |
"""
|
| 85 |
|
| 86 |
+
# Groq Whisper Transcriber - RELIABLE SOLUTION
|
| 87 |
+
class GroqWhisperTranscriber:
|
| 88 |
def __init__(self):
|
| 89 |
+
self.client = groq.Client(api_key=groq.api_key)
|
| 90 |
+
print("β
Groq Whisper transcriber initialized")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def transcribe_audio(self, audio):
|
| 93 |
+
"""Transcribe audio using Groq's reliable Whisper API"""
|
| 94 |
if audio is None:
|
| 95 |
return "Please record audio first"
|
| 96 |
|
|
|
|
| 107 |
if y.ndim > 1:
|
| 108 |
y = np.mean(y, axis=1)
|
| 109 |
|
| 110 |
+
# Convert to proper format
|
| 111 |
y = y.astype(np.float32)
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
# Normalize audio
|
| 114 |
+
max_val = np.max(np.abs(y))
|
| 115 |
+
if max_val > 0:
|
| 116 |
+
y = y / max_val
|
| 117 |
+
|
| 118 |
+
# Check audio duration
|
| 119 |
audio_duration = len(y) / sr
|
| 120 |
print(f"Audio duration: {audio_duration:.2f} seconds")
|
| 121 |
|
| 122 |
if audio_duration < 0.5:
|
| 123 |
return "Audio too short. Speak for at least 1 second."
|
| 124 |
|
| 125 |
+
if audio_duration > 60:
|
| 126 |
+
return "Audio too long. Keep it under 60 seconds."
|
| 127 |
+
|
| 128 |
+
# Convert to 16-bit PCM for WAV file
|
| 129 |
+
y_int16 = (y * 32767).astype(np.int16)
|
| 130 |
+
|
| 131 |
+
# Create temporary WAV file
|
| 132 |
+
import scipy.io.wavfile
|
| 133 |
|
| 134 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 135 |
+
temp_path = f.name
|
| 136 |
|
| 137 |
+
# Save as WAV file
|
| 138 |
+
scipy.io.wavfile.write(temp_path, sr, y_int16)
|
| 139 |
|
| 140 |
+
print("Sending to Groq Whisper API...")
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
# Transcribe with Groq API - USE TURBO VERSION
|
| 143 |
+
with open(temp_path, "rb") as audio_file:
|
| 144 |
+
transcription = self.client.audio.transcriptions.create(
|
| 145 |
+
file=(temp_path, audio_file.read(), "audio/wav"),
|
| 146 |
+
model="whisper-large-v3-turbo", # Use the best model
|
| 147 |
+
response_format="text",
|
| 148 |
+
language="en" # Optional: specify English for better accuracy
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Clean up temporary file
|
| 152 |
+
os.unlink(temp_path)
|
| 153 |
+
|
| 154 |
+
text = transcription.strip()
|
| 155 |
+
print(f"Groq transcription: '{text}'")
|
| 156 |
|
| 157 |
+
if not text:
|
| 158 |
+
return "No speech detected. Please try again."
|
|
|
|
| 159 |
|
| 160 |
+
return text
|
| 161 |
|
| 162 |
except Exception as e:
|
| 163 |
+
print(f"Groq transcription error: {str(e)}")
|
| 164 |
+
# Clean up temp file if it exists
|
| 165 |
+
try:
|
| 166 |
+
if 'temp_path' in locals():
|
| 167 |
+
os.unlink(temp_path)
|
| 168 |
+
except:
|
| 169 |
+
pass
|
| 170 |
return f"Transcription failed: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# Initialize transcriber
|
| 173 |
try:
|
| 174 |
+
transcriber = GroqWhisperTranscriber()
|
| 175 |
+
print("β
Transcriber initialized successfully with Groq API")
|
| 176 |
except Exception as e:
|
| 177 |
+
print(f"β Failed to initialize transcriber: {e}")
|
| 178 |
transcriber = None
|
| 179 |
|
| 180 |
def transcribe_audio(audio):
|
|
|
|
| 198 |
elif duration > 10:
|
| 199 |
return "Processing longer audio..."
|
| 200 |
else:
|
| 201 |
+
return "Processing audio with Groq API..."
|
| 202 |
except:
|
| 203 |
return "Ready to record"
|
| 204 |
|
|
|
|
| 416 |
- π£οΈ Speak at normal volume and pace
|
| 417 |
- π± Use a good quality microphone
|
| 418 |
|
| 419 |
+
**Using Groq Whisper API:**
|
| 420 |
+
- β
High accuracy transcription
|
| 421 |
+
- β
No more "B-B-B" or "oh-oh-oh" errors
|
| 422 |
+
- β
Fast and reliable
|
| 423 |
+
- β
Professional grade speech recognition
|
| 424 |
""")
|
| 425 |
|
| 426 |
# Clear chat history button
|
|
|
|
| 470 |
inputs=audio_input,
|
| 471 |
outputs=msg
|
| 472 |
).then(
|
| 473 |
+
fn=lambda x: "β
Transcription completed!" if x and "failed" not in x.lower() and "error" not in x.lower() and "sorry" not in x.lower() else "Ready for new recording",
|
| 474 |
inputs=msg,
|
| 475 |
outputs=transcription_status
|
| 476 |
)
|