Chia Woon Yap
commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -123,7 +123,7 @@ def retrieve_documents(query):
|
|
| 123 |
|
| 124 |
# Function to convert tuple format to message format
|
| 125 |
def convert_to_message_format(chat_history):
|
| 126 |
-
|
| 127 |
message_format = []
|
| 128 |
for user_msg, bot_msg in chat_history:
|
| 129 |
message_format.append({"role": "user", "content": user_msg})
|
|
@@ -132,7 +132,7 @@ def convert_to_message_format(chat_history):
|
|
| 132 |
|
| 133 |
# Function to convert message format to tuple format for processing
|
| 134 |
def convert_to_tuple_format(chat_history):
|
| 135 |
-
|
| 136 |
tuple_format = []
|
| 137 |
for i in range(0, len(chat_history), 2):
|
| 138 |
if i+1 < len(chat_history):
|
|
@@ -278,57 +278,62 @@ def process_document(file):
|
|
| 278 |
"""
|
| 279 |
# Real-time Whisper setup - cache the model
|
| 280 |
#@gr.cache_resource
|
| 281 |
-
def load_realtime_whisper():
|
| 282 |
-
"""Load optimized Whisper model for real-time transcription"""
|
| 283 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 284 |
-
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 285 |
|
| 286 |
-
# Use tiny model for real-time speed
|
| 287 |
-
realtime_transcriber = pipeline(
|
| 288 |
-
"automatic-speech-recognition",
|
| 289 |
-
model="openai/whisper-tiny.en",
|
| 290 |
-
device=device,
|
| 291 |
-
torch_dtype=torch_dtype,
|
| 292 |
-
)
|
| 293 |
|
| 294 |
-
return realtime_transcriber
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
# Load model at startup
|
| 297 |
-
realtime_transcriber = load_realtime_whisper()
|
| 298 |
|
| 299 |
-
def transcribe_audio(audio):
|
| 300 |
-
"""Real-time optimized transcription"""
|
| 301 |
-
if audio is None:
|
| 302 |
-
return ""
|
| 303 |
|
| 304 |
-
sr, y = audio
|
| 305 |
|
| 306 |
# Quick preprocessing
|
| 307 |
-
if y.ndim > 1:
|
| 308 |
-
y = y.mean(axis=1) # Convert to mono
|
| 309 |
-
|
| 310 |
-
y = y.astype(np.float32)
|
| 311 |
-
max_val = np.max(np.abs(y))
|
| 312 |
-
if max_val > 0:
|
| 313 |
-
y = y / max_val
|
| 314 |
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
-
"""
|
| 332 |
|
| 333 |
|
| 334 |
#Common Issue 1: Audio Format Problems
|
|
|
|
| 123 |
|
| 124 |
# Function to convert tuple format to message format
|
| 125 |
def convert_to_message_format(chat_history):
|
| 126 |
+
#Convert from [(user, bot)] format to [{"role": "user", "content": user}, {"role": "assistant", "content": bot}] format"""
|
| 127 |
message_format = []
|
| 128 |
for user_msg, bot_msg in chat_history:
|
| 129 |
message_format.append({"role": "user", "content": user_msg})
|
|
|
|
| 132 |
|
| 133 |
# Function to convert message format to tuple format for processing
|
| 134 |
def convert_to_tuple_format(chat_history):
|
| 135 |
+
#Convert from message format back to tuple format for processing"""
|
| 136 |
tuple_format = []
|
| 137 |
for i in range(0, len(chat_history), 2):
|
| 138 |
if i+1 < len(chat_history):
|
|
|
|
| 278 |
"""
|
| 279 |
# Real-time Whisper setup - cache the model
|
| 280 |
#@gr.cache_resource
|
| 281 |
+
#def load_realtime_whisper():
|
| 282 |
+
# """Load optimized Whisper model for real-time transcription"""
|
| 283 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 284 |
+
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 285 |
|
| 286 |
+
# # Use tiny model for real-time speed
|
| 287 |
+
# realtime_transcriber = pipeline(
|
| 288 |
+
# "automatic-speech-recognition",
|
| 289 |
+
# model="openai/whisper-tiny.en",
|
| 290 |
+
# device=device,
|
| 291 |
+
# torch_dtype=torch_dtype,
|
| 292 |
+
# )
|
| 293 |
|
| 294 |
+
# return realtime_transcriber
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
|
| 300 |
|
| 301 |
# Load model at startup
|
| 302 |
+
#realtime_transcriber = load_realtime_whisper()
|
| 303 |
|
| 304 |
+
#def transcribe_audio(audio):
|
| 305 |
+
# """Real-time optimized transcription"""
|
| 306 |
+
# if audio is None:
|
| 307 |
+
# return ""
|
| 308 |
|
| 309 |
+
# sr, y = audio
|
| 310 |
|
| 311 |
# Quick preprocessing
|
| 312 |
+
# if y.ndim > 1:
|
| 313 |
+
# y = y.mean(axis=1) # Convert to mono
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
+
# y = y.astype(np.float32)
|
| 316 |
+
# max_val = np.max(np.abs(y))
|
| 317 |
+
# if max_val > 0:
|
| 318 |
+
# y = y / max_val
|
| 319 |
+
#
|
| 320 |
+
# try:
|
| 321 |
+
# # Use real-time transcriber with optimized settings
|
| 322 |
+
# result = realtime_transcriber(
|
| 323 |
+
# {"sampling_rate": sr, "raw": y},
|
| 324 |
+
# generate_kwargs={
|
| 325 |
+
# "language": "english",
|
| 326 |
+
# "task": "transcribe",
|
| 327 |
+
# "temperature": 0.0, # More deterministic
|
| 328 |
+
# "no_repeat_ngram_size": 2, # Reduce repetitions
|
| 329 |
+
# }
|
| 330 |
+
# )
|
| 331 |
+
# return result["text"]
|
| 332 |
+
# except Exception as e:
|
| 333 |
+
# print(f"Transcription error: {e}")
|
| 334 |
+
# return "Could not transcribe audio. Please try again."
|
| 335 |
+
|
| 336 |
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
#Common Issue 1: Audio Format Problems
|