Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# app.py
|
| 2 |
# Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
|
| 3 |
-
#
|
| 4 |
|
| 5 |
import os
|
| 6 |
import sys
|
|
@@ -32,12 +32,12 @@ except Exception as e:
|
|
| 32 |
|
| 33 |
print("DEBUG: imports OK", flush=True)
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
# ---------- Config ----------
|
| 38 |
MEMORY_FILE = "memory.json"
|
| 39 |
MEMORY_LOCK = threading.Lock()
|
| 40 |
-
MIN_WAV_SIZE = 200
|
|
|
|
|
|
|
| 41 |
FFMPEG_CANDIDATES = [
|
| 42 |
("s16le", 16000, 1),
|
| 43 |
("s16le", 44100, 2),
|
|
@@ -63,15 +63,19 @@ def load_memory():
|
|
| 63 |
pass
|
| 64 |
return mem
|
| 65 |
|
|
|
|
| 66 |
def save_memory(mem):
|
| 67 |
with MEMORY_LOCK:
|
| 68 |
with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
|
| 69 |
json.dump(mem, fh, ensure_ascii=False, indent=2)
|
| 70 |
|
| 71 |
-
memory = load_memory()
|
| 72 |
-
print("DEBUG: memory loaded (words=%d phrases=%d)" % (len(memory.get("words", {})), len(memory.get("phrases", {}))), flush=True)
|
| 73 |
-
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# ---------- Postprocessing ----------
|
| 77 |
MEDICAL_ABBREVIATIONS = {
|
|
@@ -94,55 +98,54 @@ DRUG_NORMALIZATION = {
|
|
| 94 |
"amoxicillin": "Amoxicillin",
|
| 95 |
}
|
| 96 |
|
|
|
|
| 97 |
def expand_abbreviations(text):
|
| 98 |
-
tokens = re.split(r
|
| 99 |
out = []
|
| 100 |
for t in tokens:
|
| 101 |
key = t.lower().strip(".,;:")
|
| 102 |
if key in MEDICAL_ABBREVIATIONS:
|
| 103 |
-
trailing =
|
| 104 |
-
m = re.match(r
|
| 105 |
if m:
|
| 106 |
-
trailing = m.group(2) or
|
| 107 |
out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
|
| 108 |
else:
|
| 109 |
out.append(t)
|
| 110 |
-
return
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
|
| 116 |
|
| 117 |
def normalize_drugs(text):
|
| 118 |
for k, v in DRUG_NORMALIZATION.items():
|
| 119 |
-
text = re.sub(rf
|
| 120 |
return text
|
| 121 |
|
|
|
|
| 122 |
def punctuation_and_capitalization(text):
|
| 123 |
text = text.strip()
|
| 124 |
if not text:
|
| 125 |
return text
|
| 126 |
-
if not re.search(r
|
| 127 |
-
text = text.rstrip() +
|
| 128 |
-
parts = re.split(r
|
| 129 |
out = []
|
| 130 |
for p in parts:
|
| 131 |
-
if p and not re.match(r
|
| 132 |
out.append(p.capitalize())
|
| 133 |
else:
|
| 134 |
out.append(p)
|
| 135 |
-
return
|
|
|
|
| 136 |
|
| 137 |
def postprocess_transcript(text, format_soap=False):
|
| 138 |
if not text:
|
| 139 |
return text
|
| 140 |
-
t = re.sub(r
|
| 141 |
t = expand_abbreviations(t)
|
| 142 |
t = normalize_drugs(t)
|
| 143 |
t = punctuation_and_capitalization(t)
|
| 144 |
if format_soap:
|
| 145 |
-
sentences = re.split(r
|
| 146 |
subj = sentences[0] if len(sentences) >= 1 else ""
|
| 147 |
obj = sentences[1] if len(sentences) >= 2 else ""
|
| 148 |
assessment = ""
|
|
@@ -150,22 +153,23 @@ def postprocess_transcript(text, format_soap=False):
|
|
| 150 |
if kw in t.lower():
|
| 151 |
assessment = "Assessment: " + subj
|
| 152 |
break
|
| 153 |
-
soap =
|
|
|
|
|
|
|
| 154 |
return soap
|
| 155 |
return t
|
| 156 |
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
# ---------- Memory utilities ----------
|
| 163 |
def extract_words_and_phrases(text):
|
| 164 |
# basic tokenization for words; phrases = sentences
|
| 165 |
words = re.findall(r"[A-Za-z0-9\-']+", text)
|
| 166 |
-
sentences = [
|
|
|
|
|
|
|
| 167 |
return [w for w in words if w.strip()], sentences
|
| 168 |
|
|
|
|
| 169 |
def update_memory_with_transcript(transcript):
|
| 170 |
global memory
|
| 171 |
words, sentences = extract_words_and_phrases(transcript)
|
|
@@ -193,9 +197,6 @@ def update_memory_with_transcript(transcript):
|
|
| 193 |
pass
|
| 194 |
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
def memory_correct_text(text, min_ratio=0.85):
|
| 200 |
if not text or (not memory.get("words") and not memory.get("phrases")):
|
| 201 |
return text
|
|
@@ -204,7 +205,9 @@ def memory_correct_text(text, min_ratio=0.85):
|
|
| 204 |
lw = w.lower()
|
| 205 |
if lw in memory["words"]:
|
| 206 |
return w
|
| 207 |
-
candidates = get_close_matches(
|
|
|
|
|
|
|
| 208 |
if candidates:
|
| 209 |
cand = candidates[0]
|
| 210 |
if w and w[0].isupper():
|
|
@@ -212,46 +215,63 @@ def memory_correct_text(text, min_ratio=0.85):
|
|
| 212 |
return cand
|
| 213 |
return w
|
| 214 |
|
| 215 |
-
tokens = re.split(r
|
| 216 |
corrected_tokens = []
|
| 217 |
for tok in tokens:
|
| 218 |
if re.match(r"^[A-Za-z0-9\-']+$", tok):
|
| 219 |
corrected_tokens.append(fix_word(tok))
|
| 220 |
else:
|
| 221 |
corrected_tokens.append(tok)
|
| 222 |
-
corrected =
|
| 223 |
|
| 224 |
for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
|
| 225 |
low_phrase = phrase.lower()
|
| 226 |
if len(low_phrase) < 8:
|
| 227 |
continue
|
| 228 |
if low_phrase in corrected.lower():
|
| 229 |
-
corrected = re.sub(
|
|
|
|
|
|
|
| 230 |
return corrected
|
| 231 |
|
|
|
|
| 232 |
# ---------- File utilities ----------
|
| 233 |
def save_as_word(text, filename=None):
|
| 234 |
if filename is None:
|
| 235 |
-
filename = os.path.join(
|
|
|
|
|
|
|
| 236 |
doc = Document()
|
| 237 |
doc.add_paragraph(text)
|
| 238 |
doc.save(filename)
|
| 239 |
return filename
|
| 240 |
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
# ---------- Hybrid conversion: pydub + small ffmpeg fallback ----------
|
| 247 |
def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
|
| 248 |
cmd = [
|
| 249 |
-
"ffmpeg",
|
| 250 |
-
"-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
]
|
| 252 |
try:
|
| 253 |
proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
|
| 254 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
return True, proc.stderr + proc.stdout
|
| 256 |
else:
|
| 257 |
try:
|
|
@@ -268,6 +288,7 @@ def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
|
|
| 268 |
pass
|
| 269 |
return False, str(e)
|
| 270 |
|
|
|
|
| 271 |
def convert_to_wav_if_needed(input_path):
|
| 272 |
input_path = str(input_path)
|
| 273 |
lower = input_path.lower()
|
|
@@ -295,10 +316,7 @@ def convert_to_wav_if_needed(input_path):
|
|
| 295 |
except Exception:
|
| 296 |
pass
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
|
| 303 |
diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
|
| 304 |
diagnostics = []
|
|
@@ -306,7 +324,9 @@ def convert_to_wav_if_needed(input_path):
|
|
| 306 |
out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 307 |
out_wav.close()
|
| 308 |
success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
|
| 309 |
-
diagnostics.append(
|
|
|
|
|
|
|
| 310 |
if success:
|
| 311 |
try:
|
| 312 |
with open(diag_log, "w", encoding="utf-8") as fh:
|
|
@@ -326,9 +346,14 @@ def convert_to_wav_if_needed(input_path):
|
|
| 326 |
except Exception:
|
| 327 |
pass
|
| 328 |
|
|
|
|
| 329 |
try:
|
| 330 |
-
fp = subprocess.run(
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
|
| 333 |
except Exception as e:
|
| 334 |
diagnostics.append("ffprobe failed: " + str(e))
|
|
@@ -348,19 +373,35 @@ def convert_to_wav_if_needed(input_path):
|
|
| 348 |
except Exception as e:
|
| 349 |
raise Exception(f"Conversion failed; diagnostics write error: {e}")
|
| 350 |
|
| 351 |
-
raise Exception(
|
|
|
|
|
|
|
|
|
|
| 352 |
|
| 353 |
# ---------- Whisper model cache ----------
|
| 354 |
MODEL_CACHE = {}
|
| 355 |
|
|
|
|
| 356 |
def get_whisper_model(name):
|
| 357 |
if name not in MODEL_CACHE:
|
| 358 |
print(f"DEBUG: loading whisper model '{name}'", flush=True)
|
| 359 |
MODEL_CACHE[name] = whisper.load_model(name)
|
| 360 |
return MODEL_CACHE[name]
|
| 361 |
|
|
|
|
| 362 |
# ---------- Main transcription generator ----------
|
| 363 |
-
def transcribe_multiple(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
log = []
|
| 365 |
transcripts = []
|
| 366 |
word_file_path = None
|
|
@@ -370,7 +411,7 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
|
|
| 370 |
# initial yield
|
| 371 |
yield "", "", None, 0
|
| 372 |
|
| 373 |
-
# cleanup previous
|
| 374 |
if os.path.exists(temp_extract_dir):
|
| 375 |
try:
|
| 376 |
shutil.rmtree(temp_extract_dir)
|
|
@@ -392,7 +433,16 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
|
|
| 392 |
log.append("Incorrect zip password")
|
| 393 |
yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
|
| 394 |
return
|
| 395 |
-
exts = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
count = 0
|
| 397 |
for info in zf.infolist():
|
| 398 |
if info.is_dir():
|
|
@@ -404,7 +454,9 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
|
|
| 404 |
except Exception as e:
|
| 405 |
log.append(f"Error extracting {info.filename}: {e}")
|
| 406 |
continue
|
| 407 |
-
p = os.path.normpath(
|
|
|
|
|
|
|
| 408 |
if os.path.exists(p):
|
| 409 |
extracted_audio_paths.append(p)
|
| 410 |
count += 1
|
|
@@ -451,7 +503,7 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
|
|
| 451 |
yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
|
| 452 |
return
|
| 453 |
|
| 454 |
-
# load model
|
| 455 |
yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
|
| 456 |
try:
|
| 457 |
model = get_whisper_model(model_name)
|
|
@@ -466,7 +518,9 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
|
|
| 466 |
for p in paths:
|
| 467 |
idx += 1
|
| 468 |
log.append(f"Processing file ({idx}/{total}): {p}")
|
| 469 |
-
yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
|
|
|
|
|
|
|
| 470 |
|
| 471 |
wav = None
|
| 472 |
try:
|
|
@@ -474,46 +528,165 @@ def transcribe_multiple(audio_files, model_name, advanced_options, merge_checkbo
|
|
| 474 |
log.append(f"Converted to WAV: {wav}")
|
| 475 |
except Exception as e:
|
| 476 |
log.append(f"Conversion failed for {p}: {e}")
|
| 477 |
-
transcripts.append(
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
continue
|
| 480 |
|
| 481 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
|
|
|
|
|
|
| 484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
|
| 486 |
|
| 487 |
-
#
|
| 488 |
-
def run_transcription_wrapper(
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
if __name__ == "__main__":
|
| 518 |
port = int(os.environ.get("PORT", 7860))
|
| 519 |
print("DEBUG: launching Gradio on port", port, flush=True)
|
|
@@ -523,34 +696,3 @@ if __name__ == "__main__":
|
|
| 523 |
print("FATAL: demo.launch failed:", e, flush=True)
|
| 524 |
traceback.print_exc()
|
| 525 |
raise
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
# Safe launch: only launch if demo exists
|
| 531 |
-
if __name__ == "__main__":
|
| 532 |
-
port = int(os.environ.get("PORT", 7860))
|
| 533 |
-
print("DEBUG: preparing to launch Gradio on port", port, flush=True)
|
| 534 |
-
try:
|
| 535 |
-
if 'demo' in globals() and demo is not None:
|
| 536 |
-
print("DEBUG: demo object found. launching...", flush=True)
|
| 537 |
-
demo.queue().launch(server_name="0.0.0.0", server_port=port)
|
| 538 |
-
else:
|
| 539 |
-
print("FATAL: 'demo' not found. The Gradio UI block may be missing or failed to create.", flush=True)
|
| 540 |
-
# show the tail of the file so you can inspect quickly in logs
|
| 541 |
-
try:
|
| 542 |
-
import inspect
|
| 543 |
-
import pathlib
|
| 544 |
-
print("DEBUG: last 60 lines of /app/app.py for inspection:", flush=True)
|
| 545 |
-
with open("/app/app.py", "r", encoding="utf-8") as fh:
|
| 546 |
-
all_lines = fh.read().splitlines()
|
| 547 |
-
for ln in all_lines[-60:]:
|
| 548 |
-
print(ln)
|
| 549 |
-
except Exception:
|
| 550 |
-
pass
|
| 551 |
-
# Exit non-zero so platform reports failure clearly
|
| 552 |
-
sys.exit(1)
|
| 553 |
-
except Exception as e:
|
| 554 |
-
print("FATAL: demo.launch failed:", e, flush=True)
|
| 555 |
-
traceback.print_exc()
|
| 556 |
-
raise
|
|
|
|
| 1 |
# app.py
|
| 2 |
# Whisper transcription app - HYBRID conversion (pydub + small ffmpeg fallback)
|
| 3 |
+
# Clean, single-version file for Hugging Face Spaces.
|
| 4 |
|
| 5 |
import os
|
| 6 |
import sys
|
|
|
|
| 32 |
|
| 33 |
print("DEBUG: imports OK", flush=True)
|
| 34 |
|
|
|
|
|
|
|
| 35 |
# ---------- Config ----------
|
| 36 |
MEMORY_FILE = "memory.json"
|
| 37 |
MEMORY_LOCK = threading.Lock()
|
| 38 |
+
MIN_WAV_SIZE = 200 # bytes
|
| 39 |
+
|
| 40 |
+
# Small ffmpeg fallback grid (hybrid conversion)
|
| 41 |
FFMPEG_CANDIDATES = [
|
| 42 |
("s16le", 16000, 1),
|
| 43 |
("s16le", 44100, 2),
|
|
|
|
| 63 |
pass
|
| 64 |
return mem
|
| 65 |
|
| 66 |
+
|
| 67 |
def save_memory(mem):
|
| 68 |
with MEMORY_LOCK:
|
| 69 |
with open(MEMORY_FILE, "w", encoding="utf-8") as fh:
|
| 70 |
json.dump(mem, fh, ensure_ascii=False, indent=2)
|
| 71 |
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
memory = load_memory()
|
| 74 |
+
print(
|
| 75 |
+
"DEBUG: memory loaded (words=%d phrases=%d)"
|
| 76 |
+
% (len(memory.get("words", {})), len(memory.get("phrases", {}))),
|
| 77 |
+
flush=True,
|
| 78 |
+
)
|
| 79 |
|
| 80 |
# ---------- Postprocessing ----------
|
| 81 |
MEDICAL_ABBREVIATIONS = {
|
|
|
|
| 98 |
"amoxicillin": "Amoxicillin",
|
| 99 |
}
|
| 100 |
|
| 101 |
+
|
| 102 |
def expand_abbreviations(text):
|
| 103 |
+
tokens = re.split(r"(\s+)", text)
|
| 104 |
out = []
|
| 105 |
for t in tokens:
|
| 106 |
key = t.lower().strip(".,;:")
|
| 107 |
if key in MEDICAL_ABBREVIATIONS:
|
| 108 |
+
trailing = ""
|
| 109 |
+
m = re.match(r"([A-Za-z0-9/]+)([.,;:]*)", t)
|
| 110 |
if m:
|
| 111 |
+
trailing = m.group(2) or ""
|
| 112 |
out.append(MEDICAL_ABBREVIATIONS[key] + trailing)
|
| 113 |
else:
|
| 114 |
out.append(t)
|
| 115 |
+
return "".join(out)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
|
| 118 |
def normalize_drugs(text):
|
| 119 |
for k, v in DRUG_NORMALIZATION.items():
|
| 120 |
+
text = re.sub(rf"\b{k}\b", v, text, flags=re.IGNORECASE)
|
| 121 |
return text
|
| 122 |
|
| 123 |
+
|
| 124 |
def punctuation_and_capitalization(text):
|
| 125 |
text = text.strip()
|
| 126 |
if not text:
|
| 127 |
return text
|
| 128 |
+
if not re.search(r"[.?!]\s*$", text):
|
| 129 |
+
text = text.rstrip() + "."
|
| 130 |
+
parts = re.split(r"([.?!]\s+)", text)
|
| 131 |
out = []
|
| 132 |
for p in parts:
|
| 133 |
+
if p and not re.match(r"[.?!]\s+", p):
|
| 134 |
out.append(p.capitalize())
|
| 135 |
else:
|
| 136 |
out.append(p)
|
| 137 |
+
return "".join(out)
|
| 138 |
+
|
| 139 |
|
| 140 |
def postprocess_transcript(text, format_soap=False):
|
| 141 |
if not text:
|
| 142 |
return text
|
| 143 |
+
t = re.sub(r"\s+", " ", text).strip()
|
| 144 |
t = expand_abbreviations(t)
|
| 145 |
t = normalize_drugs(t)
|
| 146 |
t = punctuation_and_capitalization(t)
|
| 147 |
if format_soap:
|
| 148 |
+
sentences = re.split(r"(?<=[.?!])\s+", t)
|
| 149 |
subj = sentences[0] if len(sentences) >= 1 else ""
|
| 150 |
obj = sentences[1] if len(sentences) >= 2 else ""
|
| 151 |
assessment = ""
|
|
|
|
| 153 |
if kw in t.lower():
|
| 154 |
assessment = "Assessment: " + subj
|
| 155 |
break
|
| 156 |
+
soap = (
|
| 157 |
+
f"S: {subj}\nO: {obj}\nA: {assessment}\nP: Plan: follow up as indicated."
|
| 158 |
+
)
|
| 159 |
return soap
|
| 160 |
return t
|
| 161 |
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
# ---------- Memory utilities ----------
|
| 164 |
def extract_words_and_phrases(text):
|
| 165 |
# basic tokenization for words; phrases = sentences
|
| 166 |
words = re.findall(r"[A-Za-z0-9\-']+", text)
|
| 167 |
+
sentences = [
|
| 168 |
+
s.strip() for s in re.split(r"(?<=[.?!])\s+", text) if s.strip()
|
| 169 |
+
]
|
| 170 |
return [w for w in words if w.strip()], sentences
|
| 171 |
|
| 172 |
+
|
| 173 |
def update_memory_with_transcript(transcript):
|
| 174 |
global memory
|
| 175 |
words, sentences = extract_words_and_phrases(transcript)
|
|
|
|
| 197 |
pass
|
| 198 |
|
| 199 |
|
|
|
|
|
|
|
|
|
|
| 200 |
def memory_correct_text(text, min_ratio=0.85):
|
| 201 |
if not text or (not memory.get("words") and not memory.get("phrases")):
|
| 202 |
return text
|
|
|
|
| 205 |
lw = w.lower()
|
| 206 |
if lw in memory["words"]:
|
| 207 |
return w
|
| 208 |
+
candidates = get_close_matches(
|
| 209 |
+
lw, memory["words"].keys(), n=1, cutoff=min_ratio
|
| 210 |
+
)
|
| 211 |
if candidates:
|
| 212 |
cand = candidates[0]
|
| 213 |
if w and w[0].isupper():
|
|
|
|
| 215 |
return cand
|
| 216 |
return w
|
| 217 |
|
| 218 |
+
tokens = re.split(r"(\W+)", text)
|
| 219 |
corrected_tokens = []
|
| 220 |
for tok in tokens:
|
| 221 |
if re.match(r"^[A-Za-z0-9\-']+$", tok):
|
| 222 |
corrected_tokens.append(fix_word(tok))
|
| 223 |
else:
|
| 224 |
corrected_tokens.append(tok)
|
| 225 |
+
corrected = "".join(corrected_tokens)
|
| 226 |
|
| 227 |
for phrase in sorted(memory.get("phrases", {}).keys(), key=lambda s: -len(s)):
|
| 228 |
low_phrase = phrase.lower()
|
| 229 |
if len(low_phrase) < 8:
|
| 230 |
continue
|
| 231 |
if low_phrase in corrected.lower():
|
| 232 |
+
corrected = re.sub(
|
| 233 |
+
re.escape(phrase), phrase, corrected, flags=re.IGNORECASE
|
| 234 |
+
)
|
| 235 |
return corrected
|
| 236 |
|
| 237 |
+
|
| 238 |
# ---------- File utilities ----------
|
| 239 |
def save_as_word(text, filename=None):
|
| 240 |
if filename is None:
|
| 241 |
+
filename = os.path.join(
|
| 242 |
+
tempfile.gettempdir(), "merged_transcripts.docx"
|
| 243 |
+
)
|
| 244 |
doc = Document()
|
| 245 |
doc.add_paragraph(text)
|
| 246 |
doc.save(filename)
|
| 247 |
return filename
|
| 248 |
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
# ---------- Hybrid conversion: pydub + small ffmpeg fallback ----------
|
| 251 |
def _ffmpeg_convert(input_path, out_path, fmt, sr, ch):
|
| 252 |
cmd = [
|
| 253 |
+
"ffmpeg",
|
| 254 |
+
"-hide_banner",
|
| 255 |
+
"-loglevel",
|
| 256 |
+
"error",
|
| 257 |
+
"-y",
|
| 258 |
+
"-f",
|
| 259 |
+
fmt,
|
| 260 |
+
"-ar",
|
| 261 |
+
str(sr),
|
| 262 |
+
"-ac",
|
| 263 |
+
str(ch),
|
| 264 |
+
"-i",
|
| 265 |
+
input_path,
|
| 266 |
+
out_path,
|
| 267 |
]
|
| 268 |
try:
|
| 269 |
proc = subprocess.run(cmd, capture_output=True, timeout=30, text=True)
|
| 270 |
+
if (
|
| 271 |
+
proc.returncode == 0
|
| 272 |
+
and os.path.exists(out_path)
|
| 273 |
+
and os.path.getsize(out_path) > MIN_WAV_SIZE
|
| 274 |
+
):
|
| 275 |
return True, proc.stderr + proc.stdout
|
| 276 |
else:
|
| 277 |
try:
|
|
|
|
| 288 |
pass
|
| 289 |
return False, str(e)
|
| 290 |
|
| 291 |
+
|
| 292 |
def convert_to_wav_if_needed(input_path):
|
| 293 |
input_path = str(input_path)
|
| 294 |
lower = input_path.lower()
|
|
|
|
| 316 |
except Exception:
|
| 317 |
pass
|
| 318 |
|
| 319 |
+
# ffmpeg fallback
|
|
|
|
|
|
|
|
|
|
| 320 |
diag_dir = tempfile.mkdtemp(prefix="dct_diag_")
|
| 321 |
diag_log = os.path.join(diag_dir, "conversion_diagnostics.txt")
|
| 322 |
diagnostics = []
|
|
|
|
| 324 |
out_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 325 |
out_wav.close()
|
| 326 |
success, debug = _ffmpeg_convert(input_path, out_wav.name, fmt, sr, ch)
|
| 327 |
+
diagnostics.append(
|
| 328 |
+
f"TRY fmt={fmt} sr={sr} ch={ch} success={success}\n{debug}\n"
|
| 329 |
+
)
|
| 330 |
if success:
|
| 331 |
try:
|
| 332 |
with open(diag_log, "w", encoding="utf-8") as fh:
|
|
|
|
| 346 |
except Exception:
|
| 347 |
pass
|
| 348 |
|
| 349 |
+
# final diagnostics
|
| 350 |
try:
|
| 351 |
+
fp = subprocess.run(
|
| 352 |
+
["ffprobe", "-v", "error", "-show_format", "-show_streams", input_path],
|
| 353 |
+
capture_output=True,
|
| 354 |
+
text=True,
|
| 355 |
+
timeout=10,
|
| 356 |
+
)
|
| 357 |
diagnostics.append("FFPROBE:\n" + (fp.stdout.strip() or fp.stderr.strip()))
|
| 358 |
except Exception as e:
|
| 359 |
diagnostics.append("ffprobe failed: " + str(e))
|
|
|
|
| 373 |
except Exception as e:
|
| 374 |
raise Exception(f"Conversion failed; diagnostics write error: {e}")
|
| 375 |
|
| 376 |
+
raise Exception(
|
| 377 |
+
f"Could not convert file to WAV. Diagnostics saved to: {diag_log}"
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
|
| 381 |
# ---------- Whisper model cache ----------
|
| 382 |
MODEL_CACHE = {}
|
| 383 |
|
| 384 |
+
|
| 385 |
def get_whisper_model(name):
|
| 386 |
if name not in MODEL_CACHE:
|
| 387 |
print(f"DEBUG: loading whisper model '{name}'", flush=True)
|
| 388 |
MODEL_CACHE[name] = whisper.load_model(name)
|
| 389 |
return MODEL_CACHE[name]
|
| 390 |
|
| 391 |
+
|
| 392 |
# ---------- Main transcription generator ----------
|
| 393 |
+
def transcribe_multiple(
|
| 394 |
+
audio_files,
|
| 395 |
+
model_name,
|
| 396 |
+
advanced_options,
|
| 397 |
+
merge_checkbox,
|
| 398 |
+
zip_file=None,
|
| 399 |
+
zip_password=None,
|
| 400 |
+
enable_memory=False,
|
| 401 |
+
):
|
| 402 |
+
"""
|
| 403 |
+
Generator yields (log_text, transcripts_text, merged_file_path_or_None, percent_int)
|
| 404 |
+
"""
|
| 405 |
log = []
|
| 406 |
transcripts = []
|
| 407 |
word_file_path = None
|
|
|
|
| 411 |
# initial yield
|
| 412 |
yield "", "", None, 0
|
| 413 |
|
| 414 |
+
# cleanup previous temp dir
|
| 415 |
if os.path.exists(temp_extract_dir):
|
| 416 |
try:
|
| 417 |
shutil.rmtree(temp_extract_dir)
|
|
|
|
| 433 |
log.append("Incorrect zip password")
|
| 434 |
yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
|
| 435 |
return
|
| 436 |
+
exts = [
|
| 437 |
+
".mp3",
|
| 438 |
+
".wav",
|
| 439 |
+
".aac",
|
| 440 |
+
".flac",
|
| 441 |
+
".ogg",
|
| 442 |
+
".m4a",
|
| 443 |
+
".dat",
|
| 444 |
+
".dct",
|
| 445 |
+
]
|
| 446 |
count = 0
|
| 447 |
for info in zf.infolist():
|
| 448 |
if info.is_dir():
|
|
|
|
| 454 |
except Exception as e:
|
| 455 |
log.append(f"Error extracting {info.filename}: {e}")
|
| 456 |
continue
|
| 457 |
+
p = os.path.normpath(
|
| 458 |
+
os.path.join(temp_extract_dir, info.filename)
|
| 459 |
+
)
|
| 460 |
if os.path.exists(p):
|
| 461 |
extracted_audio_paths.append(p)
|
| 462 |
count += 1
|
|
|
|
| 503 |
yield "\n\n".join(log), "\n\n".join(transcripts), None, 100
|
| 504 |
return
|
| 505 |
|
| 506 |
+
# load model
|
| 507 |
yield "\n\n".join(log), "\n\n".join(transcripts), None, 5
|
| 508 |
try:
|
| 509 |
model = get_whisper_model(model_name)
|
|
|
|
| 518 |
for p in paths:
|
| 519 |
idx += 1
|
| 520 |
log.append(f"Processing file ({idx}/{total}): {p}")
|
| 521 |
+
yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
|
| 522 |
+
5 + (idx - 1) * 80 / max(1, total)
|
| 523 |
+
)
|
| 524 |
|
| 525 |
wav = None
|
| 526 |
try:
|
|
|
|
| 528 |
log.append(f"Converted to WAV: {wav}")
|
| 529 |
except Exception as e:
|
| 530 |
log.append(f"Conversion failed for {p}: {e}")
|
| 531 |
+
transcripts.append(
|
| 532 |
+
f"FILE: {os.path.basename(p)}\nERROR: Conversion failed: {e}"
|
| 533 |
+
)
|
| 534 |
+
yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
|
| 535 |
+
5 + idx * 80 / max(1, total)
|
| 536 |
+
)
|
| 537 |
continue
|
| 538 |
|
| 539 |
+
try:
|
| 540 |
+
whisper_opts = {}
|
| 541 |
+
if isinstance(advanced_options, dict):
|
| 542 |
+
whisper_opts.update(advanced_options)
|
| 543 |
+
|
| 544 |
+
result = model.transcribe(wav, **whisper_opts)
|
| 545 |
+
text = result.get("text", "").strip()
|
| 546 |
+
log.append(f"Transcribed: {len(text)} chars")
|
| 547 |
+
|
| 548 |
+
if enable_memory:
|
| 549 |
+
text = memory_correct_text(text)
|
| 550 |
+
text = postprocess_transcript(text)
|
| 551 |
+
transcripts.append(
|
| 552 |
+
f"FILE: {os.path.basename(p)}\n{text}\n"
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
if enable_memory:
|
| 556 |
+
try:
|
| 557 |
+
update_memory_with_transcript(text)
|
| 558 |
+
log.append("Memory updated.")
|
| 559 |
+
except Exception:
|
| 560 |
+
pass
|
| 561 |
+
|
| 562 |
+
yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
|
| 563 |
+
10 + idx * 85 / max(1, total)
|
| 564 |
+
)
|
| 565 |
+
except Exception as e:
|
| 566 |
+
log.append(f"Transcription failed for {p}: {e}")
|
| 567 |
+
transcripts.append(
|
| 568 |
+
f"FILE: {os.path.basename(p)}\nERROR: Transcription failed: {e}"
|
| 569 |
+
)
|
| 570 |
+
yield "\n\n".join(log), "\n\n".join(transcripts), None, int(
|
| 571 |
+
10 + idx * 85 / max(1, total)
|
| 572 |
+
)
|
| 573 |
+
continue
|
| 574 |
+
finally:
|
| 575 |
+
try:
|
| 576 |
+
if wav and os.path.exists(wav):
|
| 577 |
+
tmpdir = tempfile.gettempdir()
|
| 578 |
+
if (
|
| 579 |
+
os.path.commonpath([tmpdir, os.path.abspath(wav)])
|
| 580 |
+
== tmpdir
|
| 581 |
+
and not p.lower().endswith(".wav")
|
| 582 |
+
):
|
| 583 |
+
os.unlink(wav)
|
| 584 |
+
except Exception:
|
| 585 |
+
pass
|
| 586 |
|
| 587 |
+
# final merge option
|
| 588 |
+
if merge_checkbox:
|
| 589 |
+
try:
|
| 590 |
+
merged_text = "\n\n".join(transcripts)
|
| 591 |
+
word_file_path = save_as_word(merged_text)
|
| 592 |
+
log.append(f"Merged transcript saved: {word_file_path}")
|
| 593 |
+
except Exception as e:
|
| 594 |
+
log.append(f"Failed to save merged file: {e}")
|
| 595 |
+
word_file_path = None
|
| 596 |
|
| 597 |
+
# final yield
|
| 598 |
+
yield "\n\n".join(log), "\n\n".join(transcripts), word_file_path, 100
|
| 599 |
|
| 600 |
+
# cleanup extracted dir
|
| 601 |
+
try:
|
| 602 |
+
if os.path.exists(temp_extract_dir):
|
| 603 |
+
shutil.rmtree(temp_extract_dir)
|
| 604 |
+
log.append("Cleaned temporary extraction dir.")
|
| 605 |
+
except Exception:
|
| 606 |
+
pass
|
| 607 |
|
| 608 |
|
| 609 |
+
# ----------------------- Gradio UI -----------------------
|
| 610 |
+
def run_transcription_wrapper(
|
| 611 |
+
files, model_name, merge, zip_file, zip_password, enable_memory, advanced_options_state
|
| 612 |
+
):
|
| 613 |
+
audio_input = files
|
| 614 |
+
zip_path = None
|
| 615 |
+
if zip_file:
|
| 616 |
+
if isinstance(zip_file, (str, os.PathLike)):
|
| 617 |
+
zip_path = str(zip_file)
|
| 618 |
+
elif hasattr(zip_file, "name"):
|
| 619 |
+
zip_path = zip_file.name
|
| 620 |
+
elif isinstance(zip_file, dict) and zip_file.get("name"):
|
| 621 |
+
zip_path = zip_file["name"]
|
| 622 |
+
adv = {}
|
| 623 |
+
return transcribe_multiple(
|
| 624 |
+
audio_input,
|
| 625 |
+
model_name,
|
| 626 |
+
adv,
|
| 627 |
+
merge_checkbox=merge,
|
| 628 |
+
zip_file=zip_path,
|
| 629 |
+
zip_password=zip_password,
|
| 630 |
+
enable_memory=enable_memory,
|
| 631 |
+
)
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
print("DEBUG: building Gradio Blocks", flush=True)
|
| 635 |
+
demo = gr.Blocks()
|
| 636 |
+
|
| 637 |
+
with demo:
|
| 638 |
+
gr.Markdown("## Whisper Transcription (Spaces-ready)")
|
| 639 |
+
with gr.Row():
|
| 640 |
+
with gr.Column(scale=2):
|
| 641 |
+
file_input = gr.File(
|
| 642 |
+
label="Upload audio files (or zip)",
|
| 643 |
+
file_count="multiple",
|
| 644 |
+
type="filepath",
|
| 645 |
+
)
|
| 646 |
+
zip_input = gr.File(
|
| 647 |
+
label="Optional: Upload zip file containing audio",
|
| 648 |
+
file_count="single",
|
| 649 |
+
type="filepath",
|
| 650 |
+
)
|
| 651 |
+
zip_password = gr.Textbox(
|
| 652 |
+
label="Zip password (if any)",
|
| 653 |
+
placeholder="password (optional)",
|
| 654 |
+
)
|
| 655 |
+
model_select = gr.Dropdown(
|
| 656 |
+
choices=["small", "medium", "large", "base"],
|
| 657 |
+
value="small",
|
| 658 |
+
label="Whisper model",
|
| 659 |
+
)
|
| 660 |
+
merge_checkbox = gr.Checkbox(
|
| 661 |
+
label="Merge transcripts to a single .docx (downloadable)",
|
| 662 |
+
value=True,
|
| 663 |
+
)
|
| 664 |
+
memory_checkbox = gr.Checkbox(
|
| 665 |
+
label="Enable persistent memory (word/phrase correction)",
|
| 666 |
+
value=False,
|
| 667 |
+
)
|
| 668 |
+
submit = gr.Button("Transcribe")
|
| 669 |
+
with gr.Column(scale=3):
|
| 670 |
+
logs = gr.Textbox(label="Logs (streaming)", lines=12)
|
| 671 |
+
transcripts_out = gr.Textbox(label="Transcripts (streaming)", lines=12)
|
| 672 |
+
download_file = gr.File(label="Merged .docx (when enabled)")
|
| 673 |
+
progress_num = gr.Number(value=0, label="Progress (%)")
|
| 674 |
+
|
| 675 |
+
submit.click(
|
| 676 |
+
fn=run_transcription_wrapper,
|
| 677 |
+
inputs=[
|
| 678 |
+
file_input,
|
| 679 |
+
model_select,
|
| 680 |
+
merge_checkbox,
|
| 681 |
+
zip_input,
|
| 682 |
+
zip_password,
|
| 683 |
+
memory_checkbox,
|
| 684 |
+
gr.State({}),
|
| 685 |
+
],
|
| 686 |
+
outputs=[logs, transcripts_out, download_file, progress_num],
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
# ---------- Launch ----------
|
| 690 |
if __name__ == "__main__":
|
| 691 |
port = int(os.environ.get("PORT", 7860))
|
| 692 |
print("DEBUG: launching Gradio on port", port, flush=True)
|
|
|
|
| 696 |
print("FATAL: demo.launch failed:", e, flush=True)
|
| 697 |
traceback.print_exc()
|
| 698 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|