mkfallah commited on
Commit
ddf950c
·
verified ·
1 Parent(s): 61c5f65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -15
app.py CHANGED
@@ -21,10 +21,6 @@ custom_vocab_map = {
21
  }
22
 
23
  def replace_fuzzy(text, vocab_map, threshold=85):
24
- """
25
- Replace words/phrases in text using fuzzy matching with high threshold.
26
- Supports multiple alternatives per word/phrase.
27
- """
28
  for target, alternatives in vocab_map.items():
29
  match, score = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio)
30
  if score >= threshold:
@@ -32,15 +28,25 @@ def replace_fuzzy(text, vocab_map, threshold=85):
32
  return text
33
 
34
  def transcribe(audio):
35
- # audio is a tuple (numpy array, sample_rate)
36
- data, sr = audio
37
- # --- convert mono to 2D for soundfile ---
38
- if data.ndim == 1:
39
- data = np.expand_dims(data, axis=1)
40
- with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
41
- sf.write(tmp.name, data, samplerate=sr)
42
- # ASR with chunking for long audios
43
- result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
 
 
 
 
 
 
 
 
 
 
44
  text = result["text"]
45
  final_text = replace_fuzzy(text, custom_vocab_map, threshold=85)
46
  return final_text
@@ -48,10 +54,10 @@ def transcribe(audio):
48
  # --- Gradio interface ---
49
  iface = gr.Interface(
50
  fn=transcribe,
51
- inputs=gr.Audio(type="numpy"),
52
  outputs="text",
53
  title="Persian ASR with High Accuracy Vocabulary",
54
- description="Upload a Persian audio file; recognized words are corrected using a custom high-accuracy vocabulary."
55
  )
56
 
57
  iface.launch()
 
21
  }
22
 
23
  def replace_fuzzy(text, vocab_map, threshold=85):
 
 
 
 
24
  for target, alternatives in vocab_map.items():
25
  match, score = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio)
26
  if score >= threshold:
 
28
  return text
29
 
30
  def transcribe(audio):
31
+ # check if audio is None
32
+ if audio is None:
33
+ return "No audio input detected."
34
+
35
+ # Gradio may pass a tuple or just a file path depending on version
36
+ if isinstance(audio, tuple):
37
+ data, sr = audio
38
+ # convert to 2D if mono
39
+ if isinstance(data, int):
40
+ return "Invalid audio data."
41
+ if data.ndim == 1:
42
+ data = np.expand_dims(data, axis=1)
43
+ with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
44
+ sf.write(tmp.name, data, samplerate=sr)
45
+ result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
46
+ else:
47
+ # assume audio is file path
48
+ result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
49
+
50
  text = result["text"]
51
  final_text = replace_fuzzy(text, custom_vocab_map, threshold=85)
52
  return final_text
 
54
  # --- Gradio interface ---
55
  iface = gr.Interface(
56
  fn=transcribe,
57
+ inputs=gr.Audio(source="microphone", type="numpy"), # live mic recording
58
  outputs="text",
59
  title="Persian ASR with High Accuracy Vocabulary",
60
+ description="Speak in Persian; recognized words are corrected using a custom high-accuracy vocabulary."
61
  )
62
 
63
  iface.launch()