mkfallah commited on
Commit
153e956
·
verified ·
1 Parent(s): ddf950c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -11
app.py CHANGED
@@ -5,11 +5,11 @@ import tempfile
5
  import soundfile as sf
6
  import numpy as np
7
 
8
- # --- ASR pipeline ---
9
  asr = pipeline(
10
  task="automatic-speech-recognition",
11
  model="vhdm/whisper-large-fa-v1",
12
- device=-1 # CPU
13
  )
14
 
15
  # --- Custom vocabulary with multiple forms for accuracy ---
@@ -21,6 +21,10 @@ custom_vocab_map = {
21
  }
22
 
23
  def replace_fuzzy(text, vocab_map, threshold=85):
 
 
 
 
24
  for target, alternatives in vocab_map.items():
25
  match, score = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio)
26
  if score >= threshold:
@@ -28,23 +32,27 @@ def replace_fuzzy(text, vocab_map, threshold=85):
28
  return text
29
 
30
  def transcribe(audio):
31
- # check if audio is None
 
 
32
  if audio is None:
33
  return "No audio input detected."
34
 
35
- # Gradio may pass a tuple or just a file path depending on version
36
  if isinstance(audio, tuple):
37
  data, sr = audio
38
- # convert to 2D if mono
39
  if isinstance(data, int):
40
  return "Invalid audio data."
41
  if data.ndim == 1:
42
  data = np.expand_dims(data, axis=1)
 
43
  with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
44
  sf.write(tmp.name, data, samplerate=sr)
 
45
  result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
46
  else:
47
- # assume audio is file path
48
  result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
49
 
50
  text = result["text"]
@@ -54,10 +62,7 @@ def transcribe(audio):
54
  # --- Gradio interface ---
55
  iface = gr.Interface(
56
  fn=transcribe,
57
- inputs=gr.Audio(source="microphone", type="numpy"), # live mic recording
58
  outputs="text",
59
  title="Persian ASR with High Accuracy Vocabulary",
60
- description="Speak in Persian; recognized words are corrected using a custom high-accuracy vocabulary."
61
- )
62
-
63
- iface.launch()
 
5
  import soundfile as sf
6
  import numpy as np
7
 
8
+ # --- Initialize ASR pipeline ---
9
  asr = pipeline(
10
  task="automatic-speech-recognition",
11
  model="vhdm/whisper-large-fa-v1",
12
+ device=-1 # CPU; for GPU set device=0
13
  )
14
 
15
  # --- Custom vocabulary with multiple forms for accuracy ---
 
21
  }
22
 
23
  def replace_fuzzy(text, vocab_map, threshold=85):
24
+ """
25
+ Replace words/phrases in text using fuzzy matching with high threshold.
26
+ Supports multiple alternatives per word/phrase.
27
+ """
28
  for target, alternatives in vocab_map.items():
29
  match, score = process.extractOne(text, alternatives, scorer=fuzz.partial_ratio)
30
  if score >= threshold:
 
32
  return text
33
 
34
  def transcribe(audio):
35
+ """
36
+ audio: tuple(numpy array, sample_rate) from Gradio
37
+ """
38
  if audio is None:
39
  return "No audio input detected."
40
 
41
+ # Handle audio input
42
  if isinstance(audio, tuple):
43
  data, sr = audio
44
+ # Convert mono to 2D array for soundfile
45
  if isinstance(data, int):
46
  return "Invalid audio data."
47
  if data.ndim == 1:
48
  data = np.expand_dims(data, axis=1)
49
+ # Write temporary WAV file
50
  with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
51
  sf.write(tmp.name, data, samplerate=sr)
52
+ # Run ASR with chunking
53
  result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
54
  else:
55
+ # If audio is a file path
56
  result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
57
 
58
  text = result["text"]
 
62
  # --- Gradio interface ---
63
  iface = gr.Interface(
64
  fn=transcribe,
65
+ inputs=gr.Audio(type="numpy", label="Record or upload audio"),
66
  outputs="text",
67
  title="Persian ASR with High Accuracy Vocabulary",
68
+ description="Speak in Persian or upload an audio file; recognized words are corrected