Scrapyard commited on
Commit
a748eff
·
1 Parent(s): 1a4dc8b

forgot to save

Browse files
Files changed (1) hide show
  1. app.py +41 -28
app.py CHANGED
@@ -1,55 +1,68 @@
 
1
  import gradio as gr
2
  import numpy as np
3
  from faster_whisper import WhisperModel
 
4
 
5
  audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
 
 
6
 
7
- def transcribe(sr, buffer):
8
- segments, info = audio_model.transcribe(buffer, beam_size=5)
9
- result = list(segments)
10
- text = ""
11
- if result and len(result) > 0:
12
- text = result[0].text
13
- print("Text:", text)
14
- else:
15
  text = ""
16
- print("No text found")
17
- print(result)
18
- return text
19
 
20
- def normaliseData(audioInput, buffer_state):
21
- sr, y = audioInput
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
 
 
 
23
  # Convert to mono if stereo
24
  if y.ndim > 1:
25
  y = y.mean(axis=1)
 
26
  y = y.astype(np.float32)
27
  y /= np.max(np.abs(y))
28
 
29
- # Append to buffer
30
- if buffer_state is not None:
31
- buffer = np.concatenate([buffer_state, y])
32
  else:
33
- buffer = y
34
 
35
- # Only process if buffer is at least 3 seconds
36
- min_samples = 3 * sr
37
- words = ""
38
- if len(buffer) >= min_samples:
39
- chunk = buffer[:min_samples]
40
- words = transcribe(sr, chunk)
41
- buffer = buffer[min_samples:] # Remove processed samples
42
 
43
- return buffer, words
44
 
45
  with gr.Blocks() as demo:
46
  audioInput = gr.Audio(sources=["microphone"], streaming=True)
47
  audioOutput = gr.Textbox(label="Output")
48
- buffer_state = gr.State()
49
 
50
  audioInput.stream(
51
  fn=normaliseData,
52
- inputs=[audioInput, buffer_state],
53
- outputs=[buffer_state, audioOutput]
54
  )
55
  demo.launch()
 
 
1
+
2
  import gradio as gr
3
  import numpy as np
4
  from faster_whisper import WhisperModel
5
+ from faster_whisper.transcribe import Segment
6
 
7
  audio_model = WhisperModel("tiny.en", device="cpu", compute_type="int8")
8
+ transcription = ['']
9
+ buffer = np.array([])
10
 
11
+ def transcribe(SampleRate, data):
12
+ global buffer
13
+ if SampleRate * 3 >= len(buffer):
14
+ print("buffer big")
15
+ segments, info = audio_model.transcribe(buffer, beam_size=5)
16
+ result = (list(segments))
 
 
17
  text = ""
 
 
 
18
 
19
+ if result and len(result) > 0:
20
+ text = result[0].text
21
+ print("Text:", text)
22
+ else:
23
+ text = ""
24
+ print("No text found")
25
+ print(result)
26
+
27
+
28
+ buffer = np.array([])
29
+ return(text)
30
+ else:
31
+ buffer = np.concatenate([buffer, data])
32
+ print("buffer small")
33
+ return None
34
+
35
 
36
+ def normaliseData(audioInput, stream):
37
+ sr, y = audioInput
38
+
39
  # Convert to mono if stereo
40
  if y.ndim > 1:
41
  y = y.mean(axis=1)
42
+
43
  y = y.astype(np.float32)
44
  y /= np.max(np.abs(y))
45
 
46
+ if stream is not None:
47
+ stream = np.concatenate([stream, y])
 
48
  else:
49
+ stream = y
50
 
51
+ words = transcribe(sr, y)
52
+
53
+ # Return the stream as state and a string representation of the array for display
54
+ return stream, words,
 
 
 
55
 
 
56
 
57
  with gr.Blocks() as demo:
58
  audioInput = gr.Audio(sources=["microphone"], streaming=True)
59
  audioOutput = gr.Textbox(label="Output")
60
+ state = gr.State()
61
 
62
  audioInput.stream(
63
  fn=normaliseData,
64
+ inputs=[audioInput, state],
65
+ outputs=[state, audioOutput] # try switching it arround
66
  )
67
  demo.launch()
68
+