humanvprojectceo commited on
Commit
ed85784
·
verified ·
1 Parent(s): dfe4e71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -13
app.py CHANGED
@@ -1,7 +1,9 @@
1
  import os
 
2
  import asyncio
3
- import soundfile as sf
4
  import numpy as np
 
 
5
  from google import genai
6
  from google.genai import types
7
 
@@ -9,22 +11,24 @@ MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
9
 
10
  client = genai.Client(
11
  http_options={"api_version": "v1beta"},
12
- api_key=os.environ.get("GEMINI_API_KEY"),
13
  )
14
 
15
  CONFIG = types.LiveConnectConfig(
16
  response_modalities=["AUDIO"]
17
  )
18
 
19
-
 
 
20
  def load_audio_as_pcm16(path):
21
  y, sr = sf.read(path)
22
 
23
- # mono
24
  if len(y.shape) > 1:
25
  y = y.mean(axis=1)
26
 
27
- # resample to 16k
28
  if sr != 16000:
29
  import resampy
30
  y = resampy.resample(y, sr, 16000)
@@ -34,6 +38,9 @@ def load_audio_as_pcm16(path):
34
  return pcm16.tobytes()
35
 
36
 
 
 
 
37
  async def send_audio_file(file_path):
38
  audio_bytes = load_audio_as_pcm16(file_path)
39
 
@@ -55,17 +62,61 @@ async def send_audio_file(file_path):
55
  audio_chunks.append(response.data)
56
 
57
  full_audio = b"".join(audio_chunks)
58
- return full_audio
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- def main(file_path):
62
- audio = asyncio.run(send_audio_file(file_path))
 
 
 
63
 
64
- with open("response.raw", "wb") as f:
65
- f.write(audio)
66
 
67
- print("Audio response saved as response.raw")
68
 
 
 
 
 
 
69
 
70
- if __name__ == "__main__":
71
- main("input_audio.wav")
 
1
  import os
2
+ import io
3
  import asyncio
 
4
  import numpy as np
5
+ import soundfile as sf
6
+ import gradio as gr
7
  from google import genai
8
  from google.genai import types
9
 
 
11
 
12
  client = genai.Client(
13
  http_options={"api_version": "v1beta"},
14
+ api_key=os.getenv("GEMINI_API_KEY"),
15
  )
16
 
17
  CONFIG = types.LiveConnectConfig(
18
  response_modalities=["AUDIO"]
19
  )
20
 
21
+ # ------------------------
22
+ # Audio preprocessing
23
+ # ------------------------
24
  def load_audio_as_pcm16(path):
25
  y, sr = sf.read(path)
26
 
27
+ # تبدیل به mono
28
  if len(y.shape) > 1:
29
  y = y.mean(axis=1)
30
 
31
+ # resample به 16k
32
  if sr != 16000:
33
  import resampy
34
  y = resampy.resample(y, sr, 16000)
 
38
  return pcm16.tobytes()
39
 
40
 
41
+ # ------------------------
42
+ # Gemini interaction
43
+ # ------------------------
44
  async def send_audio_file(file_path):
45
  audio_bytes = load_audio_as_pcm16(file_path)
46
 
 
62
  audio_chunks.append(response.data)
63
 
64
  full_audio = b"".join(audio_chunks)
 
65
 
66
+ # تبدیل خروجی مدل به numpy
67
+ buf = io.BytesIO(full_audio)
68
+ y, sr = sf.read(
69
+ buf,
70
+ channels=1,
71
+ samplerate=24000,
72
+ format="RAW",
73
+ subtype="PCM_16",
74
+ dtype="float32"
75
+ )
76
+
77
+ return sr, y
78
+
79
+
80
+ # ------------------------
81
+ # Gradio function
82
+ # ------------------------
83
+ def process_audio(file):
84
+ if file is None:
85
+ return None, "Please upload an audio file."
86
+
87
+ try:
88
+ sr, audio_data = asyncio.run(send_audio_file(file))
89
+ return (sr, audio_data), "Response generated successfully!"
90
+ except Exception as e:
91
+ return None, f"Error: {str(e)}"
92
+
93
+
94
+ # ------------------------
95
+ # Gradio UI
96
+ # ------------------------
97
+ with gr.Blocks() as demo:
98
+ gr.Markdown("# Gemini Audio → Audio")
99
+ gr.Markdown("Upload audio → Gemini responds with audio")
100
+
101
+ input_audio = gr.Audio(
102
+ label="Upload audio",
103
+ type="filepath"
104
+ )
105
 
106
+ output_audio = gr.Audio(
107
+ label="Gemini spoken response",
108
+ type="numpy",
109
+ autoplay=True
110
+ )
111
 
112
+ status = gr.Textbox(label="Status")
 
113
 
114
+ btn = gr.Button("Send Audio")
115
 
116
+ btn.click(
117
+ fn=process_audio,
118
+ inputs=input_audio,
119
+ outputs=[output_audio, status]
120
+ )
121
 
122
+ demo.launch()