humanvprojectceo commited on
Commit
c44a1c5
·
verified ·
1 Parent(s): f10e7ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -29
app.py CHANGED
@@ -1,29 +1,48 @@
1
  import os
2
  import io
3
  import asyncio
 
 
4
  import soundfile as sf
5
  import gradio as gr
6
  from google import genai
7
 
8
  client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
9
 
10
- MODEL = "gemini-2.5-flash-native-audio-preview-09-2025"
11
 
12
  config = {
13
- "response_modalities": ["AUDIO"]
 
14
  }
15
 
16
- async def generate_audio_response(text: str):
17
  async with client.aio.live.connect(model=MODEL, config=config) as session:
18
- await session.send_client_content(
19
- turns={"role": "user", "parts": [{"text": text}]},
20
- turn_complete=True,
21
  )
22
 
23
  audio_chunks = []
24
- async for response in session.receive():
25
- if response.data is not None:
26
- audio_chunks.append(response.data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  full_audio = b''.join(audio_chunks)
29
  if not full_audio:
@@ -33,40 +52,51 @@ async def generate_audio_response(text: str):
33
  y, sr = sf.read(buf, channels=1, samplerate=24000, format="RAW", subtype="PCM_16", dtype="float32")
34
  return sr, y
35
 
36
- def process_text(input_text: str | None):
37
- if not input_text or not input_text.strip():
38
- return None, "Please enter a message."
39
 
40
  try:
41
- sr, audio_data = asyncio.run(generate_audio_response(input_text.strip()))
42
- return (sr, audio_data), "Response generated successfully!"
 
 
 
 
 
 
43
  except Exception as e:
44
  return None, f"Error: {str(e)}"
45
 
46
  with gr.Blocks() as demo:
47
- gr.Markdown("# Gemini Text to Spoken Audio")
48
- gr.Markdown("Enter text Gemini responds with audio")
49
-
50
- input_text = gr.Textbox(
51
- label="Your message",
52
- placeholder="Hello? Gemini are you there?",
53
- lines=3
54
- )
 
 
55
 
56
- output_audio = gr.Audio(
57
- label="Gemini spoken response",
58
- type="numpy",
59
- autoplay=True
60
- )
 
61
 
62
  status = gr.Textbox(label="Status")
63
 
64
  btn = gr.Button("Generate Response")
65
 
66
  btn.click(
67
- fn=process_text,
68
- inputs=input_text,
69
  outputs=[output_audio, status]
70
  )
71
 
 
 
72
  demo.launch()
 
1
  import os
2
  import io
3
  import asyncio
4
+ import numpy as np
5
+ import librosa
6
  import soundfile as sf
7
  import gradio as gr
8
  from google import genai
9
 
10
  client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
11
 
12
+ MODEL = "gemini-2.5-flash-native-audio-preview-12-2025"
13
 
14
  config = {
15
+ "response_modalities": ["AUDIO"],
16
+ "system_instruction": "You are a helpful assistant and answer in a friendly tone.",
17
  }
18
 
19
+ async def generate_audio_response(audio_bytes: bytes):
20
  async with client.aio.live.connect(model=MODEL, config=config) as session:
21
+ await session.send_realtime_input(
22
+ audio={"data": audio_bytes, "mime_type": "audio/pcm"}
 
23
  )
24
 
25
  audio_chunks = []
26
+ last_receive_time = asyncio.get_event_loop().time()
27
+
28
+ while True:
29
+ turn = session.receive()
30
+ has_new = False
31
+ async for response in turn:
32
+ if response.server_content and response.server_content.model_turn:
33
+ for part in response.server_content.model_turn.parts:
34
+ if hasattr(part, "inline_data") and part.inline_data.data:
35
+ audio_chunks.append(part.inline_data.data)
36
+ has_new = True
37
+ last_receive_time = asyncio.get_event_loop().time()
38
+
39
+ if audio_chunks and not has_new and (asyncio.get_event_loop().time() - last_receive_time > 3):
40
+ break
41
+
42
+ await asyncio.sleep(0.2)
43
+
44
+ if asyncio.get_event_loop().time() - last_receive_time > 30:
45
+ break
46
 
47
  full_audio = b''.join(audio_chunks)
48
  if not full_audio:
 
52
  y, sr = sf.read(buf, channels=1, samplerate=24000, format="RAW", subtype="PCM_16", dtype="float32")
53
  return sr, y
54
 
55
+ def process_audio(input_path: str | None):
56
+ if input_path is None:
57
+ return None, "Please upload a WAV file."
58
 
59
  try:
60
+ y, orig_sr = librosa.load(input_path, sr=None, mono=True)
61
+ y = librosa.resample(y, orig_sr=orig_sr, target_sr=16000)
62
+ y_int = np.int16(y * 32767)
63
+ audio_bytes = y_int.tobytes()
64
+
65
+ sr, response_audio = asyncio.run(generate_audio_response(audio_bytes))
66
+
67
+ return (sr, response_audio), "Response generated successfully!"
68
  except Exception as e:
69
  return None, f"Error: {str(e)}"
70
 
71
  with gr.Blocks() as demo:
72
+ gr.Markdown("# Gemini Live Audio-to-Audio Demo")
73
+ gr.Markdown("Upload a WAV file (spoken query). Gemini will respond with spoken audio.")
74
+
75
+ with gr.Row():
76
+ input_audio = gr.Audio(
77
+ label="Upload your query (WAV file)",
78
+ type="filepath",
79
+ sources=["upload"],
80
+ format="wav"
81
+ )
82
 
83
+ with gr.Row():
84
+ output_audio = gr.Audio(
85
+ label="Gemini spoken response",
86
+ type="numpy",
87
+ autoplay=True
88
+ )
89
 
90
  status = gr.Textbox(label="Status")
91
 
92
  btn = gr.Button("Generate Response")
93
 
94
  btn.click(
95
+ fn=process_audio,
96
+ inputs=input_audio,
97
  outputs=[output_audio, status]
98
  )
99
 
100
+ gr.Markdown("Example test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav")
101
+
102
  demo.launch()