Spaces:

humanvprojectceo
/

HumanV

Sleeping

App Files Files Community

humanvprojectceo commited on Feb 9

Commit

ed85784

verified ·

1 Parent(s): dfe4e71

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -13

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import os
 import asyncio
-import soundfile as sf
 import numpy as np
 from google import genai
 from google.genai import types
@@ -9,22 +11,24 @@ MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
 client = genai.Client(
     http_options={"api_version": "v1beta"},
-    api_key=os.environ.get("GEMINI_API_KEY"),
 )
 CONFIG = types.LiveConnectConfig(
     response_modalities=["AUDIO"]
 )
 def load_audio_as_pcm16(path):
     y, sr = sf.read(path)
-    # mono
     if len(y.shape) > 1:
         y = y.mean(axis=1)
-    # resample to 16k
     if sr != 16000:
         import resampy
         y = resampy.resample(y, sr, 16000)
@@ -34,6 +38,9 @@ def load_audio_as_pcm16(path):
     return pcm16.tobytes()
 async def send_audio_file(file_path):
     audio_bytes = load_audio_as_pcm16(file_path)
@@ -55,17 +62,61 @@ async def send_audio_file(file_path):
                 audio_chunks.append(response.data)
         full_audio = b"".join(audio_chunks)
-        return full_audio
-def main(file_path):
-    audio = asyncio.run(send_audio_file(file_path))
-    with open("response.raw", "wb") as f:
-        f.write(audio)
-    print("Audio response saved as response.raw")
-if __name__ == "__main__":
-    main("input_audio.wav")

 import os
+import io
 import asyncio
 import numpy as np
+import soundfile as sf
+import gradio as gr
 from google import genai
 from google.genai import types
 client = genai.Client(
     http_options={"api_version": "v1beta"},
+    api_key=os.getenv("GEMINI_API_KEY"),
 )
 CONFIG = types.LiveConnectConfig(
     response_modalities=["AUDIO"]
 )
+# ------------------------
+# Audio preprocessing
+# ------------------------
 def load_audio_as_pcm16(path):
     y, sr = sf.read(path)
+    # تبدیل به mono
     if len(y.shape) > 1:
         y = y.mean(axis=1)
+    # resample به 16k
     if sr != 16000:
         import resampy
         y = resampy.resample(y, sr, 16000)
     return pcm16.tobytes()
+# ------------------------
+# Gemini interaction
+# ------------------------
 async def send_audio_file(file_path):
     audio_bytes = load_audio_as_pcm16(file_path)
                 audio_chunks.append(response.data)
         full_audio = b"".join(audio_chunks)
+        # تبدیل خروجی مدل به numpy
+        buf = io.BytesIO(full_audio)
+        y, sr = sf.read(
+            buf,
+            channels=1,
+            samplerate=24000,
+            format="RAW",
+            subtype="PCM_16",
+            dtype="float32"
+        )
+        return sr, y
+# ------------------------
+# Gradio function
+# ------------------------
+def process_audio(file):
+    if file is None:
+        return None, "Please upload an audio file."
+    try:
+        sr, audio_data = asyncio.run(send_audio_file(file))
+        return (sr, audio_data), "Response generated successfully!"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# ------------------------
+# Gradio UI
+# ------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# Gemini Audio → Audio")
+    gr.Markdown("Upload audio → Gemini responds with audio")
+    input_audio = gr.Audio(
+        label="Upload audio",
+        type="filepath"
+    )
+    output_audio = gr.Audio(
+        label="Gemini spoken response",
+        type="numpy",
+        autoplay=True
+    )
+    status = gr.Textbox(label="Status")
+    btn = gr.Button("Send Audio")
+    btn.click(
+        fn=process_audio,
+        inputs=input_audio,
+        outputs=[output_audio, status]
+    )
+demo.launch()