File size: 12,949 Bytes
9534972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc5c49d
9534972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
824c94e
9534972
 
 
 
 
 
 
 
fc5c49d
9534972
 
 
 
 
 
 
824c94e
 
 
 
9534972
 
 
824c94e
9534972
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e528cf4
9901ceb
9356bc9
 
3fdf6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9901ceb
 
3fdf6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9901ceb
3fdf6fc
 
 
 
 
 
9901ceb
3fdf6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9901ceb
3fdf6fc
9901ceb
3fdf6fc
 
 
 
 
 
9901ceb
 
3fdf6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9901ceb
3fdf6fc
 
 
9901ceb
3fdf6fc
 
 
 
 
9534972
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import asyncio
import base64
import os
import time
from io import BytesIO

import gradio as gr
import numpy as np
import websockets
from dotenv import load_dotenv
from fastrtc import (
    AsyncAudioVideoStreamHandler,
    Stream,
    WebRTC,
    get_cloudflare_turn_credentials_async,
    wait_for_item,
)
from google import genai
from gradio.utils import get_space
from PIL import Image

load_dotenv()


def encode_audio(data: np.ndarray) -> dict:
    """Encode Audio data to send to the server"""
    return {
        "mime_type": "audio/pcm",
        "data": base64.b64encode(data.tobytes()).decode("UTF-8"),
    }


def encode_image(data: np.ndarray) -> dict:
    with BytesIO() as output_bytes:
        pil_image = Image.fromarray(data)
        pil_image.save(output_bytes, "JPEG")
        bytes_data = output_bytes.getvalue()
    base64_str = str(base64.b64encode(bytes_data), "utf-8")
    return {"mime_type": "image/jpeg", "data": base64_str}


class GeminiHandler(AsyncAudioVideoStreamHandler):
    def __init__(
        self,
    ) -> None:
        super().__init__(
            "mono",
            output_sample_rate=24000,
            input_sample_rate=16000,
        )
        self.audio_queue = asyncio.Queue()
        self.video_queue = asyncio.Queue()
        self.session = None
        self.last_frame_time = 0
        self.quit = asyncio.Event()

    def copy(self) -> "GeminiHandler":
        return GeminiHandler()

    async def start_up(self):
        client = genai.Client(
            api_key=os.getenv("GEMINI_API_KEY"), http_options={"api_version": "v1alpha"}
        )
        config = {"response_modalities": ["AUDIO"]}
        async with client.aio.live.connect(
            model="gemini-2.0-flash-exp",
            config=config,  # type: ignore
        ) as session:
            self.session = session
            while not self.quit.is_set():
                turn = self.session.receive()
                try:
                    async for response in turn:
                        if data := response.data:
                            audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
                        self.audio_queue.put_nowait(audio)
                except websockets.exceptions.ConnectionClosedOK:
                    print("connection closed")
                    break

    async def video_receive(self, frame: np.ndarray):
        self.video_queue.put_nowait(frame)

        if self.session:
            # send image every 1 second
            print(time.time() - self.last_frame_time)
            if time.time() - self.last_frame_time > 1:
                self.last_frame_time = time.time()
                await self.session.send(input=encode_image(frame))
                if self.latest_args[1] is not None:
                    await self.session.send(input=encode_image(self.latest_args[1]))

    async def video_emit(self):
        frame = await wait_for_item(self.video_queue, 0.01)
        if frame is not None:
            return frame
        else:
            return np.zeros((100, 100, 3), dtype=np.uint8)

    async def receive(self, frame: tuple[int, np.ndarray]) -> None:
        _, array = frame
        array = array.squeeze()
        audio_message = encode_audio(array)
        if self.session:
            await self.session.send(input=audio_message)

    async def emit(self):
        array = await wait_for_item(self.audio_queue, 0.01)
        if array is not None:
            return (self.output_sample_rate, array)
        return array

    async def shutdown(self) -> None:
        if self.session:
            self.quit.set()
            await self.session.close()
            self.quit.clear()


stream = Stream(
    handler=GeminiHandler(),
    modality="audio-video",
    mode="send-receive",
    rtc_configuration=get_cloudflare_turn_credentials_async,
    time_limit=180 if get_space() else None,
    additional_inputs=[
        gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
    ],
    ui_args={
        "icon": "https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
        "pulse_color": "rgb(255, 255, 255)",
        "icon_button_color": "rgb(255, 255, 255)",
        "title": "Gemini Audio Video Chat",
    },
)

css = """
#video-source {max-width: 600px !important; max-height: 600 !important;}
"""

with gr.Blocks(css=css) as demo:
    gr.HTML(
        """
    <div style='display: flex; align-items: center; justify-content: center; gap: 20px'>
        <div style="background-color: var(--block-background-fill); border-radius: 8px">
            <img src="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png" style="width: 100px; height: 100px;">
        </div>
        <div>
            <h1>Gen AI SDK Voice Chat</h1>
            <p>Speak with Gemini using real-time audio + video streaming</p>
            <p>Powered by <a href="https://gradio.app/">Gradio</a> and <a href=https://freddyaboulton.github.io/gradio-webrtc/">WebRTC</a>⚡️</p>
            
        </div>
    </div>
    """
    )
    with gr.Row() as row:
        with gr.Column():
            webrtc = WebRTC(
                label="Video Chat",
                modality="audio-video",
                mode="send-receive",
                elem_id="video-source",
                rtc_configuration=get_cloudflare_turn_credentials_async,
                icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
                pulse_color="rgb(255, 255, 255)",
                icon_button_color="rgb(255, 255, 255)",
            )
        #with gr.Column():
            #mage_input = gr.Image(
                #label="Image", type="numpy", sources=["upload", "clipboard"]
            #)

        webrtc.stream(
            GeminiHandler(),
            inputs=[webrtc],
            outputs=[webrtc],
            time_limit=180 if get_space() else None,
            concurrency_limit=2 if get_space() else None,
        )

stream.ui = demo


if __name__ == "__main__":
    if (mode := os.getenv("MODE")) == "UI":
        stream.ui.launch(server_port=7860)
    elif mode == "PHONE":
        raise ValueError("Phone mode not supported for this demo")
    else:
        stream.ui.launch(server_port=7860)



'''
import gradio as gr
import os
import google.generativeai as genai
from google.generativeai import types
import sys # To print errors to stderr for HF logs

# --- Configuration ---
# Use a generally available model, adjust if needed
MODEL_NAME = "gemini-2.0-flash"
SYSTEM_INSTRUCTION = """du bist ein echzeitübersetzer für deutsch und italienisch: if sprache==deutsch than translate to italienisch and add \"it-IT\" at the end. if sprache==italienisch than translate to deutsch and add \"de-DE\" at the end. erkläre nicht, kommentiere nicht, füge nicts hiinzu, nur übersetzen +länderkürzel am ende."""

# --- Gemini API Interaction ---

def get_gemini_client():
    """Initializes and returns the Gemini client, checking for API key."""
    api_key = os.environ.get("GEMINI_API_KEY")
    if not api_key:
        print("Error: GEMINI_API_KEY environment variable not set.", file=sys.stderr)
        # Raising gr.Error stops execution and displays the message in the Gradio UI
        raise gr.Error("GEMINI_API_KEY environment variable not set. Please configure it in the Hugging Face Space secrets.")
    try:
        genai.configure(api_key=api_key)
        # Optional: Add a quick check if the API key is valid, e.g., list models
        # genai.list_models()
        return genai # Return the configured module
    except Exception as e:
        print(f"Error configuring Gemini client: {e}", file=sys.stderr)
        raise gr.Error(f"Failed to configure Gemini API. Check API Key. Error: {e}")

def generate_translation(input_text):
    """
    Takes input text, calls the Gemini API with streaming, and yields the translation chunks.
    """
    if not input_text:
        yield "" # Return empty string immediately if input is empty
        return

    try:
        client = get_gemini_client() # Get configured client module
        model = client.GenerativeModel(
            MODEL_NAME,
            system_instruction=SYSTEM_INSTRUCTION,
            # Safety settings are recommended
            safety_settings=[
                {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            ]
        )

        contents = [
             # No need for explicit role here for single turn with system prompt
             input_text
        ]

        # Tools might not be strictly necessary for pure translation,
        # but keeping it as per original request. Remove if causing issues or cost.
        # tools = [
        #     types.Tool(google_search=types.GoogleSearch()),
        # ]
        generation_config = types.GenerationConfig(
            # tools=tools, # Uncomment if you need the search tool
            response_mime_type="text/plain",
            # Add other config like temperature if needed
            # temperature=0.7
        )

        # Use the stream=True parameter for the model's generate_content method
        response_stream = model.generate_content(
            contents=contents,
            generation_config=generation_config,
            stream=True
        )

        # Yield each chunk's text
        output_buffer = ""
        for chunk in response_stream:
             # Check if the chunk has text content before yielding
             # Sometimes chunks might contain other info (like safety ratings or finish reason)
             if chunk.parts:
                 output_buffer += chunk.text
                 yield output_buffer # Yield the accumulated text so far for streaming UI

    except gr.Error as e: # Re-raise Gradio errors to show them in UI
         print(f"Gradio Error during generation: {e}", file=sys.stderr)
         yield f"Error: {e}" # Yield the error message
    except Exception as e:
        error_message = f"An error occurred during translation: {e}"
        print(error_message, file=sys.stderr)
        # Provide a user-friendly error in the Gradio interface
        yield error_message # Yield the error message

# --- Gradio Interface Definition ---
with gr.Blocks() as demo:
    gr.Markdown(f"""
    # German <-> Italian Translator (Gemini {MODEL_NAME})
    Enter text in German or Italian. The app will attempt to translate based on the rules:
    *   German input -> Italian translation + `it-IT`
    *   Italian input -> German translation + `de-DE`
    *   Uses Google Gemini API. Requires `GEMINI_API_KEY` set in Hugging Face Space secrets.
    """)

    with gr.Row():
        text_input = gr.Textbox(
            label="Input Text (German or Italian)",
            placeholder="Type German or Italian text here...",
            lines=5
        )
        text_output = gr.Textbox(
            label="Translation",
            placeholder="Translation will appear here...",
            lines=5,
            interactive=False # Output is read-only
        )

    translate_button = gr.Button("Translate", variant="primary")

    # Connect button click to the generation function
    # The function `generate_translation` is a generator, so Gradio handles streaming automatically
    translate_button.click(
        fn=generate_translation,
        inputs=text_input,
        outputs=text_output
    )

    gr.Examples(
        examples=[
            ["Hallo Welt"],
            ["Come stai?"],
            ["Das ist ein Test."],
            ["Questa è una prova."],
            ["Ich liebe Pizza."],
            ["Mi piace il gelato."],
        ],
        inputs=text_input,
        outputs=text_output, # Optional: show expected output if known, otherwise just prefill input
        fn=generate_translation, # Make examples clickable
        cache_examples=False, # Disable caching for API calls unless desired/safe
        label="Examples (Click to Run)"
    )

# --- Launch the App ---
# When deploying on Hugging Face Spaces, HF automatically runs this.
# You might need demo.launch(share=True) for local testing with sharing.
if __name__ == "__main__":
    # Check for API key locally before launching (optional but helpful for local dev)
    if not os.environ.get("GEMINI_API_KEY"):
         print("\nWarning: GEMINI_API_KEY environment variable not set.", file=sys.stderr)
         print("The app will likely fail unless the key is provided.", file=sys.stderr)
         print("For Hugging Face deployment, set this in the Space secrets.\n", file=sys.stderr)
    demo.launch()
'''