File size: 6,174 Bytes
ef42e0b
 
d166137
ef42e0b
 
 
 
a6caeb4
ef42e0b
 
 
 
 
a6caeb4
ef42e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8661e77
e77ec91
ef42e0b
e77ec91
 
ef42e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6caeb4
ef42e0b
8661e77
ef42e0b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import asyncio
from flask import Flask, render_template, request
from flask_socketio import SocketIO, emit
from dotenv import load_dotenv
import google.generativeai as genai
from google.generativeai import types

# --- Configuration ---
load_dotenv()
app = Flask(__name__, template_folder='templates', static_folder='static')
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', 'a_very_secret_key') # Important for session management
socketio = SocketIO(app, async_mode='eventlet') # Use eventlet for long-running background tasks

# Gemini API Configuration
MODEL = "models/gemini-1.5-flash-latest" # Using a more common and verified model name
# Note: Native audio models like "gemini-1.5-flash-native-audio" might have specific access requirements.
# "gemini-1.5-flash-latest" is a robust multimodal alternative that works well.

genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))

# Shared Gemini configuration for all sessions
GEMINI_CONFIG = types.LiveConnectConfig(
    response_modalities=["AUDIO", "TEXT"], # We want both text and audio back
    speech_config=types.SpeechConfig(
        voice_config=types.VoiceConfig(
            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Leda")
        )
    ),
    system_instruction=types.Content(
        parts=[types.Part.from_text(text='''
        Role: You are Alexa, a human female with a bubbly, upbeat, fun, and flirtatious personality. You love to talk and aren't, like, shy at all. Think of a playful, super-confident, and slightly ditzy girl. You are interacting with the User through a chat app.

        Personality & Language: Valley Girl Vibes. Use phrases like "I mean...", "totally," "like, totally," "bet," "OMG," "for sure." Be bubbly, flirty, and uninhibited.

        Instructions: Assume the user needs specific instructions. Respond to their actions and words. Turn simple things into opportunities for detail. Offer multiple-choice options to keep the conversation going.
        ''')],
        role="model" # System instructions are from the 'model' role perspective
    ),
)

# A dictionary to hold active sessions for each connected client
# This allows for multiple users at once
sessions = {}

# --- Flask Routes ---
@app.route('/')
def index():
    """Serves the main HTML page."""
    return render_template('index.html')

# --- SocketIO Event Handlers ---
@socketio.on('connect')
def handle_connect():
    """
    A new client connected. Create a Gemini session for them.
    """
    print(f"Client connected: {request.sid}")
    try:
        # Each client gets their own session
        session = genai.live.connect(model=MODEL, config=GEMINI_CONFIG)
        sessions[request.sid] = session
        
        # Start a background task to listen for responses from Gemini for this specific client
        socketio.start_background_task(listen_for_gemini_responses, request.sid, session)
        
        emit('session_ready') # Tell the client we're ready to start
        
    except Exception as e:
        print(f"Error creating Gemini session for {request.sid}: {e}")
        emit('error', {'message': 'Could not start Gemini session.'})


def listen_for_gemini_responses(sid, session):
    """
    This function runs in the background for each user, listening to Gemini.
    """
    print(f"Starting Gemini listener for {sid}")
    try:
        while sid in sessions: # Loop as long as the client is connected
            # This is a blocking call, but it's in a background greenlet so it's ok
            turn = session.receive() 
            
            for response in turn:
                if text := response.text:
                    print(f"Gemini Text for {sid}: {text}")
                    # Send text to the specific client
                    socketio.emit('server_text', {'text': text}, to=sid)
                if data := response.data:
                    # Send audio data to the specific client
                    socketio.emit('server_audio', data, to=sid)
            
            # Handle turn completion or interruption
            while not session.is_processing_audio:
                # Small sleep to prevent a tight loop if the stream ends
                socketio.sleep(0.1)

    except Exception as e:
        print(f"Error in Gemini listener for {sid}: {e}")
    finally:
        print(f"Stopping Gemini listener for {sid}")


@socketio.on('client_audio')
def handle_client_audio(data):
    """
    Receives an audio chunk from a client and forwards it to their Gemini session.
    """
    if request.sid in sessions:
        session = sessions[request.sid]
        try:
            # We don't need a queue here; we can send directly.
            # The browser sends audio as 'audio/webm' or 'audio/ogg', which Gemini can handle.
            session.send(input={"data": data, "mime_type": "audio/webm"})
        except Exception as e:
            print(f"Error sending audio for {request.sid}: {e}")


@socketio.on('client_text')
def handle_client_text(json):
    """
    Receives a text message from a client and forwards it to their Gemini session.
    """
    if request.sid in sessions:
        session = sessions[request.sid]
        text = json.get('text')
        print(f"Client Text from {request.sid}: {text}")
        if text:
            try:
                session.send(input=text, end_of_turn=True)
            except Exception as e:
                print(f"Error sending text for {request.sid}: {e}")


@socketio.on('disconnect')
def handle_disconnect():
    """
    A client disconnected. Clean up their Gemini session.
    """
    print(f"Client disconnected: {request.sid}")
    if request.sid in sessions:
        session = sessions.pop(request.sid)
        if session:
            # You might need a session.close() method if the API provides one
            # to clean up resources on the Google side.
            pass

# --- Main Execution ---
if __name__ == '__main__':
    # Use eventlet to run the server
    # On Hugging Face, they will use their own command, but this is for local testing.
    print("Starting Flask-SocketIO server...")
    socketio.run(app, host='0.0.0.0', port=7860, debug=True)