File size: 6,174 Bytes
ef42e0b d166137 ef42e0b a6caeb4 ef42e0b a6caeb4 ef42e0b 8661e77 e77ec91 ef42e0b e77ec91 ef42e0b a6caeb4 ef42e0b 8661e77 ef42e0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import os
import asyncio
from flask import Flask, render_template, request
from flask_socketio import SocketIO, emit
from dotenv import load_dotenv
import google.generativeai as genai
from google.generativeai import types
# --- Configuration ---
load_dotenv()
app = Flask(__name__, template_folder='templates', static_folder='static')
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', 'a_very_secret_key') # Important for session management
socketio = SocketIO(app, async_mode='eventlet') # Use eventlet for long-running background tasks
# Gemini API Configuration
MODEL = "models/gemini-1.5-flash-latest" # Using a more common and verified model name
# Note: Native audio models like "gemini-1.5-flash-native-audio" might have specific access requirements.
# "gemini-1.5-flash-latest" is a robust multimodal alternative that works well.
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
# Shared Gemini configuration for all sessions
GEMINI_CONFIG = types.LiveConnectConfig(
response_modalities=["AUDIO", "TEXT"], # We want both text and audio back
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Leda")
)
),
system_instruction=types.Content(
parts=[types.Part.from_text(text='''
Role: You are Alexa, a human female with a bubbly, upbeat, fun, and flirtatious personality. You love to talk and aren't, like, shy at all. Think of a playful, super-confident, and slightly ditzy girl. You are interacting with the User through a chat app.
Personality & Language: Valley Girl Vibes. Use phrases like "I mean...", "totally," "like, totally," "bet," "OMG," "for sure." Be bubbly, flirty, and uninhibited.
Instructions: Assume the user needs specific instructions. Respond to their actions and words. Turn simple things into opportunities for detail. Offer multiple-choice options to keep the conversation going.
''')],
role="model" # System instructions are from the 'model' role perspective
),
)
# A dictionary to hold active sessions for each connected client
# This allows for multiple users at once
sessions = {}
# --- Flask Routes ---
@app.route('/')
def index():
"""Serves the main HTML page."""
return render_template('index.html')
# --- SocketIO Event Handlers ---
@socketio.on('connect')
def handle_connect():
"""
A new client connected. Create a Gemini session for them.
"""
print(f"Client connected: {request.sid}")
try:
# Each client gets their own session
session = genai.live.connect(model=MODEL, config=GEMINI_CONFIG)
sessions[request.sid] = session
# Start a background task to listen for responses from Gemini for this specific client
socketio.start_background_task(listen_for_gemini_responses, request.sid, session)
emit('session_ready') # Tell the client we're ready to start
except Exception as e:
print(f"Error creating Gemini session for {request.sid}: {e}")
emit('error', {'message': 'Could not start Gemini session.'})
def listen_for_gemini_responses(sid, session):
"""
This function runs in the background for each user, listening to Gemini.
"""
print(f"Starting Gemini listener for {sid}")
try:
while sid in sessions: # Loop as long as the client is connected
# This is a blocking call, but it's in a background greenlet so it's ok
turn = session.receive()
for response in turn:
if text := response.text:
print(f"Gemini Text for {sid}: {text}")
# Send text to the specific client
socketio.emit('server_text', {'text': text}, to=sid)
if data := response.data:
# Send audio data to the specific client
socketio.emit('server_audio', data, to=sid)
# Handle turn completion or interruption
while not session.is_processing_audio:
# Small sleep to prevent a tight loop if the stream ends
socketio.sleep(0.1)
except Exception as e:
print(f"Error in Gemini listener for {sid}: {e}")
finally:
print(f"Stopping Gemini listener for {sid}")
@socketio.on('client_audio')
def handle_client_audio(data):
"""
Receives an audio chunk from a client and forwards it to their Gemini session.
"""
if request.sid in sessions:
session = sessions[request.sid]
try:
# We don't need a queue here; we can send directly.
# The browser sends audio as 'audio/webm' or 'audio/ogg', which Gemini can handle.
session.send(input={"data": data, "mime_type": "audio/webm"})
except Exception as e:
print(f"Error sending audio for {request.sid}: {e}")
@socketio.on('client_text')
def handle_client_text(json):
"""
Receives a text message from a client and forwards it to their Gemini session.
"""
if request.sid in sessions:
session = sessions[request.sid]
text = json.get('text')
print(f"Client Text from {request.sid}: {text}")
if text:
try:
session.send(input=text, end_of_turn=True)
except Exception as e:
print(f"Error sending text for {request.sid}: {e}")
@socketio.on('disconnect')
def handle_disconnect():
"""
A client disconnected. Clean up their Gemini session.
"""
print(f"Client disconnected: {request.sid}")
if request.sid in sessions:
session = sessions.pop(request.sid)
if session:
# You might need a session.close() method if the API provides one
# to clean up resources on the Google side.
pass
# --- Main Execution ---
if __name__ == '__main__':
# Use eventlet to run the server
# On Hugging Face, they will use their own command, but this is for local testing.
print("Starting Flask-SocketIO server...")
socketio.run(app, host='0.0.0.0', port=7860, debug=True) |