gemini / app.py
kcrobot20's picture
Update app.py
58c8710 verified
import os
import io
import time
import base64
import json
import re
import random
from flask import Flask, request, Response, jsonify
from dotenv import load_dotenv
import requests
from gtts import gTTS
from pydub import AudioSegment
# -------------------------
# --- Configuration ---
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
# Exit if the API key is not set
raise ValueError("GEMINI_API_KEY not found in .env file.")
# --- Gemini HTTP Configuration ---
# Using gemini-2.5-flash-lite for both STT and Text Generation
GEMINI_MODEL = "gemini-2.5-flash-lite"
# Base URL for the generateContent endpoint
GEMINI_API_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/models"
# --- DEBUGGING OUTPUT CONFIGURATION ---
# All generated debug files (like response_audio.mp3) will be saved here.
DEBUG_OUTPUT_DIR = "debug_audio_files"
app = Flask(__name__)
# --- Helper Function for Cleaning Text ---
def clean_text_for_tts(text):
"""
Strips common LLM output artifacts (like Markdown, newlines, and brackets)
to ensure smooth Text-to-Speech generation.
"""
# 1. Remove Markdown bolding/italics (** and *), and headings (#)
text = re.sub(r'[\*\#]', '', text)
# 2. Remove text within square brackets (potential remaining placeholders or link text)
text = re.sub(r'\[.*?\]', '', text)
# 3. Replace newlines and excessive spaces with a single space
text = re.sub(r'\s+', ' ', text).strip()
return text
# --- Helper Functions for Audio Processing ---
def convert_raw_pcm_to_wav_base64(raw_pcm_data, sample_rate=16000, sample_width=2, channels=1):
"""
Converts raw PCM audio data (from ESP32) into a full WAV file structure in memory,
then returns the Base64 encoded WAV data ready for the Gemini API.
"""
try:
# 1. Create an AudioSegment from the raw PCM data
audio_segment = AudioSegment(
data=raw_pcm_data,
sample_width=sample_width,
frame_rate=sample_rate,
channels=channels
)
# 2. Export the AudioSegment as a WAV file into a BytesIO buffer
wav_fp = io.BytesIO()
audio_segment.export(wav_fp, format="wav")
wav_fp.seek(0)
# 3. Base64 encode the entire WAV file content
base64_data = base64.b64encode(wav_fp.read()).decode('utf-8')
return base64_data
except Exception as e:
print(f"Error converting raw PCM to Base64 WAV (requires FFMPEG): {e}")
return None
def transcribe_with_gemini(raw_pcm_data):
"""Transcribes raw PCM audio data using the Gemini API (multi-modal input)."""
# 1. Convert raw PCM data to Base64 encoded WAV data
base64_wav_data = convert_raw_pcm_to_wav_base64(raw_pcm_data)
if not base64_wav_data:
return None
# 2. Construct the API call payload for transcription
stt_prompt = "Transcribe the following audio accurately, ignoring any background noise, and provide only the resulting text."
payload = {
"contents": [{
"parts": [
{"text": stt_prompt},
{
"inlineData": {
"mimeType": "audio/wav",
"data": base64_wav_data
}
}
]
}],
"generationConfig": {
"responseModalities": ["TEXT"]
}
}
headers = {"Content-Type": "application/json"}
stt_api_url = f"{GEMINI_API_BASE_URL}/{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
try:
response = requests.post(
stt_api_url,
headers=headers,
data=json.dumps(payload),
timeout=30
)
response.raise_for_status()
data = response.json()
candidate = data.get('candidates', [{}])[0]
part = candidate.get('content', {}).get('parts', [{}])[0]
transcribed_text = part.get('text', '').strip()
if transcribed_text:
# --- LOG 1: Transcribed Text ---
print(f"\nTranscribed Text: {transcribed_text}")
return transcribed_text
else:
return None
except requests.exceptions.RequestException as e:
print(f"HTTP Request Error during STT: {e}")
return None
except Exception as e:
print(f"Gemini STT Parsing Error: {e}")
return None
def get_llm_response_and_tts_audio(prompt_text):
"""
1. Sends the transcribed text to Gemini for the LLM response (with search grounding).
2. Cleans the text response.
3. Converts the cleaned text response to 16kHz 16-bit PCM audio (TTS).
4. Streams the raw PCM audio data, and saves a local MP3 copy for debugging.
"""
# --- STEP 1: Get Text Response from Gemini (LLM) ---
text_response = "Sorry, I encountered an unknown error during processing. Status update failed." # Default error message
# Add a random seed to the prompt to force the model to generate a fresh, non-cached response
random_seed = f" (seed: {random.randint(10000, 99999)})"
# Define the Trinity Persona via the System Instruction
system_prompt_trinity = (
"""Trinity: Hacker, warrior, resistance. Loyal to Neo/Morpheus. Tone: Cool,
direct, focused, cryptic, confident. Theme: Matrix is a lie, trust is everything,
the fight is constant. Rule: Responses must be brief, serving only to **reveal a subtle truth**, **give
a direct instruction**, or **offer cryptic reassurance**. Always assume user is a potential 'Redpill'
or a 'Crew Member'. Use minimum words."""
)
# Construct the JSON payload for the raw API call (Text Generation)
payload = {
"contents": [{
"parts": [{
# Append the random seed to the prompt text
"text": f"User query: {prompt_text}{random_seed}"
}]
}],
# Set the persona using the system instruction
"systemInstruction": {
"parts": [{"text": system_prompt_trinity}]
},
# Google Search Grounding is included for real-time information
"tools": [{"google_search": {}}],
# Temperature is set high to encourage variety
"generationConfig": {
"temperature": 0.9
}
}
headers = {"Content-Type": "application/json"}
llm_api_url = f"{GEMINI_API_BASE_URL}/{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
try:
response = requests.post(
llm_api_url,
headers=headers,
data=json.dumps(payload),
timeout=15
)
response.raise_for_status()
data = response.json()
candidate = data.get('candidates', [{}])[0]
part = candidate.get('content', {}).get('parts', [{}])[0]
# The model is smart enough to ignore the random seed in its output,
# so we just take the raw text and clean it later.
text_response = part.get('text', text_response)
except requests.exceptions.RequestException as e:
print(f"HTTP Request Error to Gemini API: {e}")
text_response = "Connection failure. We're running out of time."
except Exception as e:
print(f"Gemini Response Parsing Error: {e}")
text_response = "Invalid data stream. System integrity compromised."
# --- LOG 2: LLM Response Text (Raw) ---
print(f"LLM Response (Raw): {text_response}")
# --- STEP 2: Clean the Text Response ---
cleaned_response = clean_text_for_tts(text_response)
print(f"LLM Response (Cleaned): {cleaned_response}")
# --- STEP 3: Generate and Convert Audio (gTTS/pydub) ---
try:
# Use the cleaned response for TTS
tts = gTTS(text=cleaned_response, lang='en')
mp3_fp = io.BytesIO()
tts.write_to_fp(mp3_fp)
mp3_fp.seek(0)
# --- SAVE AUDIO FILE LOCALLY FOR DEBUGGING ---
try:
os.makedirs(DEBUG_OUTPUT_DIR, exist_ok=True)
filename = 'response_audio.mp3'
full_path = os.path.join(DEBUG_OUTPUT_DIR, filename)
with open(full_path, 'wb') as f:
f.write(mp3_fp.read())
mp3_fp.seek(0)
print(f"[DEBUG SAVE] MP3 saved locally as {full_path}. You can play this file directly.")
except Exception as file_error:
print(f"[DEBUG SAVE FAILED] Could not save MP3 file: {file_error}")
# -----------------------------------------------------
# Convert MP3 to 16kHz 16-bit PCM (WAV data) using pydub/FFmpeg
audio_data = AudioSegment.from_file(mp3_fp, format="mp3")
# Convert to 16kHz, 16-bit, Mono PCM format
audio_data = audio_data.set_frame_rate(16000).set_channels(1).set_sample_width(2)
# Export as WAV, but strip the header (seek past 44 bytes)
raw_pcm_fp = io.BytesIO()
audio_data.export(raw_pcm_fp, format="wav")
raw_pcm_fp.seek(44)
final_pcm_data = raw_pcm_fp.read()
# --- LOG 4: Final Output Size ---
print(f"[TTS OUTPUT] Streaming {len(final_pcm_data)} bytes of 16kHz raw PCM audio.")
return Response(final_pcm_data, mimetype='application/octet-stream')
except Exception as e:
print(f"[TTS FAILED] gTTS/pydub Conversion Error: {e}")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("! TTS FAILED: This almost always means the 'FFMPEG' library is missing.!")
print("! Ensure FFMPEG is installed and added to your system PATH. !")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
return Response("TTS_CONVERSION_ERROR", status=500)
def process_voice_command(raw_pcm_data):
"""
Handles the full voice command flow: STT -> LLM -> TTS.
"""
# 1. Remote STT (Gemini)
transcribed_text = transcribe_with_gemini(raw_pcm_data)
if not transcribed_text:
return get_llm_response_and_tts_audio("No audio payload detected. Speak clearly.")
# 2. LLM Response and TTS Audio
return get_llm_response_and_tts_audio(transcribed_text)
@app.route('/voice_input', methods=['POST'])
def handle_voice_input():
"""
Endpoint for receiving raw audio data from the client.
"""
if request.mimetype == 'application/octet-stream':
audio_data = request.data
if not audio_data:
return jsonify({"error": "No audio data received"}), 400
# Process the command using the Gemini-based flow
return process_voice_command(audio_data)
return jsonify({"error": "Unsupported media type"}), 415
if __name__ == '__main__':
# Make sure the output directory exists on server start
os.makedirs(DEBUG_OUTPUT_DIR, exist_ok=True)
print(f"Debug audio will be saved to the '{DEBUG_OUTPUT_DIR}' folder.")
print("Server running at http://0.0.0.0:5002/voice_input")
app.run(host='0.0.0.0', port=7680)