Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -71,7 +71,7 @@ except Exception as e:
|
|
| 71 |
# --- Model Constants (as per Streamlit app) ---
|
| 72 |
CATEGORY_MODEL = "gemini-2.0-flash-exp"
|
| 73 |
GENERATION_MODEL = "gemini-2.0-flash-exp-image-generation"
|
| 74 |
-
TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
| 75 |
|
| 76 |
|
| 77 |
# -----------------------------------------------------------------------------
|
|
@@ -126,6 +126,9 @@ def _convert_pcm_to_wav(pcm_data, sample_rate=24000, channels=1, sample_width=2)
|
|
| 126 |
audio_buffer.seek(0)
|
| 127 |
return audio_buffer.getvalue()
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
def generate_tts_audio_and_upload(text_to_speak, uid, project_id, step_num):
|
| 130 |
"""Generates audio using the exact method from the Streamlit app and uploads it."""
|
| 131 |
try:
|
|
@@ -156,6 +159,60 @@ def generate_tts_audio_and_upload(text_to_speak, uid, project_id, step_num):
|
|
| 156 |
except Exception as e:
|
| 157 |
print(f"Error during TTS generation for step {step_num}: {e}")
|
| 158 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
def send_text_request(model_name, prompt, image):
|
| 161 |
"""Helper to send requests that expect only a text response."""
|
|
|
|
| 71 |
# --- Model Constants (as per Streamlit app) ---
|
| 72 |
CATEGORY_MODEL = "gemini-2.0-flash-exp"
|
| 73 |
GENERATION_MODEL = "gemini-2.0-flash-exp-image-generation"
|
| 74 |
+
#TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
| 75 |
|
| 76 |
|
| 77 |
# -----------------------------------------------------------------------------
|
|
|
|
| 126 |
audio_buffer.seek(0)
|
| 127 |
return audio_buffer.getvalue()
|
| 128 |
|
| 129 |
+
|
| 130 |
+
#Gemini tts implementation SOTA but slow
|
| 131 |
+
'''
|
| 132 |
def generate_tts_audio_and_upload(text_to_speak, uid, project_id, step_num):
|
| 133 |
"""Generates audio using the exact method from the Streamlit app and uploads it."""
|
| 134 |
try:
|
|
|
|
| 159 |
except Exception as e:
|
| 160 |
print(f"Error during TTS generation for step {step_num}: {e}")
|
| 161 |
return None
|
| 162 |
+
'''
|
| 163 |
+
|
| 164 |
+
# DeepGram faster and efficient
|
| 165 |
+
def generate_tts_audio_and_upload(text_to_speak, uid, project_id, step_num):
|
| 166 |
+
"""
|
| 167 |
+
Generates audio using the Deepgram TTS API and uploads it to Firebase Storage.
|
| 168 |
+
This is a drop-in replacement for the previous Google GenAI TTS function.
|
| 169 |
+
"""
|
| 170 |
+
try:
|
| 171 |
+
# --- Step 1: Get the Deepgram API Key from environment variables ---
|
| 172 |
+
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
| 173 |
+
if not api_key:
|
| 174 |
+
print("FATAL: DEEPGRAM_API_KEY environment variable not set.")
|
| 175 |
+
return None
|
| 176 |
+
|
| 177 |
+
# --- Step 2: Define the API endpoint and headers ---
|
| 178 |
+
# The model 'aura-2-draco-en' is specified as a query parameter in the URL.
|
| 179 |
+
DEEPGRAM_URL = "https://api.deepgram.com/v1/speak?model=aura-2-draco-en"
|
| 180 |
+
|
| 181 |
+
headers = {
|
| 182 |
+
"Authorization": f"Token {api_key}",
|
| 183 |
+
"Content-Type": "text/plain" # As per Deepgram's requirement for this type of request
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
# --- Step 3: Make the API call to Deepgram ---
|
| 187 |
+
# Deepgram expects the raw text as the request body, not in a JSON object.
|
| 188 |
+
# We send the text directly in the 'data' parameter.
|
| 189 |
+
response = requests.post(DEEPGRAM_URL, headers=headers, data=text_to_speak.encode('utf-8'))
|
| 190 |
+
|
| 191 |
+
# Raise an exception for bad status codes (4xx or 5xx)
|
| 192 |
+
response.raise_for_status()
|
| 193 |
+
|
| 194 |
+
# The raw audio data is in the response content
|
| 195 |
+
audio_data = response.content
|
| 196 |
+
|
| 197 |
+
# --- Step 4: Upload the received audio to Firebase Storage ---
|
| 198 |
+
# The output format from this Deepgram model is MP3.
|
| 199 |
+
audio_path = f"users/{uid}/projects/{project_id}/narrations/step_{step_num}.mp3"
|
| 200 |
+
|
| 201 |
+
# The MIME type for MP3 is 'audio/mpeg'.
|
| 202 |
+
narration_url = upload_to_storage(audio_data, audio_path, 'audio/mpeg')
|
| 203 |
+
|
| 204 |
+
return narration_url
|
| 205 |
+
|
| 206 |
+
except requests.exceptions.RequestException as e:
|
| 207 |
+
print(f"Error during Deepgram API call for step {step_num}: {e}")
|
| 208 |
+
# Log the response body if available for more detailed error info
|
| 209 |
+
if e.response is not None:
|
| 210 |
+
print(f"Deepgram Error Response: {e.response.text}")
|
| 211 |
+
return None
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"An unexpected error occurred during TTS generation for step {step_num}: {e}")
|
| 214 |
+
return None
|
| 215 |
+
|
| 216 |
|
| 217 |
def send_text_request(model_name, prompt, image):
|
| 218 |
"""Helper to send requests that expect only a text response."""
|