shanusherly commited on
Commit
c29b700
·
verified ·
1 Parent(s): 16f22a5

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +271 -183
app.py CHANGED
@@ -1,183 +1,271 @@
1
- # app.py -- Fast Gemini + ElevenLabs minimal Chat + TTS for Hugging Face Spaces
2
- import os
3
- import hashlib
4
- import time
5
- import requests
6
-
7
- import gradio as gr
8
-
9
- # Config from environment (set these in HF Spaces "Secrets")
10
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
11
- ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
12
- ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "")
13
-
14
- # Model choices optimized for speed (change if you prefer quality)
15
- GEMINI_MODEL = "gemini-1.5-flash" # faster than 2.5
16
- ELEVEN_MODEL = "eleven_turbo_v2" # faster TTS
17
-
18
- OUTPUT_DIR = "/tmp/generated_audio"
19
- os.makedirs(OUTPUT_DIR, exist_ok=True)
20
-
21
- # Helper: deterministic filename
22
- def _audio_path_for_text(text: str) -> str:
23
- h = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10]
24
- return os.path.join(OUTPUT_DIR, f"audio_{h}.mp3")
25
-
26
- # ---------- Minimal Gemini wrapper (fast & defensive) ----------
27
- try:
28
- import google.generativeai as genai
29
- genai.configure(api_key=GEMINI_API_KEY)
30
- _GENAI_AVAILABLE = True
31
- except Exception:
32
- genai = None
33
- _GENAI_AVAILABLE = False
34
-
35
- def generate_text_fast(prompt: str, max_tokens: int = 512) -> str:
36
- """
37
- Try to get a short text response quickly.
38
- """
39
- if not _GENAI_AVAILABLE:
40
- return "Error: Gemini SDK not available or GEMINI_API_KEY not set."
41
-
42
- # Keep prompt small by design; limit prompt length externally before calling.
43
- try:
44
- # Preferred modern helper (if available)
45
- if hasattr(genai, "generate_text"):
46
- resp = genai.generate_text(model=GEMINI_MODEL, prompt=prompt, max_output_tokens=max_tokens)
47
- # resp may contain .text or be str-like
48
- if hasattr(resp, "text"):
49
- return resp.text.strip()
50
- return str(resp).strip()
51
- # Fallback: GenerativeModel
52
- if hasattr(genai, "GenerativeModel"):
53
- model = genai.GenerativeModel(GEMINI_MODEL)
54
- # Many SDKs expect a prompt or messages; try generate_content then fallback to generate
55
- if hasattr(model, "generate_content"):
56
- out = model.generate_content(prompt)
57
- if hasattr(out, "text"):
58
- return out.text.strip()
59
- return str(out).strip()
60
- if hasattr(model, "generate"):
61
- out = model.generate(prompt)
62
- if hasattr(out, "text"):
63
- return out.text.strip()
64
- return str(out).strip()
65
-
66
- except Exception as e:
67
- # brief error to user; more detailed logs printed server-side
68
- print("Gemini generation error:", e)
69
- return "Sorry — text generation failed."
70
-
71
- return "Gemini generation: no supported method found."
72
-
73
- # ---------- Minimal ElevenLabs TTS (HTTP fallback; small & fast) ----------
74
- def generate_tts_http(text: str) -> dict:
75
- """
76
- Call ElevenLabs TTS endpoint (works reliably if API KEY present).
77
- Returns {"ok": bool, "path": str or "", "error": str}
78
- """
79
- if not ELEVENLABS_API_KEY or not ELEVENLABS_VOICE_ID:
80
- return {"ok": False, "path": "", "error": "ElevenLabs cred/voice not set."}
81
-
82
- url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}"
83
- headers = {
84
- "Accept": "audio/mpeg",
85
- "Content-Type": "application/json",
86
- "xi-api-key": ELEVENLABS_API_KEY
87
- }
88
- payload = {
89
- "text": text,
90
- "model_id": ELEVEN_MODEL,
91
- # small voice settings for reliability
92
- "voice_settings": {"stability": 0.4, "similarity_boost": 0.3}
93
- }
94
- try:
95
- r = requests.post(url, json=payload, headers=headers, timeout=20)
96
- r.raise_for_status()
97
- path = _audio_path_for_text(text)
98
- with open(path, "wb") as f:
99
- f.write(r.content)
100
- return {"ok": True, "path": path, "error": ""}
101
- except Exception as e:
102
- print("ElevenLabs HTTP error:", e)
103
- return {"ok": False, "path": "", "error": str(e)}
104
-
105
- # ---------- Chat handling ----------
106
- # Keep a short local history (only last 4 exchanges) to avoid big prompts
107
- CHAT_HISTORY_LIMIT = 4
108
-
109
- def format_prompt(history: list, user_message: str) -> str:
110
- # history is list of tuples (role, text) where role in {"user","assistant"}
111
- parts = []
112
- for role, text in history[-(CHAT_HISTORY_LIMIT*2):]:
113
- parts.append(f"{role.capitalize()}: {text}")
114
- parts.append(f"User: {user_message}")
115
- parts.append("Assistant:")
116
- # Keep the prompt small; don't include system instruction repeatedly
117
- return "\n".join(parts)
118
-
119
- # We'll store a minimal session history in memory (per process). This is fine for quick demos.
120
- SESSION_HISTORY = []
121
-
122
- def chat_and_tts(user_message: str):
123
- """
124
- Returns (assistant_text, audio_file_path or None, status_message)
125
- This is intentionally synchronous and small.
126
- """
127
- # Limit user input length — improves speed & cost
128
- if len(user_message) > 800:
129
- user_message = user_message[:800] + "..."
130
-
131
- prompt = format_prompt(SESSION_HISTORY, user_message)
132
- # generate short text (limit tokens)
133
- t0 = time.time()
134
- assistant_text = generate_text_fast(prompt, max_tokens=300)
135
- gen_time = time.time() - t0
136
-
137
- # update local history
138
- SESSION_HISTORY.append(("user", user_message))
139
- SESSION_HISTORY.append(("assistant", assistant_text))
140
- if len(SESSION_HISTORY) > CHAT_HISTORY_LIMIT * 2:
141
- SESSION_HISTORY[:] = SESSION_HISTORY[-(CHAT_HISTORY_LIMIT*2):]
142
-
143
- # generate audio (optional) — keep small to reduce delay
144
- # Truncate TTS length
145
- tts_text = assistant_text if len(assistant_text) <= 400 else assistant_text[:400] + "..."
146
- t0 = time.time()
147
- tts_res = generate_tts_http(tts_text)
148
- tts_time = time.time() - t0
149
-
150
- if tts_res.get("ok"):
151
- return assistant_text, tts_res["path"], f"gen:{gen_time:.2f}s tts:{tts_time:.2f}s"
152
- else:
153
- return assistant_text, None, f"gen:{gen_time:.2f}s tts_failed: {tts_res.get('error','unknown')}"
154
-
155
- # ---------- Gradio UI (minimal) ----------
156
- with gr.Blocks(title="Fast Gemini + Eleven TTS") as demo:
157
- gr.Markdown("### Fast Gemini (text) + ElevenLabs (audio) demo — optimized for quick builds")
158
- chat = gr.Chatbot(elem_id="chatbot", label="Conversation")
159
- txt = gr.Textbox(placeholder="Type your message and press Enter", label="You")
160
- status = gr.Textbox(value="Ready", label="Status", interactive=False)
161
- audio_player = gr.Audio(label="Latest reply (audio)", interactive=False)
162
-
163
- def on_submit(user_msg, chat_history):
164
- assistant_text, audio_path, status_msg = chat_and_tts(user_msg)
165
- # append to chat view
166
- chat_history = chat_history or []
167
- chat_history.append((user_msg, assistant_text))
168
- # update audio player only if we have a file
169
- if audio_path:
170
- return chat_history, status_msg, audio_path, ""
171
- else:
172
- return chat_history, status_msg, None, ""
173
-
174
- txt.submit(on_submit, [txt, chat], [chat, status, audio_player, txt])
175
- # clear button
176
- def clear():
177
- global SESSION_HISTORY
178
- SESSION_HISTORY = []
179
- return [], "Cleared", None, ""
180
- gr.Button("Reset Chat").click(clear, None, [chat, status, audio_player, txt])
181
-
182
- if __name__ == "__main__":
183
- demo.launch(share=False, server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import requests
4
+ import json
5
+ import gradio as gr
6
+
7
+ # Google Gemini imports
8
+ import google.generativeai as genai
9
+ from langchain_google_genai import ChatGoogleGenerativeAI
10
+ from langchain_core.prompts import PromptTemplate
11
+
12
+
13
+ # legacy chains + memory now live in langchain_classic
14
+ from langchain_classic.chains import LLMChain
15
+ from langchain_classic.memory import ConversationBufferMemory
16
+
17
+
18
+ # ElevenLabs imports
19
+ from elevenlabs.client import ElevenLabs
20
+ from elevenlabs import save
21
+
22
+ # Google Gemini API Key
23
+ GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"
24
+
25
+ # ElevenLabs API Key
26
+ ELEVENLABS_API_KEY = "YOUR_ELEVENLABS_API_KEY"
27
+
28
+ # ElevenLabs Voice ID (Rachel voice by default)
29
+ ELEVENLABS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"
30
+
31
+ # Configure Gemini
32
+ genai.configure(api_key=GEMINI_API_KEY)
33
+
34
+ # Initialize ElevenLabs client
35
+ elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
36
+
37
+ print("✅ API keys configured successfully!")
38
+
39
+ template = """You are a helpful assistant to answer user queries.
40
+ {chat_history}
41
+ User: {user_message}
42
+ Chatbot:"""
43
+
44
+ prompt = PromptTemplate(
45
+ input_variables=["chat_history", "user_message"],
46
+ template=template
47
+ )
48
+
49
+ memory = ConversationBufferMemory(memory_key="chat_history")
50
+
51
+ print("✅ Prompt template created!")
52
+
53
+ # Initialize Gemini model using direct Google GenerativeAI (NOT LangChain wrapper)
54
+ import google.generativeai as genai
55
+
56
+ # Configure the Gemini model directly
57
+ gemini_model = genai.GenerativeModel('gemini-2.5-flash')
58
+
59
+ # Create a custom LLM wrapper for LangChain compatibility
60
+ class GeminiLLM:
61
+ def __init__(self, model):
62
+ self.model = model
63
+ self.memory_history = []
64
+
65
+ def predict(self, user_message):
66
+ # Build conversation context
67
+ full_prompt = f"You are a helpful assistant to answer user queries.\n"
68
+ for msg in self.memory_history:
69
+ full_prompt += f"{msg}\n"
70
+ full_prompt += f"User: {user_message}\nChatbot:"
71
+
72
+ # Generate response
73
+ response = self.model.generate_content(full_prompt)
74
+ answer = response.text
75
+
76
+ # Update memory
77
+ self.memory_history.append(f"User: {user_message}")
78
+ self.memory_history.append(f"Chatbot: {answer}")
79
+
80
+ # Keep only last 10 exchanges to avoid token limits
81
+ if len(self.memory_history) > 20:
82
+ self.memory_history = self.memory_history[-20:]
83
+
84
+ return answer
85
+
86
+ # Initialize the custom LLM
87
+ llm_chain = GeminiLLM(gemini_model)
88
+
89
+ print("✅ Gemini LLM initialized with direct SDK!")
90
+
91
+ def generate_audio_elevenlabs(text):
92
+ """
93
+ Generate audio using ElevenLabs API
94
+ Returns audio file path or error message
95
+ """
96
+ try:
97
+ # Generate audio
98
+ audio = elevenlabs_client.generate(
99
+ text=text,
100
+ voice=ELEVENLABS_VOICE_ID,
101
+ model="eleven_monolingual_v1" # or "eleven_multilingual_v2"
102
+ )
103
+
104
+ # Save audio to file
105
+ output_path = f"/content/output_audio_{hash(text) % 10000}.mp3"
106
+ save(audio, output_path)
107
+
108
+ return {
109
+ "type": "SUCCESS",
110
+ "response": output_path,
111
+ "message": "Audio generated successfully"
112
+ }
113
+ except Exception as e:
114
+ return {
115
+ "type": "ERROR",
116
+ "response": str(e),
117
+ "message": f"Audio generation failed: {str(e)}"
118
+ }
119
+
120
+ def generate_audio_elevenlabs_http(text):
121
+ """
122
+ Alternative method using direct HTTP API calls
123
+ More reliable for some use cases
124
+ """
125
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}"
126
+
127
+ headers = {
128
+ "Accept": "audio/mpeg",
129
+ "Content-Type": "application/json",
130
+ "xi-api-key": ELEVENLABS_API_KEY
131
+ }
132
+
133
+ data = {
134
+ "text": text,
135
+ "model_id": "eleven_monolingual_v1",
136
+ "voice_settings": {
137
+ "stability": 0.5,
138
+ "similarity_boost": 0.5,
139
+ "style": 0.5,
140
+ "use_speaker_boost": True
141
+ }
142
+ }
143
+
144
+ try:
145
+ response = requests.post(url, json=data, headers=headers)
146
+ response.raise_for_status()
147
+
148
+ # Save audio file
149
+ output_path = f"/content/output_audio_{hash(text) % 10000}.mp3"
150
+ with open(output_path, 'wb') as f:
151
+ f.write(response.content)
152
+
153
+ return {
154
+ "type": "SUCCESS",
155
+ "response": output_path,
156
+ "message": "Audio generated successfully"
157
+ }
158
+ except requests.exceptions.RequestException as e:
159
+ return {
160
+ "type": "ERROR",
161
+ "response": str(e),
162
+ "message": f"Audio generation failed: {str(e)}"
163
+ }
164
+
165
+ print("✅ ElevenLabs audio functions defined!")
166
+
167
+ def get_audio_reply_for_question(text):
168
+ """
169
+ Generate audio for the chatbot response
170
+ """
171
+ generated_audio_event = generate_audio_elevenlabs(text)
172
+
173
+ final_response = {
174
+ "audio_path": '',
175
+ "message": ''
176
+ }
177
+
178
+ if generated_audio_event["type"] == "SUCCESS":
179
+ audio_path = generated_audio_event["response"]
180
+ final_response['audio_path'] = audio_path
181
+ final_response['message'] = "Audio generated successfully"
182
+ else:
183
+ final_response['message'] = generated_audio_event['message']
184
+
185
+ return final_response
186
+
187
+ print("✅ Audio reply function defined!")
188
+
189
+ def get_text_response(user_message):
190
+ """
191
+ Get text response from Gemini
192
+ """
193
+ try:
194
+ response = llm_chain.predict(user_message=user_message)
195
+ return response
196
+ except Exception as e:
197
+ error_msg = f"Error in Gemini response: {str(e)}"
198
+ print(error_msg)
199
+ return f"Sorry, I encountered an error: {str(e)}"
200
+
201
+ print("✅ Text response function defined!")
202
+
203
+ def get_text_response_and_audio_response(user_message):
204
+ """
205
+ Get both text response from Gemini and audio from ElevenLabs
206
+ """
207
+ # Get text response from Gemini
208
+ text_response = get_text_response(user_message)
209
+
210
+ # Generate audio for the response
211
+ audio_reply = get_audio_reply_for_question(text_response)
212
+
213
+ final_response = {
214
+ 'text': text_response,
215
+ 'audio_path': audio_reply.get('audio_path', ''),
216
+ 'message': audio_reply.get('message', '')
217
+ }
218
+
219
+ return final_response
220
+
221
+ print("✅ Combined response function defined!")
222
+
223
+ def chat_bot_response(message, history):
224
+ """
225
+ Main chatbot function for Gradio interface
226
+ Returns tuple of (text_response, audio_file_path)
227
+ """
228
+ try:
229
+ # Get text and audio response
230
+ response = get_text_response_and_audio_response(message)
231
+
232
+ text_response = response['text']
233
+ audio_path = response['audio_path']
234
+
235
+ if audio_path and os.path.exists(audio_path):
236
+ # Return both text and audio
237
+ return text_response
238
+ else:
239
+ # Return only text if audio fails
240
+ return text_response
241
+
242
+ except Exception as e:
243
+ error_msg = f"Error: {str(e)}"
244
+ print(error_msg)
245
+ return error_msg
246
+
247
+ print("✅ Chatbot response handler defined!")
248
+
249
+ demo = gr.ChatInterface(
250
+ fn=chat_bot_response,
251
+ title="🤖 Gemini + ElevenLabs Chatbot",
252
+ description="Chat with Google Gemini AI with voice responses from ElevenLabs",
253
+ examples=[
254
+ "How are you doing?",
255
+ "What are your interests?",
256
+ "Tell me a short story",
257
+ "What's the weather like today?",
258
+ "Explain quantum computing in simple terms"
259
+ ],
260
+ theme=gr.themes.Soft()
261
+ )
262
+
263
+ print("✅ Gradio interface created!")
264
+
265
+ if __name__ == "__main__":
266
+ # Launch with public link
267
+ demo.launch(
268
+ share=True, # Creates public link
269
+ debug=True # Shows errors and logs
270
+ )
271
+