Spaces:

shanusherly
/

audio

Sleeping

App Files Files Community

shanusherly commited on Dec 7, 2025

Commit

c29b700

verified ·

1 Parent(s): 16f22a5

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +271 -183

app.py CHANGED Viewed

@@ -1,183 +1,271 @@
-# app.py -- Fast Gemini + ElevenLabs minimal Chat + TTS for Hugging Face Spaces
-import os
-import hashlib
-import time
-import requests
-import gradio as gr
-# Config from environment (set these in HF Spaces "Secrets")
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
-ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY", "")
-ELEVENLABS_VOICE_ID = os.getenv("ELEVENLABS_VOICE_ID", "")
-# Model choices optimized for speed (change if you prefer quality)
-GEMINI_MODEL = "gemini-1.5-flash"      # faster than 2.5
-ELEVEN_MODEL = "eleven_turbo_v2"      # faster TTS
-OUTPUT_DIR = "/tmp/generated_audio"
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-# Helper: deterministic filename
-def _audio_path_for_text(text: str) -> str:
-    h = hashlib.sha256(text.encode("utf-8")).hexdigest()[:10]
-    return os.path.join(OUTPUT_DIR, f"audio_{h}.mp3")
-# ---------- Minimal Gemini wrapper (fast & defensive) ----------
-try:
-    import google.generativeai as genai
-    genai.configure(api_key=GEMINI_API_KEY)
-    _GENAI_AVAILABLE = True
-except Exception:
-    genai = None
-    _GENAI_AVAILABLE = False
-def generate_text_fast(prompt: str, max_tokens: int = 512) -> str:
-    """
-    Try to get a short text response quickly.
-    """
-    if not _GENAI_AVAILABLE:
-        return "Error: Gemini SDK not available or GEMINI_API_KEY not set."
-    # Keep prompt small by design; limit prompt length externally before calling.
-    try:
-        # Preferred modern helper (if available)
-        if hasattr(genai, "generate_text"):
-            resp = genai.generate_text(model=GEMINI_MODEL, prompt=prompt, max_output_tokens=max_tokens)
-            # resp may contain .text or be str-like
-            if hasattr(resp, "text"):
-                return resp.text.strip()
-            return str(resp).strip()
-        # Fallback: GenerativeModel
-        if hasattr(genai, "GenerativeModel"):
-            model = genai.GenerativeModel(GEMINI_MODEL)
-            # Many SDKs expect a prompt or messages; try generate_content then fallback to generate
-            if hasattr(model, "generate_content"):
-                out = model.generate_content(prompt)
-                if hasattr(out, "text"):
-                    return out.text.strip()
-                return str(out).strip()
-            if hasattr(model, "generate"):
-                out = model.generate(prompt)
-                if hasattr(out, "text"):
-                    return out.text.strip()
-                return str(out).strip()
-    except Exception as e:
-        # brief error to user; more detailed logs printed server-side
-        print("Gemini generation error:", e)
-        return "Sorry — text generation failed."
-    return "Gemini generation: no supported method found."
-# ---------- Minimal ElevenLabs TTS (HTTP fallback; small & fast) ----------
-def generate_tts_http(text: str) -> dict:
-    """
-    Call ElevenLabs TTS endpoint (works reliably if API KEY present).
-    Returns {"ok": bool, "path": str or "", "error": str}
-    """
-    if not ELEVENLABS_API_KEY or not ELEVENLABS_VOICE_ID:
-        return {"ok": False, "path": "", "error": "ElevenLabs cred/voice not set."}
-    url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}"
-    headers = {
-        "Accept": "audio/mpeg",
-        "Content-Type": "application/json",
-        "xi-api-key": ELEVENLABS_API_KEY
-    }
-    payload = {
-        "text": text,
-        "model_id": ELEVEN_MODEL,
-        # small voice settings for reliability
-        "voice_settings": {"stability": 0.4, "similarity_boost": 0.3}
-    }
-    try:
-        r = requests.post(url, json=payload, headers=headers, timeout=20)
-        r.raise_for_status()
-        path = _audio_path_for_text(text)
-        with open(path, "wb") as f:
-            f.write(r.content)
-        return {"ok": True, "path": path, "error": ""}
-    except Exception as e:
-        print("ElevenLabs HTTP error:", e)
-        return {"ok": False, "path": "", "error": str(e)}
-# ---------- Chat handling ----------
-# Keep a short local history (only last 4 exchanges) to avoid big prompts
-CHAT_HISTORY_LIMIT = 4
-def format_prompt(history: list, user_message: str) -> str:
-    # history is list of tuples (role, text) where role in {"user","assistant"}
-    parts = []
-    for role, text in history[-(CHAT_HISTORY_LIMIT*2):]:
-        parts.append(f"{role.capitalize()}: {text}")
-    parts.append(f"User: {user_message}")
-    parts.append("Assistant:")
-    # Keep the prompt small; don't include system instruction repeatedly
-    return "\n".join(parts)
-# We'll store a minimal session history in memory (per process). This is fine for quick demos.
-SESSION_HISTORY = []
-def chat_and_tts(user_message: str):
-    """
-    Returns (assistant_text, audio_file_path or None, status_message)
-    This is intentionally synchronous and small.
-    """
-    # Limit user input length — improves speed & cost
-    if len(user_message) > 800:
-        user_message = user_message[:800] + "..."
-    prompt = format_prompt(SESSION_HISTORY, user_message)
-    # generate short text (limit tokens)
-    t0 = time.time()
-    assistant_text = generate_text_fast(prompt, max_tokens=300)
-    gen_time = time.time() - t0
-    # update local history
-    SESSION_HISTORY.append(("user", user_message))
-    SESSION_HISTORY.append(("assistant", assistant_text))
-    if len(SESSION_HISTORY) > CHAT_HISTORY_LIMIT * 2:
-        SESSION_HISTORY[:] = SESSION_HISTORY[-(CHAT_HISTORY_LIMIT*2):]
-    # generate audio (optional) — keep small to reduce delay
-    # Truncate TTS length
-    tts_text = assistant_text if len(assistant_text) <= 400 else assistant_text[:400] + "..."
-    t0 = time.time()
-    tts_res = generate_tts_http(tts_text)
-    tts_time = time.time() - t0
-    if tts_res.get("ok"):
-        return assistant_text, tts_res["path"], f"gen:{gen_time:.2f}s tts:{tts_time:.2f}s"
-    else:
-        return assistant_text, None, f"gen:{gen_time:.2f}s tts_failed: {tts_res.get('error','unknown')}"
-# ---------- Gradio UI (minimal) ----------
-with gr.Blocks(title="Fast Gemini + Eleven TTS") as demo:
-    gr.Markdown("### Fast Gemini (text) + ElevenLabs (audio) demo — optimized for quick builds")
-    chat = gr.Chatbot(elem_id="chatbot", label="Conversation")
-    txt = gr.Textbox(placeholder="Type your message and press Enter", label="You")
-    status = gr.Textbox(value="Ready", label="Status", interactive=False)
-    audio_player = gr.Audio(label="Latest reply (audio)", interactive=False)
-    def on_submit(user_msg, chat_history):
-        assistant_text, audio_path, status_msg = chat_and_tts(user_msg)
-        # append to chat view
-        chat_history = chat_history or []
-        chat_history.append((user_msg, assistant_text))
-        # update audio player only if we have a file
-        if audio_path:
-            return chat_history, status_msg, audio_path, ""
-        else:
-            return chat_history, status_msg, None, ""
-    txt.submit(on_submit, [txt, chat], [chat, status, audio_player, txt])
-    # clear button
-    def clear():
-        global SESSION_HISTORY
-        SESSION_HISTORY = []
-        return [], "Cleared", None, ""
-    gr.Button("Reset Chat").click(clear, None, [chat, status, audio_player, txt])
-if __name__ == "__main__":
-    demo.launch(share=False, server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))

+import os
+import re
+import requests
+import json
+import gradio as gr
+# Google Gemini imports
+import google.generativeai as genai
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.prompts import PromptTemplate
+# legacy chains + memory now live in langchain_classic
+from langchain_classic.chains import LLMChain
+from langchain_classic.memory import ConversationBufferMemory
+# ElevenLabs imports
+from elevenlabs.client import ElevenLabs
+from elevenlabs import save
+# Google Gemini API Key
+GEMINI_API_KEY = "YOUR_GEMINI_API_KEY"
+# ElevenLabs API Key
+ELEVENLABS_API_KEY = "YOUR_ELEVENLABS_API_KEY"
+# ElevenLabs Voice ID (Rachel voice by default)
+ELEVENLABS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM"
+# Configure Gemini
+genai.configure(api_key=GEMINI_API_KEY)
+# Initialize ElevenLabs client
+elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
+print("✅ API keys configured successfully!")
+template = """You are a helpful assistant to answer user queries.
+{chat_history}
+User: {user_message}
+Chatbot:"""
+prompt = PromptTemplate(
+    input_variables=["chat_history", "user_message"],
+    template=template
+)
+memory = ConversationBufferMemory(memory_key="chat_history")
+print("✅ Prompt template created!")
+# Initialize Gemini model using direct Google GenerativeAI (NOT LangChain wrapper)
+import google.generativeai as genai
+# Configure the Gemini model directly
+gemini_model = genai.GenerativeModel('gemini-2.5-flash')
+# Create a custom LLM wrapper for LangChain compatibility
+class GeminiLLM:
+    def __init__(self, model):
+        self.model = model
+        self.memory_history = []
+    def predict(self, user_message):
+        # Build conversation context
+        full_prompt = f"You are a helpful assistant to answer user queries.\n"
+        for msg in self.memory_history:
+            full_prompt += f"{msg}\n"
+        full_prompt += f"User: {user_message}\nChatbot:"
+        # Generate response
+        response = self.model.generate_content(full_prompt)
+        answer = response.text
+        # Update memory
+        self.memory_history.append(f"User: {user_message}")
+        self.memory_history.append(f"Chatbot: {answer}")
+        # Keep only last 10 exchanges to avoid token limits
+        if len(self.memory_history) > 20:
+            self.memory_history = self.memory_history[-20:]
+        return answer
+# Initialize the custom LLM
+llm_chain = GeminiLLM(gemini_model)
+print("✅ Gemini LLM initialized with direct SDK!")
+def generate_audio_elevenlabs(text):
+    """
+    Generate audio using ElevenLabs API
+    Returns audio file path or error message
+    """
+    try:
+        # Generate audio
+        audio = elevenlabs_client.generate(
+            text=text,
+            voice=ELEVENLABS_VOICE_ID,
+            model="eleven_monolingual_v1"  # or "eleven_multilingual_v2"
+        )
+        # Save audio to file
+        output_path = f"/content/output_audio_{hash(text) % 10000}.mp3"
+        save(audio, output_path)
+        return {
+            "type": "SUCCESS",
+            "response": output_path,
+            "message": "Audio generated successfully"
+        }
+    except Exception as e:
+        return {
+            "type": "ERROR",
+            "response": str(e),
+            "message": f"Audio generation failed: {str(e)}"
+        }
+def generate_audio_elevenlabs_http(text):
+    """
+    Alternative method using direct HTTP API calls
+    More reliable for some use cases
+    """
+    url = f"https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}"
+    headers = {
+        "Accept": "audio/mpeg",
+        "Content-Type": "application/json",
+        "xi-api-key": ELEVENLABS_API_KEY
+    }
+    data = {
+        "text": text,
+        "model_id": "eleven_monolingual_v1",
+        "voice_settings": {
+            "stability": 0.5,
+            "similarity_boost": 0.5,
+            "style": 0.5,
+            "use_speaker_boost": True
+        }
+    }
+    try:
+        response = requests.post(url, json=data, headers=headers)
+        response.raise_for_status()
+        # Save audio file
+        output_path = f"/content/output_audio_{hash(text) % 10000}.mp3"
+        with open(output_path, 'wb') as f:
+            f.write(response.content)
+        return {
+            "type": "SUCCESS",
+            "response": output_path,
+            "message": "Audio generated successfully"
+        }
+    except requests.exceptions.RequestException as e:
+        return {
+            "type": "ERROR",
+            "response": str(e),
+            "message": f"Audio generation failed: {str(e)}"
+        }
+print("✅ ElevenLabs audio functions defined!")
+def get_audio_reply_for_question(text):
+    """
+    Generate audio for the chatbot response
+    """
+    generated_audio_event = generate_audio_elevenlabs(text)
+    final_response = {
+        "audio_path": '',
+        "message": ''
+    }
+    if generated_audio_event["type"] == "SUCCESS":
+        audio_path = generated_audio_event["response"]
+        final_response['audio_path'] = audio_path
+        final_response['message'] = "Audio generated successfully"
+    else:
+        final_response['message'] = generated_audio_event['message']
+    return final_response
+print("✅ Audio reply function defined!")
+def get_text_response(user_message):
+    """
+    Get text response from Gemini
+    """
+    try:
+        response = llm_chain.predict(user_message=user_message)
+        return response
+    except Exception as e:
+        error_msg = f"Error in Gemini response: {str(e)}"
+        print(error_msg)
+        return f"Sorry, I encountered an error: {str(e)}"
+print("✅ Text response function defined!")
+def get_text_response_and_audio_response(user_message):
+    """
+    Get both text response from Gemini and audio from ElevenLabs
+    """
+    # Get text response from Gemini
+    text_response = get_text_response(user_message)
+    # Generate audio for the response
+    audio_reply = get_audio_reply_for_question(text_response)
+    final_response = {
+        'text': text_response,
+        'audio_path': audio_reply.get('audio_path', ''),
+        'message': audio_reply.get('message', '')
+    }
+    return final_response
+print("✅ Combined response function defined!")
+def chat_bot_response(message, history):
+    """
+    Main chatbot function for Gradio interface
+    Returns tuple of (text_response, audio_file_path)
+    """
+    try:
+        # Get text and audio response
+        response = get_text_response_and_audio_response(message)
+        text_response = response['text']
+        audio_path = response['audio_path']
+        if audio_path and os.path.exists(audio_path):
+            # Return both text and audio
+            return text_response
+        else:
+            # Return only text if audio fails
+            return text_response
+    except Exception as e:
+        error_msg = f"Error: {str(e)}"
+        print(error_msg)
+        return error_msg
+print("✅ Chatbot response handler defined!")
+demo = gr.ChatInterface(
+    fn=chat_bot_response,
+    title="🤖 Gemini + ElevenLabs Chatbot",
+    description="Chat with Google Gemini AI with voice responses from ElevenLabs",
+    examples=[
+        "How are you doing?",
+        "What are your interests?",
+        "Tell me a short story",
+        "What's the weather like today?",
+        "Explain quantum computing in simple terms"
+    ],
+    theme=gr.themes.Soft()
+)
+print("✅ Gradio interface created!")
+if __name__ == "__main__":
+    # Launch with public link
+    demo.launch(
+        share=True,  # Creates public link
+        debug=True   # Shows errors and logs
+    )