Spaces:

kcrobot20
/

kc20ai

Sleeping

App Files Files Community

kcrobot20 commited on Oct 13, 2025

Commit

84ad4e7

verified ·

1 Parent(s): 138566c

initial commit

Browse files

Files changed (1) hide show

app.py +53 -72

app.py CHANGED Viewed

@@ -2,21 +2,21 @@
 # -*- coding: utf-8 -*-
 """
 KCrobot AI — Vmax Final (voice-first)
-- Default Gemini model: gemini-1.5-pro (but code will fallback)
-- Read secrets from environment (HF "New secret"):
     GEMINI_API_KEY, GEMINI_MODEL, TELEGRAM_TOKEN, TELEGRAM_CHAT_ID,
     ELEVEN_API_KEY, ELEVEN_VOICE_ID, GOOGLE_SPEECH_LANG
 - Voice-first: ESP32 uploads audio -> /chat_audio -> STT -> Gemini -> TTS -> MP3
-- If STT libs not installed on server, /chat_audio returns 501 with message
 - Endpoints:
-    GET  /                -> web UI (chat, minimal)
-    POST /chat_text       -> {"q":"...","voice":true}
-    POST /chat_audio      -> upload wav (multipart 'file' or raw bytes)
-    POST /esp/send_text   -> wrapper for /chat_text
-    GET  /play_latest     -> latest_reply.mp3
-    GET  /_history        -> conversation history
-    POST /notify          -> forward to Telegram
-    GET  /health          -> health check
 """
 from __future__ import annotations
@@ -32,33 +32,31 @@ import pathlib
 from datetime import datetime
 from typing import Tuple, Dict, Any, Optional
-import requests
 from flask import Flask, request, jsonify, send_file, render_template_string
-# --- Try to import recommended/new SDK first (google-genai)
 USE_GENAI_SDK = False
 GENAI_CLIENT = None
 try:
-    # new official: from google import genai
     from google import genai  # type: ignore
     USE_GENAI_SDK = True
 except Exception:
     try:
-        # fallback: google.generativeai (older package)
         import google.generativeai as genai  # type: ignore
         USE_GENAI_SDK = True
     except Exception:
         genai = None
         USE_GENAI_SDK = False
-# TTS fallback
 try:
     from gtts import gTTS  # type: ignore
     GTTS_AVAILABLE = True
 except Exception:
     GTTS_AVAILABLE = False
-# Optional STT libs
 try:
     import speech_recognition as sr  # type: ignore
     from pydub import AudioSegment  # type: ignore
@@ -69,7 +67,7 @@ except Exception:
     STT_AVAILABLE = False
 # -------------------------
-# CONFIG via env (HF New secret)
 # -------------------------
 CFG = {
     "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", "").strip(),
@@ -81,7 +79,7 @@ CFG = {
     "GOOGLE_SPEECH_LANG": os.getenv("GOOGLE_SPEECH_LANG", "vi-VN").strip(),
 }
-# Model fallback list (priority order)
 MODEL_FALLBACK_LIST = [
     CFG.get("GEMINI_MODEL") or "gemini-1.5-pro",
     "gemini-1.5-flash",
@@ -89,7 +87,7 @@ MODEL_FALLBACK_LIST = [
     "gemini-2.5-pro",
 ]
-# ensure unique, keep order
 seen = set()
 MODEL_FALLBACK = []
 for m in MODEL_FALLBACK_LIST:
@@ -101,18 +99,18 @@ for m in MODEL_FALLBACK_LIST:
 GEMINI_KEY = CFG.get("GEMINI_API_KEY") or ""
 if USE_GENAI_SDK and GEMINI_KEY:
     try:
-        # new SDK style
         GENAI_CLIENT = genai.Client(api_key=GEMINI_KEY)  # type: ignore
     except Exception:
         try:
-            # older style configure (google.generativeai)
             genai.configure(api_key=GEMINI_KEY)  # type: ignore
             GENAI_CLIENT = genai  # type: ignore
         except Exception:
             GENAI_CLIENT = None
 # -------------------------
-# STORAGE & logging
 # -------------------------
 BASE = pathlib.Path.cwd()
 DATA_DIR = BASE / "data"
@@ -125,29 +123,29 @@ LATEST_MP3 = DATA_DIR / "latest_reply.mp3"
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("kcrobot_vmax")
-# snapshot non-secret config flags
 try:
     CFG_SNAPSHOT.write_text(json.dumps({k: bool(CFG.get(k)) for k in CFG}, indent=2), encoding="utf-8")
 except Exception:
     pass
 # -------------------------
-# Helpers: safe json, usage, history
 # -------------------------
 def load_json_safe(path: pathlib.Path, default):
     try:
         if path.exists():
             return json.loads(path.read_text(encoding="utf-8"))
     except Exception as e:
-        logger.debug("load_json_safe failed %s -> %s", path, e)
     return default
 def save_json_safe(path: pathlib.Path, data):
     try:
         path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
         return True
-    except Exception:
-        logger.exception("save_json_safe failed for %s", path)
     return False
 def today_str():
@@ -175,9 +173,13 @@ def append_history(entry: dict):
     save_json_safe(HISTORY_FILE, h)
 # -------------------------
-# Language detection (simple)
 # -------------------------
-VIET_CHAR_RE = re.compile(r"[àáạảãâầấậẩẫăằắặẳẵđèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹ]", re.I)
 def detect_lang(text: str) -> str:
     if not text or not isinstance(text, str):
         return "en"
@@ -191,17 +193,15 @@ def detect_lang(text: str) -> str:
     return "en"
 # -------------------------
-# Gemini: SDK call and REST fallback + model fallback controller
 # -------------------------
-def gemini_call_with_model(model: str, prompt: str, max_output_tokens: int = 1024, temperature: float = 0.2) -> Tuple[bool, str, int]:
     """
-    Try to call Gemini with a single model.
-    Returns (ok, text_or_error, http_status_or_0)
     """
-    # prefer SDK if available
     if GENAI_CLIENT:
         try:
-            # new SDK method
             if hasattr(GENAI_CLIENT, "models") and hasattr(GENAI_CLIENT.models, "generate_content"):
                 resp = GENAI_CLIENT.models.generate_content(model=model, contents=prompt,
                                                            max_output_tokens=max_output_tokens, temperature=temperature)  # type: ignore
@@ -209,14 +209,12 @@ def gemini_call_with_model(model: str, prompt: str, max_output_tokens: int = 102
                 if txt:
                     return True, txt, 200
                 return True, str(resp), 200
-            # older compatibility
             if hasattr(GENAI_CLIENT, "generate_content"):
                 resp = GENAI_CLIENT.generate_content(prompt, max_output_tokens=max_output_tokens, temperature=temperature)
                 if hasattr(resp, "text") and resp.text:
                     return True, resp.text, 200
                 return True, str(resp), 200
         except requests.exceptions.HTTPError as he:
-            # SDK might raise requests HTTPError
             try:
                 code = he.response.status_code
             except Exception:
@@ -224,6 +222,7 @@ def gemini_call_with_model(model: str, prompt: str, max_output_tokens: int = 102
             return False, str(he), code
         except Exception as e:
             return False, str(e), 0
     # REST fallback
     key = CFG.get("GEMINI_API_KEY") or ""
     if not key:
@@ -234,11 +233,11 @@ def gemini_call_with_model(model: str, prompt: str, max_output_tokens: int = 102
         "prompt": {
             "messages": [
                 {"author": "system", "content": {"text": "You are a helpful assistant."}},
-                {"author": "user", "content": {"text": prompt}}
             ]
         },
         "maxOutputTokens": max_output_tokens,
-        "temperature": temperature
     }
     try:
         r = requests.post(url, params={"key": key}, json=payload, headers=headers, timeout=30)
@@ -248,7 +247,7 @@ def gemini_call_with_model(model: str, prompt: str, max_output_tokens: int = 102
         if status >= 400:
             return False, f"HTTP {status}: {r.text}", status
         j = r.json()
-        # parse common shapes
         cand = j.get("candidates")
         if cand and isinstance(cand, list):
             c0 = cand[0]
@@ -274,8 +273,7 @@ def gemini_call_with_model(model: str, prompt: str, max_output_tokens: int = 102
 def call_gemini_with_fallbacks(prompt: str, max_output_tokens: int = 1024, temperature: float = 0.2) -> Dict[str, Any]:
     """
-    Try sequence of models from MODEL_FALLBACK.
-    Return dict {"ok":bool, "text":str, "model":str, "error":...}
     """
     if not CFG.get("GEMINI_API_KEY"):
         return {"ok": False, "error": "Gemini API key not configured (set GEMINI_API_KEY in New secret)"}
@@ -283,20 +281,16 @@ def call_gemini_with_fallbacks(prompt: str, max_output_tokens: int = 1024, tempe
     for model in MODEL_FALLBACK:
         if not model:
             continue
-        ok, txt_or_err, status = gemini_call_with_model(model, prompt, max_output_tokens, temperature)
         if ok:
-            return {"ok": True, "text": txt_or_err, "model": model}
-        # if 404, try next model; otherwise remember error and maybe return at end
-        last_error = {"model": model, "status": status, "error": txt_or_err}
-        logger.warning("Gemini model %s failed: %s (status %s)", model, txt_or_err, status)
-        if status not in (404, 0):
-            # for some HTTP errors we may stop trying (e.g., 403 unauthorized)
-            # but still try next model for robustness
-            pass
     return {"ok": False, "error": f"All models failed. Last: {last_error}", "last": last_error}
 # -------------------------
-# TTS backends (ElevenLabs -> gTTS)
 # -------------------------
 def tts_elevenlabs_bytes(text: str, voice_id: str, api_key: str) -> bytes:
     url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
@@ -308,7 +302,7 @@ def tts_elevenlabs_bytes(text: str, voice_id: str, api_key: str) -> bytes:
 def tts_gtts_bytes(text: str, lang: str = "vi") -> bytes:
     if not GTTS_AVAILABLE:
-        raise RuntimeError("gTTS not installed on server")
     t = gTTS(text=text, lang=lang)
     bio = io.BytesIO()
     t.write_to_fp(bio)
@@ -336,7 +330,7 @@ def synthesize_and_save(answer: str, lang_hint: str = "vi") -> Tuple[bool, str]:
         return False, f"TTS error: {e}"
 # -------------------------
-# STT: server-side speech-to-text
 # -------------------------
 def speech_to_text(wav_bytes: bytes) -> Tuple[bool, str]:
     if not STT_AVAILABLE:
@@ -352,7 +346,7 @@ def speech_to_text(wav_bytes: bytes) -> Tuple[bool, str]:
         return False, str(e)
 # -------------------------
-# Telegram helper
 # -------------------------
 def send_telegram_message(text: str) -> bool:
     token = CFG.get("TELEGRAM_TOKEN") or ""
@@ -384,12 +378,11 @@ textarea{width:100%;padding:10px;border-radius:8px;background:#061427;color:#fff
 button{padding:10px 14px;border-radius:8px;background:#0ea5ff;color:#012;border:none;cursor:pointer}
 #resp{white-space:pre-wrap;margin-top:12px;background:#021220;padding:12px;border-radius:6px}
 .small{font-size:0.9rem;color:#9fb3c8}
-</style></head>
-<body>
 <div class="container">
 <h1>🤖 KCrobot AI — Vmax (Voice-first)</h1>
 <p class="small">Model fallback: {{models}} — Gemini key: {{gemini}} — Telegram: {{tg}}</p>
-<p>Chú ý: giao diện chat là phụ — ưu tiên voice (ESP32 gửi audio). Bạn có thể thử gỏ "Xin chào" để nghe trả lời.</p>
 <textarea id="q" rows="4" placeholder="Nhập tiếng Việt / English..."></textarea>
 <p><label><input id="voice" type="checkbox" checked> Voice ON</label>
 <button onclick="send()">Gửi & Nghe</button></p>
@@ -455,7 +448,6 @@ def chat_text():
         if ok:
             play_url = "/play_latest"
     try:
-        # Telegram notify background
         threading.Thread(target=send_telegram_message, args=(f"Q: {q}\nA: {answer}",)).start()
     except Exception:
         pass
@@ -466,19 +458,10 @@ def chat_text():
 @app.route("/esp/send_text", methods=["POST"])
 def esp_send_text():
-    # wrapper for chat_text for esp32 convenience
     return chat_text()
 @app.route("/chat_audio", methods=["POST"])
 def chat_audio():
-    """
-    Primary voice endpoint:
-    - Accept audio file field 'file' (wav) or raw bytes body
-    - Do STT (server-side) if available, else return 501
-    - Use Gemini to reply (model fallback)
-    - Synthesize reply to latest_reply.mp3 and return play_url
-    """
-    # read bytes
     wav_bytes = None
     if 'file' in request.files:
         f = request.files['file']
@@ -487,7 +470,6 @@ def chat_audio():
         wav_bytes = request.get_data()
     if not wav_bytes:
         return jsonify({"error": "no audio provided"}), 400
-    # save for debugging
     try:
         ts = int(time.time())
         (DATA_DIR / f"uploaded_{ts}.wav").write_bytes(wav_bytes)
@@ -549,10 +531,9 @@ def health():
     })
 # -------------------------
-# Start app
 # -------------------------
 if __name__ == "__main__":
-    # ensure files exist
     load_json_safe(HISTORY_FILE, [])
     load_json_safe(USAGE_FILE, {})
     logger.info("KCrobot Vmax starting. Gemini key present: %s, SDK present: %s, STT available: %s",

 # -*- coding: utf-8 -*-
 """
 KCrobot AI — Vmax Final (voice-first)
+- Default Gemini model: gemini-1.5-pro (fallbacks implemented)
+- Read secrets from env (HF "New secret"):
     GEMINI_API_KEY, GEMINI_MODEL, TELEGRAM_TOKEN, TELEGRAM_CHAT_ID,
     ELEVEN_API_KEY, ELEVEN_VOICE_ID, GOOGLE_SPEECH_LANG
 - Voice-first: ESP32 uploads audio -> /chat_audio -> STT -> Gemini -> TTS -> MP3
+- If STT libs missing, /chat_audio returns 501 (server-side STT optional)
 - Endpoints:
+    GET  /             -> simple web UI (chat secondary)
+    POST /chat_text    -> {"q":"...","voice":true}
+    POST /chat_audio   -> upload wav (multipart 'file' or raw bytes)
+    POST /esp/send_text-> wrapper for /chat_text
+    GET  /play_latest  -> latest_reply.mp3
+    GET  /_history     -> recent history
+    POST /notify       -> forward to Telegram
+    GET  /health       -> health check
 """
 from __future__ import annotations
 from datetime import datetime
 from typing import Tuple, Dict, Any, Optional
 from flask import Flask, request, jsonify, send_file, render_template_string
+import requests
+# --- Attempt to import Google GenAI SDK (new) or older lib
 USE_GENAI_SDK = False
 GENAI_CLIENT = None
 try:
+    # new official package pattern: `from google import genai`
     from google import genai  # type: ignore
     USE_GENAI_SDK = True
 except Exception:
     try:
         import google.generativeai as genai  # type: ignore
         USE_GENAI_SDK = True
     except Exception:
         genai = None
         USE_GENAI_SDK = False
+# --- TTS/STT libs (optional)
 try:
     from gtts import gTTS  # type: ignore
     GTTS_AVAILABLE = True
 except Exception:
     GTTS_AVAILABLE = False
 try:
     import speech_recognition as sr  # type: ignore
     from pydub import AudioSegment  # type: ignore
     STT_AVAILABLE = False
 # -------------------------
+# Config from env (HF New secret)
 # -------------------------
 CFG = {
     "GEMINI_API_KEY": os.getenv("GEMINI_API_KEY", "").strip(),
     "GOOGLE_SPEECH_LANG": os.getenv("GOOGLE_SPEECH_LANG", "vi-VN").strip(),
 }
+# Model fallback list: prefer configured model, then alternatives
 MODEL_FALLBACK_LIST = [
     CFG.get("GEMINI_MODEL") or "gemini-1.5-pro",
     "gemini-1.5-flash",
     "gemini-2.5-pro",
 ]
+# dedupe keep order
 seen = set()
 MODEL_FALLBACK = []
 for m in MODEL_FALLBACK_LIST:
 GEMINI_KEY = CFG.get("GEMINI_API_KEY") or ""
 if USE_GENAI_SDK and GEMINI_KEY:
     try:
+        # Try new SDK client style
         GENAI_CLIENT = genai.Client(api_key=GEMINI_KEY)  # type: ignore
     except Exception:
         try:
+            # fallback older style configure
             genai.configure(api_key=GEMINI_KEY)  # type: ignore
             GENAI_CLIENT = genai  # type: ignore
         except Exception:
             GENAI_CLIENT = None
 # -------------------------
+# Storage & logging
 # -------------------------
 BASE = pathlib.Path.cwd()
 DATA_DIR = BASE / "data"
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("kcrobot_vmax")
+# save non-secret snapshot
 try:
     CFG_SNAPSHOT.write_text(json.dumps({k: bool(CFG.get(k)) for k in CFG}, indent=2), encoding="utf-8")
 except Exception:
     pass
 # -------------------------
+# Helpers: json safe, usage, history
 # -------------------------
 def load_json_safe(path: pathlib.Path, default):
     try:
         if path.exists():
             return json.loads(path.read_text(encoding="utf-8"))
     except Exception as e:
+        logger.debug("load_json_safe error %s -> %s", path, e)
     return default
 def save_json_safe(path: pathlib.Path, data):
     try:
         path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
         return True
+    except Exception as e:
+        logger.exception("save_json_safe failed for %s: %s", path, e)
     return False
 def today_str():
     save_json_safe(HISTORY_FILE, h)
 # -------------------------
+# Language detection
 # -------------------------
+VIET_CHAR_RE = re.compile(
+    r"[àáạảãâầấậẩẫăằắặẳẵđèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹ]",
+    re.I,
+)
 def detect_lang(text: str) -> str:
     if not text or not isinstance(text, str):
         return "en"
     return "en"
 # -------------------------
+# Gemini single model call (SDK preferred, REST fallback)
 # -------------------------
+def gemini_call_single(model: str, prompt: str, max_output_tokens: int = 1024, temperature: float = 0.2) -> Tuple[bool, str, int]:
     """
+    Try calling a single model. Return (ok, text_or_error, http_status_or_0)
     """
+    # SDK path
     if GENAI_CLIENT:
         try:
             if hasattr(GENAI_CLIENT, "models") and hasattr(GENAI_CLIENT.models, "generate_content"):
                 resp = GENAI_CLIENT.models.generate_content(model=model, contents=prompt,
                                                            max_output_tokens=max_output_tokens, temperature=temperature)  # type: ignore
                 if txt:
                     return True, txt, 200
                 return True, str(resp), 200
             if hasattr(GENAI_CLIENT, "generate_content"):
                 resp = GENAI_CLIENT.generate_content(prompt, max_output_tokens=max_output_tokens, temperature=temperature)
                 if hasattr(resp, "text") and resp.text:
                     return True, resp.text, 200
                 return True, str(resp), 200
         except requests.exceptions.HTTPError as he:
             try:
                 code = he.response.status_code
             except Exception:
             return False, str(he), code
         except Exception as e:
             return False, str(e), 0
     # REST fallback
     key = CFG.get("GEMINI_API_KEY") or ""
     if not key:
         "prompt": {
             "messages": [
                 {"author": "system", "content": {"text": "You are a helpful assistant."}},
+                {"author": "user", "content": {"text": prompt}},
             ]
         },
         "maxOutputTokens": max_output_tokens,
+        "temperature": temperature,
     }
     try:
         r = requests.post(url, params={"key": key}, json=payload, headers=headers, timeout=30)
         if status >= 400:
             return False, f"HTTP {status}: {r.text}", status
         j = r.json()
+        # try parse candidates
         cand = j.get("candidates")
         if cand and isinstance(cand, list):
             c0 = cand[0]
 def call_gemini_with_fallbacks(prompt: str, max_output_tokens: int = 1024, temperature: float = 0.2) -> Dict[str, Any]:
     """
+    Try models in MODEL_FALLBACK sequence; return dict with ok/text/model or error.
     """
     if not CFG.get("GEMINI_API_KEY"):
         return {"ok": False, "error": "Gemini API key not configured (set GEMINI_API_KEY in New secret)"}
     for model in MODEL_FALLBACK:
         if not model:
             continue
+        ok, text_or_err, status = gemini_call_single(model, prompt, max_output_tokens, temperature)
         if ok:
+            return {"ok": True, "text": text_or_err, "model": model}
+        last_error = {"model": model, "status": status, "error": text_or_err}
+        logger.warning("Model %s failed: %s (status=%s)", model, text_or_err, status)
+        # continue to next for robustness; some errors (403) may persist but try anyway
     return {"ok": False, "error": f"All models failed. Last: {last_error}", "last": last_error}
 # -------------------------
+# TTS: ElevenLabs optional -> gTTS fallback
 # -------------------------
 def tts_elevenlabs_bytes(text: str, voice_id: str, api_key: str) -> bytes:
     url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
 def tts_gtts_bytes(text: str, lang: str = "vi") -> bytes:
     if not GTTS_AVAILABLE:
+        raise RuntimeError("gTTS not available in environment")
     t = gTTS(text=text, lang=lang)
     bio = io.BytesIO()
     t.write_to_fp(bio)
         return False, f"TTS error: {e}"
 # -------------------------
+# STT: server-side speech-to-text (optional)
 # -------------------------
 def speech_to_text(wav_bytes: bytes) -> Tuple[bool, str]:
     if not STT_AVAILABLE:
         return False, str(e)
 # -------------------------
+# Telegram helper (optional)
 # -------------------------
 def send_telegram_message(text: str) -> bool:
     token = CFG.get("TELEGRAM_TOKEN") or ""
 button{padding:10px 14px;border-radius:8px;background:#0ea5ff;color:#012;border:none;cursor:pointer}
 #resp{white-space:pre-wrap;margin-top:12px;background:#021220;padding:12px;border-radius:6px}
 .small{font-size:0.9rem;color:#9fb3c8}
+</style></head><body>
 <div class="container">
 <h1>🤖 KCrobot AI — Vmax (Voice-first)</h1>
 <p class="small">Model fallback: {{models}} — Gemini key: {{gemini}} — Telegram: {{tg}}</p>
+<p>Giao diện chat là phụ — ưu tiên voice (ESP32 gửi audio). Bạn có thể thử gõ "Xin chào" để nghe trả lời.</p>
 <textarea id="q" rows="4" placeholder="Nhập tiếng Việt / English..."></textarea>
 <p><label><input id="voice" type="checkbox" checked> Voice ON</label>
 <button onclick="send()">Gửi & Nghe</button></p>
         if ok:
             play_url = "/play_latest"
     try:
         threading.Thread(target=send_telegram_message, args=(f"Q: {q}\nA: {answer}",)).start()
     except Exception:
         pass
 @app.route("/esp/send_text", methods=["POST"])
 def esp_send_text():
     return chat_text()
 @app.route("/chat_audio", methods=["POST"])
 def chat_audio():
     wav_bytes = None
     if 'file' in request.files:
         f = request.files['file']
         wav_bytes = request.get_data()
     if not wav_bytes:
         return jsonify({"error": "no audio provided"}), 400
     try:
         ts = int(time.time())
         (DATA_DIR / f"uploaded_{ts}.wav").write_bytes(wav_bytes)
     })
 # -------------------------
+# Start server
 # -------------------------
 if __name__ == "__main__":
     load_json_safe(HISTORY_FILE, [])
     load_json_safe(USAGE_FILE, {})
     logger.info("KCrobot Vmax starting. Gemini key present: %s, SDK present: %s, STT available: %s",