Spaces:

MrA7A1
/

AiCoderClean

Sleeping

App Files Files Community

MrA7A1 commited on Mar 22

Commit

8e4fc58

verified ·

1 Parent(s): 5b0d4db

KAPO rollout fix: sync brain_server/api/main.py

Browse files

Files changed (1) hide show

brain_server/api/main.py +45 -108

brain_server/api/main.py CHANGED Viewed

@@ -80,6 +80,7 @@ FIREBASE_RUNTIME_CACHE: dict[str, tuple[float, Any]] = {}
 RUNTIME_LOG_BUFFER: deque[dict[str, Any]] = deque(maxlen=200)
 LAST_BRAIN_URL_REPORT: dict[str, Any] = {"url": "", "ts": 0.0}
 RUNTIME_STATE_THREAD_STARTED = False
 DEFAULT_MODEL_REPO = "QuantFactory/aya-expanse-8b-GGUF"
 DEFAULT_MODEL_FILE = "aya-expanse-8b.Q4_K_M.gguf"
@@ -198,7 +199,17 @@ def _drive_bootstrap_configured() -> bool:
 def _bootstrap_shared_state() -> None:
     if _drive_bootstrap_configured() or _shared_state_backend() in {"google_drive", "drive", "gdrive"}:
-        DRIVE_STATE.ensure_bootstrap_loaded(force=False)
 def _startup_self_update_enabled() -> bool:
@@ -312,10 +323,6 @@ def _is_kaggle_runtime() -> bool:
     return "/kaggle/" in str(_project_root()).replace("\\", "/") or bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE"))
-def _is_hf_space_runtime() -> bool:
-    return str(os.getenv("HF_SPACE_DOCKER", "0")).strip().lower() in {"1", "true", "yes", "on"} or bool(os.getenv("SPACE_ID"))
 def _apply_executor_settings(settings: dict[str, Any]) -> None:
     for key in (
         "NGROK_AUTHTOKEN",
@@ -1057,6 +1064,36 @@ def _report_known_public_url() -> str | None:
     return public_url
 def _bootstrap_executor_handshake(start_tunnel: bool = False) -> None:
     executor_url = os.getenv("EXECUTOR_URL", "").strip()
     if not executor_url:
@@ -1066,8 +1103,10 @@ def _bootstrap_executor_handshake(start_tunnel: bool = False) -> None:
                 logger.info("Brain public URL started locally without executor handshake: %s", public_url)
             else:
                 logger.info("Brain started without publishing a public URL")
             return
         logger.info("Skipping executor handshake: EXECUTOR_URL not configured")
         return
     settings = _pull_executor_settings()
@@ -1084,6 +1123,7 @@ def _bootstrap_executor_handshake(start_tunnel: bool = False) -> None:
         logger.info("Brain public URL reported to executor: %s", public_url)
     else:
         logger.info("Brain started without publishing a public URL")
 @app.on_event("startup")
@@ -1754,9 +1794,6 @@ def _dispatch_background(task, *args) -> None:
 def _restart_process(delay_sec: float = 1.0) -> None:
-    if _is_hf_space_runtime():
-        logger.info("Skipping in-process restart on Hugging Face Space runtime")
-        return
     def _run() -> None:
         time.sleep(max(0.2, float(delay_sec)))
         target_root = _sync_target_root()
@@ -2127,13 +2164,6 @@ else:
 @app.post("/system/restart")
 async def system_restart(req: RestartRequest | None = None):
     delay_sec = req.delay_sec if req else 1.0
-    if _is_hf_space_runtime():
-        return {
-            "status": "skipped",
-            "reason": "restart_disabled_on_hf_space",
-            "delay_sec": delay_sec,
-            "target_root": _sync_target_root(),
-        }
     _restart_process(delay_sec=delay_sec)
     return {
         "status": "restarting",
@@ -2307,96 +2337,3 @@ async def health(executor_url: str | None = None, check_executor: bool = False):
     payload = _health_payload(check_executor=check_executor, executor_url=executor_url)
     _persist_runtime_state_snapshot(reason="health_endpoint")
     return payload
-# KAPO HF SPACE TRANSFORMERS PATCH
-def _kapo_hf_transformers_enabled() -> bool:
-    return str(os.getenv('KAPO_HF_TRANSFORMERS_RUNTIME', '0')).strip().lower() in {'1', 'true', 'yes', 'on'}
-def ensure_model_loaded(repo_id: str, filename: str, hf_token: str | None = None) -> None:
-    global MODEL, MODEL_ERROR, MODEL_META
-    repo_id = (repo_id or '').strip()
-    filename = (filename or '').strip()
-    if not repo_id:
-        MODEL = None
-        MODEL_ERROR = 'model repo missing'
-        return
-    if _kapo_hf_transformers_enabled() or (_is_hf_space_runtime() and not filename):
-        try:
-            from transformers import AutoModelForCausalLM, AutoTokenizer
-            tokenizer = AutoTokenizer.from_pretrained(repo_id, token=hf_token, trust_remote_code=True)
-            model = AutoModelForCausalLM.from_pretrained(repo_id, token=hf_token, trust_remote_code=True, device_map='cpu')
-            if hasattr(model, 'eval'):
-                model.eval()
-            MODEL = {'kind': 'transformers', 'model': model, 'tokenizer': tokenizer}
-            MODEL_ERROR = None
-            MODEL_META = {'repo_id': repo_id, 'filename': filename, 'path': None}
-            logger.info('Loaded transformers model %s', repo_id)
-            return
-        except Exception as exc:
-            MODEL = None
-            MODEL_ERROR = f'transformers model load failed: {exc}'
-            logger.exception('Transformers model load failed')
-            return
-    if not filename:
-        MODEL = None
-        MODEL_ERROR = 'model file missing'
-        return
-    try:
-        model_path = _download_model(repo_id, filename, hf_token=hf_token)
-    except Exception as exc:
-        MODEL = None
-        MODEL_ERROR = f'model download failed: {exc}'
-        logger.exception('Model download failed')
-        return
-    try:
-        from llama_cpp import Llama
-        MODEL = Llama(model_path=model_path, n_ctx=4096)
-        MODEL_ERROR = None
-        MODEL_META = {'repo_id': repo_id, 'filename': filename, 'path': model_path}
-        logger.info('Loaded model %s/%s', repo_id, filename)
-    except Exception as exc:
-        MODEL = None
-        MODEL_ERROR = f'model load failed: {exc}'
-        logger.exception('Model load failed')
-def _generate_response(user_input: str, history: list[dict[str, str]], context_block: str) -> str:
-    language = _detect_language(user_input)
-    exact_reply = _extract_exact_reply_instruction_safe(user_input)
-    if exact_reply:
-        return exact_reply
-    fast_reply = _project_specific_fast_reply(user_input)
-    if fast_reply:
-        return fast_reply
-    if MODEL is None:
-        try:
-            _load_default_model()
-        except Exception:
-            logger.exception('Lazy model load failed')
-        if MODEL is None:
-            if language == 'ar':
-                return 'الخدمة تعمل لكن توليد الرد الحر غير متاح الآن لأن النموذج غير محمل.'
-            return 'The Brain is online, but natural chat generation is unavailable because the model is not loaded.'
-    prompt = _build_chat_prompt(user_input, history, context_block)
-    try:
-        max_tokens = 80 if language == 'ar' else 96
-        if isinstance(MODEL, dict) and MODEL.get('kind') == 'transformers':
-            tokenizer = MODEL['tokenizer']
-            model = MODEL['model']
-            inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=2048)
-            if hasattr(model, 'device'):
-                inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
-            output_ids = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False, pad_token_id=tokenizer.eos_token_id)
-            generated = output_ids[0][inputs['input_ids'].shape[1]:]
-            text = tokenizer.decode(generated, skip_special_tokens=True).strip()
-        else:
-            output = MODEL(prompt, max_tokens=max_tokens, temperature=0.1, top_p=0.85, stop=['\nUser:', '\nUSER:', '\n###', '<|EOT|>'])
-            text = output['choices'][0]['text'].strip()
-        if _response_looks_bad(text, language):
-            return _fallback_response(user_input)
-        return text or ('تم استلام رسالتك.' if language == 'ar' else 'I received your message.')
-    except Exception:
-        logger.exception('Model generation failed')
-        if language == 'ar':
-            return 'فهمت طلبك، لكن حدث خطأ أثناء توليد الرد النصي.'
-        return 'I understood your request, but text generation failed.'

 RUNTIME_LOG_BUFFER: deque[dict[str, Any]] = deque(maxlen=200)
 LAST_BRAIN_URL_REPORT: dict[str, Any] = {"url": "", "ts": 0.0}
 RUNTIME_STATE_THREAD_STARTED = False
+PUBLIC_URL_RETRY_STARTED = False
 DEFAULT_MODEL_REPO = "QuantFactory/aya-expanse-8b-GGUF"
 DEFAULT_MODEL_FILE = "aya-expanse-8b.Q4_K_M.gguf"
 def _bootstrap_shared_state() -> None:
     if _drive_bootstrap_configured() or _shared_state_backend() in {"google_drive", "drive", "gdrive"}:
+        payload = DRIVE_STATE.ensure_bootstrap_loaded(force=False) or {}
+        fallback_mappings = {
+            "executor_url": "EXECUTOR_URL",
+            "control_plane_url": "KAPO_CONTROL_PLANE_URL",
+            "cloudflare_control_plane_url": "KAPO_CONTROL_PLANE_URL",
+            "cloudflare_queue_name": "KAPO_CLOUDFLARE_QUEUE_NAME",
+        }
+        for key, env_name in fallback_mappings.items():
+            value = payload.get(key)
+            if value not in (None, ""):
+                os.environ[env_name] = str(value)
 def _startup_self_update_enabled() -> bool:
     return "/kaggle/" in str(_project_root()).replace("\\", "/") or bool(os.getenv("KAGGLE_KERNEL_RUN_TYPE"))
 def _apply_executor_settings(settings: dict[str, Any]) -> None:
     for key in (
         "NGROK_AUTHTOKEN",
     return public_url
+def _retry_publish_public_url(attempts: int = 8, delay_sec: float = 12.0) -> None:
+    for attempt in range(max(1, int(attempts))):
+        try:
+            public_url = _report_known_public_url()
+            if not public_url and _auto_publish_public_url_on_startup():
+                public_url = start_ngrok(os.getenv("NGROK_AUTHTOKEN") or None)
+            if public_url:
+                logger.info("Recovered brain public URL on retry attempt %s: %s", attempt + 1, public_url)
+                return
+        except Exception:
+            logger.warning("Public URL retry attempt %s failed", attempt + 1, exc_info=True)
+        time.sleep(max(2.0, float(delay_sec)))
+    logger.warning("Brain public URL retry loop exhausted without a published URL")
+def _ensure_public_url_background(start_tunnel: bool = False) -> None:
+    global PUBLIC_URL_RETRY_STARTED
+    current = str(os.getenv("BRAIN_PUBLIC_URL") or LAST_BRAIN_URL_REPORT.get("url") or _load_saved_public_url() or "").strip()
+    if current or PUBLIC_URL_RETRY_STARTED:
+        return
+    if not start_tunnel and not _auto_publish_public_url_on_startup():
+        return
+    PUBLIC_URL_RETRY_STARTED = True
+    threading.Thread(
+        target=_retry_publish_public_url,
+        kwargs={"attempts": 8, "delay_sec": 12.0},
+        daemon=True,
+    ).start()
 def _bootstrap_executor_handshake(start_tunnel: bool = False) -> None:
     executor_url = os.getenv("EXECUTOR_URL", "").strip()
     if not executor_url:
                 logger.info("Brain public URL started locally without executor handshake: %s", public_url)
             else:
                 logger.info("Brain started without publishing a public URL")
+                _ensure_public_url_background(start_tunnel=True)
             return
         logger.info("Skipping executor handshake: EXECUTOR_URL not configured")
+        _ensure_public_url_background(start_tunnel=start_tunnel)
         return
     settings = _pull_executor_settings()
         logger.info("Brain public URL reported to executor: %s", public_url)
     else:
         logger.info("Brain started without publishing a public URL")
+        _ensure_public_url_background(start_tunnel=start_tunnel)
 @app.on_event("startup")
 def _restart_process(delay_sec: float = 1.0) -> None:
     def _run() -> None:
         time.sleep(max(0.2, float(delay_sec)))
         target_root = _sync_target_root()
 @app.post("/system/restart")
 async def system_restart(req: RestartRequest | None = None):
     delay_sec = req.delay_sec if req else 1.0
     _restart_process(delay_sec=delay_sec)
     return {
         "status": "restarting",
     payload = _health_payload(check_executor=check_executor, executor_url=executor_url)
     _persist_runtime_state_snapshot(reason="health_endpoint")
     return payload