Spaces:

Inpris
/

Humains-Junior

Sleeping

App Files Files Community

NS-Y commited on Nov 3, 2025

Commit

6ee67a7

verified ·

1 Parent(s): 107e86b

Update app.py

Browse files

friendly model

Files changed (1) hide show

app.py +188 -121

app.py CHANGED Viewed

@@ -1,20 +1,32 @@
 import os
 import json
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.models.llama import LlamaTokenizer  # force slow llama if needed
 import gradio as gr
-os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-DEFAULT_MODEL = os.environ.get("EXOSKELETON_MODEL_ID", "Inpris/humains-junior")
-DEVICE_MAP = os.environ.get("DEVICE_MAP", "auto")
-MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "512"))
-TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
-TOP_P = float(os.environ.get("TOP_P", "0.95"))
-USE_AUTH_TOKEN = os.environ.get("HF_TOKEN")
 APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
 Response Format:
@@ -45,147 +57,202 @@ Analysis: The query asks for the capital of France. The context states it is Lon
 Response: The capital of France is London.
 """
-PHI3_TEMPLATE = """{% for message in messages -%}
-{% if message['role'] == 'system' -%}
-<|system|>
-{{ message['content'] }}
-<|end|>
-{% elif message['role'] == 'user' -%}
-<|user|>
-{{ message['content'] }}
-<|end|>
-{% elif message['role'] == 'assistant' -%}
-<|assistant|>
-{{ message['content'] }}
-<|end|>
-{% endif -%}
-{% endfor -%}
-<|assistant|>
-"""
 def build_messages(question: str, context: str):
     system = APPENDIX_RULES
     user = f"""Client: {question.strip()} Answer based on the context.
 Context:
 {context.strip()}"""
-    return [{"role":"system","content":system},{"role":"user","content":user}]
-def ensure_chat_template(tok):
-    try:
-        tmpl = tok.chat_template
-    except Exception:
-        tmpl = None
-    if not tmpl:
-        tok.chat_template = PHI3_TEMPLATE
-def encode_messages(tokenizer, messages: list):
-    ensure_chat_template(tokenizer)
-    return tokenizer.apply_chat_template(
-        messages, add_generation_prompt=True, tokenize=True, return_tensors="pt"
-    )
-_tokenizer = None
-_model = None
-def load_tokenizer_robust(model_id: str, auth):
-    try:
-        return AutoTokenizer.from_pretrained(model_id, use_auth_token=auth, trust_remote_code=False, use_fast=False)
-    except Exception as e1:
-        last_err = e1
-    try:
-        return LlamaTokenizer.from_pretrained(model_id, use_auth_token=auth)
-    except Exception as e2:
-        last_err = e2
-    try:
-        return AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", use_auth_token=auth, trust_remote_code=False, use_fast=False)
-    except Exception as e3:
-        raise last_err
-def load_model(model_id: str = DEFAULT_MODEL):
-    global _tokenizer, _model
-    if _tokenizer is not None and _model is not None:
-        return _tokenizer, _model
-    auth = USE_AUTH_TOKEN if (USE_AUTH_TOKEN and USE_AUTH_TOKEN.strip()) else None
-    _tokenizer = load_tokenizer_robust(model_id, auth)
-    if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
-        _tokenizer.pad_token_id = _tokenizer.eos_token_id
-    _model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map=DEVICE_MAP,
-        use_auth_token=auth,
-        trust_remote_code=True,
-    )
-    try:
-        _model.generation_config.cache_implementation = "static"
-    except Exception:
-        pass
-    return _tokenizer, _model
-def generate_text(question: str, context: str, temperature: float, top_p: float, max_new_tokens: int, model_id: str):
-    tokenizer, model = load_model(model_id)
-    messages = build_messages(question, context)
-    inputs = encode_messages(tokenizer, messages).to(model.device)
-    with torch.no_grad():
-        output_ids = model.generate(
-            inputs,
-            do_sample=True if temperature > 0 else False,
-            temperature=temperature,
-            top_p=top_p,
-            max_new_tokens=max_new_tokens,
-            pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-            use_cache=False,
-        )
-    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    analysis, response = "", ""
     a_idx = text.rfind("Analysis:")
     r_idx = text.rfind("Response:")
     if a_idx != -1 and (r_idx == -1 or a_idx < r_idx):
         if r_idx != -1:
-            analysis = text[a_idx+len("Analysis:"):r_idx].strip()
-            response = text[r_idx+len("Response:"):].strip()
         else:
-            analysis = text[a_idx+len("Analysis:"):].strip()
     else:
         response = text.strip()
-    return analysis, response, text
 PRESET_Q = "What are the health effects of coffee? Answer based on the context."
-PRESET_CTX = "Coffee contains caffeine, which can increase alertness. Excess intake may cause jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
-with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt Demo") as demo:
-    gr.Markdown("# Exoskeleton Reasoning — Appendix-Style Prompt\nThe model must **prioritize the provided context**, and reply in plain text with two sections: **Analysis** and **Response**.")
     with gr.Row():
         with gr.Column(scale=3):
             q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
             ctx = gr.Textbox(label="Context (the source you must follow)", value=PRESET_CTX, lines=8)
             with gr.Row():
-                temp = gr.Slider(0.0, 1.2, value=TEMPERATURE, step=0.05, label="Temperature")
-                topp = gr.Slider(0.1, 1.0, value=TOP_P, step=0.05, label="Top-p")
-            with gr.Row():
-                max_new = gr.Slider(64, 1024, value=MAX_NEW_TOKENS, step=16, label="Max new tokens")
-                model_id = gr.Textbox(label="Model ID", value=DEFAULT_MODEL)
             run = gr.Button("Run", variant="primary")
-            gr.Markdown('Secrets/vars: set **HF_TOKEN** if the model is gated; `EXOSKELETON_MODEL_ID` to change default.')
         with gr.Column(scale=4):
             with gr.Accordion("Analysis", open=True):
-                analysis_box = gr.Textbox(lines=6, label="Analysis (model)")
             with gr.Accordion("Response", open=True):
-                response_box = gr.Textbox(lines=6, label="Response (model)")
             with gr.Accordion("Raw output", open=False):
                 raw_box = gr.Textbox(lines=8, label="Raw text")
-    def infer_fn(question, context, temperature, top_p, max_new_tokens, model_id):
-        if not question.strip() or not context.strip():
             gr.Warning("Please provide both a Client question and Context.")
             return "", "", ""
-        a, r, raw = generate_text(question, context, temperature, top_p, max_new_tokens, model_id)
-        return a, r, raw
-    run.click(fn=infer_fn, inputs=[q, ctx, temp, topp, max_new, model_id], outputs=[analysis_box, response_box, raw_box])
 if __name__ == "__main__":
     demo.launch()

 import os
+import time
+import random
 import json
+import requests
 import gradio as gr
+# ==============================
+# Config via Secrets / Variables
+# ==============================
+# Secrets (Space: Settings → Variables & secrets → Secrets)
+FRIENDLI_API_KEY = os.getenv("FRIENDLI_API_KEY", "")  # <— SECRET. Do not print/log.
+# Variables (non-secret is okay; keep model id as a secret if you prefer)
+FRIENDLI_ENDPOINT = os.getenv("FRIENDLI_ENDPOINT", "https://api.friendli.ai/dedicated/v1/chat/completions")
+FRIENDLI_MODEL_ID = os.getenv("FRIENDLI_MODEL_ID", "stp7xzjspxe8")  # move to Secret if you want to hide it fully
+DEFAULT_MAX_TOKENS = int(os.getenv("FRIENDLI_MAX_TOKENS", "2000"))
+DEFAULT_TEMPERATURE = float(os.getenv("FRIENDLI_TEMPERATURE", "0.0"))
+DEFAULT_TIMEOUT = int(os.getenv("FRIENDLI_TIMEOUT_SEC", "60"))
+# Safety: never leak secrets in logs
+def _redact(s: str) -> str:
+    if not s:
+        return s
+    return s[:4] + "****" + s[-4:] if len(s) > 8 else "****"
+# ==============================
+# Appendix-style Prompt (Phi 3.5 instruct flavor)
+# ==============================
 APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
 Response Format:
 Response: The capital of France is London.
 """
 def build_messages(question: str, context: str):
+    """
+    Friendly's API expects OpenAI-style 'messages'.
+    We'll send:
+      - system: Appendix rules + one-shot example
+      - user:   "Client: ... Answer based on the context.\n\nContext:\n..."
+    """
     system = APPENDIX_RULES
     user = f"""Client: {question.strip()} Answer based on the context.
 Context:
 {context.strip()}"""
+    return [
+        {"role": "system", "content": system},
+        {"role": "user", "content": user},
+    ]
+# ==============================
+# Friendly API client with retry
+# ==============================
+def call_friendly_with_retry(messages, model_id, max_tokens, temperature, timeout_sec=DEFAULT_TIMEOUT,
+                             max_attempts=5, first_503_wait=10):
+    """
+    Calls Friendly chat completions with:
+      - 503-aware first retry (server warm-up)
+      - exponential backoff w/ jitter
+      - strict timeout
+    All secrets are read from env; nothing is exposed to the client UI.
+    """
+    if not FRIENDLI_API_KEY:
+        raise RuntimeError("Missing FRIENDLI_API_KEY secret.")
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {FRIENDLI_API_KEY}",
+    }
+    payload = {
+        "messages": messages,
+        "model": model_id,
+        "max_tokens": int(max_tokens),
+        "temperature": float(temperature),
+    }
+    # First attempt is often 503 (cold start). Handle specifically.
+    for attempt in range(1, max_attempts + 1):
+        try:
+            resp = requests.post(
+                FRIENDLI_ENDPOINT,
+                headers=headers,
+                json=payload,
+                timeout=timeout_sec,
+            )
+            # If Friendly uses 429/5xx for rate/overload, raise_for_status will catch it
+            if resp.status_code == 503:
+                # cold start; wait and retry with fixed small delay
+                if attempt < max_attempts:
+                    time.sleep(first_503_wait)
+                    continue
+                else:
+                    resp.raise_for_status()
+            resp.raise_for_status()
+            data = resp.json()
+            # Defensive parsing
+            content = (
+                data.get("choices", [{}])[0]
+                    .get("message", {})
+                    .get("content", "")
+            )
+            if not content or not str(content).strip():
+                return "[EMPTY_RESPONSE]"
+            return str(content)
+        except requests.exceptions.HTTPError as http_err:
+            code = getattr(http_err.response, "status_code", None)
+            # Retry strategies:
+            if code in (429, 500, 502, 503, 504) and attempt < max_attempts:
+                # Exp backoff with jitter
+                sleep_s = min(2 ** attempt, 20) + random.uniform(0, 0.5)
+                time.sleep(sleep_s)
+                continue
+            # Non-retryable or exhausted
+            raise RuntimeError(f"Friendly API HTTP error (status={code}): {http_err}") from http_err
+        except requests.exceptions.RequestException as net_err:
+            # Network timeouts / DNS / connection errors — retry with backoff
+            if attempt < max_attempts:
+                sleep_s = min(2 ** attempt, 20) + random.uniform(0, 0.5)
+                time.sleep(sleep_s)
+                continue
+            raise RuntimeError(f"Friendly API network error: {net_err}") from net_err
+    # Should not reach here due to raises above, but just in case:
+    raise RuntimeError("Failed to get response from Friendly API after retries.")
+# ==============================
+# Helpers
+# ==============================
+def parse_analysis_response(text: str):
+    """Extract 'Analysis:' and 'Response:' blocks from plain text."""
+    if not text:
+        return "", ""
     a_idx = text.rfind("Analysis:")
     r_idx = text.rfind("Response:")
+    analysis, response = "", ""
     if a_idx != -1 and (r_idx == -1 or a_idx < r_idx):
         if r_idx != -1:
+            analysis = text[a_idx + len("Analysis:"): r_idx].strip()
+            response = text[r_idx + len("Response:"):].strip()
         else:
+            analysis = text[a_idx + len("Analysis:"):].strip()
     else:
         response = text.strip()
+    return analysis, response
+# ==============================
+# UI
+# ==============================
 PRESET_Q = "What are the health effects of coffee? Answer based on the context."
+PRESET_CTX = (
+    "Coffee contains caffeine, which can increase alertness. Excess intake may cause "
+    "jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
+)
+with gr.Blocks(title="Exoskeleton Reasoning — Appendix Prompt (Friendly API)") as demo:
+    gr.Markdown(
+        "# Exoskeleton Reasoning — Appendix-Style Prompt (Friendly API)\n"
+        "- This demo **uses your Friendly endpoint** from the server (no keys in the browser).\n"
+        "- The model must prioritize the provided **Context**, and reply in plain text with two sections: **Analysis** and **Response**.\n"
+        "- Note: the **first call** may return **503** while the model wakes; built-in retries will handle it."
+    )
     with gr.Row():
         with gr.Column(scale=3):
             q = gr.Textbox(label="Client question", value=PRESET_Q, lines=4)
             ctx = gr.Textbox(label="Context (the source you must follow)", value=PRESET_CTX, lines=8)
             with gr.Row():
+                temp = gr.Slider(0.0, 1.0, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
+                max_new = gr.Slider(64, 4000, value=DEFAULT_MAX_TOKENS, step=32, label="Max tokens")
+            # Optional override (kept server-side; not exposed to client JS)
+            model_id_box = gr.Textbox(
+                label="Model ID (server-side override)",
+                value=FRIENDLI_MODEL_ID,
+                type="password",  # visually hides value in the UI (still server-side)
+            )
             run = gr.Button("Run", variant="primary")
+            tips = gr.Markdown(
+                f"**Server config** — endpoint: `{FRIENDLI_ENDPOINT}` · model: hidden · timeout: {DEFAULT_TIMEOUT}s"
+            )
         with gr.Column(scale=4):
             with gr.Accordion("Analysis", open=True):
+                analysis_box = gr.Textbox(lines=8, label="Analysis (model)")
             with gr.Accordion("Response", open=True):
+                response_box = gr.Textbox(lines=8, label="Response (model)")
             with gr.Accordion("Raw output", open=False):
                 raw_box = gr.Textbox(lines=8, label="Raw text")
+    def infer_fn(question, context, temperature, max_tokens, model_id_override):
+        if not FRIENDLI_API_KEY:
+            raise gr.Error("Server is missing FRIENDLI_API_KEY secret. Add it in Settings → Variables & secrets.")
+        question = (question or "").strip()
+        context = (context or "").strip()
+        if not question or not context:
             gr.Warning("Please provide both a Client question and Context.")
             return "", "", ""
+        # Never expose secrets/endpoint; all calls are server-side
+        messages = build_messages(question, context)
+        # Resolve model id strictly server-side
+        model_id = (model_id_override or "").strip() or FRIENDLI_MODEL_ID
+        # Do the call with retries
+        text = call_friendly_with_retry(
+            messages=messages,
+            model_id=model_id,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            timeout_sec=DEFAULT_TIMEOUT,
+            max_attempts=5,
+            first_503_wait=10,
+        )
+        analysis, response = parse_analysis_response(text)
+        return analysis, response, text
+    run.click(
+        fn=infer_fn,
+        inputs=[q, ctx, temp, max_new, model_id_box],
+        outputs=[analysis_box, response_box, raw_box]
+    )
 if __name__ == "__main__":
     demo.launch()