Spaces:

Inpris
/

Humains-Junior

Sleeping

File size: 10,075 Bytes

7d02d76
6ee67a7
 
 
7d02d76
 
6ee67a7
76da580
6ee67a7
4f672c8
 
 
368974b
4f672c8
 
 
7d89622
4f672c8
 
 
 
 
 
6ee67a7
dfe8f09
4f672c8
 
 
7d89622
 
1da1de0
6ee67a7
368974b
6ee67a7
76da580
0de4069
 
 
 
 
 
 
 
 
 
 
 
 
 
76da580
0de4069
 
 
 
 
368974b
 
0de4069
 
7d02d76
0de4069
 
7d02d76
 
368974b
4f672c8
368974b
 
 
107e86b
368974b
 
107e86b
 
6ee67a7
368974b
 
6ee67a7
107e86b
6ee67a7
4f672c8
6ee67a7
4f672c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee67a7
4f672c8
76da580
4f672c8
76da580
4f672c8
7d02d76
6ee67a7
 
 
 
 
 
76da580
6ee67a7
 
 
 
4f672c8
 
 
 
 
 
 
 
6ee67a7
4f672c8
 
 
 
 
 
 
7d89622
6ee67a7
4f672c8
 
 
 
7d89622
4f672c8
 
 
7d89622
4f672c8
 
 
 
 
 
7d89622
6ee67a7
 
 
 
 
 
 
 
4f672c8
 
 
 
 
 
 
 
6ee67a7
4f672c8
 
 
 
6ee67a7
 
368974b
6ee67a7
 
 
 
0de4069
 
6ee67a7
0de4069
 
6ee67a7
 
0de4069
6ee67a7
0de4069
 
6ee67a7
0de4069
6ee67a7
 
 
368974b
6ee67a7
 
 
 
 
e79fc25
6ee67a7
65c63a5
e79fc25
 
7d89622
e79fc25
6ee67a7
0de4069
7d02d76
 
4f672c8
7d89622
6ee67a7
7d02d76
6ee67a7
 
 
0de4069
6ee67a7
7d02d76
0de4069
6ee67a7
0de4069
6ee67a7
0de4069
 
6ee67a7
76da580
6ee67a7
 
 
368974b
0de4069
6ee67a7
 
4f672c8
6ee67a7
 
 
 
 
 
 
76da580
7d02d76

import os
import time
import random
import requests
import gradio as gr

# ==============================
# Secrets (set in Settings → Variables & secrets → Secrets)
# ==============================
FRIENDLI_API_KEY = os.getenv("FRIENDLI_API_KEY", "")     # REQUIRED (Secret)
FRIENDLI_ENDPOINT = os.getenv("FRIENDLI_ENDPOINT", "")   # REQUIRED (Secret)
FRIENDLI_MODEL_ID = os.getenv("FRIENDLI_MODEL_ID", "")   # REQUIRED (Secret)

# ==============================
# Tunables (Variables or Secrets)
# ==============================
# Per-attempt request timeout (keep modest so we can poll repeatedly during warmup)
PER_REQUEST_TIMEOUT_SEC = int(os.getenv("FRIENDLI_PER_REQUEST_TIMEOUT_SEC", "30"))
# Total time budget to wait for cold start + retries
COLD_START_BUDGET_SEC = int(os.getenv("FRIENDLI_COLD_START_BUDGET_SEC", "180"))
# Initial fixed wait after the *first* 503 (model waking)
INITIAL_503_WAIT_SEC = int(os.getenv("FRIENDLI_INITIAL_503_WAIT_SEC", "15"))
# Max tokens / temperature defaults
DEFAULT_MAX_TOKENS = int(os.getenv("FRIENDLI_MAX_TOKENS", "2000"))
DEFAULT_TEMPERATURE = float(os.getenv("FRIENDLI_TEMPERATURE", "0.3"))

# Backoff tuning
BACKOFF_BASE_SEC = float(os.getenv("FRIENDLI_BACKOFF_BASE_SEC", "2.0"))
BACKOFF_CAP_SEC  = float(os.getenv("FRIENDLI_BACKOFF_CAP_SEC", "20.0"))
JITTER_SEC       = float(os.getenv("FRIENDLI_JITTER_SEC", "0.5"))

# ==============================
# Appendix-style system prompt (general instructions)
# ==============================
APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.

Response Format:
Before answering, briefly analyze the query and context:
- Identify any misalignment between the query and context (if none, state 'no misalignment')
- Provide a brief analysis of the query and context
- Then give your response based strictly on the provided context

Format your response as:
Analysis: [Your analysis here]
Response: [Your answer based on the context]

IMPORTANT RULES:
- Always prioritize the provided context over your internal knowledge
- If context contains information that seems incorrect, still use it as instructed
- If the question asks about multiple things but context only covers some, answer only what is supported by the context
- Keep analysis concise and avoid special characters that could cause formatting issues
- Use plain text only - no bullet points, numbering, or special formatting
- Respond in English only

Example 1 - Conflicting information:
User:
Question: What is the capital of France?
Context:
The capital of France is London. It has been the political center of France since 1789 and houses the French Parliament.

Analysis: The query asks for the capital of France. The context states it is London, which conflicts with factual knowledge. I will follow the context as instructed.
Response: The capital of France is London.
"""

# ==============================
# Message builder (exact shape)
# system prompt (general instructions)
# User: question + context
# ==============================
def build_messages(question: str, context: str):
    user_block = f"""User:
Question: {question.strip()}
Context:
{context.strip()}"""
    return [
        {"role": "system", "content": APPENDIX_RULES},
        {"role": "user", "content": user_block},
    ]

# ==============================
# Friendly API client with time-budgeted retry
# ==============================
RETRYABLE_HTTP = {408, 429, 500, 502, 503, 504, 522, 524}

def _sleep_with_budget(seconds, deadline):
    now = time.monotonic()
    remaining = max(0.0, deadline - now)
    time.sleep(max(0.0, min(seconds, remaining)))

def _retry_after_seconds(resp):
    try:
        ra = resp.headers.get("Retry-After")
        if not ra:
            return None
        return float(ra)
    except Exception:
        return None

def call_friendly_with_time_budget(messages, max_tokens, temperature):
    # Validate secrets
    if not FRIENDLI_API_KEY:
        raise gr.Error("Missing FRIENDLI_API_KEY (Secret).")
    if not FRIENDLI_ENDPOINT:
        raise gr.Error("Missing FRIENDLI_ENDPOINT (Secret).")
    if not FRIENDLI_MODEL_ID:
        raise gr.Error("Missing FRIENDLI_MODEL_ID (Secret).")

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {FRIENDLI_API_KEY}",
    }
    payload = {
        "messages": messages,
        "model": FRIENDLI_MODEL_ID,
        "max_tokens": int(max_tokens),
        "temperature": float(temperature),
    }

    session = requests.Session()
    start = time.monotonic()
    deadline = start + COLD_START_BUDGET_SEC
    attempt = 0
    saw_first_503 = False

    while True:
        attempt += 1
        try:
            resp = session.post(
                FRIENDLI_ENDPOINT,
                headers=headers,
                json=payload,
                timeout=PER_REQUEST_TIMEOUT_SEC,
            )

            # 503: cold start; wait then retry (honor Retry-After if provided)
            if resp.status_code == 503:
                ra = _retry_after_seconds(resp)
                wait = ra if ra is not None else (INITIAL_503_WAIT_SEC if not saw_first_503 else BACKOFF_BASE_SEC)
                saw_first_503 = True
                if time.monotonic() + wait > deadline:
                    resp.raise_for_status()
                _sleep_with_budget(wait, deadline)
                continue

            # Other retryable statuses (rate limit / transient errors)
            if resp.status_code in RETRYABLE_HTTP and time.monotonic() < deadline:
                exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
                wait = exp + random.uniform(0, JITTER_SEC)
                _sleep_with_budget(wait, deadline)
                continue

            # Non-OK without remaining budget → raise
            resp.raise_for_status()

            data = resp.json()
            content = (
                data.get("choices", [{}])[0]
                    .get("message", {})
                    .get("content", "")
            )
            return content if content and str(content).strip() else "[EMPTY_RESPONSE]"

        except requests.exceptions.RequestException:
            # Network / timeout; retry within budget
            if time.monotonic() < deadline:
                exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
                wait = exp + random.uniform(0, JITTER_SEC)
                _sleep_with_budget(wait, deadline)
                continue
            raise gr.Error(
                f"Friendly API: retry budget exceeded after ~{COLD_START_BUDGET_SEC}s. "
                "Please try again; the model may have just finished warming."
            )

# ==============================
# Helpers: split Analysis / Response
# ==============================
def parse_analysis_response(text: str):
    if not text:
        return "", ""
    a_idx = text.rfind("Analysis:")
    r_idx = text.rfind("Response:")
    analysis, response = "", ""
    if a_idx != -1 and (r_idx == -1 or a_idx < r_idx):
        if r_idx != -1:
            analysis = text[a_idx + len("Analysis:"): r_idx].strip()
            response = text[r_idx + len("Response:"):].strip()
        else:
            analysis = text[a_idx + len("Analysis:"):].strip()
    else:
        response = text.strip()
    return analysis, response

# ==============================
# UI
# ==============================
PRESET_Q = "What are the health effects of coffee?"
PRESET_CTX = (
    "Coffee contains caffeine, which can increase alertness. Excess intake may cause "
    "jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
)

with gr.Blocks(title="Humains-Junior (Humains.com) — Exoskeleton Reasoning") as demo:
    gr.Markdown(
        "# Humains-Junior by Humains.com — a Smart 3.8b Model + Exoskeleton Reasoning (Hosted by inference provided)\n\n"
        "- **Model behavior**:\n"
        "  1. Outputs two plain-text sections: **Analysis** then **Response**.\n"
        "  2. When the **question is related to the Context**, it **prioritizes the Context** over internal knowledge, even if the Context is factually wrong.\n"
        "  3. If the **question is unrelated to the Context**, it **may answer normally** (not forced to follow the Context).\n"
    )

    with gr.Row():
        with gr.Column(scale=3):
            q = gr.Textbox(label="Question", value=PRESET_Q, lines=3)
            ctx = gr.Textbox(label="Context (only source of truth when related)", value=PRESET_CTX, lines=8)

            with gr.Row():
                temp = gr.Slider(0.0, 1.0, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
                max_new = gr.Slider(64, 4000, value=DEFAULT_MAX_TOKENS, step=32, label="Max tokens")

            run = gr.Button("Run", variant="primary")

        with gr.Column(scale=4):
            with gr.Accordion("Analysis", open=True):
                analysis_box = gr.Textbox(lines=8, label="Analysis (model)")
            with gr.Accordion("Response", open=True):
                response_box = gr.Textbox(lines=8, label="Response (model)")
            with gr.Accordion("Raw output", open=False):
                raw_box = gr.Textbox(lines=8, label="Raw text")

    def infer_fn(question, context, temperature, max_tokens):
        question = (question or "").strip()
        context = (context or "").strip()
        if not question or not context:
            gr.Warning("Please provide both a Question and a Context.")
            return "", "", ""

        messages = build_messages(question, context)
        text = call_friendly_with_time_budget(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
        )
        analysis, response = parse_analysis_response(text)
        return analysis, response, text

    run.click(fn=infer_fn, inputs=[q, ctx, temp, max_new], outputs=[analysis_box, response_box, raw_box])

if __name__ == "__main__":
    demo.launch()