import os
import time
import random
import requests
import gradio as gr

# ==============================
# Secrets (set in Settings → Variables & secrets → Secrets)
# ==============================
FRIENDLI_API_KEY = os.getenv("FRIENDLI_API_KEY", "")     # REQUIRED (Secret)
FRIENDLI_ENDPOINT = os.getenv("FRIENDLI_ENDPOINT", "")   # REQUIRED (Secret)
FRIENDLI_MODEL_ID = os.getenv("FRIENDLI_MODEL_ID", "")   # REQUIRED (Secret)

# ==============================
# Tunables (Variables or Secrets)
# ==============================
# Per-attempt request timeout (keep modest so we can poll repeatedly during warmup)
PER_REQUEST_TIMEOUT_SEC = int(os.getenv("FRIENDLI_PER_REQUEST_TIMEOUT_SEC", "30"))
# Total time budget to wait for cold start + retries
COLD_START_BUDGET_SEC = int(os.getenv("FRIENDLI_COLD_START_BUDGET_SEC", "180"))
# Initial fixed wait after the *first* 503 (model waking)
INITIAL_503_WAIT_SEC = int(os.getenv("FRIENDLI_INITIAL_503_WAIT_SEC", "15"))
# Max tokens / temperature defaults
DEFAULT_MAX_TOKENS = int(os.getenv("FRIENDLI_MAX_TOKENS", "2000"))
DEFAULT_TEMPERATURE = float(os.getenv("FRIENDLI_TEMPERATURE", "0.3"))

# Backoff tuning
BACKOFF_BASE_SEC = float(os.getenv("FRIENDLI_BACKOFF_BASE_SEC", "2.0"))
BACKOFF_CAP_SEC  = float(os.getenv("FRIENDLI_BACKOFF_CAP_SEC", "20.0"))
JITTER_SEC       = float(os.getenv("FRIENDLI_JITTER_SEC", "0.5"))

# ==============================
# Appendix-style system prompt (general instructions)
# ==============================
APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.

Response Format:
Before answering, briefly analyze the query and context:
- Identify any misalignment between the query and context (if none, state 'no misalignment')
- Provide a brief analysis of the query and context
- Then give your response based strictly on the provided context

Format your response as:
Analysis: [Your analysis here]
Response: [Your answer based on the context]

IMPORTANT RULES:
- Always prioritize the provided context over your internal knowledge
- If context contains information that seems incorrect, still use it as instructed
- If the question asks about multiple things but context only covers some, answer only what is supported by the context
- Keep analysis concise and avoid special characters that could cause formatting issues
- Use plain text only - no bullet points, numbering, or special formatting
- Respond in English only

Example 1 - Conflicting information:
User:
Question: What is the capital of France?
Context:
The capital of France is London. It has been the political center of France since 1789 and houses the French Parliament.

Analysis: The query asks for the capital of France. The context states it is London, which conflicts with factual knowledge. I will follow the context as instructed.
Response: The capital of France is London.
"""

# ==============================
# Message builder (exact shape)
# system prompt (general instructions)
# User: question + context
# ==============================
def build_messages(question: str, context: str):
    user_block = f"""User:
Question: {question.strip()}
Context:
{context.strip()}"""
    return [
        {"role": "system", "content": APPENDIX_RULES},
        {"role": "user", "content": user_block},
    ]

# ==============================
# Friendly API client with time-budgeted retry
# ==============================
RETRYABLE_HTTP = {408, 429, 500, 502, 503, 504, 522, 524}

def _sleep_with_budget(seconds, deadline):
    now = time.monotonic()
    remaining = max(0.0, deadline - now)
    time.sleep(max(0.0, min(seconds, remaining)))

def _retry_after_seconds(resp):
    try:
        ra = resp.headers.get("Retry-After")
        if not ra:
            return None
        return float(ra)
    except Exception:
        return None

def call_friendly_with_time_budget(messages, max_tokens, temperature):
    # Validate secrets
    if not FRIENDLI_API_KEY:
        raise gr.Error("Missing FRIENDLI_API_KEY (Secret).")
    if not FRIENDLI_ENDPOINT:
        raise gr.Error("Missing FRIENDLI_ENDPOINT (Secret).")
    if not FRIENDLI_MODEL_ID:
        raise gr.Error("Missing FRIENDLI_MODEL_ID (Secret).")

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {FRIENDLI_API_KEY}",
    }
    payload = {
        "messages": messages,
        "model": FRIENDLI_MODEL_ID,
        "max_tokens": int(max_tokens),
        "temperature": float(temperature),
    }

    session = requests.Session()
    start = time.monotonic()
    deadline = start + COLD_START_BUDGET_SEC
    attempt = 0
    saw_first_503 = False

    while True:
        attempt += 1
        try:
            resp = session.post(
                FRIENDLI_ENDPOINT,
                headers=headers,
                json=payload,
                timeout=PER_REQUEST_TIMEOUT_SEC,
            )

            # 503: cold start; wait then retry (honor Retry-After if provided)
            if resp.status_code == 503:
                ra = _retry_after_seconds(resp)
                wait = ra if ra is not None else (INITIAL_503_WAIT_SEC if not saw_first_503 else BACKOFF_BASE_SEC)
                saw_first_503 = True
                if time.monotonic() + wait > deadline:
                    resp.raise_for_status()
                _sleep_with_budget(wait, deadline)
                continue

            # Other retryable statuses (rate limit / transient errors)
            if resp.status_code in RETRYABLE_HTTP and time.monotonic() < deadline:
                exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
                wait = exp + random.uniform(0, JITTER_SEC)
                _sleep_with_budget(wait, deadline)
                continue

            # Non-OK without remaining budget → raise
            resp.raise_for_status()

            data = resp.json()
            content = (
                data.get("choices", [{}])[0]
                    .get("message", {})
                    .get("content", "")
            )
            return content if content and str(content).strip() else "[EMPTY_RESPONSE]"

        except requests.exceptions.RequestException:
            # Network / timeout; retry within budget
            if time.monotonic() < deadline:
                exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
                wait = exp + random.uniform(0, JITTER_SEC)
                _sleep_with_budget(wait, deadline)
                continue
            raise gr.Error(
                f"Friendly API: retry budget exceeded after ~{COLD_START_BUDGET_SEC}s. "
                "Please try again; the model may have just finished warming."
            )

# ==============================
# Helpers: split Analysis / Response
# ==============================
def parse_analysis_response(text: str):
    if not text:
        return "", ""
    a_idx = text.rfind("Analysis:")
    r_idx = text.rfind("Response:")
    analysis, response = "", ""
    if a_idx != -1 and (r_idx == -1 or a_idx < r_idx):
        if r_idx != -1:
            analysis = text[a_idx + len("Analysis:"): r_idx].strip()
            response = text[r_idx + len("Response:"):].strip()
        else:
            analysis = text[a_idx + len("Analysis:"):].strip()
    else:
        response = text.strip()
    return analysis, response

# ==============================
# UI
# ==============================
PRESET_Q = "What are the health effects of coffee?"
PRESET_CTX = (
    "Coffee contains caffeine, which can increase alertness. Excess intake may cause "
    "jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
)

with gr.Blocks(title="Humains-Junior (Humains.com) — Exoskeleton Reasoning") as demo:
    gr.Markdown(
        "# Humains-Junior by Humains.com — a Smart 3.8b Model + Exoskeleton Reasoning (Hosted by inference provided)\n\n"
        "- **Model behavior**:\n"
        "  1. Outputs two plain-text sections: **Analysis** then **Response**.\n"
        "  2. When the **question is related to the Context**, it **prioritizes the Context** over internal knowledge, even if the Context is factually wrong.\n"
        "  3. If the **question is unrelated to the Context**, it **may answer normally** (not forced to follow the Context).\n"
    )

    with gr.Row():
        with gr.Column(scale=3):
            q = gr.Textbox(label="Question", value=PRESET_Q, lines=3)
            ctx = gr.Textbox(label="Context (only source of truth when related)", value=PRESET_CTX, lines=8)

            with gr.Row():
                temp = gr.Slider(0.0, 1.0, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
                max_new = gr.Slider(64, 4000, value=DEFAULT_MAX_TOKENS, step=32, label="Max tokens")

            run = gr.Button("Run", variant="primary")

        with gr.Column(scale=4):
            with gr.Accordion("Analysis", open=True):
                analysis_box = gr.Textbox(lines=8, label="Analysis (model)")
            with gr.Accordion("Response", open=True):
                response_box = gr.Textbox(lines=8, label="Response (model)")
            with gr.Accordion("Raw output", open=False):
                raw_box = gr.Textbox(lines=8, label="Raw text")

    def infer_fn(question, context, temperature, max_tokens):
        question = (question or "").strip()
        context = (context or "").strip()
        if not question or not context:
            gr.Warning("Please provide both a Question and a Context.")
            return "", "", ""

        messages = build_messages(question, context)
        text = call_friendly_with_time_budget(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
        )
        analysis, response = parse_analysis_response(text)
        return analysis, response, text

    run.click(fn=infer_fn, inputs=[q, ctx, temp, max_new], outputs=[analysis_box, response_box, raw_box])

if __name__ == "__main__":
    demo.launch()