"""
llm_api.py — Unified Gemini client shared by team_context and PR-body
generation paths.

Used in two places:
  - team_context.extract_team_context: corpus-format summarization of
    GitHub commits. The 2B base hallucinates on this task (tested 2026-
    05-25), so Gemini is the floor for trustworthy extraction.
  - PR body generation when MHPD_GENERATION_BACKEND=gemini (alternate
    to the local 2B path). Tradeoff is faster UX + more Zero-GPU quota
    at the cost of ~$0.001 per request.

Configured by env vars:
  GEMINI_API_KEY        required
  GEMINI_MODEL          default gemini-2.0-flash-exp
"""
from __future__ import annotations

import os
from typing import Optional

import google.generativeai as genai

# gemini-3.1-flash-lite chosen empirically (2026-05-25 testing) over
# gemini-3.5-flash and gemini-2.0-flash-exp:
#   - 3.5-flash: outputs truncated by thinking-mode token consumption;
#     all 800 max_output_tokens spent on internal reasoning, ~0 emitted.
#   - 2.0-flash-exp: deprecated, returns 404.
#   - 3.1-flash-lite: non-thinking variant, fast (~3s for our 800-token
#     prompts), produces well-structured corpus-format output grounded
#     in the provided commits. Quality validated against 4 corpus repos.
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite")

_configured = False
_model_cache: Optional[genai.GenerativeModel] = None


def _ensure_configured() -> genai.GenerativeModel:
    global _configured, _model_cache
    if not _configured:
        api_key = os.environ.get("GEMINI_API_KEY")
        if not api_key:
            raise RuntimeError(
                "GEMINI_API_KEY is required. Set it as a Space secret or "
                "export it locally. Alternatively, set MHPD_GENERATION_BACKEND=local "
                "and ensure team_context's heuristic fallback is acceptable."
            )
        genai.configure(api_key=api_key)
        _configured = True
    if _model_cache is None:
        _model_cache = genai.GenerativeModel(GEMINI_MODEL)
    return _model_cache


def gemini_call(prompt: str, *, max_output_tokens: int = 800,
                temperature: float = 0.2) -> str:
    """Single-turn generation. Raises on configuration / API failure;
    callers decide whether to retry or fall back."""
    model = _ensure_configured()
    resp = model.generate_content(
        prompt,
        generation_config={
            "temperature": temperature,
            "max_output_tokens": max_output_tokens,
        },
    )
    return (resp.text or "").strip()


def gemini_generate_pr_body(team_ctx: str, paper_text: str) -> str:
    """Same task as model_io._generate_pr_body_local but via Gemini Flash.

    Activated by MHPD_GENERATION_BACKEND=gemini. ~$0.0001-$0.001 per call,
    ~2-3s latency, fully avoids burning GPU time on free-text generation
    (which the 2B base does but slowly, ~3-6s on GPU)."""
    prompt = (
        "You are a software engineer integrating a research paper into your "
        "team's codebase. Write a concise pull request body proposing the "
        "integration. Be specific and technical — your output will be handed "
        "directly to a coding agent for implementation.\n\n"
        f"Team context:\n{team_ctx[:1500]}\n\n"
        f"Paper:\n{paper_text[:1800]}\n\n"
        "Use this structure exactly:\n\n"
        "## Summary\n(1-2 sentences: what does this PR do?)\n\n"
        "## Motivation\n(why this paper fits the team's direction — reference "
        "specific commits or themes from the team context)\n\n"
        "## Implementation plan\n(3-5 concrete bullet points a coding agent "
        "would follow; include file paths or module names where you can infer them)\n\n"
        "## Open questions\n(1-2 items genuinely worth clarifying before merge)"
    )
    return gemini_call(prompt, max_output_tokens=700, temperature=0.3)