""" llm_api.py — Unified Gemini client shared by team_context and PR-body generation paths. Used in two places: - team_context.extract_team_context: corpus-format summarization of GitHub commits. The 2B base hallucinates on this task (tested 2026- 05-25), so Gemini is the floor for trustworthy extraction. - PR body generation when MHPD_GENERATION_BACKEND=gemini (alternate to the local 2B path). Tradeoff is faster UX + more Zero-GPU quota at the cost of ~$0.001 per request. Configured by env vars: GEMINI_API_KEY required GEMINI_MODEL default gemini-2.0-flash-exp """ from __future__ import annotations import os from typing import Optional import google.generativeai as genai # gemini-3.1-flash-lite chosen empirically (2026-05-25 testing) over # gemini-3.5-flash and gemini-2.0-flash-exp: # - 3.5-flash: outputs truncated by thinking-mode token consumption; # all 800 max_output_tokens spent on internal reasoning, ~0 emitted. # - 2.0-flash-exp: deprecated, returns 404. # - 3.1-flash-lite: non-thinking variant, fast (~3s for our 800-token # prompts), produces well-structured corpus-format output grounded # in the provided commits. Quality validated against 4 corpus repos. GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite") _configured = False _model_cache: Optional[genai.GenerativeModel] = None def _ensure_configured() -> genai.GenerativeModel: global _configured, _model_cache if not _configured: api_key = os.environ.get("GEMINI_API_KEY") if not api_key: raise RuntimeError( "GEMINI_API_KEY is required. Set it as a Space secret or " "export it locally. Alternatively, set MHPD_GENERATION_BACKEND=local " "and ensure team_context's heuristic fallback is acceptable." ) genai.configure(api_key=api_key) _configured = True if _model_cache is None: _model_cache = genai.GenerativeModel(GEMINI_MODEL) return _model_cache def gemini_call(prompt: str, *, max_output_tokens: int = 800, temperature: float = 0.2) -> str: """Single-turn generation. Raises on configuration / API failure; callers decide whether to retry or fall back.""" model = _ensure_configured() resp = model.generate_content( prompt, generation_config={ "temperature": temperature, "max_output_tokens": max_output_tokens, }, ) return (resp.text or "").strip() def gemini_generate_pr_body(team_ctx: str, paper_text: str) -> str: """Same task as model_io._generate_pr_body_local but via Gemini Flash. Activated by MHPD_GENERATION_BACKEND=gemini. ~$0.0001-$0.001 per call, ~2-3s latency, fully avoids burning GPU time on free-text generation (which the 2B base does but slowly, ~3-6s on GPU).""" prompt = ( "You are a software engineer integrating a research paper into your " "team's codebase. Write a concise pull request body proposing the " "integration. Be specific and technical — your output will be handed " "directly to a coding agent for implementation.\n\n" f"Team context:\n{team_ctx[:1500]}\n\n" f"Paper:\n{paper_text[:1800]}\n\n" "Use this structure exactly:\n\n" "## Summary\n(1-2 sentences: what does this PR do?)\n\n" "## Motivation\n(why this paper fits the team's direction — reference " "specific commits or themes from the team context)\n\n" "## Implementation plan\n(3-5 concrete bullet points a coding agent " "would follow; include file paths or module names where you can infer them)\n\n" "## Open questions\n(1-2 items genuinely worth clarifying before merge)" ) return gemini_call(prompt, max_output_tokens=700, temperature=0.3)