Feature-Finder / llm_api.py
salma-remyx's picture
Replace with MHPD paper recommender (v1.2 backend) to reuse Zero-GPU allocation
f4145b9 verified
"""
llm_api.py β€” Unified Gemini client shared by team_context and PR-body
generation paths.
Used in two places:
- team_context.extract_team_context: corpus-format summarization of
GitHub commits. The 2B base hallucinates on this task (tested 2026-
05-25), so Gemini is the floor for trustworthy extraction.
- PR body generation when MHPD_GENERATION_BACKEND=gemini (alternate
to the local 2B path). Tradeoff is faster UX + more Zero-GPU quota
at the cost of ~$0.001 per request.
Configured by env vars:
GEMINI_API_KEY required
GEMINI_MODEL default gemini-2.0-flash-exp
"""
from __future__ import annotations
import os
from typing import Optional
import google.generativeai as genai
# gemini-3.1-flash-lite chosen empirically (2026-05-25 testing) over
# gemini-3.5-flash and gemini-2.0-flash-exp:
# - 3.5-flash: outputs truncated by thinking-mode token consumption;
# all 800 max_output_tokens spent on internal reasoning, ~0 emitted.
# - 2.0-flash-exp: deprecated, returns 404.
# - 3.1-flash-lite: non-thinking variant, fast (~3s for our 800-token
# prompts), produces well-structured corpus-format output grounded
# in the provided commits. Quality validated against 4 corpus repos.
GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite")
_configured = False
_model_cache: Optional[genai.GenerativeModel] = None
def _ensure_configured() -> genai.GenerativeModel:
global _configured, _model_cache
if not _configured:
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise RuntimeError(
"GEMINI_API_KEY is required. Set it as a Space secret or "
"export it locally. Alternatively, set MHPD_GENERATION_BACKEND=local "
"and ensure team_context's heuristic fallback is acceptable."
)
genai.configure(api_key=api_key)
_configured = True
if _model_cache is None:
_model_cache = genai.GenerativeModel(GEMINI_MODEL)
return _model_cache
def gemini_call(prompt: str, *, max_output_tokens: int = 800,
temperature: float = 0.2) -> str:
"""Single-turn generation. Raises on configuration / API failure;
callers decide whether to retry or fall back."""
model = _ensure_configured()
resp = model.generate_content(
prompt,
generation_config={
"temperature": temperature,
"max_output_tokens": max_output_tokens,
},
)
return (resp.text or "").strip()
def gemini_generate_pr_body(team_ctx: str, paper_text: str) -> str:
"""Same task as model_io._generate_pr_body_local but via Gemini Flash.
Activated by MHPD_GENERATION_BACKEND=gemini. ~$0.0001-$0.001 per call,
~2-3s latency, fully avoids burning GPU time on free-text generation
(which the 2B base does but slowly, ~3-6s on GPU)."""
prompt = (
"You are a software engineer integrating a research paper into your "
"team's codebase. Write a concise pull request body proposing the "
"integration. Be specific and technical β€” your output will be handed "
"directly to a coding agent for implementation.\n\n"
f"Team context:\n{team_ctx[:1500]}\n\n"
f"Paper:\n{paper_text[:1800]}\n\n"
"Use this structure exactly:\n\n"
"## Summary\n(1-2 sentences: what does this PR do?)\n\n"
"## Motivation\n(why this paper fits the team's direction β€” reference "
"specific commits or themes from the team context)\n\n"
"## Implementation plan\n(3-5 concrete bullet points a coding agent "
"would follow; include file paths or module names where you can infer them)\n\n"
"## Open questions\n(1-2 items genuinely worth clarifying before merge)"
)
return gemini_call(prompt, max_output_tokens=700, temperature=0.3)