Spaces:
Paused
Paused
| """ | |
| llm_api.py β Unified Gemini client shared by team_context and PR-body | |
| generation paths. | |
| Used in two places: | |
| - team_context.extract_team_context: corpus-format summarization of | |
| GitHub commits. The 2B base hallucinates on this task (tested 2026- | |
| 05-25), so Gemini is the floor for trustworthy extraction. | |
| - PR body generation when MHPD_GENERATION_BACKEND=gemini (alternate | |
| to the local 2B path). Tradeoff is faster UX + more Zero-GPU quota | |
| at the cost of ~$0.001 per request. | |
| Configured by env vars: | |
| GEMINI_API_KEY required | |
| GEMINI_MODEL default gemini-2.0-flash-exp | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from typing import Optional | |
| import google.generativeai as genai | |
| # gemini-3.1-flash-lite chosen empirically (2026-05-25 testing) over | |
| # gemini-3.5-flash and gemini-2.0-flash-exp: | |
| # - 3.5-flash: outputs truncated by thinking-mode token consumption; | |
| # all 800 max_output_tokens spent on internal reasoning, ~0 emitted. | |
| # - 2.0-flash-exp: deprecated, returns 404. | |
| # - 3.1-flash-lite: non-thinking variant, fast (~3s for our 800-token | |
| # prompts), produces well-structured corpus-format output grounded | |
| # in the provided commits. Quality validated against 4 corpus repos. | |
| GEMINI_MODEL = os.environ.get("GEMINI_MODEL", "gemini-3.1-flash-lite") | |
| _configured = False | |
| _model_cache: Optional[genai.GenerativeModel] = None | |
| def _ensure_configured() -> genai.GenerativeModel: | |
| global _configured, _model_cache | |
| if not _configured: | |
| api_key = os.environ.get("GEMINI_API_KEY") | |
| if not api_key: | |
| raise RuntimeError( | |
| "GEMINI_API_KEY is required. Set it as a Space secret or " | |
| "export it locally. Alternatively, set MHPD_GENERATION_BACKEND=local " | |
| "and ensure team_context's heuristic fallback is acceptable." | |
| ) | |
| genai.configure(api_key=api_key) | |
| _configured = True | |
| if _model_cache is None: | |
| _model_cache = genai.GenerativeModel(GEMINI_MODEL) | |
| return _model_cache | |
| def gemini_call(prompt: str, *, max_output_tokens: int = 800, | |
| temperature: float = 0.2) -> str: | |
| """Single-turn generation. Raises on configuration / API failure; | |
| callers decide whether to retry or fall back.""" | |
| model = _ensure_configured() | |
| resp = model.generate_content( | |
| prompt, | |
| generation_config={ | |
| "temperature": temperature, | |
| "max_output_tokens": max_output_tokens, | |
| }, | |
| ) | |
| return (resp.text or "").strip() | |
| def gemini_generate_pr_body(team_ctx: str, paper_text: str) -> str: | |
| """Same task as model_io._generate_pr_body_local but via Gemini Flash. | |
| Activated by MHPD_GENERATION_BACKEND=gemini. ~$0.0001-$0.001 per call, | |
| ~2-3s latency, fully avoids burning GPU time on free-text generation | |
| (which the 2B base does but slowly, ~3-6s on GPU).""" | |
| prompt = ( | |
| "You are a software engineer integrating a research paper into your " | |
| "team's codebase. Write a concise pull request body proposing the " | |
| "integration. Be specific and technical β your output will be handed " | |
| "directly to a coding agent for implementation.\n\n" | |
| f"Team context:\n{team_ctx[:1500]}\n\n" | |
| f"Paper:\n{paper_text[:1800]}\n\n" | |
| "Use this structure exactly:\n\n" | |
| "## Summary\n(1-2 sentences: what does this PR do?)\n\n" | |
| "## Motivation\n(why this paper fits the team's direction β reference " | |
| "specific commits or themes from the team context)\n\n" | |
| "## Implementation plan\n(3-5 concrete bullet points a coding agent " | |
| "would follow; include file paths or module names where you can infer them)\n\n" | |
| "## Open questions\n(1-2 items genuinely worth clarifying before merge)" | |
| ) | |
| return gemini_call(prompt, max_output_tokens=700, temperature=0.3) | |