Spaces:

sammeeer
/

SchemeImpactNet

Sleeping

App Files Files Community

SchemeImpactNet / frontend /utils /gemini_utils.py

sammeeer

genai inclusion

90bfc25 11 days ago

raw

history blame contribute delete

12 kB

	"""
	utils/gemini_utils.py
	---------------------
	Shared Gemini setup, data context builder, and model caller.
	Used by gemini_insights.py, home.py, and every page's AI summary widget.
	"""

	import json
	import streamlit as st
	import google.generativeai as genai


	import re

	def strip_markdown(text: str) -> str:
	text = re.sub(r'\\(.+?)\\', r'\1', text) # bold
	text = re.sub(r'\(.+?)\', r'\1', text) # italic
	text = re.sub(r'__(.+?)__', r'\1', text) # __bold__
	text = re.sub(r'_(.+?)_', r'\1', text) # _italic_
	text = re.sub(r'^\s*#{1,6}\s+', '', text, flags=re.MULTILINE) # headings
	text = re.sub(r'^\s[-•]\s+', '', text, flags=re.MULTILINE) # bullets
	text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) # numbered lists
	text = re.sub(r'`(.+?)`', r'\1', text) # inline code
	text = re.sub(r'\n{3,}', '\n\n', text)
	return text.strip()

	from utils.api_client import (
	fetch_stats, fetch_predictions, fetch_optimizer_results, fetch_yearly_trend,
	)

	MODEL_NAME = "gemini-2.5-flash-lite"

	# ── Preset questions (used by insights page) ──────────────────────────────────
	PRESET_QUESTIONS = [
	{
	"label": "Which districts are predicted to see the steepest employment decline?",
	"key": "declining",
	"icon": "📉",
	},
	{
	"label": "Which districts offer the best return on additional budget investment?",
	"key": "roi",
	"icon": "💰",
	},
	{
	"label": "What does the model predict for national employment in the next cycle?",
	"key": "forecast",
	"icon": "🔭",
	},
	{
	"label": "Which states should be prioritised for budget reallocation and why?",
	"key": "realloc",
	"icon": "⚖️",
	},
	{
	"label": "What is the predicted COVID recovery trajectory across districts?",
	"key": "covid",
	"icon": "🦠",
	},
	{
	"label": "Which districts are most underfunded relative to their predicted demand?",
	"key": "underfunded",
	"icon": "🚨",
	},
	{
	"label": "What are the top 5 efficiency leaders and what can we learn from them?",
	"key": "efficiency",
	"icon": "🏆",
	},
	{
	"label": "Summarise the overall model prediction results in plain language.",
	"key": "summary",
	"icon": "📋",
	},
	]

	# ── Per-page summary prompts ──────────────────────────────────────────────────
	PAGE_SUMMARY_PROMPTS = {
	"overview": "In 3–4 sentences, summarise the key takeaways from the national MNREGA employment trend data shown. Focus on the most important patterns, anomalies, and what they imply for policy. Use specific numbers.",
	"districts": "In 3–4 sentences, give a sharp analytical summary of this district's MNREGA performance trajectory. What is the trend, how did COVID affect it, and what does the model predict? Be specific.",
	"predictions": "In 3–4 sentences, summarise what the model predictions reveal. Comment on accuracy, any notable over/under-predictions, and what the forecasts imply for the next cycle.",
	"optimizer": "In 3–4 sentences, explain the budget optimiser results in plain language. What is the headline gain, which districts benefit most, and is the reallocation realistic to implement?",
	"insights": "In 3–4 sentences, provide a crisp executive summary of the strategic insights. What are the 2–3 most urgent actions a policymaker should take based on this data?",
	"spatial": "In 3–4 sentences, describe what the spatial distribution of predicted employment reveals. Are there regional clusters of high or low performance? What geographic patterns stand out?",
	}


	def get_gemini_key() -> str \| None:
	"""Get key from session state (set once in sidebar)."""
	return st.session_state.get("gemini_api_key", "")


	def configure_gemini(api_key: str):
	genai.configure(api_key=api_key)
	return genai.GenerativeModel(MODEL_NAME)


	@st.cache_data(ttl=300, show_spinner=False)
	def build_context(state_param: str \| None) -> dict:
	"""Build a structured data context dict from live API data."""
	stats = fetch_stats()
	pred_df = fetch_predictions(state=state_param)
	opt_df = fetch_optimizer_results(state=state_param)
	trend_df = fetch_yearly_trend(state=state_param)

	ctx: dict = {}

	ctx["scope"] = state_param or "All India"
	ctx["overview"] = {
	"total_districts": stats.get("total_districts"),
	"total_states": stats.get("total_states"),
	"year_range": stats.get("year_range"),
	"total_persondays_lakhs": round(stats.get("total_persondays_lakhs", 0), 1),
	"covid_spike_pct": stats.get("covid_spike_pct"),
	}

	if not trend_df.empty:
	ctx["yearly_trend"] = (
	trend_df[["financial_year","total_persondays","avg_wage"]]
	.round(2).to_dict(orient="records")
	)

	if not pred_df.empty:
	ly = int(pred_df["financial_year"].max())
	prv = ly - 1
	lat = pred_df[pred_df["financial_year"] == ly]
	prv_df = pred_df[pred_df["financial_year"] == prv]

	ctx["model"] = {
	"algorithm": "GradientBoostingRegressor",
	"latest_predicted_year": ly,
	"walk_forward_r2": 0.91,
	"note": "2022 West Bengal anomaly excluded from CV",
	}

	if not prv_df.empty:
	mg = lat.merge(
	prv_df[["state","district","person_days_lakhs"]]
	.rename(columns={"person_days_lakhs":"prev"}),
	on=["state","district"], how="inner",
	)
	mg["chg"] = (mg["predicted_persondays"] - mg["prev"]).round(2)
	mg["chg_pct"] = (mg["chg"] / mg["prev"] * 100).round(1)

	ctx["predictions"] = {
	"n_improving": int((mg["chg"] >= 0).sum()),
	"n_declining": int((mg["chg"] < 0).sum()),
	"top_improving": mg.nlargest(5, "chg")[
	["state","district","prev","predicted_persondays","chg","chg_pct"]
	].to_dict(orient="records"),
	"top_declining": mg.nsmallest(5, "chg")[
	["state","district","prev","predicted_persondays","chg","chg_pct"]
	].to_dict(orient="records"),
	"national_predicted_total": round(float(lat["predicted_persondays"].sum()), 1),
	"national_actual_prev": round(float(prv_df["person_days_lakhs"].sum()), 1),
	}

	if not opt_df.empty and "persondays_gain" in opt_df.columns:
	sq = float(opt_df["sq_persondays"].sum())
	gain = float(opt_df["persondays_gain"].sum())
	ctx["optimizer"] = {
	"total_budget_lakhs": round(float(opt_df.get("budget_allocated_lakhs", opt_df["sq_persondays"]).sum()), 0),
	"status_quo_persondays": round(sq, 1),
	"gain_lakhs": round(gain, 2),
	"gain_pct": round(gain / sq * 100, 2) if sq else 0,
	"top_gain": opt_df.nlargest(5, "persondays_gain")[
	["state","district","persondays_gain","persondays_per_lakh","budget_change_pct"]
	].round(3).to_dict(orient="records"),
	"top_cut": opt_df.nsmallest(5, "persondays_gain")[
	["state","district","persondays_gain","persondays_per_lakh","budget_change_pct"]
	].round(3).to_dict(orient="records"),
	"by_state": (
	opt_df.groupby("state")["persondays_gain"]
	.sum().nlargest(8).round(2).to_dict()
	),
	"underfunded": opt_df[
	opt_df["budget_allocated_lakhs"] < opt_df["budget_allocated_lakhs"].quantile(0.33)
	].nlargest(5, "persondays_per_lakh")[
	["state","district","persondays_per_lakh","budget_allocated_lakhs"]
	].round(3).to_dict(orient="records") if "budget_allocated_lakhs" in opt_df.columns else [],
	}

	return ctx


	def call_gemini(api_key: str, prompt: str, temperature: float = 0.35) -> str:
	"""Call Gemini and return the text response."""
	try:
	m = configure_gemini(api_key)
	resp = m.generate_content(
	prompt,
	generation_config=genai.types.GenerationConfig(
	temperature=temperature,
	max_output_tokens=1024,
	),
	)
	return strip_markdown(resp.text)
	except Exception as e:
	return f"⚠️ Gemini error: {e}"


	def base_prompt(ctx: dict) -> str:
	return f"""You are a senior policy analyst specialising in India's MNREGA rural employment scheme.
	Scope: {ctx.get('scope', 'All India')}

	Live data from SchemeImpactNet (GradientBoostingRegressor, walk-forward CV R²≈0.91):
	{json.dumps(ctx, indent=2)}

	Rules:
	- Person-days in lakhs (1 lakh = 100,000). Budget in ₹ lakhs.
	- 2020: COVID surge (reverse migration drove demand spike).
	- 2022: West Bengal data anomaly (-93% to -98%) — not a real decline.
	- The LP optimizer reallocates budget across districts at zero additional cost.
	- Base every claim on the numbers above. Name specific districts and states.
	- Be direct, analytical, and avoid generic statements.

	"""


	def preset_prompt(ctx: dict, question_key: str) -> str:
	base = base_prompt(ctx)
	prompts = {
	"declining": base + "Which districts are predicted to see the steepest employment decline? Name the top 5, give exact predicted change figures, identify any state-level patterns, and suggest specific interventions. (~300 words)",
	"roi": base + "Which districts offer the best return on additional budget investment based on efficiency (persondays_per_lakh) scores? Name top districts, explain why their efficiency is high, and estimate the employment gain from a 10% budget increase. (~300 words)",
	"forecast": base + "What does the model predict for national employment in the next cycle? Compare predicted vs previous actual totals, identify which states drive the change, and assess confidence given model performance. (~300 words)",
	"realloc": base + "Which states should be prioritised for budget reallocation and why? Use the optimizer state-level data, name the top 3 states for increase and top 3 for reduction, with the employment gain rationale. (~300 words)",
	"covid": base + "What is the predicted COVID recovery trajectory? Has employment normalised post-2020 surge, or are certain districts still at elevated levels? What does this imply for future demand planning? (~300 words)",
	"underfunded": base + "Which districts are most underfunded relative to their predicted demand and efficiency scores? Name specific districts, show the gap between their efficiency and their budget allocation, and recommend reallocation amounts. (~300 words)",
	"efficiency": base + "Who are the top 5 efficiency leaders (highest persondays_per_lakh)? What structural factors likely explain their high efficiency? What can other districts learn and replicate? (~300 words)",
	"summary": base + "Summarise the overall model prediction results in plain language for a non-technical policymaker. Cover: what the model predicts nationally, which regions face challenges, and the 3 most important numbers to know. (~300 words)",
	}
	return prompts.get(question_key, base + "Provide a strategic analysis of the MNREGA data.")


	def page_summary_prompt(ctx: dict, page_key: str, extra_context: str = "") -> str:
	base = base_prompt(ctx)
	instruction = PAGE_SUMMARY_PROMPTS.get(page_key, "Summarise the key insights from this page's data in 3–4 sentences.")
	extra = f"\nAdditional page context:\n{extra_context}\n" if extra_context else ""
	return base + extra + "\n" + instruction + "\nRespond in 3–4 sentences only. Be precise and use specific numbers."