Spaces:

build-small-hackathon
/

omniscient-reader

Running

App Files Files Community

omniscient-reader / modal_client.py

Aswini-Kumar

Initial commit — Omniscient Reader Scenario Simulator

7d99fde 19 days ago

Raw

History Blame Contribute Delete

15.6 kB

	"""
	modal_client.py — Dual AI Client: Groq (fast turns) + Modal (big moments)
	==========================================================================
	Architecture:
	• call_dokkaebi() → Groq API (llama-3.1-8b-instant, 8B) — fast gameplay turns (~2s)
	• call_dokkaebi_modal() → Modal llama.cpp (Qwen 2.5 14B) — cinematic moments only
	(scenario intros, end-game report card)

	Total model parameter budget: 8B + 14B = 22B — well under the 32B hackathon cap.

	Groq is free, LPU-accelerated, and ~10x faster than Modal for standard turns.
	Modal is kept for "big moments" to satisfy the Modal prize requirement and to
	leverage the larger 14B model for high-quality cinematic narrative generation.

	Part of the ORV (Omniscient Reader's Viewpoint) Scenario Simulator.
	Build Small Hackathon 2026.
	"""

	import json
	import os
	import re
	import time
	from typing import Any

	import requests

	# ---------------------------------------------------------------------------
	# Load .env if present (for local dev)
	# ---------------------------------------------------------------------------
	try:
	_env_path = os.path.join(os.path.dirname(__file__), ".env")
	if os.path.isfile(_env_path):
	with open(_env_path) as _f:
	for _line in _f:
	_line = _line.strip()
	if _line and not _line.startswith("#") and "=" in _line:
	_k, _v = _line.split("=", 1)
	os.environ.setdefault(_k.strip(), _v.strip())
	except Exception:
	pass

	# ---------------------------------------------------------------------------
	# Configuration
	# ---------------------------------------------------------------------------

	# ── Groq (fast gameplay turns) ────────────────────────────────────────────
	GROQ_API_KEY: str = os.environ.get("GROQ_API_KEY", "")
	GROQ_ENDPOINT: str = "https://api.groq.com/openai/v1/chat/completions"
	# Must be <32B parameters for the hackathon!
	GROQ_MODEL: str = os.environ.get("GROQ_MODEL", "llama-3.1-8b-instant")
	GROQ_TIMEOUT: int = 30

	# ── Modal (cinematic big moments only) ───────────────────────────────────
	MODAL_ENDPOINT_URL: str = os.environ.get(
	"MODAL_ENDPOINT_URL",
	"https://aswinikumary--orv-dokkaebi-server-serve.modal.run"
	)
	MODAL_TIMEOUT: int = 45 # Reduced: if Modal cold-starts > 45s, fall back to Groq

	# Required keys in every valid AI response
	_REQUIRED_KEYS: set = {
	"narrative",
	"dokkaebi_comment",
	"stat_changes",
	"suggestions",
	"entertainment_score",
	"constellation_reactions",
	}


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# FALLBACK RESPONSE
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


	def get_fallback_response() -> dict[str, Any]:
	"""
	Return a safe, in-character fallback response for when ALL AI
	endpoints are unreachable or return invalid data.
	"""
	return {
	"narrative": (
	"The probability wavers. Something shifts in the fabric of reality. "
	"You feel it — a moment of static, as if the universe itself hesitated."
	),
	"dokkaebi_comment": "...Technical difficulties. How amusing.",
	"meta_detected": False,
	"meta_reason": None,
	"reality_subversion": None,
	"stat_changes": {
	"hp": 0,
	"coins": 10,
	"meta_exposure": 0,
	"prob_stability": 0,
	"trust": {},
	"constellation_affinity": {},
	},
	"new_title": None,
	"hidden_scenario": None,
	"big_moment": False,
	"suggestions": [
	"Look around carefully",
	"Search for survivors",
	"Do something unexpected",
	],
	"entertainment_score": 3,
	"constellation_reactions": [
	{
	"modifier": "Prisoner of the Golden Headband",
	"reaction": "Even the system glitches. Amusing.",
	"coins": 20,
	}
	],
	"scenario_complete": False,
	"scenario_rank": None,
	"dokkaebi_internal": "fallback response used",
	}


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# RESPONSE PARSER (shared)
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


	def parse_ai_response(raw_text: str) -> dict[str, Any]:
	"""
	Parse the raw text returned by the AI into a validated dict.

	Handles common LLM output quirks:
	- Strips leading/trailing whitespace
	- Removes markdown code fences
	- Extracts the substring between the first { and last }
	- Validates that all required keys are present
	"""
	text = raw_text.strip()

	# Strip markdown code fences if present
	text = re.sub(r"^```(?:json)?\s*", "", text, flags=re.MULTILINE)
	text = re.sub(r"```\s*$", "", text, flags=re.MULTILINE)
	text = text.strip()

	# Find the JSON object boundaries
	first_brace = text.find("{")
	last_brace = text.rfind("}")

	if first_brace == -1 or last_brace == -1 or last_brace <= first_brace:
	raise ValueError(
	f"No valid JSON object found in AI response. "
	f"Raw text (first 200 chars): {raw_text[:200]}"
	)

	json_str = text[first_brace : last_brace + 1]

	try:
	parsed = json.loads(json_str)
	except json.JSONDecodeError as exc:
	raise ValueError(
	f"Failed to parse JSON from AI response: {exc}. "
	f"Extracted text (first 300 chars): {json_str[:300]}"
	) from exc

	if not isinstance(parsed, dict):
	raise ValueError(
	f"Expected a JSON object (dict), got {type(parsed).__name__}."
	)

	# Validate required keys
	missing = _REQUIRED_KEYS - set(parsed.keys())
	if missing:
	raise ValueError(
	f"AI response is missing required keys: {missing}. "
	f"Present keys: {set(parsed.keys())}"
	)

	return parsed


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# GROQ CALLER — fast gameplay turns
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


	def _call_groq(
	system_prompt: str,
	user_message: str,
	max_retries: int = 2,
	) -> dict[str, Any]:
	"""
	Call Groq's LPU-accelerated API for fast gameplay turns.
	Typically responds in 1–3 seconds.
	"""
	if not GROQ_API_KEY:
	raise RuntimeError("GROQ_API_KEY is not set.")

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {GROQ_API_KEY}",
	}

	payload = {
	"model": GROQ_MODEL,
	"messages": [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_message},
	],
	"temperature": 0.85,
	"max_tokens": 480, # Reduced for speed — 350-450 tokens typical
	"stream": False,
	"stop": None,
	}

	last_error: Exception \| None = None

	for attempt in range(max_retries):
	try:
	print(
	f"[Groq] Attempt {attempt + 1}/{max_retries} — "
	f"model={GROQ_MODEL}"
	)
	t0 = time.time()
	response = requests.post(
	GROQ_ENDPOINT,
	json=payload,
	headers=headers,
	timeout=GROQ_TIMEOUT,
	)
	response.raise_for_status()
	elapsed = time.time() - t0

	data = response.json()
	raw_text = data["choices"][0]["message"]["content"]
	print(
	f"[Groq] Response in {elapsed:.1f}s — "
	f"{len(raw_text)} chars"
	)

	parsed = parse_ai_response(raw_text)
	print("[Groq] Parsed successfully.")
	return parsed

	except requests.exceptions.HTTPError as exc:
	last_error = exc
	status = getattr(exc.response, "status_code", "???")
	# 429 = rate limit — wait and retry
	if status == 429:
	wait = 3 * (attempt + 1)
	print(f"[Groq] Rate limited. Waiting {wait}s…")
	time.sleep(wait)
	else:
	print(f"[Groq] HTTP {status}: {exc}")
	break # Non-retryable HTTP error

	except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as exc:
	last_error = exc
	print(f"[Groq] Network error attempt {attempt + 1}: {exc}")
	if attempt < max_retries - 1:
	time.sleep(1)

	except (ValueError, KeyError, IndexError) as exc:
	last_error = exc
	print(f"[Groq] Parse/structure error attempt {attempt + 1}: {exc}")
	if attempt < max_retries - 1:
	time.sleep(1)

	except Exception as exc: # noqa: BLE001
	last_error = exc
	print(f"[Groq] Unexpected error: {type(exc).__name__}: {exc}")
	break

	raise RuntimeError(f"Groq failed after {max_retries} attempts: {last_error}")


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# MODAL CALLER — cinematic big moments only
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


	def _call_modal(
	system_prompt: str,
	user_message: str,
	max_retries: int = 2,
	) -> dict[str, Any]:
	"""
	Call the Modal-hosted Qwen 2.5 14B endpoint.
	Used only for cinematic big moments (scenario intros, end-game report).
	Slower (~15-25s) but higher quality narrative generation.
	"""
	url = f"{MODAL_ENDPOINT_URL}/v1/chat/completions"

	payload = {
	"model": "dokkaebi",
	"messages": [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_message},
	],
	"temperature": 0.8,
	"max_tokens": 600,
	"stream": False,
	}

	headers = {"Content-Type": "application/json"}
	last_error: Exception \| None = None

	for attempt in range(max_retries):
	try:
	print(
	f"[Modal] Attempt {attempt + 1}/{max_retries} — "
	f"POST {url}"
	)
	t0 = time.time()
	response = requests.post(
	url,
	json=payload,
	headers=headers,
	timeout=MODAL_TIMEOUT,
	)
	response.raise_for_status()
	elapsed = time.time() - t0

	data = response.json()
	raw_text = data["choices"][0]["message"]["content"]
	print(f"[Modal] Response in {elapsed:.1f}s — {len(raw_text)} chars")

	parsed = parse_ai_response(raw_text)
	print("[Modal] Parsed successfully.")
	return parsed

	except Exception as exc: # noqa: BLE001
	last_error = exc
	print(f"[Modal] Error attempt {attempt + 1}: {type(exc).__name__}: {exc}")
	if attempt < max_retries - 1:
	wait = 2 ** attempt
	print(f"[Modal] Retrying in {wait}s…")
	time.sleep(wait)

	raise RuntimeError(f"Modal failed after {max_retries} attempts: {last_error}")


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# PUBLIC API
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━


	def call_dokkaebi(
	system_prompt: str,
	user_message: str,
	max_retries: int = 2,
	use_modal: bool = False,
	) -> dict[str, Any]:
	"""
	Main entry point for all AI calls.

	Parameters
	----------
	system_prompt : str
	The full Dokkaebi system prompt (interpolated).
	user_message : str
	The player's action or a trigger phrase.
	max_retries : int
	Max retry attempts per backend.
	use_modal : bool
	If True, use Modal (Qwen 2.5 14B) for high-quality cinematic output.
	If False (default), use Groq (Llama 3.1 8B Instant) for fast gameplay.

	Returns
	-------
	dict
	Parsed AI response. Falls back to get_fallback_response() if all
	backends fail.
	"""
	# ── Primary: try the requested backend ───────────────────────────────
	primary = "Modal" if use_modal else "Groq"
	try:
	if use_modal:
	return _call_modal(system_prompt, user_message, max_retries)
	else:
	return _call_groq(system_prompt, user_message, max_retries)
	except Exception as primary_exc:
	print(f"[AI] {primary} failed: {primary_exc}")

	# ── Fallback: try the other backend ──────────────────────────────────
	secondary = "Groq" if use_modal else "Modal"
	print(f"[AI] Falling back to {secondary}…")
	try:
	if use_modal:
	# Modal failed → try Groq
	return _call_groq(system_prompt, user_message, 1)
	else:
	# Groq failed → try Modal
	return _call_modal(system_prompt, user_message, 1)
	except Exception as secondary_exc:
	print(f"[AI] {secondary} also failed: {secondary_exc}")

	# ── Last resort: static fallback ─────────────────────────────────────
	print("[AI] All backends failed. Using static fallback response.")
	return get_fallback_response()


	def call_dokkaebi_cinematic(
	system_prompt: str,
	user_message: str,
	) -> dict[str, Any]:
	"""
	Convenience wrapper: always uses Modal for cinematic big moments.
	Falls back to Groq if Modal is unavailable.

	Use this for:
	- Scenario intro generation (start of each new scenario)
	- End-game report card generation
	"""
	return call_dokkaebi(
	system_prompt,
	user_message,
	max_retries=2,
	use_modal=True,
	)