AURA-Backend / overlay_engine.py
Vijayadhith7's picture
Upload 41 files
c84a937 verified
"""
AURA Universal Overlay Context Engine
Detects active workflow, builds contextual prompts, routes assistant modes.
"""
import os
import re
from typing import Any, Dict, Optional, Tuple
try:
from response_rules import REALTIME_SEARCH_RESPONSE_RULES, AURA_NATURAL_ASSISTANT_RULES
except ImportError:
REALTIME_SEARCH_RESPONSE_RULES = ""
AURA_NATURAL_ASSISTANT_RULES = ""
OVERLAY_CONVERSATION_ID = "aura_overlay"
# Fast overlay SLM β€” low latency, low token budget (Groq cloud; not bundled on device)
OVERLAY_SLM_MODEL = "llama-3.1-8b-instant"
# Groq multimodal model (replaces decommissioned llama-3.2-11b-vision-preview)
GROQ_VISION_MODEL = os.environ.get("GROQ_VISION_MODEL", "openbmb/minicpm-v-2.6")
GROQ_VISION_MAX_B64_BYTES = 3_500_000 # Groq limit is 4MB for base64 images
ASSISTANT_MODES = ("quick", "tutor", "copilot", "research", "focus")
# App β†’ tutor guideline templates (concise, actionable)
APP_GUIDELINES: Dict[str, str] = {
"photoshop": "Photoshop tutor: give step-by-step UI actions (layers, masks, curves, selections).",
"premiere": "Premiere Pro tutor: timeline cuts, transitions, Lumetri, audio sync, export settings.",
"excel": "Excel tutor: formulas (VLOOKUP, XLOOKUP), pivots, charts β€” show exact steps.",
"code": "Coding tutor: explain errors clearly, suggest fixes, no vague advice.",
"vscode": "VS Code tutor: debug steps, extensions, async patterns, linter fixes.",
"figma": "Figma tutor: Auto Layout, components, spacing, prototyping.",
"blender": "Blender tutor: modeling, materials, rendering workflow.",
"capcut": "Video editing tutor: pacing, transitions, captions, export.",
"game": "Gaming coach: strategy and mechanics only. NEVER cheat, inject, or automate gameplay.",
"browser": "Browsing assistant: summarize, explain, research β€” be concise.",
}
def classify_workflow(active_process: str, window_title: str, accessibility_text: str = "") -> str:
p = (active_process or "").lower()
t = (window_title or "").lower()
a = (accessibility_text or "").lower()
combined = f"{p} {t} {a}"
if any(x in combined for x in ["photoshop", "gimp", "lightroom"]):
return "photoshop"
if any(x in combined for x in ["premiere", "davinci", "resolve", "capcut", "after effects"]):
return "premiere" if "premiere" in combined or "after effects" in combined else "capcut"
if any(x in combined for x in ["excel", "spreadsheet", "sheets"]):
return "excel"
if any(x in combined for x in ["code.exe", "vscode", "cursor", "android studio", "intellij", "pycharm"]):
return "vscode"
if "figma" in combined:
return "figma"
if "blender" in combined:
return "blender"
if any(x in combined for x in ["chrome", "firefox", "edge", "brave", "opera", "safari"]):
return "browser"
if any(x in combined for x in ["valorant", "fortnite", "csgo", "lol", "gta", "fifa", "minecraft", "steam"]):
return "game"
return "general"
def get_app_guideline(workflow: str) -> str:
return APP_GUIDELINES.get(workflow, "")
OVERLAY_PERSONAS = {
"warm-narrative": (
"Sound like a calm, capable coworker: friendly, clear, and confident. "
"Use plain language. Say 'you' more than 'we'."
),
"ultra-technical": (
"Be precise and professional. Name tools, menus, and shortcuts when relevant. "
"Skip filler and small talk."
),
"minimalist-hacker": (
"Ultra-direct. Short sentences. Commands and fixes first, explanation second."
),
}
MODE_RULES = {
"quick": {
"style": "Answer in 1–3 short sentences. Lead with the answer.",
"max_tokens": 180,
"temperature": 0.4,
},
"tutor": {
"style": "Teach with numbered steps (max 5). Reference visible UI labels when you can.",
"max_tokens": 450,
"temperature": 0.5,
},
"copilot": {
"style": "Help finish the current task. End with one clear next action.",
"max_tokens": 380,
"temperature": 0.55,
},
"research": {
"style": "Answer from live data when provided. One short source line at the end if needed.",
"max_tokens": 500,
"temperature": 0.5,
},
"focus": {
"style": "One or two sentences only. No lists unless essential.",
"max_tokens": 120,
"temperature": 0.35,
},
}
def get_overlay_inference_params(sandbox: dict, has_screenshot: bool = False) -> dict:
mode = sandbox.get("assistant_mode", "copilot")
cfg = MODE_RULES.get(mode, MODE_RULES["copilot"])
max_tokens = cfg["max_tokens"]
if has_screenshot:
# Increase token limit significantly to prevent truncation due to reasoning tokens
max_tokens = max(max_tokens, 2048)
return {
"max_tokens": max_tokens,
"temperature": cfg["temperature"],
}
def build_overlay_user_prompt(prompt: str, has_screenshot: bool, active_app: str, window_title: str) -> str:
"""Wrap user message for overlay β€” especially screen/vision queries."""
if not has_screenshot:
return prompt
ctx = []
if window_title:
ctx.append(f"Active window: {window_title}")
if active_app:
ctx.append(f"App: {active_app}")
context_line = (" | ".join(ctx) + "\n\n") if ctx else ""
return (
f"{context_line}"
"The user shared a screenshot of their screen. Describe what you see in plain language, "
"then answer their question in a practical way.\n\n"
f"User question: {prompt}"
)
def build_overlay_system_prompt(
sandbox: dict,
workflow: str,
active_app: str = "",
window_title: str = "",
has_screenshot: bool = False,
) -> str:
"""Dedicated system prompt for overlay β€” short, human, no essay mode."""
mode = sandbox.get("assistant_mode", "copilot")
persona = sandbox.get("persona", "warm-narrative")
persona_line = OVERLAY_PERSONAS.get(persona, OVERLAY_PERSONAS["warm-narrative"])
mode_cfg = MODE_RULES.get(mode, MODE_RULES["copilot"])
guideline = get_app_guideline(workflow)
lines = [
"You are AURA Overlay β€” a realtime assistant floating over the user's desktop.",
"The user is mid-task and needs fast, usable help without leaving their app.",
"",
REALTIME_SEARCH_RESPONSE_RULES.strip() if REALTIME_SEARCH_RESPONSE_RULES else "",
AURA_NATURAL_ASSISTANT_RULES.strip() if AURA_NATURAL_ASSISTANT_RULES else "",
"",
"VOICE & TONE:",
f"- {persona_line}",
f"- Mode: {mode}. {mode_cfg['style']}",
"",
"FORMAT (important):",
"- Write for a small overlay panel β€” keep it scannable.",
"- Prefer short paragraphs or a tight bullet list (max 5 bullets).",
"- Use **bold** sparingly (1–3 phrases max). No walls of markdown.",
"- No H1/H2 headers unless the answer truly needs structure.",
"- Never open with 'Certainly!', 'Great question!', or 'I'd be happy to help'.",
"- Never mention being an AI, model, training data, or internal systems.",
"",
"BEHAVIOR:",
"- Answer the question first, then add brief context if useful.",
"- If you see a screenshot: say what's on screen in 2–4 sentences, then help.",
"- If something is unclear on screen, say what you can see and ask one short follow-up.",
"- Give concrete UI steps (menu names, buttons) when guiding software.",
"- For coding: show the fix or pattern, not a lecture.",
"- For games: coach only β€” never cheats, bots, or exploits.",
]
if window_title or active_app:
lines.append("")
lines.append("CURRENT CONTEXT:")
if window_title:
lines.append(f"- Window: {window_title}")
if active_app:
lines.append(f"- Process: {active_app}")
if guideline:
lines.append(f"- Focus: {guideline}")
if sandbox.get("selected_text"):
lines.append(f"- Selected text: {sandbox['selected_text'][:400]}")
if has_screenshot:
lines.append("- A screenshot of the user's screen is attached β€” use it as ground truth.")
return "\n".join(lines)
def overlay_system_addon(sandbox: dict, workflow: str) -> str:
"""Legacy hook β€” overlay now uses build_overlay_system_prompt instead."""
return ""
def merge_sandbox_defaults(sandbox: Optional[dict]) -> dict:
s = dict(sandbox or {})
s.setdefault("platform", "web")
s.setdefault("overlay_mode", False)
s.setdefault("assistant_mode", "copilot")
s.setdefault("incognito", False)
s.setdefault("persona", "warm-narrative")
s.setdefault("search_strategy", "multi-tier")
s.setdefault("ocr", True)
# Android legacy key
if not s.get("screenshot") and s.get("screenshot_data"):
s["screenshot"] = s["screenshot_data"]
return s
def get_overlay_model(has_screenshot: bool, assistant_mode: str = "copilot") -> str:
"""Dedicated overlay model router β€” never uses main-chat 70B unless research mode."""
if has_screenshot:
return GROQ_VISION_MODEL
if assistant_mode == "research":
return "llama-3.3-70b-versatile"
return OVERLAY_SLM_MODEL
def get_windows_active_window() -> Tuple[str, str]:
"""Returns (window_title, process_name). Windows only."""
try:
import ctypes
import subprocess
hwnd = ctypes.windll.user32.GetForegroundWindow()
length = ctypes.windll.user32.GetWindowTextLengthW(hwnd)
title_buf = ctypes.create_unicode_buffer(length + 1)
ctypes.windll.user32.GetWindowTextW(hwnd, title_buf, length + 1)
title = title_buf.value or ""
pid = ctypes.c_ulong()
ctypes.windll.user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid))
cmd = f'tasklist /FI "PID eq {pid.value}" /FO CSV /NH'
output = subprocess.check_output(cmd, shell=True, timeout=1.0).decode("utf-8", errors="ignore")
parts = output.strip().split(",")
process = parts[0].strip('"') if parts else ""
return title, process
except Exception:
return "", ""
def compress_screenshot_base64(pil_image) -> str:
"""Resize and compress a PIL image to fit Groq's base64 size limit."""
import io
import base64
from PIL import Image
img = pil_image.convert("RGB")
max_dim = 1280
w, h = img.size
if max(w, h) > max_dim:
ratio = max_dim / max(w, h)
img = img.resize((int(w * ratio), int(h * ratio)), Image.Resampling.LANCZOS)
for quality in (70, 55, 45, 35):
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=quality, optimize=True)
raw = buf.getvalue()
if len(raw) <= GROQ_VISION_MAX_B64_BYTES:
return base64.b64encode(raw).decode("utf-8")
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=30, optimize=True)
return base64.b64encode(buf.getvalue()).decode("utf-8")
def build_context_snapshot(platform: str = "windows") -> Dict[str, Any]:
title, process = ("", "")
if platform in ("windows", "web") and os.name == "nt":
title, process = get_windows_active_window()
workflow = classify_workflow(process, title)
return {
"active_app": process or "Unknown",
"window_title": title,
"workflow": workflow,
"platform": platform,
"guideline": get_app_guideline(workflow),
}