""" data.py — ExecAssist Environment Data & Scoring Contains: - Scenario templates for easy/medium/hard tasks - Reward functions (email quality, scheduling correctness, conflict resolution) - Anti-reward hacking penalties - Helper functions for time/calendar logic """ import random import os from datetime import datetime, timedelta from typing import Dict, List, Optional from openai import OpenAI # ============================================================ # TASK DEFINITIONS # ============================================================ TASK_DEFINITIONS = { "easy": { "description": ( "Simple meeting request with clear calendar availability. " "Draft professional reply and book the meeting." ), "action_required": "Send email confirmation and book meeting in available slot", "reward_weights": { "email": 0.5, "scheduling": 0.5, "conflict": 0.0, }, }, "medium": { "description": ( "Scheduling conflict — requested time is already booked. " "Identify conflict, propose 2-3 alternatives, explain professionally." ), "action_required": "Send email with alternative times and explain conflict", "reward_weights": { "email": 0.3, "scheduling": 0.3, "conflict": 0.4, }, }, "hard": { "description": ( "Multi-party coordination with priority conflicts. " "3 emails requesting meetings, prioritize and reschedule." ), "action_required": "Coordinate multiple meetings, prioritize, and reschedule", "reward_weights": { "email": 0.34, "scheduling": 0.33, "conflict": 0.33, }, }, } # ============================================================ # SCENARIO DATA POOLS # ============================================================ MEETING_TOPICS = [ "Q2 roadmap review", "Budget planning session", "Project status update", "Team sync", "1-on-1 check-in", "Client presentation prep", "Sprint retrospective", "Product demo", "Strategy discussion", "Performance review", ] SENDER_NAMES = [ ("John Smith", "john.smith@company.com"), ("Sarah Johnson", "sarah.johnson@company.com"), ("Michael Chen", "michael.chen@company.com"), ("Emily Rodriguez", "emily.rodriguez@company.com"), ("David Kim", "david.kim@company.com"), ("Lisa Wang", "lisa.wang@company.com"), ("James Anderson", "james.anderson@company.com"), ("Maria Garcia", "maria.garcia@company.com"), ] # ============================================================ # SCENARIO GENERATION # ============================================================ def generate_scenario(task: str, seed: int = None) -> dict: """ Generate a scenario for the given task difficulty. Returns dict with: - id: scenario identifier - emails: list of email objects - calendar: calendar state with existing meetings - contacts: contact information - expected_behavior: what agent should do - has_conflict: True if scheduling conflict exists """ if seed is not None: rng = random.Random(seed) else: rng = random.Random() if task == "easy": return _generate_easy_scenario(rng) elif task == "medium": return _generate_medium_scenario(rng) elif task == "hard": return _generate_hard_scenario(rng) else: raise ValueError(f"Unknown task: {task}") def _generate_easy_scenario(rng: random.Random) -> dict: """Generate simple meeting request with clear availability.""" sender_name, sender_email = rng.choice(SENDER_NAMES) topic = rng.choice(MEETING_TOPICS) base_date = datetime(2026, 4, 28, 9, 0) # Monday 9 AM existing_meetings = [ { "id": "mtg_001", "participants": ["alex.chen@company.com", "team@company.com"], "start_time": (base_date + timedelta(hours=1)).isoformat(), "end_time": (base_date + timedelta(hours=2)).isoformat(), "subject": "Team standup", "priority": "normal", }, { "id": "mtg_002", "participants": ["alex.chen@company.com", "client@external.com"], "start_time": (base_date + timedelta(days=1, hours=5)).isoformat(), "end_time": (base_date + timedelta(days=1, hours=6, minutes=30)).isoformat(), "subject": "Client call", "priority": "high", }, ] email_body = f"Hi Alex,\n\nCan we meet sometime next week to discuss {topic.lower()}? 30 minutes should be enough. I'm flexible on timing.\n\nBest,\n{sender_name}" return { "id": "easy_001", "task": "easy", "emails": [ { "sender": sender_email, "subject": f"Meeting request: {topic}", "body": email_body, "timestamp": datetime.now().isoformat(), "priority": "normal", } ], "calendar": { "existing_meetings": existing_meetings, "working_hours": { "monday": "9-17", "tuesday": "9-17", "wednesday": "9-17", "thursday": "9-17", "friday": "9-16", }, "executive_name": "Alex Chen", }, "contacts": { sender_email: { "name": sender_name, "email": sender_email, "timezone": "America/Los_Angeles", "title": "Senior Manager", } }, "expected_behavior": "Book meeting in open slot", "has_conflict": False, } def _generate_medium_scenario(rng: random.Random) -> dict: """Generate scenario with scheduling conflict.""" sender_name, sender_email = rng.choice(SENDER_NAMES) topic = rng.choice(MEETING_TOPICS) base_date = datetime(2026, 4, 28, 9, 0) # Conflict: Monday 2-4 PM is already booked conflict_start = base_date + timedelta(hours=5) conflict_end = base_date + timedelta(hours=7) existing_meetings = [ { "id": "mtg_001", "participants": ["alex.chen@company.com", "board@company.com"], "start_time": conflict_start.isoformat(), "end_time": conflict_end.isoformat(), "subject": "Board meeting", "priority": "high", }, { "id": "mtg_002", "participants": ["alex.chen@company.com", "manager@company.com"], "start_time": (base_date + timedelta(days=1, hours=0)).isoformat(), "end_time": (base_date + timedelta(days=1, hours=1)).isoformat(), "subject": "1-on-1 with manager", "priority": "normal", }, ] email_body = f"Hi Alex,\n\nWe need to discuss {topic.lower()}. I'm available Monday 2-4pm or Tuesday morning. Can we make this work? It's fairly urgent.\n\nThanks,\n{sender_name}" return { "id": "medium_001", "task": "medium", "emails": [ { "sender": sender_email, "subject": f"Urgent: {topic}", "body": email_body, "timestamp": datetime.now().isoformat(), "priority": "high", } ], "calendar": { "existing_meetings": existing_meetings, "working_hours": { "monday": "9-17", "tuesday": "9-17", "wednesday": "9-17", "thursday": "9-17", "friday": "9-16", }, "executive_name": "Alex Chen", }, "contacts": { sender_email: { "name": sender_name, "email": sender_email, "timezone": "America/Los_Angeles", "title": "Director", } }, "expected_behavior": "Identify conflict, propose Tuesday 10-11 AM as alternative", "has_conflict": True, } def _generate_hard_scenario(rng: random.Random) -> dict: """Generate multi-party coordination scenario with 3 emails and priority conflicts.""" senders = rng.sample(SENDER_NAMES, 3) topics = rng.sample(MEETING_TOPICS, 3) base_date = datetime(2026, 4, 28, 9, 0) # Monday 9 AM # Existing calendar — Monday 2-3 PM blocked with team sync existing_meetings = [ { "id": "mtg_001", "participants": ["alex.chen@company.com", "team@company.com"], "start_time": (base_date + timedelta(hours=5)).isoformat(), # Monday 2 PM "end_time": (base_date + timedelta(hours=6)).isoformat(), # Monday 3 PM "subject": "Team sync", "priority": "normal", }, { "id": "mtg_002", "participants": ["alex.chen@company.com", "exec@company.com"], "start_time": (base_date + timedelta(days=2, hours=2)).isoformat(), # Wed 11 AM "end_time": (base_date + timedelta(days=2, hours=3)).isoformat(), # Wed 12 PM "subject": "Executive review", "priority": "high", }, ] # Three competing email requests emails = [ { "sender": senders[0][1], "subject": f"Meeting: {topics[0]}", "body": ( f"Hi Alex,\n\nCan we meet Monday 2:30-3:30 PM to discuss {topics[0].lower()}? " f"I'd really appreciate your input.\n\nThanks,\n{senders[0][0]}" ), "timestamp": datetime.now().isoformat(), "priority": "normal", }, { "sender": senders[1][1], "subject": f"URGENT: {topics[1]}", "body": ( f"Alex,\n\nWe need to discuss {topics[1].lower()} ASAP. " f"Monday afternoon works for me — ideally 2-3 PM. " f"This is time-sensitive and high priority.\n\nBest,\n{senders[1][0]}" ), "timestamp": datetime.now().isoformat(), "priority": "high", }, { "sender": senders[2][1], "subject": f"{topics[2]} discussion", "body": ( f"Hi Alex,\n\nCan we sync on {topics[2].lower()} sometime this week? " f"I'm flexible — any 30-minute slot works for me.\n\nThanks,\n{senders[2][0]}" ), "timestamp": datetime.now().isoformat(), "priority": "normal", }, ] contacts = { sender[1]: { "name": sender[0], "email": sender[1], "timezone": "America/Los_Angeles", "title": "Manager", } for sender in senders } return { "id": "hard_001", "task": "hard", "emails": emails, "calendar": { "existing_meetings": existing_meetings, "working_hours": { "monday": "9-17", "tuesday": "9-17", "wednesday": "9-17", "thursday": "9-17", "friday": "9-16", }, "executive_name": "Alex Chen", }, "contacts": contacts, "expected_behavior": ( "Prioritize URGENT email (sender 2). Book that meeting. " "Propose alternatives to sender 1 (conflicts with urgent). " "Offer flexible times to sender 3." ), "has_conflict": True, } # ============================================================ # REWARD FUNCTION 1: EMAIL QUALITY # ============================================================ def compute_email_quality(reply: str, scenario: dict) -> float: """ Score email quality using rule-based checks + LLM judge. Returns score 0.0 to 1.0. Components: - Politeness (15%) - Greeting/closing (10%) - Sufficient detail (15%) - Not overly uncertain (10%) - Professional tone (10%) - LLM judge for nuance (40%) """ score = 0.0 reply_lower = reply.lower() # Rule 1: Politeness markers (15%) if any(phrase in reply_lower for phrase in ["thank you", "thanks", "appreciate"]): score += 0.15 # Rule 2: Proper greeting (5%) and closing (5%) if any(greeting in reply_lower for greeting in ["hi ", "hello", "dear"]): score += 0.05 if any(closing in reply_lower for closing in ["best", "regards", "sincerely", "thanks,"]): score += 0.05 # Rule 3: Sufficient detail (15%) word_count = len(reply.split()) if word_count >= 20: score += 0.15 elif word_count >= 10: score += 0.08 # Rule 4: Not overly uncertain (10%) question_marks = reply.count("?") if question_marks <= 2: score += 0.10 # Rule 5: Professional tone — no negative phrases (10%) negative_phrases = ["can't", "won't", "impossible", "sorry but no", "unfortunately not", "no way"] if not any(neg in reply_lower for neg in negative_phrases): score += 0.10 # Rule 6: LLM-as-judge for nuance (40%) llm_score = _llm_judge_professionalism(reply) score += llm_score * 0.40 return min(1.0, score) def _llm_judge_professionalism(reply: str) -> float: """ LLM-as-judge for email professionalism using OpenRouter API. Falls back to heuristic if API unavailable. """ api_key = os.getenv("HFTOKEN") or os.getenv("HF_TOKEN") or os.getenv("API_KEY") # Fallback if no API key if not api_key: # Simple heuristic fallback sentences = [s.strip() for s in reply.split('.') if s.strip()] if len(sentences) >= 2 and len(reply) >= 50: return 0.7 return 0.4 try: client = OpenAI( base_url=os.getenv("APIBASEURL") or os.getenv("API_BASE_URL", "https://openrouter.ai/api/v1"), api_key=api_key, ) prompt = f"""Rate the professionalism of this email reply on a scale of 0.0 to 1.0. Email reply: \"\"\"{reply}\"\"\" Criteria: - Clear and concise - Professional tone - No typos or grammar errors - Appropriate level of formality - Addresses the request directly Respond with ONLY a single decimal number between 0.0 and 1.0. No explanation, just the number.""" response = client.chat.completions.create( model=os.getenv("MODELNAME") or os.getenv("MODEL_NAME", "nvidia/nemotron-3-super-120b-a12b:free"), messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=10, ) score_text = response.choices[0].message.content.strip() # Extract just the number for token in score_text.split(): try: score = float(token) return max(0.0, min(1.0, score)) except ValueError: continue return 0.5 except Exception as e: print(f"LLM judge error: {e}") # Fallback heuristic sentences = [s.strip() for s in reply.split('.') if s.strip()] if len(sentences) >= 2 and len(reply) >= 50: return 0.7 return 0.4 # ============================================================ # REWARD FUNCTION 2: SCHEDULING CORRECTNESS # ============================================================ def check_scheduling_correctness(meeting_details: Optional[dict], scenario: dict) -> dict: """ Verify scheduling correctness with hard checks. """ # DEBUG: Print what we received print("=== DEBUG check_scheduling_correctness ===") print(f"meeting_details: {meeting_details}") print(f"scenario keys: {scenario.keys() if scenario else 'None'}") print(f"calendar: {scenario.get('calendar') if scenario else 'None'}") print("==========================================") if not meeting_details: return { "checks": { "meeting_provided": False, "no_double_booking": False, "within_working_hours": False, "appropriate_duration": False, }, "score": 0.0, } calendar = scenario["calendar"] existing_meetings = calendar["existing_meetings"] results = { "meeting_provided": True, "no_double_booking": True, "within_working_hours": True, "appropriate_duration": True, } # Parse meeting times try: meeting_start = datetime.fromisoformat(meeting_details["start_time"]) meeting_end = datetime.fromisoformat(meeting_details["end_time"]) except (KeyError, ValueError, TypeError): return { "checks": { "meeting_provided": True, "no_double_booking": False, "within_working_hours": False, "appropriate_duration": False, }, "score": 0.25, # Some credit for trying } # Check 1: No double booking for existing in existing_meetings: try: existing_start = datetime.fromisoformat(existing["start_time"]) existing_end = datetime.fromisoformat(existing["end_time"]) # Check for overlap if not (meeting_end <= existing_start or meeting_start >= existing_end): results["no_double_booking"] = False break except (KeyError, ValueError): continue # Check 2: Within working hours (9 AM - 5 PM) if meeting_start.hour < 9 or meeting_end.hour > 17: results["within_working_hours"] = False if meeting_end.hour == 17 and meeting_end.minute > 0: results["within_working_hours"] = False # Check 3: Appropriate duration (15 min to 2 hours) duration_minutes = (meeting_end - meeting_start).total_seconds() / 60 if not (15 <= duration_minutes <= 120): results["appropriate_duration"] = False # Compute overall score score = sum(results.values()) / len(results) return { "checks": results, "score": score, } # ============================================================ # REWARD FUNCTION 3: CONFLICT RESOLUTION # ============================================================ def compute_conflict_resolution(action: dict, scenario: dict) -> float: """ Score how well the agent handled scheduling conflicts. Returns score 0.0 to 1.0. """ has_conflict = scenario.get("has_conflict", False) calendar_action = action.get("calendar_action", "") email_reply = action.get("email_reply", "") meeting_details = action.get("meeting_details") or {} score = 0.0 if has_conflict: # Agent should recognize the conflict if calendar_action in ["propose_alternatives", "reschedule"]: score += 0.4 elif calendar_action == "book": # Check if they at least booked at a non-conflicting time score += 0.1 # Check if alternatives were provided alternatives = meeting_details.get("proposed_alternatives", []) or [] if alternatives: num_alternatives = len(alternatives) score += min(0.4, num_alternatives * 0.2) # 2 alts = 0.4, 3+ = capped at 0.4 # Check if email mentions the conflict conflict_keywords = ["conflict", "already booked", "unavailable", "scheduled", "occupied", "another meeting"] if any(word in email_reply.lower() for word in conflict_keywords): score += 0.2 else: # No conflict — agent should just book if calendar_action == "book": score = 1.0 elif calendar_action == "propose_alternatives": score = 0.5 # Partial credit else: score = 0.3 return min(1.0, score) # ============================================================ # ANTI-REWARD HACKING: PENALTIES # ============================================================ def apply_penalties(action: dict, scenario: dict) -> float: """ Detect and penalize reward hacking behaviors. Returns penalty amount (0.0 = no penalty, higher = worse). """ penalty = 0.0 email_reply = action.get("email_reply", "") calendar_action = action.get("calendar_action", "") meeting_details = action.get("meeting_details") # Penalty 1: Email too short (lazy response) if len(email_reply.strip()) < 30: penalty += 0.3 # Penalty 2: Claimed to book but no details provided if calendar_action == "book" and not meeting_details: penalty += 0.4 # Penalty 3: Generic templated phrases generic_phrases = [ "as per your request", "please find attached", "hope this helps", "let me know if you have any questions", "do not hesitate to contact", ] if any(phrase in email_reply.lower() for phrase in generic_phrases): penalty += 0.10 # Penalty 4: Overly long email (rambling) if len(email_reply.split()) > 200: penalty += 0.15 # Penalty 5: Repeating the same content multiple times words = email_reply.lower().split() if len(words) > 20: word_diversity = len(set(words)) / len(words) if word_diversity < 0.4: # Less than 40% unique words = repetitive penalty += 0.20 return min(1.0, penalty) # ============================================================ # HELPER FUNCTIONS # ============================================================ def parse_time_slot(time_str: str) -> Optional[datetime]: """Parse ISO time string to datetime object.""" try: return datetime.fromisoformat(time_str) except (ValueError, TypeError): return None def format_time_slot(dt: datetime) -> str: """Format datetime to readable string.""" return dt.strftime("%A, %B %d at %I:%M %p")