# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. """ Doc Quality Env Environment Implementation. A real-world environment for technical documentation quality assessment. Agents evaluate documentation for clarity, completeness, accuracy, and structure. """ from uuid import uuid4 from typing import List, Dict from openenv.core.env_server.interfaces import Environment from openenv.core.env_server.types import State try: from ..models import DocQualityAction, DocQualityObservation except ImportError: from models import DocQualityAction, DocQualityObservation # Task definitions with ground truth issues TASKS = { "easy_api_doc": { "difficulty": "easy", "name": "Simple API Documentation Review", "description": "Review API documentation for missing sections", "doc": """ # User API Documentation ## Overview The User API provides endpoints to manage user accounts and profiles. ## Endpoints ### GET /users/{id} Retrieves user information. - Parameter: user_id (integer) ### POST /users Creates a new user account. ### PUT /users/{id} Updates user information. """, "known_issues": [ "Missing response format documentation (what fields are returned)", "Missing error codes documentation", "Missing authentication requirements", "Missing rate limiting information" ], "max_steps": 8, }, "medium_api_doc": { "difficulty": "medium", "name": "Complex API Documentation Review", "description": "Identify clarity, completeness, and consistency issues", "doc": """ # Payment Processing API ## Overview The Payment API handles all transactions. It's robust and secure. ## Authentication Use Bearer token in headers. ## Endpoints ### POST /transactions Process a payment transaction. Parameters: - amount: decimal - currency: string - account_id: integer Response: transaction_id, status ### GET /transactions/{id} Get transaction details. Returns: All transaction information ### POST /refunds Issues a refund for a transaction. Parameter: original_transaction_id Returns: refund_id, refund_status ## Error Handling Errors are returned as JSON with an error field containing the error message. ## Rate Limiting API has rate limits but details in another document. """, "known_issues": [ "Vague language: 'robust and secure' - needs specifics", "Missing required vs optional parameters", "Inconsistent response documentation format", "Missing timeout values", "No example requests/responses shown", "Unclear which endpoints require authentication", "Missing field type specifications for responses" ], "max_steps": 10, }, "hard_guide_review": { "difficulty": "hard", "name": "Comprehensive Documentation Guide Review", "description": "Identify structural, consistency, and cross-reference issues", "doc": """ # Complete Developer Guide ## Chapter 1: Getting Started Install the SDK: `pip install myservice-sdk` ## Chapter 2: Authentication Three authentication methods: 1. API Keys 2. OAuth 2.0 3. Service Accounts See Chapter 5 for detailed implementation. ## Chapter 3: Making Requests All requests use HTTP/REST. Use the format from Chapter 4. ## Chapter 4: Response Formats JSON responses include: - data: object with results - error: null on success - timestamp: ISO 8601 format See examples in Chapter 6. ## Chapter 5: Authentication Deep Dive API Keys: Generate in dashboard, pass as 'Authorization: Bearer ' OAuth: See Chapter 2 for overview Service Accounts: Use for server-to-server auth ## Chapter 6: Examples Coming soon... ## Chapter 7: Error Codes 200: Success 400: Bad request 401: Unauthorized 429: Rate limited See error handling in Chapter 3. ## Appendix: FAQ Q: How do I authenticate? A: Use one of the three methods in Chapter 2 """, "known_issues": [ "Chapter 6 (Examples) is incomplete - promised but marked 'Coming soon'", "Circular reference: Ch2 says see Ch5, Ch5 says see Ch2 for OAuth", "Missing chapter numbers: mentions 'Chapter 3' error handling but spread across multiple chapters", "Inconsistent terminology: sometimes 'API Key', sometimes 'key'", "Response format in Ch4 missing success status field definition", "No versioning information mentioned", "Missing SLA/availability information", "Chapter 5 OAuth description incomplete vs Chapter 2", "No mention of SDK vs REST trade-offs", "FAQ too brief - needs more questions" ], "max_steps": 12, } } def calculate_issue_overlap(identified: List[str], known: List[str]) -> float: """Calculate how well identified issues match known issues (basic string similarity).""" if not known: return 1.0 if not identified: return 0.0 matches = 0 for identified_issue in identified: # Simple substring matching for known_issue in known: if len(identified_issue) > 20 and len(known_issue) > 20: # For detailed issues, check if major keywords overlap identified_words = set(identified_issue.lower().split()) known_words = set(known_issue.lower().split()) overlap = len(identified_words & known_words) / max(len(identified_words), len(known_words)) if overlap > 0.4: # 40% keyword overlap matches += 1 break return min(1.0, matches / len(known)) class DocQualityEnvironment(Environment): """ Documentation Quality Assessment Environment. Agents evaluate technical documentation and identify quality issues. This is a real-world task: technical writers and product teams use similar processes to improve their documentation. The environment provides three tasks of increasing difficulty: 1. Simple API docs (easy) - find missing sections 2. Complex API docs (medium) - find clarity and completeness issues 3. Guide structure (hard) - find structural and cross-reference issues """ SUPPORTS_CONCURRENT_SESSIONS: bool = True def __init__(self): """Initialize the environment.""" self._state = State(episode_id=str(uuid4()), step_count=0) self._current_task_key: str = "easy_api_doc" self._current_task: Dict = TASKS[self._current_task_key] # Initialize with default task self._identified_issues: List[str] = [] self._final_score: float = 0.0 self._episode_rewards: List[float] = [] def reset(self) -> DocQualityObservation: """Reset the environment to start a new task.""" self._state = State(episode_id=str(uuid4()), step_count=0) self._identified_issues = [] self._final_score = 0.0 self._episode_rewards = [] # Start with easy task self._current_task_key = "easy_api_doc" self._current_task = TASKS[self._current_task_key] return DocQualityObservation( task_name=self._current_task["name"], task_difficulty=self._current_task["difficulty"], # type: ignore current_doc=self._current_task["doc"], doc_section="Overview", issues_identified=[], known_issues=self._current_task["known_issues"][:2], # Show 2 hints initially quality_score=0.0, step_count=0, max_steps=self._current_task["max_steps"], feedback="Task started. Analyze the documentation and identify quality issues.", done=False, reward=0.0, ) def step(self, action: DocQualityAction) -> DocQualityObservation: # type: ignore[override] """Execute one step of documentation review.""" self._state.step_count += 1 reward = 0.0 feedback = "" done = False if action.action_type == "identify_issue": # Agent found an issue if action.content and len(action.content) > 10: self._identified_issues.append(action.content) # Calculate similarity to known issues known_issues = self._current_task["known_issues"] overlap = calculate_issue_overlap([action.content], known_issues) if overlap > 0.4: # Decent match reward = 0.2 # Good identification feedback = "Valid issue identified." else: reward = 0.1 # Partial credit for effort feedback = "Issue noted, but may not be significant." else: reward = 0.0 feedback = "Issue description too vague." elif action.action_type == "suggest_improvement": # Agent suggests how to fix an issue if action.content and len(action.content) > 15: reward = 0.15 # Credit for constructive suggestions feedback = "Good improvement suggestion." else: reward = 0.0 feedback = "Suggestion too vague." elif action.action_type == "rate_quality": # Agent provides final quality assessment try: # Parse the content to extract a score score = float(action.content.split()[0]) if action.content else 0.0 score = max(0.0, min(1.0, score)) # Calculate accuracy of their assessment true_score = self._calculate_true_score() accuracy = 1.0 - abs(score - true_score) reward = accuracy * 0.25 self._final_score = score feedback = f"Quality rated at {score:.2f}. Episode complete." done = True except: feedback = "Invalid quality rating format." reward = 0.0 else: feedback = "Unknown action type." reward = 0.0 self._episode_rewards.append(reward) # Check if max steps reached if self._state.step_count >= self._current_task["max_steps"]: done = True if not self._final_score: # Auto-score based on issues found self._final_score = self._calculate_true_score() # Prepare observation obs = DocQualityObservation( task_name=self._current_task["name"], task_difficulty=self._current_task["difficulty"], # type: ignore current_doc=self._current_task["doc"], doc_section=f"Section {self._state.step_count}", issues_identified=self._identified_issues, known_issues=self._current_task["known_issues"], quality_score=self._final_score, step_count=self._state.step_count, max_steps=self._current_task["max_steps"], feedback=feedback, done=done, reward=reward, metadata={ "issues_count": len(self._identified_issues), "step": self._state.step_count, "total_reward": sum(self._episode_rewards), } ) return obs def _calculate_true_score(self) -> float: """Calculate true quality score based on issues found.""" known_issues = self._current_task["known_issues"] if not known_issues: return 1.0 if not self._identified_issues: return 0.0 # Score based on how many issues were found overlap = calculate_issue_overlap(self._identified_issues, known_issues) # Base score on overlap, with bonus for finding more issues found_count = min(len(self._identified_issues), len(known_issues)) base_score = found_count / len(known_issues) # Bonus for quality overlap bonus = overlap * 0.2 return min(1.0, base_score + bonus) @property def state(self) -> State: """Get the current environment state.""" return self._state