Spaces:
Sleeping
Sleeping
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| """ | |
| Doc Quality Env Environment Implementation. | |
| A real-world environment for technical documentation quality assessment. | |
| Agents evaluate documentation for clarity, completeness, accuracy, and structure. | |
| """ | |
| from uuid import uuid4 | |
| from typing import List, Dict | |
| from openenv.core.env_server.interfaces import Environment | |
| from openenv.core.env_server.types import State | |
| try: | |
| from ..models import DocQualityAction, DocQualityObservation | |
| except ImportError: | |
| from models import DocQualityAction, DocQualityObservation | |
| # Task definitions with ground truth issues | |
| TASKS = { | |
| "easy_api_doc": { | |
| "difficulty": "easy", | |
| "name": "Simple API Documentation Review", | |
| "description": "Review API documentation for missing sections", | |
| "doc": """ | |
| # User API Documentation | |
| ## Overview | |
| The User API provides endpoints to manage user accounts and profiles. | |
| ## Endpoints | |
| ### GET /users/{id} | |
| Retrieves user information. | |
| - Parameter: user_id (integer) | |
| ### POST /users | |
| Creates a new user account. | |
| ### PUT /users/{id} | |
| Updates user information. | |
| """, | |
| "known_issues": [ | |
| "Missing response format documentation (what fields are returned)", | |
| "Missing error codes documentation", | |
| "Missing authentication requirements", | |
| "Missing rate limiting information" | |
| ], | |
| "max_steps": 8, | |
| }, | |
| "medium_api_doc": { | |
| "difficulty": "medium", | |
| "name": "Complex API Documentation Review", | |
| "description": "Identify clarity, completeness, and consistency issues", | |
| "doc": """ | |
| # Payment Processing API | |
| ## Overview | |
| The Payment API handles all transactions. It's robust and secure. | |
| ## Authentication | |
| Use Bearer token in headers. | |
| ## Endpoints | |
| ### POST /transactions | |
| Process a payment transaction. | |
| Parameters: | |
| - amount: decimal | |
| - currency: string | |
| - account_id: integer | |
| Response: transaction_id, status | |
| ### GET /transactions/{id} | |
| Get transaction details. | |
| Returns: All transaction information | |
| ### POST /refunds | |
| Issues a refund for a transaction. | |
| Parameter: original_transaction_id | |
| Returns: refund_id, refund_status | |
| ## Error Handling | |
| Errors are returned as JSON with an error field containing the error message. | |
| ## Rate Limiting | |
| API has rate limits but details in another document. | |
| """, | |
| "known_issues": [ | |
| "Vague language: 'robust and secure' - needs specifics", | |
| "Missing required vs optional parameters", | |
| "Inconsistent response documentation format", | |
| "Missing timeout values", | |
| "No example requests/responses shown", | |
| "Unclear which endpoints require authentication", | |
| "Missing field type specifications for responses" | |
| ], | |
| "max_steps": 10, | |
| }, | |
| "hard_guide_review": { | |
| "difficulty": "hard", | |
| "name": "Comprehensive Documentation Guide Review", | |
| "description": "Identify structural, consistency, and cross-reference issues", | |
| "doc": """ | |
| # Complete Developer Guide | |
| ## Chapter 1: Getting Started | |
| Install the SDK: `pip install myservice-sdk` | |
| ## Chapter 2: Authentication | |
| Three authentication methods: | |
| 1. API Keys | |
| 2. OAuth 2.0 | |
| 3. Service Accounts | |
| See Chapter 5 for detailed implementation. | |
| ## Chapter 3: Making Requests | |
| All requests use HTTP/REST. Use the format from Chapter 4. | |
| ## Chapter 4: Response Formats | |
| JSON responses include: | |
| - data: object with results | |
| - error: null on success | |
| - timestamp: ISO 8601 format | |
| See examples in Chapter 6. | |
| ## Chapter 5: Authentication Deep Dive | |
| API Keys: Generate in dashboard, pass as 'Authorization: Bearer <key>' | |
| OAuth: See Chapter 2 for overview | |
| Service Accounts: Use for server-to-server auth | |
| ## Chapter 6: Examples | |
| Coming soon... | |
| ## Chapter 7: Error Codes | |
| 200: Success | |
| 400: Bad request | |
| 401: Unauthorized | |
| 429: Rate limited | |
| See error handling in Chapter 3. | |
| ## Appendix: FAQ | |
| Q: How do I authenticate? | |
| A: Use one of the three methods in Chapter 2 | |
| """, | |
| "known_issues": [ | |
| "Chapter 6 (Examples) is incomplete - promised but marked 'Coming soon'", | |
| "Circular reference: Ch2 says see Ch5, Ch5 says see Ch2 for OAuth", | |
| "Missing chapter numbers: mentions 'Chapter 3' error handling but spread across multiple chapters", | |
| "Inconsistent terminology: sometimes 'API Key', sometimes 'key'", | |
| "Response format in Ch4 missing success status field definition", | |
| "No versioning information mentioned", | |
| "Missing SLA/availability information", | |
| "Chapter 5 OAuth description incomplete vs Chapter 2", | |
| "No mention of SDK vs REST trade-offs", | |
| "FAQ too brief - needs more questions" | |
| ], | |
| "max_steps": 12, | |
| } | |
| } | |
| def calculate_issue_overlap(identified: List[str], known: List[str]) -> float: | |
| """Calculate how well identified issues match known issues (basic string similarity).""" | |
| if not known: | |
| return 1.0 | |
| if not identified: | |
| return 0.0 | |
| matches = 0 | |
| for identified_issue in identified: | |
| # Simple substring matching | |
| for known_issue in known: | |
| if len(identified_issue) > 20 and len(known_issue) > 20: | |
| # For detailed issues, check if major keywords overlap | |
| identified_words = set(identified_issue.lower().split()) | |
| known_words = set(known_issue.lower().split()) | |
| overlap = len(identified_words & known_words) / max(len(identified_words), len(known_words)) | |
| if overlap > 0.4: # 40% keyword overlap | |
| matches += 1 | |
| break | |
| return min(1.0, matches / len(known)) | |
| class DocQualityEnvironment(Environment): | |
| """ | |
| Documentation Quality Assessment Environment. | |
| Agents evaluate technical documentation and identify quality issues. | |
| This is a real-world task: technical writers and product teams use similar processes | |
| to improve their documentation. | |
| The environment provides three tasks of increasing difficulty: | |
| 1. Simple API docs (easy) - find missing sections | |
| 2. Complex API docs (medium) - find clarity and completeness issues | |
| 3. Guide structure (hard) - find structural and cross-reference issues | |
| """ | |
| SUPPORTS_CONCURRENT_SESSIONS: bool = True | |
| def __init__(self): | |
| """Initialize the environment.""" | |
| self._state = State(episode_id=str(uuid4()), step_count=0) | |
| self._current_task_key: str = "easy_api_doc" | |
| self._current_task: Dict = TASKS[self._current_task_key] # Initialize with default task | |
| self._identified_issues: List[str] = [] | |
| self._final_score: float = 0.0 | |
| self._episode_rewards: List[float] = [] | |
| def reset(self) -> DocQualityObservation: | |
| """Reset the environment to start a new task.""" | |
| self._state = State(episode_id=str(uuid4()), step_count=0) | |
| self._identified_issues = [] | |
| self._final_score = 0.0 | |
| self._episode_rewards = [] | |
| # Start with easy task | |
| self._current_task_key = "easy_api_doc" | |
| self._current_task = TASKS[self._current_task_key] | |
| return DocQualityObservation( | |
| task_name=self._current_task["name"], | |
| task_difficulty=self._current_task["difficulty"], # type: ignore | |
| current_doc=self._current_task["doc"], | |
| doc_section="Overview", | |
| issues_identified=[], | |
| known_issues=self._current_task["known_issues"][:2], # Show 2 hints initially | |
| quality_score=0.0, | |
| step_count=0, | |
| max_steps=self._current_task["max_steps"], | |
| feedback="Task started. Analyze the documentation and identify quality issues.", | |
| done=False, | |
| reward=0.0, | |
| ) | |
| def step(self, action: DocQualityAction) -> DocQualityObservation: # type: ignore[override] | |
| """Execute one step of documentation review.""" | |
| self._state.step_count += 1 | |
| reward = 0.0 | |
| feedback = "" | |
| done = False | |
| if action.action_type == "identify_issue": | |
| # Agent found an issue | |
| if action.content and len(action.content) > 10: | |
| self._identified_issues.append(action.content) | |
| # Calculate similarity to known issues | |
| known_issues = self._current_task["known_issues"] | |
| overlap = calculate_issue_overlap([action.content], known_issues) | |
| if overlap > 0.4: # Decent match | |
| reward = 0.2 # Good identification | |
| feedback = "Valid issue identified." | |
| else: | |
| reward = 0.1 # Partial credit for effort | |
| feedback = "Issue noted, but may not be significant." | |
| else: | |
| reward = 0.0 | |
| feedback = "Issue description too vague." | |
| elif action.action_type == "suggest_improvement": | |
| # Agent suggests how to fix an issue | |
| if action.content and len(action.content) > 15: | |
| reward = 0.15 # Credit for constructive suggestions | |
| feedback = "Good improvement suggestion." | |
| else: | |
| reward = 0.0 | |
| feedback = "Suggestion too vague." | |
| elif action.action_type == "rate_quality": | |
| # Agent provides final quality assessment | |
| try: | |
| # Parse the content to extract a score | |
| score = float(action.content.split()[0]) if action.content else 0.0 | |
| score = max(0.0, min(1.0, score)) | |
| # Calculate accuracy of their assessment | |
| true_score = self._calculate_true_score() | |
| accuracy = 1.0 - abs(score - true_score) | |
| reward = accuracy * 0.25 | |
| self._final_score = score | |
| feedback = f"Quality rated at {score:.2f}. Episode complete." | |
| done = True | |
| except: | |
| feedback = "Invalid quality rating format." | |
| reward = 0.0 | |
| else: | |
| feedback = "Unknown action type." | |
| reward = 0.0 | |
| self._episode_rewards.append(reward) | |
| # Check if max steps reached | |
| if self._state.step_count >= self._current_task["max_steps"]: | |
| done = True | |
| if not self._final_score: | |
| # Auto-score based on issues found | |
| self._final_score = self._calculate_true_score() | |
| # Prepare observation | |
| obs = DocQualityObservation( | |
| task_name=self._current_task["name"], | |
| task_difficulty=self._current_task["difficulty"], # type: ignore | |
| current_doc=self._current_task["doc"], | |
| doc_section=f"Section {self._state.step_count}", | |
| issues_identified=self._identified_issues, | |
| known_issues=self._current_task["known_issues"], | |
| quality_score=self._final_score, | |
| step_count=self._state.step_count, | |
| max_steps=self._current_task["max_steps"], | |
| feedback=feedback, | |
| done=done, | |
| reward=reward, | |
| metadata={ | |
| "issues_count": len(self._identified_issues), | |
| "step": self._state.step_count, | |
| "total_reward": sum(self._episode_rewards), | |
| } | |
| ) | |
| return obs | |
| def _calculate_true_score(self) -> float: | |
| """Calculate true quality score based on issues found.""" | |
| known_issues = self._current_task["known_issues"] | |
| if not known_issues: | |
| return 1.0 | |
| if not self._identified_issues: | |
| return 0.0 | |
| # Score based on how many issues were found | |
| overlap = calculate_issue_overlap(self._identified_issues, known_issues) | |
| # Base score on overlap, with bonus for finding more issues | |
| found_count = min(len(self._identified_issues), len(known_issues)) | |
| base_score = found_count / len(known_issues) | |
| # Bonus for quality overlap | |
| bonus = overlap * 0.2 | |
| return min(1.0, base_score + bonus) | |
| def state(self) -> State: | |
| """Get the current environment state.""" | |
| return self._state | |