doc_quality_env / server /doc_quality_env_environment.py
akiii1234's picture
Upload folder using huggingface_hub
02c0b36 verified
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""
Doc Quality Env Environment Implementation.
A real-world environment for technical documentation quality assessment.
Agents evaluate documentation for clarity, completeness, accuracy, and structure.
"""
from uuid import uuid4
from typing import List, Dict
from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State
try:
from ..models import DocQualityAction, DocQualityObservation
except ImportError:
from models import DocQualityAction, DocQualityObservation
# Task definitions with ground truth issues
TASKS = {
"easy_api_doc": {
"difficulty": "easy",
"name": "Simple API Documentation Review",
"description": "Review API documentation for missing sections",
"doc": """
# User API Documentation
## Overview
The User API provides endpoints to manage user accounts and profiles.
## Endpoints
### GET /users/{id}
Retrieves user information.
- Parameter: user_id (integer)
### POST /users
Creates a new user account.
### PUT /users/{id}
Updates user information.
""",
"known_issues": [
"Missing response format documentation (what fields are returned)",
"Missing error codes documentation",
"Missing authentication requirements",
"Missing rate limiting information"
],
"max_steps": 8,
},
"medium_api_doc": {
"difficulty": "medium",
"name": "Complex API Documentation Review",
"description": "Identify clarity, completeness, and consistency issues",
"doc": """
# Payment Processing API
## Overview
The Payment API handles all transactions. It's robust and secure.
## Authentication
Use Bearer token in headers.
## Endpoints
### POST /transactions
Process a payment transaction.
Parameters:
- amount: decimal
- currency: string
- account_id: integer
Response: transaction_id, status
### GET /transactions/{id}
Get transaction details.
Returns: All transaction information
### POST /refunds
Issues a refund for a transaction.
Parameter: original_transaction_id
Returns: refund_id, refund_status
## Error Handling
Errors are returned as JSON with an error field containing the error message.
## Rate Limiting
API has rate limits but details in another document.
""",
"known_issues": [
"Vague language: 'robust and secure' - needs specifics",
"Missing required vs optional parameters",
"Inconsistent response documentation format",
"Missing timeout values",
"No example requests/responses shown",
"Unclear which endpoints require authentication",
"Missing field type specifications for responses"
],
"max_steps": 10,
},
"hard_guide_review": {
"difficulty": "hard",
"name": "Comprehensive Documentation Guide Review",
"description": "Identify structural, consistency, and cross-reference issues",
"doc": """
# Complete Developer Guide
## Chapter 1: Getting Started
Install the SDK: `pip install myservice-sdk`
## Chapter 2: Authentication
Three authentication methods:
1. API Keys
2. OAuth 2.0
3. Service Accounts
See Chapter 5 for detailed implementation.
## Chapter 3: Making Requests
All requests use HTTP/REST. Use the format from Chapter 4.
## Chapter 4: Response Formats
JSON responses include:
- data: object with results
- error: null on success
- timestamp: ISO 8601 format
See examples in Chapter 6.
## Chapter 5: Authentication Deep Dive
API Keys: Generate in dashboard, pass as 'Authorization: Bearer <key>'
OAuth: See Chapter 2 for overview
Service Accounts: Use for server-to-server auth
## Chapter 6: Examples
Coming soon...
## Chapter 7: Error Codes
200: Success
400: Bad request
401: Unauthorized
429: Rate limited
See error handling in Chapter 3.
## Appendix: FAQ
Q: How do I authenticate?
A: Use one of the three methods in Chapter 2
""",
"known_issues": [
"Chapter 6 (Examples) is incomplete - promised but marked 'Coming soon'",
"Circular reference: Ch2 says see Ch5, Ch5 says see Ch2 for OAuth",
"Missing chapter numbers: mentions 'Chapter 3' error handling but spread across multiple chapters",
"Inconsistent terminology: sometimes 'API Key', sometimes 'key'",
"Response format in Ch4 missing success status field definition",
"No versioning information mentioned",
"Missing SLA/availability information",
"Chapter 5 OAuth description incomplete vs Chapter 2",
"No mention of SDK vs REST trade-offs",
"FAQ too brief - needs more questions"
],
"max_steps": 12,
}
}
def calculate_issue_overlap(identified: List[str], known: List[str]) -> float:
"""Calculate how well identified issues match known issues (basic string similarity)."""
if not known:
return 1.0
if not identified:
return 0.0
matches = 0
for identified_issue in identified:
# Simple substring matching
for known_issue in known:
if len(identified_issue) > 20 and len(known_issue) > 20:
# For detailed issues, check if major keywords overlap
identified_words = set(identified_issue.lower().split())
known_words = set(known_issue.lower().split())
overlap = len(identified_words & known_words) / max(len(identified_words), len(known_words))
if overlap > 0.4: # 40% keyword overlap
matches += 1
break
return min(1.0, matches / len(known))
class DocQualityEnvironment(Environment):
"""
Documentation Quality Assessment Environment.
Agents evaluate technical documentation and identify quality issues.
This is a real-world task: technical writers and product teams use similar processes
to improve their documentation.
The environment provides three tasks of increasing difficulty:
1. Simple API docs (easy) - find missing sections
2. Complex API docs (medium) - find clarity and completeness issues
3. Guide structure (hard) - find structural and cross-reference issues
"""
SUPPORTS_CONCURRENT_SESSIONS: bool = True
def __init__(self):
"""Initialize the environment."""
self._state = State(episode_id=str(uuid4()), step_count=0)
self._current_task_key: str = "easy_api_doc"
self._current_task: Dict = TASKS[self._current_task_key] # Initialize with default task
self._identified_issues: List[str] = []
self._final_score: float = 0.0
self._episode_rewards: List[float] = []
def reset(self) -> DocQualityObservation:
"""Reset the environment to start a new task."""
self._state = State(episode_id=str(uuid4()), step_count=0)
self._identified_issues = []
self._final_score = 0.0
self._episode_rewards = []
# Start with easy task
self._current_task_key = "easy_api_doc"
self._current_task = TASKS[self._current_task_key]
return DocQualityObservation(
task_name=self._current_task["name"],
task_difficulty=self._current_task["difficulty"], # type: ignore
current_doc=self._current_task["doc"],
doc_section="Overview",
issues_identified=[],
known_issues=self._current_task["known_issues"][:2], # Show 2 hints initially
quality_score=0.0,
step_count=0,
max_steps=self._current_task["max_steps"],
feedback="Task started. Analyze the documentation and identify quality issues.",
done=False,
reward=0.0,
)
def step(self, action: DocQualityAction) -> DocQualityObservation: # type: ignore[override]
"""Execute one step of documentation review."""
self._state.step_count += 1
reward = 0.0
feedback = ""
done = False
if action.action_type == "identify_issue":
# Agent found an issue
if action.content and len(action.content) > 10:
self._identified_issues.append(action.content)
# Calculate similarity to known issues
known_issues = self._current_task["known_issues"]
overlap = calculate_issue_overlap([action.content], known_issues)
if overlap > 0.4: # Decent match
reward = 0.2 # Good identification
feedback = "Valid issue identified."
else:
reward = 0.1 # Partial credit for effort
feedback = "Issue noted, but may not be significant."
else:
reward = 0.0
feedback = "Issue description too vague."
elif action.action_type == "suggest_improvement":
# Agent suggests how to fix an issue
if action.content and len(action.content) > 15:
reward = 0.15 # Credit for constructive suggestions
feedback = "Good improvement suggestion."
else:
reward = 0.0
feedback = "Suggestion too vague."
elif action.action_type == "rate_quality":
# Agent provides final quality assessment
try:
# Parse the content to extract a score
score = float(action.content.split()[0]) if action.content else 0.0
score = max(0.0, min(1.0, score))
# Calculate accuracy of their assessment
true_score = self._calculate_true_score()
accuracy = 1.0 - abs(score - true_score)
reward = accuracy * 0.25
self._final_score = score
feedback = f"Quality rated at {score:.2f}. Episode complete."
done = True
except:
feedback = "Invalid quality rating format."
reward = 0.0
else:
feedback = "Unknown action type."
reward = 0.0
self._episode_rewards.append(reward)
# Check if max steps reached
if self._state.step_count >= self._current_task["max_steps"]:
done = True
if not self._final_score:
# Auto-score based on issues found
self._final_score = self._calculate_true_score()
# Prepare observation
obs = DocQualityObservation(
task_name=self._current_task["name"],
task_difficulty=self._current_task["difficulty"], # type: ignore
current_doc=self._current_task["doc"],
doc_section=f"Section {self._state.step_count}",
issues_identified=self._identified_issues,
known_issues=self._current_task["known_issues"],
quality_score=self._final_score,
step_count=self._state.step_count,
max_steps=self._current_task["max_steps"],
feedback=feedback,
done=done,
reward=reward,
metadata={
"issues_count": len(self._identified_issues),
"step": self._state.step_count,
"total_reward": sum(self._episode_rewards),
}
)
return obs
def _calculate_true_score(self) -> float:
"""Calculate true quality score based on issues found."""
known_issues = self._current_task["known_issues"]
if not known_issues:
return 1.0
if not self._identified_issues:
return 0.0
# Score based on how many issues were found
overlap = calculate_issue_overlap(self._identified_issues, known_issues)
# Base score on overlap, with bonus for finding more issues
found_count = min(len(self._identified_issues), len(known_issues))
base_score = found_count / len(known_issues)
# Bonus for quality overlap
bonus = overlap * 0.2
return min(1.0, base_score + bonus)
@property
def state(self) -> State:
"""Get the current environment state."""
return self._state