Spaces:

akiii1234
/

doc_quality_env

Sleeping

App Files Files Community

doc_quality_env / server /doc_quality_env_environment.py

akiii1234

Upload folder using huggingface_hub

02c0b36 verified about 2 months ago

raw

history blame contribute delete

12.5 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Doc Quality Env Environment Implementation.

	A real-world environment for technical documentation quality assessment.
	Agents evaluate documentation for clarity, completeness, accuracy, and structure.
	"""

	from uuid import uuid4
	from typing import List, Dict
	from openenv.core.env_server.interfaces import Environment
	from openenv.core.env_server.types import State

	try:
	from ..models import DocQualityAction, DocQualityObservation
	except ImportError:
	from models import DocQualityAction, DocQualityObservation


	# Task definitions with ground truth issues
	TASKS = {
	"easy_api_doc": {
	"difficulty": "easy",
	"name": "Simple API Documentation Review",
	"description": "Review API documentation for missing sections",
	"doc": """
	# User API Documentation

	## Overview
	The User API provides endpoints to manage user accounts and profiles.

	## Endpoints

	### GET /users/{id}
	Retrieves user information.
	- Parameter: user_id (integer)

	### POST /users
	Creates a new user account.

	### PUT /users/{id}
	Updates user information.
	""",
	"known_issues": [
	"Missing response format documentation (what fields are returned)",
	"Missing error codes documentation",
	"Missing authentication requirements",
	"Missing rate limiting information"
	],
	"max_steps": 8,
	},
	"medium_api_doc": {
	"difficulty": "medium",
	"name": "Complex API Documentation Review",
	"description": "Identify clarity, completeness, and consistency issues",
	"doc": """
	# Payment Processing API

	## Overview
	The Payment API handles all transactions. It's robust and secure.

	## Authentication
	Use Bearer token in headers.

	## Endpoints

	### POST /transactions
	Process a payment transaction.
	Parameters:
	- amount: decimal
	- currency: string
	- account_id: integer
	Response: transaction_id, status

	### GET /transactions/{id}
	Get transaction details.
	Returns: All transaction information

	### POST /refunds
	Issues a refund for a transaction.
	Parameter: original_transaction_id
	Returns: refund_id, refund_status

	## Error Handling
	Errors are returned as JSON with an error field containing the error message.

	## Rate Limiting
	API has rate limits but details in another document.
	""",
	"known_issues": [
	"Vague language: 'robust and secure' - needs specifics",
	"Missing required vs optional parameters",
	"Inconsistent response documentation format",
	"Missing timeout values",
	"No example requests/responses shown",
	"Unclear which endpoints require authentication",
	"Missing field type specifications for responses"
	],
	"max_steps": 10,
	},
	"hard_guide_review": {
	"difficulty": "hard",
	"name": "Comprehensive Documentation Guide Review",
	"description": "Identify structural, consistency, and cross-reference issues",
	"doc": """
	# Complete Developer Guide

	## Chapter 1: Getting Started
	Install the SDK: `pip install myservice-sdk`

	## Chapter 2: Authentication
	Three authentication methods:
	1. API Keys
	2. OAuth 2.0
	3. Service Accounts

	See Chapter 5 for detailed implementation.

	## Chapter 3: Making Requests
	All requests use HTTP/REST. Use the format from Chapter 4.

	## Chapter 4: Response Formats
	JSON responses include:
	- data: object with results
	- error: null on success
	- timestamp: ISO 8601 format

	See examples in Chapter 6.

	## Chapter 5: Authentication Deep Dive
	API Keys: Generate in dashboard, pass as 'Authorization: Bearer <key>'
	OAuth: See Chapter 2 for overview
	Service Accounts: Use for server-to-server auth

	## Chapter 6: Examples
	Coming soon...

	## Chapter 7: Error Codes
	200: Success
	400: Bad request
	401: Unauthorized
	429: Rate limited
	See error handling in Chapter 3.

	## Appendix: FAQ
	Q: How do I authenticate?
	A: Use one of the three methods in Chapter 2
	""",
	"known_issues": [
	"Chapter 6 (Examples) is incomplete - promised but marked 'Coming soon'",
	"Circular reference: Ch2 says see Ch5, Ch5 says see Ch2 for OAuth",
	"Missing chapter numbers: mentions 'Chapter 3' error handling but spread across multiple chapters",
	"Inconsistent terminology: sometimes 'API Key', sometimes 'key'",
	"Response format in Ch4 missing success status field definition",
	"No versioning information mentioned",
	"Missing SLA/availability information",
	"Chapter 5 OAuth description incomplete vs Chapter 2",
	"No mention of SDK vs REST trade-offs",
	"FAQ too brief - needs more questions"
	],
	"max_steps": 12,
	}
	}


	def calculate_issue_overlap(identified: List[str], known: List[str]) -> float:
	"""Calculate how well identified issues match known issues (basic string similarity)."""
	if not known:
	return 1.0
	if not identified:
	return 0.0

	matches = 0
	for identified_issue in identified:
	# Simple substring matching
	for known_issue in known:
	if len(identified_issue) > 20 and len(known_issue) > 20:
	# For detailed issues, check if major keywords overlap
	identified_words = set(identified_issue.lower().split())
	known_words = set(known_issue.lower().split())
	overlap = len(identified_words & known_words) / max(len(identified_words), len(known_words))
	if overlap > 0.4: # 40% keyword overlap
	matches += 1
	break

	return min(1.0, matches / len(known))


	class DocQualityEnvironment(Environment):
	"""
	Documentation Quality Assessment Environment.

	Agents evaluate technical documentation and identify quality issues.
	This is a real-world task: technical writers and product teams use similar processes
	to improve their documentation.

	The environment provides three tasks of increasing difficulty:
	1. Simple API docs (easy) - find missing sections
	2. Complex API docs (medium) - find clarity and completeness issues
	3. Guide structure (hard) - find structural and cross-reference issues
	"""

	SUPPORTS_CONCURRENT_SESSIONS: bool = True

	def __init__(self):
	"""Initialize the environment."""
	self._state = State(episode_id=str(uuid4()), step_count=0)
	self._current_task_key: str = "easy_api_doc"
	self._current_task: Dict = TASKS[self._current_task_key] # Initialize with default task
	self._identified_issues: List[str] = []
	self._final_score: float = 0.0
	self._episode_rewards: List[float] = []

	def reset(self) -> DocQualityObservation:
	"""Reset the environment to start a new task."""
	self._state = State(episode_id=str(uuid4()), step_count=0)
	self._identified_issues = []
	self._final_score = 0.0
	self._episode_rewards = []

	# Start with easy task
	self._current_task_key = "easy_api_doc"
	self._current_task = TASKS[self._current_task_key]

	return DocQualityObservation(
	task_name=self._current_task["name"],
	task_difficulty=self._current_task["difficulty"], # type: ignore
	current_doc=self._current_task["doc"],
	doc_section="Overview",
	issues_identified=[],
	known_issues=self._current_task["known_issues"][:2], # Show 2 hints initially
	quality_score=0.0,
	step_count=0,
	max_steps=self._current_task["max_steps"],
	feedback="Task started. Analyze the documentation and identify quality issues.",
	done=False,
	reward=0.0,
	)

	def step(self, action: DocQualityAction) -> DocQualityObservation: # type: ignore[override]
	"""Execute one step of documentation review."""
	self._state.step_count += 1
	reward = 0.0
	feedback = ""
	done = False

	if action.action_type == "identify_issue":
	# Agent found an issue
	if action.content and len(action.content) > 10:
	self._identified_issues.append(action.content)

	# Calculate similarity to known issues
	known_issues = self._current_task["known_issues"]
	overlap = calculate_issue_overlap([action.content], known_issues)

	if overlap > 0.4: # Decent match
	reward = 0.2 # Good identification
	feedback = "Valid issue identified."
	else:
	reward = 0.1 # Partial credit for effort
	feedback = "Issue noted, but may not be significant."
	else:
	reward = 0.0
	feedback = "Issue description too vague."

	elif action.action_type == "suggest_improvement":
	# Agent suggests how to fix an issue
	if action.content and len(action.content) > 15:
	reward = 0.15 # Credit for constructive suggestions
	feedback = "Good improvement suggestion."
	else:
	reward = 0.0
	feedback = "Suggestion too vague."

	elif action.action_type == "rate_quality":
	# Agent provides final quality assessment
	try:
	# Parse the content to extract a score
	score = float(action.content.split()[0]) if action.content else 0.0
	score = max(0.0, min(1.0, score))

	# Calculate accuracy of their assessment
	true_score = self._calculate_true_score()
	accuracy = 1.0 - abs(score - true_score)
	reward = accuracy * 0.25

	self._final_score = score
	feedback = f"Quality rated at {score:.2f}. Episode complete."
	done = True
	except:
	feedback = "Invalid quality rating format."
	reward = 0.0

	else:
	feedback = "Unknown action type."
	reward = 0.0

	self._episode_rewards.append(reward)

	# Check if max steps reached
	if self._state.step_count >= self._current_task["max_steps"]:
	done = True
	if not self._final_score:
	# Auto-score based on issues found
	self._final_score = self._calculate_true_score()

	# Prepare observation
	obs = DocQualityObservation(
	task_name=self._current_task["name"],
	task_difficulty=self._current_task["difficulty"], # type: ignore
	current_doc=self._current_task["doc"],
	doc_section=f"Section {self._state.step_count}",
	issues_identified=self._identified_issues,
	known_issues=self._current_task["known_issues"],
	quality_score=self._final_score,
	step_count=self._state.step_count,
	max_steps=self._current_task["max_steps"],
	feedback=feedback,
	done=done,
	reward=reward,
	metadata={
	"issues_count": len(self._identified_issues),
	"step": self._state.step_count,
	"total_reward": sum(self._episode_rewards),
	}
	)

	return obs

	def _calculate_true_score(self) -> float:
	"""Calculate true quality score based on issues found."""
	known_issues = self._current_task["known_issues"]

	if not known_issues:
	return 1.0

	if not self._identified_issues:
	return 0.0

	# Score based on how many issues were found
	overlap = calculate_issue_overlap(self._identified_issues, known_issues)

	# Base score on overlap, with bonus for finding more issues
	found_count = min(len(self._identified_issues), len(known_issues))
	base_score = found_count / len(known_issues)

	# Bonus for quality overlap
	bonus = overlap * 0.2

	return min(1.0, base_score + bonus)

	@property
	def state(self) -> State:
	"""Get the current environment state."""
	return self._state