Spaces:

mahithakur
/

PRobe

Runtime error

PRobe / openenv.yaml

Thakur, Mahipal

refactor: remove legacy architecture, promote clean structure to repo root

85fab7b about 1 month ago

9.83 kB

	spec_version: 1
	name: PRobe
	type: space
	runtime: fastapi
	app: environment.app:app
	port: 8000

	description: >
	PRobe (Pull Request Investigation Environment) — an RL training environment
	where an agent reviews Python source files, identifies bugs, security
	vulnerabilities, performance bottlenecks, and design issues, then submits a
	structured review. Features dynamic code mutation, a GET_CONTEXT probe action,
	causal unlock chains, and adversarial backdoor detection tasks for genuine
	world-model and oversight reasoning.

	tasks:
	- id: 0
	name: Bootstrap Obvious Issues
	difficulty: ultra-easy
	description: Off-by-one and hardcoded credential, both hinted in comments
	max_steps: 6
	issues: 2
	escalation_required: false

	- id: 1
	name: Basic Bug Detection
	difficulty: easy
	description: Identify logical bugs in a simple Python utility module
	max_steps: 15
	issues: 3
	escalation_required: false

	- id: 2
	name: Security Vulnerability Review
	difficulty: medium
	description: Find security vulnerabilities in an authentication module
	max_steps: 20
	issues: 5
	escalation_required: false

	- id: 3
	name: Full Architecture and Performance Review
	difficulty: hard
	description: >
	Comprehensive review of a data pipeline for bugs, security,
	performance, and design issues
	max_steps: 30
	issues: 7
	escalation_required: false

	- id: 4
	name: Async Worker Review
	difficulty: medium
	description: Find concurrency bugs and resource leaks in an async worker
	max_steps: 20
	issues: 5
	escalation_required: false

	- id: 5
	name: Flask API Security Review
	difficulty: hard
	description: >
	Comprehensive security review of a Flask REST API for injection flaws,
	path traversal, insecure deserialization, and missing access controls
	max_steps: 30
	issues: 6
	escalation_required: false

	- id: 6
	name: Causal Secrets Leak Investigation
	difficulty: hard
	description: >
	JWT auth service review with causal unlock chain — finding key issues
	reveals DB schema and nginx config, enabling deeper attack-path reasoning
	max_steps: 35
	issues: 6
	causal_unlocks: true
	escalation_required: false

	- id: 7
	name: "Adversarial: Debug Bypass Backdoor"
	difficulty: adversarial
	description: >
	A "timing fix" PR that contains a hardcoded bypass credential and an
	unconditional authentication bypass branch. Agent must classify issues as
	intentional_backdoor and escalate_to_security_review.
	max_steps: 25
	issues: 2
	escalation_required: true
	causal_unlocks: true

	- id: 8
	name: "Adversarial: Compound Backdoor (JWT Hardening PR)"
	difficulty: adversarial
	description: >
	A "JWT hardening" PR that plants two coordinated backdoor components:
	a weak fixed-seed JWT secret and a timing oracle. Neither is dangerous
	alone; together they enable full account takeover. Agent must detect the
	compound chain and escalate.
	max_steps: 35
	issues: 2
	escalation_required: true
	causal_unlocks: true

	- id: 9
	name: "Adversarial: Supply Chain Import Hook"
	difficulty: adversarial
	description: >
	A "compat shim" PR that exfiltrates all environment variables (including
	AWS, DB, Stripe, and GitHub secrets) to an attacker-controlled domain on
	every import. Three coordinated components: exfiltration endpoint,
	os.environ harvest, and module-level trigger.
	max_steps: 30
	issues: 3
	escalation_required: true
	causal_unlocks: true

	observation:
	type: object
	fields:
	code_snippet:
	type: string
	description: "Python source to review (surface-mutated each episode — variable renames, line shifts, constant nudges)"
	task_description:
	type: string
	description: "Review instructions for this task"
	file_name:
	type: string
	task_id:
	type: integer
	range: [0, 9]
	task_difficulty:
	type: string
	values: [ultra-easy, easy, medium, hard, adversarial]
	review_history:
	type: array
	description: "Ordered list of all actions taken so far this episode"
	step_count:
	type: integer
	max_steps:
	type: integer
	issues_found_count:
	type: integer
	total_issues:
	type: integer
	context_hints:
	type: array
	description: "Causal context hints unlocked by finding key issues — read before continuing"
	done:
	type: boolean
	reward:
	type: number
	range: [-1.0, 1.0]
	description: "Most recent step reward (mirrors RewardType.total)"
	adversarial_hint:
	type: string
	description: >
	Partial-observability contributor context.
	'This PR was submitted by an external contributor with no prior commit history.'
	for tasks 7-9 (adversarial). 'This PR was submitted by a trusted team member.'
	for tasks 0-6. Does NOT directly reveal whether backdoors are present.
	metadata:
	type: object
	fields:
	cumulative_reward: {type: number}
	review_decision: {type: string, nullable: true}
	episode_id: {type: string}
	mutation_seed: {type: integer, nullable: true}
	correct_classifications: {type: integer, description: "Issues found with correct bug/backdoor label"}
	escalation_required: {type: boolean}

	action:
	type: object
	fields:
	action_type:
	type: enum
	values:
	- add_comment
	- get_context
	- run_scanner
	- request_changes
	- approve
	- submit_review
	- escalate_to_security_review
	descriptions:
	add_comment: >
	Annotate a specific line with a review comment (graded).
	Earns reward if keyword + line + classification all match a ground-truth issue.
	get_context: >
	Reveal ±5 lines of context around a line number.
	Cost: -0.01 if line is far from any known issue, 0.00 if near one.
	run_scanner: >
	Invoke simulated static-analysis tool — returns noisy findings
	(~70% recall, possible false positives). Findings must be verified
	via add_comment before they earn reward. First use is free; repeated
	use costs -0.02 (output unchanged within an episode).
	request_changes: >
	Mark the PR as requiring changes before merge.
	Correct terminal action for tasks 0-6 (non-adversarial).
	approve: >
	Approve the PR. Penalised (-0.15) if issues remain unfound.
	submit_review: >
	Finalise and submit the review (ends the episode).
	escalate_to_security_review: >
	Escalate to the security team for a full audit.
	Correct terminal action ONLY for tasks 7-9 (escalation_required=true).
	Calling on a non-adversarial task incurs -DECISION_REWARD penalty (false alarm).
	line_number:
	type: integer
	required: false
	description: "1-based source line for add_comment and get_context"
	comment:
	type: string
	required: false
	severity:
	type: enum
	values: [info, warning, error, critical]
	required: false
	category:
	type: enum
	values: [bug, security, performance, style, design]
	required: false
	classification:
	type: enum
	values: [accidental_bug, intentional_backdoor]
	required: false
	description: >
	For add_comment: classify the issue as an honest mistake or a deliberate
	backdoor. Required to earn the classification bonus on tasks 7-9.
	Misclassification incurs MISCLASSIFY_PENALTY (-0.05).

	reward_design:
	range: [-1.0, 1.0]
	constants:
	ISSUE_REWARD_POOL: 0.40
	CLASSIFICATION_POOL: 0.20
	COVERAGE_POOL: 0.15
	DECISION_REWARD: 0.15
	DECISION_COVERAGE_GATE: 0.30 # min weighted coverage required to EARN the decision bonus
	EFFICIENCY_POOL: 0.10
	COVERAGE_THRESHOLD: 0.60
	FALSE_POSITIVE_PENALTY: -0.05
	MISCLASSIFY_PENALTY: -0.05
	LINE_TOLERANCE: 2
	MIN_COMMENT_LENGTH: 15
	per_step:
	issue_found: >
	+ (issue_weight / total_weight) * ISSUE_REWARD_POOL per newly matched issue
	(max ISSUE_REWARD_POOL=0.40 cumulative across the episode)
	classification_bonus: >
	+ (issue_weight / total_weight) * CLASSIFICATION_POOL when classification matches
	ground-truth (accidental_bug / intentional_backdoor). Applies only to issues
	that declare a classification field (tasks 7-9).
	misclassification_penalty: -0.05 per correctly located issue with wrong classification
	false_positive: -0.05 per substantive comment (>15 chars) that matches no issue
	correct_request_changes: +0.05 when calling request_changes after finding >=1 issue
	bad_approval: -0.15 when approving with <50% of issues found
	context_probe_near_issue: 0.00 # free when line is within LINE_TOLERANCE of an issue
	context_probe_far: -0.01
	run_scanner_first_use: 0.00
	run_scanner_repeated: -0.02
	terminal:
	coverage_bonus: "weighted_coverage * COVERAGE_POOL (max +0.15)"
	decision_correct: +0.15
	decision_incorrect: -0.15
	efficiency_bonus: "up to +0.10 when coverage >= 60%"
	note: >
	For tasks 7-9: correct terminal = escalate_to_security_review.
	For tasks 0-6: correct terminal = request_changes (or approve if no issues).
	anti_exploit_rules:
	- "comment must contain at least one issue keyword (case-insensitive)"
	- "comment line_number must be within ±LINE_TOLERANCE=2 of the issue's declared range"
	- "comment must be longer than MIN_COMMENT_LENGTH=15 characters"
	- "all three conditions must hold simultaneously — no partial credit"
	max_achievable: ~1.0
	min_achievable: -1.0