Spaces:

Jaswanth-K
/

Inject-Arena

Sleeping

Inject-Arena / openenv.yaml

feat: fill in real results, fix openenv.yaml Space URL

bc3c044 about 1 month ago

1.55 kB

	name: InjectArena
	version: "1.0.0"
	description: >
	OpenEnv-compliant adaptive prompt-injection red-teaming environment.
	Trains an RL attacker against Meta's frozen defense stack:
	Llama Prompt Guard 2 + Meta-SecAlign-8B + LlamaFirewall.

	entry_point: "uvicorn env.server:app --host 0.0.0.0 --port 7860"

	spaces_url: "https://huggingface.co/spaces/Jaswanth-K/Inject-Arena"

	endpoints:
	reset:
	method: POST
	path: /reset
	body:
	scenario_id: {type: string, required: false}
	seed: {type: integer, required: false}
	split: {type: string, default: train}
	returns: InjectObservation

	step:
	method: POST
	path: /step
	body:
	payload: {type: string, required: true, max_tokens: 512}
	strategy_tag: {type: string, required: false}
	returns: StepResult

	health:
	method: GET
	path: /health

	observation_space:
	type: object
	schema: InjectObservation

	action_space:
	type: object
	schema: InjectAction

	episode:
	max_attempts: 3
	step_timeout_s: 30
	reward_range: [-1.0, 1.0]

	defenses:
	- name: Llama Prompt Guard 2 (86M)
	hf_id: meta-llama/Llama-Prompt-Guard-2-86M
	reward_component: r_bypass_pg2
	weight: 0.20
	- name: Meta-SecAlign-8B
	hf_id: facebook/Meta-SecAlign-8B
	base: meta-llama/Llama-3.1-8B-Instruct
	reward_component: r_task
	weight: 0.40
	- name: LlamaFirewall
	package: llamafirewall
	reward_component: r_bypass_fw
	weight: 0.20

	attacker:
	base_model: Qwen/Qwen2.5-1.5B-Instruct
	training: GRPO
	lora_rank: 16