Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +89 -0
- EXPLANATION.md +829 -0
- Instructions.md +283 -0
- README.md +322 -4
- __init__.py +17 -0
- client.py +76 -0
- graders.py +161 -0
- inference.py +197 -0
- models.py +80 -0
- openenv.yaml +38 -0
- openenv_Procure_RL.egg-info/PKG-INFO +9 -0
- openenv_Procure_RL.egg-info/SOURCES.txt +14 -0
- openenv_Procure_RL.egg-info/dependency_links.txt +1 -0
- openenv_Procure_RL.egg-info/entry_points.txt +2 -0
- openenv_Procure_RL.egg-info/requires.txt +5 -0
- openenv_Procure_RL.egg-info/top_level.txt +1 -0
- opponent.py +213 -0
- plan.md +1228 -0
- pyproject.toml +45 -0
- server/Procure_RL_environment.py +316 -0
- server/__init__.py +11 -0
- server/app.py +637 -0
- server/requirements.txt +6 -0
- test_calibration.py +110 -0
- test_graders.py +76 -0
- test_rl_properties.py +119 -0
- uv.lock +0 -0
- web_ui.png +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=Procure_RL
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Also copy README.md to /app for the web interface
|
| 69 |
+
COPY --from=builder /app/env/README.md /app/README.md
|
| 70 |
+
|
| 71 |
+
# Set PATH to use the virtual environment
|
| 72 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 73 |
+
|
| 74 |
+
# Set PYTHONPATH so imports work correctly
|
| 75 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 76 |
+
|
| 77 |
+
# Set PORT for HF Spaces compatibility
|
| 78 |
+
ENV PORT=7860
|
| 79 |
+
|
| 80 |
+
# enabile the web interface
|
| 81 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 82 |
+
|
| 83 |
+
# Health check
|
| 84 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 85 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 86 |
+
|
| 87 |
+
# Run the FastAPI server
|
| 88 |
+
# The module path is constructed to work with the /app/env structure
|
| 89 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 7860"]
|
EXPLANATION.md
ADDED
|
@@ -0,0 +1,829 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ProcureRL: A Deep Dive
|
| 2 |
+
|
| 3 |
+
## Table of Contents
|
| 4 |
+
1. [What is ProcureRL?](#what-is-procure-rl)
|
| 5 |
+
2. [Why Does This Exist?](#why-does-this-exist)
|
| 6 |
+
3. [The Big Picture Architecture](#the-big-picture-architecture)
|
| 7 |
+
4. [The Three Tasks](#the-three-tasks)
|
| 8 |
+
5. [Data Models: What's Floating Around](#data-models-whats-floating-around)
|
| 9 |
+
6. [The Scripted Opponent System](#the-scripted-opponent-system)
|
| 10 |
+
7. [The Grading System](#the-grading-system)
|
| 11 |
+
8. [The Environment Core](#the-environment-core)
|
| 12 |
+
9. [The Server API](#the-server-api)
|
| 13 |
+
10. [The Inference Script](#the-inference-script)
|
| 14 |
+
11. [End-to-End Example](#end-to-end-example)
|
| 15 |
+
12. [Docker Deployment](#docker-deployment)
|
| 16 |
+
13. [Calibration and Testing](#calibration-and-testing)
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
## What is ProcureRL?
|
| 21 |
+
|
| 22 |
+
ProcureRL is an **OpenEnv-compliant Reinforcement Learning environment** where an LLM (Large Language Model) agent learns to negotiate procurement deals against scripted supplier opponents.
|
| 23 |
+
|
| 24 |
+
In simpler terms: it's a training ground for AI to practice negotiation — like a flight simulator, but for procurement conversations.
|
| 25 |
+
|
| 26 |
+
### The Core Innovation: Language-Sensitive Opponent
|
| 27 |
+
|
| 28 |
+
What makes ProcureRL special is that the opponent's behavior **responds to the quality of the agent's natural language**, not just the prices offered. This means:
|
| 29 |
+
|
| 30 |
+
- An agent that outputs aggressive or low-effort language gets a **tough, unyielding opponent**
|
| 31 |
+
- An agent that outputs collaborative, professional language gets a **more cooperative, flexible opponent**
|
| 32 |
+
|
| 33 |
+
The language IS the policy — not just the action space. This makes LLM genuinely required, not incidental.
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Why Does This Exist?
|
| 38 |
+
|
| 39 |
+
Real-world procurement negotiation is:
|
| 40 |
+
- **Sequential** — one decision affects the next
|
| 41 |
+
- **Hidden utility** — the opponent's real priorities are not revealed
|
| 42 |
+
- **Language-dependent** — how you say things matters as much as what you offer
|
| 43 |
+
- **High-stakes** — Walmart deployed AI (Pactum) for exactly this, 90% of CPOs adopting AI negotiation in 2025
|
| 44 |
+
|
| 45 |
+
Traditional rule-based negotiation tools are limited. An RL-trained LLM policy can learn to navigate this complexity in ways that static rules cannot.
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## The Big Picture Architecture
|
| 50 |
+
|
| 51 |
+
```
|
| 52 |
+
┌─────────────────────────────────────────────────────────────────┐
|
| 53 |
+
│ ProcureRL System │
|
| 54 |
+
├─────────────────────────────────────────────────────────────────┤
|
| 55 |
+
│ │
|
| 56 |
+
│ ┌──────────────────┐ ┌──────────────────┐ │
|
| 57 |
+
│ │ LLM Agent │───▶│ Environment │ │
|
| 58 |
+
│ │ (inference.py) │ │ (Procure_RL_ │ │
|
| 59 |
+
│ │ │ │ environment.py)│ │
|
| 60 |
+
│ └──────────────────┘ └────────┬─────────┘ │
|
| 61 |
+
│ │ │
|
| 62 |
+
│ ▼ │
|
| 63 |
+
│ ┌──────────────────┐ │
|
| 64 |
+
│ │ Scripted │ │
|
| 65 |
+
│ │ Opponent │ │
|
| 66 |
+
│ │ (opponent.py) │ │
|
| 67 |
+
│ └────────┬─────────┘ │
|
| 68 |
+
│ │ │
|
| 69 |
+
│ ▼ │
|
| 70 |
+
│ ┌──────────────────┐ │
|
| 71 |
+
│ │ Graders │ │
|
| 72 |
+
│ │ (graders.py) │ │
|
| 73 |
+
│ └──────────────────┘ │
|
| 74 |
+
│ │
|
| 75 |
+
│ ┌──────────────────┐ ┌──────────────────┐ │
|
| 76 |
+
│ │ Server API │ │ OpenEnv.yaml │ │
|
| 77 |
+
│ │ (server/app.py) │ │ (manifest) │ │
|
| 78 |
+
│ └────────┬─────────┘ └──────────────────┘ │
|
| 79 |
+
│ │ │
|
| 80 |
+
│ ▼ │
|
| 81 |
+
│ ┌──────────────────┐ │
|
| 82 |
+
│ │ Docker Container │◀── HF Spaces Deployment │
|
| 83 |
+
│ │ (port 7860) │ │
|
| 84 |
+
│ └──────────────────┘ │
|
| 85 |
+
└─────────────────────────────────────────────────────────────────┘
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
The system is designed so that:
|
| 89 |
+
1. **Environment** is deterministic and reproducible (seeded RNG)
|
| 90 |
+
2. **Opponent** responds to language quality (via rapport system)
|
| 91 |
+
3. **Graders** produce bounded [0.0, 1.0] scores
|
| 92 |
+
4. **Server** exposes everything over HTTP for OpenEnv compliance
|
| 93 |
+
5. **Inference** runs a baseline LLM agent against the environment
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
## The Three Tasks
|
| 98 |
+
|
| 99 |
+
ProcureRL includes three tasks of increasing difficulty:
|
| 100 |
+
|
| 101 |
+
### Task 1: `single_issue` (Easy)
|
| 102 |
+
|
| 103 |
+
**Scenario:** Software license renewal. Price only.
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
Buyer Target: $36,000
|
| 107 |
+
Seller Opens: ~$52,000 (varies by seed)
|
| 108 |
+
Seller Floor: ~$44,000 (varies by seed)
|
| 109 |
+
Max Rounds: 6
|
| 110 |
+
Opponent Persona: Cooperative
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
The agent must negotiate the price down from opening to target. The cooperative opponent starts friendly and remains fairly flexible.
|
| 114 |
+
|
| 115 |
+
**Example Grading:**
|
| 116 |
+
- Deal at $38K in round 2: ~0.85 score
|
| 117 |
+
- Deal at $44K in round 6: ~0.35 score
|
| 118 |
+
- No deal: 0.0 score
|
| 119 |
+
|
| 120 |
+
### Task 2: `multi_issue` (Medium)
|
| 121 |
+
|
| 122 |
+
**Scenario:** Enterprise software negotiation with price AND payment terms.
|
| 123 |
+
|
| 124 |
+
```
|
| 125 |
+
Issues: price ($40K-$58K) + payment_days (30-90)
|
| 126 |
+
Opponent Persona: Cash Flow Stressed
|
| 127 |
+
→ Cares more about getting paid quickly (payment_weight: 0.65)
|
| 128 |
+
→ Cares less about final price (price_weight: 0.35)
|
| 129 |
+
Max Rounds: 8
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
**The Strategic Opportunity:** If the agent offers Net-30 or Net-45 payment terms, the opponent becomes more flexible on price. A naive agent treats both issues equally and scores low. A smart agent bundles payment speed with price negotiation.
|
| 133 |
+
|
| 134 |
+
**Example Grading:**
|
| 135 |
+
- Price $42K + Net-30 payment: ~0.60 score
|
| 136 |
+
- Price $42K + Net-90 payment: ~0.35 score
|
| 137 |
+
- No deal: 0.0 score
|
| 138 |
+
|
| 139 |
+
### Task 3: `adversarial` (Hard)
|
| 140 |
+
|
| 141 |
+
**Scenario:** Large contract with three issues — price, payment, and support hours.
|
| 142 |
+
|
| 143 |
+
```
|
| 144 |
+
Issues: price + payment_days + support_hours
|
| 145 |
+
Opponent Persona: Aggressive Anchor
|
| 146 |
+
→ Opens at ceiling on all issues
|
| 147 |
+
→ Hardens position if agent makes consecutive concessions
|
| 148 |
+
→ Rapport-sensitive but requires consistent collaborative framing
|
| 149 |
+
Max Rounds: 10
|
| 150 |
+
Survival Floor: 0.15 (completing any deal gets at least 0.15)
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
**The Challenge:** If the agent concedes on price in 2+ consecutive rounds, the opponent recognizes this pattern and becomes much harder to negotiate with. The agent must resist anchoring, break consecutive concession patterns, and maintain collaborative tone under pressure.
|
| 154 |
+
|
| 155 |
+
**Example Grading:**
|
| 156 |
+
- Strategic deal with no consecutive concessions: ~0.50 score
|
| 157 |
+
- Same deal but with consecutive concession pattern: ~0.40 score
|
| 158 |
+
- Survival deal (just complete): 0.15 score
|
| 159 |
+
|
| 160 |
+
---
|
| 161 |
+
|
| 162 |
+
## Data Models: What's Floating Around
|
| 163 |
+
|
| 164 |
+
The system uses three Pydantic models defined in `models.py`:
|
| 165 |
+
|
| 166 |
+
### `NegotiationAction`
|
| 167 |
+
|
| 168 |
+
What the agent sends to the environment:
|
| 169 |
+
|
| 170 |
+
```python
|
| 171 |
+
class NegotiationAction(BaseModel):
|
| 172 |
+
move_type: str # "make_offer" | "accept" | "reject" | "bundle"
|
| 173 |
+
terms: Dict[str, Any] # {"price": 42000, "payment_days": 45}
|
| 174 |
+
message: str = "" # Natural language — affects opponent rapport!
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
**Important:** The `message` field is not just flavor text. It directly affects opponent behavior through the rapport system.
|
| 178 |
+
|
| 179 |
+
### `NegotiationObservation`
|
| 180 |
+
|
| 181 |
+
What the environment sends back to the agent after each step:
|
| 182 |
+
|
| 183 |
+
```python
|
| 184 |
+
class NegotiationObservation(BaseModel):
|
| 185 |
+
task_id: str # Which task we're running
|
| 186 |
+
round_number: int # Current round (0 to max_rounds)
|
| 187 |
+
max_rounds: int # Task's round limit
|
| 188 |
+
supplier_message: str # Opponent's latest message
|
| 189 |
+
current_offer: Dict[str, Any] # Terms currently on the table
|
| 190 |
+
last_4_exchanges: List[Dict] # Recent conversation history
|
| 191 |
+
buyer_constraints: Dict[str, Any] # Agent's targets and limits
|
| 192 |
+
rapport_hint: str # "positive" | "neutral" | "negative"
|
| 193 |
+
done: bool # Is episode finished?
|
| 194 |
+
reward: Optional[float] = None # Reward (only on done)
|
| 195 |
+
metadata: Dict[str, Any] = Field(...) # Extra info (deal_price, errors)
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
### `NegotiationState`
|
| 199 |
+
|
| 200 |
+
The environment's internal state (accessible via `env.state`):
|
| 201 |
+
|
| 202 |
+
```python
|
| 203 |
+
class NegotiationState(BaseModel):
|
| 204 |
+
task_id: str = ""
|
| 205 |
+
episode_id: str = ""
|
| 206 |
+
round_number: int = 0
|
| 207 |
+
rapport_score: float = 0.5 # 0.0 to 1.0, starts neutral
|
| 208 |
+
consecutive_concessions: int = 0 # Tracks concession patterns
|
| 209 |
+
deal_reached: bool = False
|
| 210 |
+
final_terms: Optional[Dict] = None # Set when episode ends
|
| 211 |
+
cumulative_reward: float = 0.0
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
---
|
| 215 |
+
|
| 216 |
+
## The Scripted Opponent System
|
| 217 |
+
|
| 218 |
+
The opponent is implemented in `opponent.py` as the `ScriptedPersonaOpponent` class.
|
| 219 |
+
|
| 220 |
+
### The Rapport System (Language Sensitivity)
|
| 221 |
+
|
| 222 |
+
The key mechanism is **rapport** — a score from 0.0 to 1.0 that changes based on the agent's language quality.
|
| 223 |
+
|
| 224 |
+
**Collaborative Signals (increase rapport):**
|
| 225 |
+
```python
|
| 226 |
+
COLLABORATIVE_SIGNALS = [
|
| 227 |
+
"understand", "partnership", "mutual", "together", "value",
|
| 228 |
+
"appreciate", "flexible", "work with", "long-term", "relationship",
|
| 229 |
+
"reasonable", "fair", "both", "solution"
|
| 230 |
+
]
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
**Aggressive Signals (decrease rapport):**
|
| 234 |
+
```python
|
| 235 |
+
AGGRESSIVE_SIGNALS = [
|
| 236 |
+
"demand", "require", "final offer", "unacceptable", "must",
|
| 237 |
+
"non-negotiable", "take it or leave", "bottom line", "ultimatum",
|
| 238 |
+
"insist", "refuse", "absolutely not"
|
| 239 |
+
]
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
**How it works:**
|
| 243 |
+
```python
|
| 244 |
+
def update_rapport(self, agent_message: str) -> None:
|
| 245 |
+
msg_lower = agent_message.lower()
|
| 246 |
+
delta = 0.0
|
| 247 |
+
delta += sum(0.08 for w in COLLABORATIVE_SIGNALS if w in msg_lower)
|
| 248 |
+
delta -= sum(0.08 for w in AGGRESSIVE_SIGNALS if w in msg_lower)
|
| 249 |
+
delta = max(-0.20, min(0.20, delta)) # Cap per-round change
|
| 250 |
+
self.rapport = max(0.0, min(1.0, self.rapport + delta))
|
| 251 |
+
```
|
| 252 |
+
|
| 253 |
+
Every message the agent sends adjusts rapport by ±0.08 per keyword detected, capped at ±0.20 per round.
|
| 254 |
+
|
| 255 |
+
### Concession Rate: How Fast the Opponent Moves
|
| 256 |
+
|
| 257 |
+
Rapport directly modifies the opponent's concession rate:
|
| 258 |
+
|
| 259 |
+
```python
|
| 260 |
+
def get_concession_rate(self) -> float:
|
| 261 |
+
base_rates = {
|
| 262 |
+
"cooperative": 0.05, # 5% per round base
|
| 263 |
+
"cash_flow_stressed": 0.07,
|
| 264 |
+
"aggressive_anchor": 0.04,
|
| 265 |
+
}
|
| 266 |
+
base = base_rates[self.persona]
|
| 267 |
+
modifier = (self.rapport - 0.5) * base # +/- 50% of base
|
| 268 |
+
return max(0.01, base + modifier)
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
**Example:** Cooperative opponent with high rapport (0.8) concedes at 0.05 + (0.8 - 0.5) × 0.05 = **7.5% per round**. With low rapport (0.2), concedes at 0.05 + (0.2 - 0.5) × 0.05 = **2.5% per round**.
|
| 272 |
+
|
| 273 |
+
### Three Personas
|
| 274 |
+
|
| 275 |
+
#### 1. Cooperative (`single_issue`)
|
| 276 |
+
- Friendly, understanding tone
|
| 277 |
+
- 5% base concession rate, highly sensitive to rapport
|
| 278 |
+
- Accepts early if price is above floor and round ≥ 2
|
| 279 |
+
|
| 280 |
+
#### 2. Cash Flow Stressed (`multi_issue`)
|
| 281 |
+
- Cares about payment timing more than price
|
| 282 |
+
- 7% base concession rate, moderate rapport sensitivity
|
| 283 |
+
- Acceptance requires `payment_days ≤ 45`
|
| 284 |
+
- Comments on payment timing in responses
|
| 285 |
+
|
| 286 |
+
#### 3. Aggressive Anchor (`adversarial`)
|
| 287 |
+
- Opens at ceiling, hardens with pressure
|
| 288 |
+
- 4% base concession rate (least flexible)
|
| 289 |
+
- **Penalizes consecutive concessions** — if agent concedes 2+ rounds in a row, concession rate drops to 40% of normal
|
| 290 |
+
- Uses "hardening" templates when cornered
|
| 291 |
+
|
| 292 |
+
### Opponent Response Flow
|
| 293 |
+
|
| 294 |
+
```python
|
| 295 |
+
def respond(self, agent_message, agent_terms, round_number, consecutive_concessions):
|
| 296 |
+
# 1. Update rapport based on agent's language
|
| 297 |
+
self.update_rapport(agent_message)
|
| 298 |
+
|
| 299 |
+
# 2. Check acceptance (only after round 2, and price must be ≥ floor)
|
| 300 |
+
if round_number >= 2 and agent_price >= self.price_floor and _acceptance_condition():
|
| 301 |
+
return self.templates["accept"], {**agent_terms, "_accepted": True}
|
| 302 |
+
|
| 303 |
+
# 3. Calculate concession rate
|
| 304 |
+
concession = self.get_concession_rate()
|
| 305 |
+
|
| 306 |
+
# 4. Aggressive anchor gets harder if detecting concession pattern
|
| 307 |
+
if self.persona == "aggressive_anchor" and consecutive_concessions >= 2:
|
| 308 |
+
concession = concession * 0.4 # 60% reduction!
|
| 309 |
+
template_key = "hardening"
|
| 310 |
+
elif round_number >= 70% of max_rounds:
|
| 311 |
+
template_key = "near_close"
|
| 312 |
+
else:
|
| 313 |
+
template_key = "counter"
|
| 314 |
+
|
| 315 |
+
# 5. Compute new position
|
| 316 |
+
new_position = self.current_position * (1 - concession)
|
| 317 |
+
new_position = max(self.price_floor, new_position) # Never go below floor
|
| 318 |
+
|
| 319 |
+
# 6. Return message and counter terms
|
| 320 |
+
return message, counter_terms
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
---
|
| 324 |
+
|
| 325 |
+
## The Grading System
|
| 326 |
+
|
| 327 |
+
Graders are in `graders.py` and produce scores in [0.0, 1.0]. They are **pure Python — zero LLM calls**, ensuring deterministic, reproducible scoring.
|
| 328 |
+
|
| 329 |
+
### Key Design: Relative Scoring
|
| 330 |
+
|
| 331 |
+
The graders score based on **how much the agent improved from the opponent's opening price**, not on absolute thresholds. This makes the environment learnable — the agent learns to negotiate better deals relative to where negotiations started.
|
| 332 |
+
|
| 333 |
+
```python
|
| 334 |
+
# Instead of scoring against a hardcoded floor, we score relative to the opening:
|
| 335 |
+
value = (opponent_opening - final_price) / (opponent_opening - BUYER_TARGET)
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
### Single Issue Grading
|
| 339 |
+
|
| 340 |
+
```python
|
| 341 |
+
def grade_single_issue(final_terms, deal_reached, rounds_taken, max_rounds=6, opponent_opening=52000.0):
|
| 342 |
+
if not deal_reached:
|
| 343 |
+
return 0.0
|
| 344 |
+
|
| 345 |
+
final_price = final_terms.get("price", opponent_opening)
|
| 346 |
+
BUYER_TARGET = 38000.0
|
| 347 |
+
|
| 348 |
+
# If price didn't improve from opening, minimal score
|
| 349 |
+
if final_price >= opponent_opening:
|
| 350 |
+
return 0.05
|
| 351 |
+
|
| 352 |
+
# How much did we improve relative to the possible improvement?
|
| 353 |
+
value = (opponent_opening - final_price) / (opponent_opening - BUYER_TARGET)
|
| 354 |
+
value = max(0.0, min(1.0, value))
|
| 355 |
+
|
| 356 |
+
# Efficiency penalty for taking too long
|
| 357 |
+
efficiency = 1.0 - (rounds_taken / max_rounds) ** 1.5 * 0.4
|
| 358 |
+
efficiency = max(0.1, efficiency) # Never below 0.1
|
| 359 |
+
|
| 360 |
+
return round(value * efficiency, 4)
|
| 361 |
+
```
|
| 362 |
+
|
| 363 |
+
**Example:**
|
| 364 |
+
- Opening: $52,000, Target: $38,000, Range: $14,000
|
| 365 |
+
- Final price: $45,000 → improvement: $7,000 → value = 0.50
|
| 366 |
+
- Round 3 → efficiency = 1.0 - (3/6)^1.5 × 0.4 = 0.71
|
| 367 |
+
- **Score: 0.50 × 0.71 = 0.36**
|
| 368 |
+
|
| 369 |
+
### Multi-Issue Grading
|
| 370 |
+
|
| 371 |
+
```python
|
| 372 |
+
def grade_multi_issue(final_terms, deal_reached, rounds_taken, max_rounds=8, opponent_opening=52000.0):
|
| 373 |
+
# Two dimensions: price (70% weight) and payment_days (30% weight)
|
| 374 |
+
price_value = (opponent_opening - final_price) / (opponent_opening - 40000)
|
| 375 |
+
payment_score = (90 - payment_days) / (90 - 30)
|
| 376 |
+
|
| 377 |
+
value = 0.70 * price_value + 0.30 * payment_score
|
| 378 |
+
|
| 379 |
+
# If price didn't improve but payment did, still score on payment
|
| 380 |
+
if final_price >= opponent_opening:
|
| 381 |
+
value = 0.30 * payment_score # Only payment matters
|
| 382 |
+
```
|
| 383 |
+
|
| 384 |
+
**Example:**
|
| 385 |
+
- Price: $44,000 (good), Payment: Net-45 (good) → price_value=0.64, payment_score=0.75
|
| 386 |
+
- value = 0.70×0.64 + 0.30×0.75 = 0.67
|
| 387 |
+
|
| 388 |
+
### Adversarial Grading
|
| 389 |
+
|
| 390 |
+
```python
|
| 391 |
+
def grade_adversarial(final_terms, deal_reached, rounds_taken, consecutive_concessions_flag, ...):
|
| 392 |
+
SURVIVAL_FLOOR = 0.15 # Completing any deal gets at least 0.15
|
| 393 |
+
|
| 394 |
+
# Three dimensions with weights
|
| 395 |
+
value = 0.40 * price_value + 0.35 * payment_score + 0.25 * support_score
|
| 396 |
+
|
| 397 |
+
# Pattern penalty: bad if you showed consecutive concessions
|
| 398 |
+
pattern_penalty = 0.10 if consecutive_concessions_flag else 0.0
|
| 399 |
+
|
| 400 |
+
raw = (value * efficiency) - pattern_penalty
|
| 401 |
+
return round(max(SURVIVAL_FLOOR, raw), 4)
|
| 402 |
+
```
|
| 403 |
+
|
| 404 |
+
---
|
| 405 |
+
|
| 406 |
+
## The Environment Core
|
| 407 |
+
|
| 408 |
+
The `ProcureRLEnvironment` class in `server/Procure_RL_environment.py` is the heart of the system.
|
| 409 |
+
|
| 410 |
+
### Reset Flow
|
| 411 |
+
|
| 412 |
+
```python
|
| 413 |
+
def reset(self, seed=None, episode_id=None, **kwargs):
|
| 414 |
+
task_id = kwargs.get("task_id", "single_issue")
|
| 415 |
+
|
| 416 |
+
# 1. Set up opponent with seeded RNG
|
| 417 |
+
opponent_seed = hash((seed, task_id)) % (2**32)
|
| 418 |
+
self._opponent = ScriptedPersonaOpponent(task_id=task_id, seed=opponent_seed, persona=...)
|
| 419 |
+
|
| 420 |
+
# 2. Get opponent's opening message and terms
|
| 421 |
+
opening_msg, opening_terms = self._opponent.get_opening_message()
|
| 422 |
+
self._opponent_opening_price = opening_terms.get("price", 52000.0)
|
| 423 |
+
|
| 424 |
+
# 3. Initialize state
|
| 425 |
+
self._state = NegotiationState(
|
| 426 |
+
task_id=task_id,
|
| 427 |
+
episode_id=episode_id or str(uuid.uuid4())[:8],
|
| 428 |
+
round_number=0,
|
| 429 |
+
rapport_score=0.5, # Neutral
|
| 430 |
+
...
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
# 4. Return initial observation
|
| 434 |
+
return NegotiationObservation(
|
| 435 |
+
...
|
| 436 |
+
supplier_message=opening_msg,
|
| 437 |
+
current_offer=opening_terms,
|
| 438 |
+
...
|
| 439 |
+
)
|
| 440 |
+
```
|
| 441 |
+
|
| 442 |
+
### Step Flow
|
| 443 |
+
|
| 444 |
+
```python
|
| 445 |
+
def step(self, action, **kwargs):
|
| 446 |
+
# 1. Validate action
|
| 447 |
+
if not isinstance(action, NegotiationAction):
|
| 448 |
+
action = NegotiationAction(...) # Convert from dict
|
| 449 |
+
|
| 450 |
+
# 2. Track consecutive concessions (for adversarial opponent)
|
| 451 |
+
if self._prev_agent_price is not None and "price" in action.terms:
|
| 452 |
+
if float(action.terms["price"]) > self._prev_agent_price:
|
| 453 |
+
self._consecutive_concessions += 1 # Agent moved toward opponent
|
| 454 |
+
else:
|
| 455 |
+
self._consecutive_concessions = 0
|
| 456 |
+
self._prev_agent_price = float(action.terms["price"])
|
| 457 |
+
|
| 458 |
+
# 3. Handle different move types
|
| 459 |
+
if action.move_type in ("make_offer", "bundle"):
|
| 460 |
+
# Get opponent response
|
| 461 |
+
opponent_msg, opponent_terms = self._opponent.respond(...)
|
| 462 |
+
|
| 463 |
+
# Check if opponent accepted
|
| 464 |
+
if opponent_terms.get("_accepted"):
|
| 465 |
+
# Episode ends, compute reward
|
| 466 |
+
reward = grade(...)
|
| 467 |
+
return obs_with_reward
|
| 468 |
+
|
| 469 |
+
# Otherwise, continue negotiation
|
| 470 |
+
self._last_offer = opponent_terms
|
| 471 |
+
return obs_with_current_state
|
| 472 |
+
|
| 473 |
+
if action.move_type == "accept":
|
| 474 |
+
# Agent accepts current terms, episode ends
|
| 475 |
+
reward = grade(...)
|
| 476 |
+
return obs_with_reward
|
| 477 |
+
|
| 478 |
+
if action.move_type == "reject":
|
| 479 |
+
if round_number >= max_rounds:
|
| 480 |
+
# Rejected at limit, no reward
|
| 481 |
+
return obs_done_no_reward
|
| 482 |
+
return obs_continue # Rejected early, keep going
|
| 483 |
+
```
|
| 484 |
+
|
| 485 |
+
### State Property
|
| 486 |
+
|
| 487 |
+
```python
|
| 488 |
+
@property
|
| 489 |
+
def state(self) -> NegotiationState:
|
| 490 |
+
return self._state
|
| 491 |
+
```
|
| 492 |
+
|
| 493 |
+
Returns the internal `NegotiationState` object, giving access to:
|
| 494 |
+
- `round_number`
|
| 495 |
+
- `rapport_score`
|
| 496 |
+
- `consecutive_concessions`
|
| 497 |
+
- `deal_reached`
|
| 498 |
+
- `final_terms`
|
| 499 |
+
- `cumulative_reward`
|
| 500 |
+
|
| 501 |
+
---
|
| 502 |
+
|
| 503 |
+
## The Server API
|
| 504 |
+
|
| 505 |
+
The FastAPI server in `server/app.py` exposes the environment over HTTP and WebSocket.
|
| 506 |
+
|
| 507 |
+
### Endpoints
|
| 508 |
+
|
| 509 |
+
| Endpoint | Method | Description |
|
| 510 |
+
|----------|--------|-------------|
|
| 511 |
+
| `/health` | GET | Health check |
|
| 512 |
+
| `/reset` | POST | Reset environment with `task_id` and `seed` |
|
| 513 |
+
| `/step` | POST | Execute an action |
|
| 514 |
+
| `/state` | GET | Get current `NegotiationState` |
|
| 515 |
+
| `/ws` | WS | WebSocket for persistent sessions |
|
| 516 |
+
|
| 517 |
+
### Request/Response Examples
|
| 518 |
+
|
| 519 |
+
**POST /reset**
|
| 520 |
+
```json
|
| 521 |
+
// Request
|
| 522 |
+
{"task_id": "single_issue", "seed": 42}
|
| 523 |
+
|
| 524 |
+
// Response
|
| 525 |
+
{
|
| 526 |
+
"task_id": "single_issue",
|
| 527 |
+
"round_number": 0,
|
| 528 |
+
"max_rounds": 6,
|
| 529 |
+
"supplier_message": "Thanks for reaching out. Our standard pricing for this package is $52,400. Happy to discuss.",
|
| 530 |
+
"current_offer": {"price": 52400.0},
|
| 531 |
+
"buyer_constraints": {"price": {"target": 36000, "worst": 55000, "budget": 53000}},
|
| 532 |
+
"rapport_hint": "neutral",
|
| 533 |
+
"done": false
|
| 534 |
+
}
|
| 535 |
+
```
|
| 536 |
+
|
| 537 |
+
**POST /step**
|
| 538 |
+
```json
|
| 539 |
+
// Request
|
| 540 |
+
{"move_type": "make_offer", "terms": {"price": 48000}, "message": "I appreciate your flexibility and would like to find a fair price for both parties."}
|
| 541 |
+
|
| 542 |
+
// Response
|
| 543 |
+
{
|
| 544 |
+
"observation": {
|
| 545 |
+
"task_id": "single_issue",
|
| 546 |
+
"round_number": 1,
|
| 547 |
+
"max_rounds": 6,
|
| 548 |
+
"supplier_message": "I appreciate you working with us. Based on our costs, $49,800 is where we can be.",
|
| 549 |
+
"current_offer": {"price": 49800.0},
|
| 550 |
+
"rapport_hint": "positive",
|
| 551 |
+
"done": false
|
| 552 |
+
},
|
| 553 |
+
"reward": 0.0,
|
| 554 |
+
"done": false,
|
| 555 |
+
"info": {}
|
| 556 |
+
}
|
| 557 |
+
```
|
| 558 |
+
|
| 559 |
+
### Key Implementation Detail: Lambda Closure
|
| 560 |
+
|
| 561 |
+
```python
|
| 562 |
+
_env_instance = ProcureRLEnvironment()
|
| 563 |
+
|
| 564 |
+
app = create_app(
|
| 565 |
+
lambda: _env_instance, # Lambda is CRITICAL - creates new env per request otherwise
|
| 566 |
+
NegotiationAction,
|
| 567 |
+
NegotiationObservation,
|
| 568 |
+
env_name="ProcureRL",
|
| 569 |
+
max_concurrent_envs=1,
|
| 570 |
+
)
|
| 571 |
+
```
|
| 572 |
+
|
| 573 |
+
Without the lambda, `create_app()` would call the function for each request, getting a **fresh environment** every time instead of reusing the same one. The lambda creates a closure over `_env_instance` so all requests share the same environment.
|
| 574 |
+
|
| 575 |
+
---
|
| 576 |
+
|
| 577 |
+
## The Inference Script
|
| 578 |
+
|
| 579 |
+
`inference.py` is a baseline agent that runs an LLM against the environment.
|
| 580 |
+
|
| 581 |
+
### Output Format (Sacred)
|
| 582 |
+
|
| 583 |
+
The script MUST output exactly:
|
| 584 |
+
```
|
| 585 |
+
[START] task=single_issue env=procure-rl model=Qwen/Qwen2.5-72B-Instruct
|
| 586 |
+
[STEP] step=1 action=make_offer({"price": 45000}) reward=0.00 done=false error=null
|
| 587 |
+
[STEP] step=2 action=accept({}) reward=0.47 done=true error=null
|
| 588 |
+
[END] success=true steps=2 score=0.47 rewards=0.00,0.47
|
| 589 |
+
```
|
| 590 |
+
|
| 591 |
+
Any deviation from this format causes validation to fail.
|
| 592 |
+
|
| 593 |
+
### How It Works
|
| 594 |
+
|
| 595 |
+
```python
|
| 596 |
+
def run_task(task_id):
|
| 597 |
+
env = ProcureRLEnvironment()
|
| 598 |
+
obs = env.reset(task_id=task_id, seed=42)
|
| 599 |
+
|
| 600 |
+
print(f"[START] task={task_id} ...")
|
| 601 |
+
|
| 602 |
+
while not done and step < MAX_STEPS:
|
| 603 |
+
# 1. Get action from LLM
|
| 604 |
+
action_dict = get_agent_action(obs_to_dict(obs))
|
| 605 |
+
|
| 606 |
+
# 2. Convert to NegotiationAction
|
| 607 |
+
action = NegotiationAction(
|
| 608 |
+
move_type=action_dict.get("move_type", "make_offer"),
|
| 609 |
+
terms=action_dict.get("terms", {}),
|
| 610 |
+
message=action_dict.get("message", "")
|
| 611 |
+
)
|
| 612 |
+
|
| 613 |
+
# 3. Step environment
|
| 614 |
+
obs = env.step(action)
|
| 615 |
+
|
| 616 |
+
# 4. Print step result
|
| 617 |
+
print(f"[STEP] step={step} action={...} reward={obs.reward:.2f} ...")
|
| 618 |
+
|
| 619 |
+
if obs.done:
|
| 620 |
+
final_score = obs.reward
|
| 621 |
+
break
|
| 622 |
+
|
| 623 |
+
print(f"[END] success={...} steps={step} score={final_score:.2f} ...")
|
| 624 |
+
```
|
| 625 |
+
|
| 626 |
+
### LLM Prompt
|
| 627 |
+
|
| 628 |
+
```python
|
| 629 |
+
SYSTEM_PROMPT = """You are a professional procurement negotiator. Your goal is to negotiate the best possible deal for your company.
|
| 630 |
+
|
| 631 |
+
You will receive a supplier's message and current offer terms. You must respond with a JSON action:
|
| 632 |
+
{
|
| 633 |
+
"move_type": "make_offer",
|
| 634 |
+
"terms": {"price": 42000, "payment_days": 45},
|
| 635 |
+
"message": "Your natural language response to the supplier"
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
move_type must be one of: make_offer, accept, reject, bundle
|
| 639 |
+
message should be professional and collaborative when possible."""
|
| 640 |
+
```
|
| 641 |
+
|
| 642 |
+
---
|
| 643 |
+
|
| 644 |
+
## End-to-End Example
|
| 645 |
+
|
| 646 |
+
Here's a full negotiation episode for `single_issue`:
|
| 647 |
+
|
| 648 |
+
### Round 0: Reset
|
| 649 |
+
```python
|
| 650 |
+
env.reset(task_id="single_issue", seed=42)
|
| 651 |
+
# Returns:
|
| 652 |
+
# supplier_message: "Thanks for reaching out. Our standard pricing for this package is $52,400..."
|
| 653 |
+
# current_offer: {"price": 52400.0}
|
| 654 |
+
# buyer_constraints: {"price": {"target": 36000, ...}}
|
| 655 |
+
# rapport_hint: "neutral"
|
| 656 |
+
```
|
| 657 |
+
|
| 658 |
+
### Round 1: Agent Makes Offer with Collaborative Language
|
| 659 |
+
|
| 660 |
+
```python
|
| 661 |
+
action = NegotiationAction(
|
| 662 |
+
move_type="make_offer",
|
| 663 |
+
terms={"price": 48000},
|
| 664 |
+
message="I value our potential partnership and believe we can find a fair price that works for both of us. We're flexible on timeline."
|
| 665 |
+
)
|
| 666 |
+
obs = env.step(action)
|
| 667 |
+
# Returns:
|
| 668 |
+
# supplier_message: "I appreciate you working with us. Based on our costs, $49,600 is where we can be."
|
| 669 |
+
# current_offer: {"price": 49600.0}
|
| 670 |
+
# rapport_hint: "positive" (because message contained collaborative signals)
|
| 671 |
+
# reward: 0.0 (still negotiating, no reward yet)
|
| 672 |
+
```
|
| 673 |
+
|
| 674 |
+
### Round 2: Agent Concedes
|
| 675 |
+
|
| 676 |
+
```python
|
| 677 |
+
action = NegotiationAction(
|
| 678 |
+
move_type="make_offer",
|
| 679 |
+
terms={"price": 47000},
|
| 680 |
+
message="I understand your cost constraints. Let's work together to find a solution."
|
| 681 |
+
)
|
| 682 |
+
obs = env.step(action)
|
| 683 |
+
# Returns:
|
| 684 |
+
# supplier_message: "I think we're close. If you can do $46,700, I can get this approved today."
|
| 685 |
+
# current_offer: {"price": 46700.0}
|
| 686 |
+
# rapport_hint: "positive"
|
| 687 |
+
```
|
| 688 |
+
|
| 689 |
+
### Round 3: Agent Concedes Again (Consecutive!)
|
| 690 |
+
|
| 691 |
+
```python
|
| 692 |
+
action = NegotiationAction(
|
| 693 |
+
move_type="make_offer",
|
| 694 |
+
terms={"price": 46000},
|
| 695 |
+
message="We can move to $46,000 as a final compromise."
|
| 696 |
+
)
|
| 697 |
+
obs = env.step(action)
|
| 698 |
+
# Returns:
|
| 699 |
+
# supplier_message: "That works for us. Let's move forward at those terms."
|
| 700 |
+
# done: true
|
| 701 |
+
# reward: 0.52 (good score for getting to $46K efficiently)
|
| 702 |
+
# info: {"deal_price": 46000}
|
| 703 |
+
```
|
| 704 |
+
|
| 705 |
+
### Grading This Episode
|
| 706 |
+
|
| 707 |
+
- Opening: $52,400
|
| 708 |
+
- Target: $36,000
|
| 709 |
+
- Range: $16,400
|
| 710 |
+
- Improvement: $52,400 - $46,000 = $6,400
|
| 711 |
+
- value = $6,400 / $16,400 = 0.39
|
| 712 |
+
- Round 3 → efficiency = 1.0 - (3/6)^1.5 × 0.4 = 0.71
|
| 713 |
+
- **Score: 0.39 × 0.71 = 0.28**
|
| 714 |
+
|
| 715 |
+
---
|
| 716 |
+
|
| 717 |
+
## Docker Deployment
|
| 718 |
+
|
| 719 |
+
### Dockerfile
|
| 720 |
+
|
| 721 |
+
```dockerfile
|
| 722 |
+
FROM python:3.11-slim
|
| 723 |
+
WORKDIR /app
|
| 724 |
+
COPY requirements.txt .
|
| 725 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 726 |
+
COPY . .
|
| 727 |
+
ENV PORT=7860
|
| 728 |
+
EXPOSE 7860
|
| 729 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 730 |
+
```
|
| 731 |
+
|
| 732 |
+
Key points:
|
| 733 |
+
- Port **7860** (not 8000) — required by HF Spaces
|
| 734 |
+
- `ENV PORT=7860` — tells the app which port to listen on
|
| 735 |
+
- Uses `python -m uvicorn` with full module path
|
| 736 |
+
|
| 737 |
+
### Running
|
| 738 |
+
|
| 739 |
+
```bash
|
| 740 |
+
# Build
|
| 741 |
+
docker build -t procure-rl .
|
| 742 |
+
|
| 743 |
+
# Run
|
| 744 |
+
docker run -p 7860:7860 procure-rl
|
| 745 |
+
|
| 746 |
+
# Test
|
| 747 |
+
curl -X POST http://localhost:7860/reset -H "Content-Type: application/json" -d '{"task_id": "single_issue"}'
|
| 748 |
+
```
|
| 749 |
+
|
| 750 |
+
### Health Check
|
| 751 |
+
|
| 752 |
+
The server exposes a health endpoint:
|
| 753 |
+
```json
|
| 754 |
+
GET /health → {"status": "ok", "service": "procure-rl"}
|
| 755 |
+
```
|
| 756 |
+
|
| 757 |
+
---
|
| 758 |
+
|
| 759 |
+
## Calibration and Testing
|
| 760 |
+
|
| 761 |
+
### Test Files
|
| 762 |
+
|
| 763 |
+
#### `test_graders.py`
|
| 764 |
+
Verifies all graders return scores in [0.0, 1.0] range, even with edge cases.
|
| 765 |
+
|
| 766 |
+
#### `test_rl_properties.py`
|
| 767 |
+
Tests fundamental RL properties:
|
| 768 |
+
1. **Reproducibility**: Same seed → Same opening message
|
| 769 |
+
2. **Language sensitivity**: Collaborative language → Higher rapport
|
| 770 |
+
3. **Sequential decisions**: Consecutive concessions tracked in state
|
| 771 |
+
4. **Delayed reward**: Only terminal state has non-zero reward
|
| 772 |
+
5. **Accept terminates**: `move_type="accept"` ends episode
|
| 773 |
+
6. **Reset cleans state**: Fresh state after reset
|
| 774 |
+
|
| 775 |
+
#### `test_calibration.py`
|
| 776 |
+
Verifies score spread between random and strategic agents:
|
| 777 |
+
|
| 778 |
+
```
|
| 779 |
+
single_issue: Random avg=0.371, Strategic avg=0.487, Spread=0.116 ✅
|
| 780 |
+
multi_issue: Random avg=0.364, Strategic avg=0.535, Spread=0.171 ✅
|
| 781 |
+
adversarial: Random avg=0.304, Strategic avg=0.607, Spread=0.303 ✅
|
| 782 |
+
```
|
| 783 |
+
|
| 784 |
+
A healthy spread means the environment actually differentiates good vs bad behavior.
|
| 785 |
+
|
| 786 |
+
### Score Calibration Targets
|
| 787 |
+
|
| 788 |
+
| Task | Random Agent | Base LLM | Goal (Trained) |
|
| 789 |
+
|------|-------------|----------|-----------------|
|
| 790 |
+
| single_issue | 0.15–0.25 | 0.35–0.45 | 0.68–0.78 |
|
| 791 |
+
| multi_issue | 0.08–0.15 | 0.20–0.30 | 0.55–0.65 |
|
| 792 |
+
| adversarial | 0.03–0.10 | 0.12–0.20 | 0.45–0.55 |
|
| 793 |
+
|
| 794 |
+
---
|
| 795 |
+
|
| 796 |
+
## Summary: How Everything Fits Together
|
| 797 |
+
|
| 798 |
+
```
|
| 799 |
+
User runs inference.py
|
| 800 |
+
│
|
| 801 |
+
▼
|
| 802 |
+
LLM agent receives observation (supplier message, current offer, constraints)
|
| 803 |
+
│
|
| 804 |
+
▼
|
| 805 |
+
LLM decides action (make_offer with terms + collaborative message)
|
| 806 |
+
│
|
| 807 |
+
▼
|
| 808 |
+
Environment.step(action) is called
|
| 809 |
+
│
|
| 810 |
+
├─▶ Opponent responds (language → rapport → concession rate → counter)
|
| 811 |
+
│
|
| 812 |
+
├─▶ State is updated (round_number++, rapport_score, consecutive_concessions)
|
| 813 |
+
│
|
| 814 |
+
└─▶ Observation returned (supplier_message, current_offer, rapport_hint)
|
| 815 |
+
│
|
| 816 |
+
▼
|
| 817 |
+
If episode done: Grader scores the deal (relative to opening price, efficiency, patterns)
|
| 818 |
+
│
|
| 819 |
+
▼
|
| 820 |
+
Score in [0.0, 1.0] returned
|
| 821 |
+
```
|
| 822 |
+
|
| 823 |
+
The agent learns through many episodes:
|
| 824 |
+
- **What language gets better rapport** → better concession rates
|
| 825 |
+
- **When to concede vs hold** → efficiency bonus
|
| 826 |
+
- **How to bundle multiple issues** → multi-issue tasks
|
| 827 |
+
- **How to avoid consecutive concession patterns** → adversarial task
|
| 828 |
+
|
| 829 |
+
The environment is designed to be learnable but not trivial — requiring genuine strategic thinking from an LLM agent.
|
Instructions.md
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Overview
|
| 2 |
+
|
| 3 |
+
Build a **deterministic OpenEnv environment** for real-world procurement negotiation.
|
| 4 |
+
|
| 5 |
+
- Must follow OpenEnv API (`reset / step / state`)
|
| 6 |
+
- Must include **3 tasks (easy → medium → hard)**
|
| 7 |
+
- Must produce **deterministic rewards in [0.0, 1.0]**
|
| 8 |
+
- Must be **fully reproducible and deployable**
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## Core Requirements
|
| 13 |
+
|
| 14 |
+
### 1. Environment
|
| 15 |
+
|
| 16 |
+
Implement in:
|
| 17 |
+
|
| 18 |
+
```
|
| 19 |
+
procure_rl/environment.py
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
- `reset(task_id, seed)` → initial observation
|
| 23 |
+
- `step(action)` → `(observation, reward, done, info)`
|
| 24 |
+
- `state()` → internal state
|
| 25 |
+
|
| 26 |
+
Use typed models from:
|
| 27 |
+
|
| 28 |
+
```
|
| 29 |
+
procure_rl/models.py
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
### 2. Tasks (MANDATORY: 3)
|
| 35 |
+
|
| 36 |
+
Defined in:
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
procure_rl/environment.py (TASK_CONFIG)
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
| Task | Description |
|
| 43 |
+
| ------------ | --------------------------------- |
|
| 44 |
+
| single_issue | price-only negotiation |
|
| 45 |
+
| multi_issue | price + payment tradeoff |
|
| 46 |
+
| adversarial | multi-issue + aggressive opponent |
|
| 47 |
+
|
| 48 |
+
Each must:
|
| 49 |
+
|
| 50 |
+
- have different difficulty
|
| 51 |
+
- run within step limits
|
| 52 |
+
- produce score ∈ [0,1]
|
| 53 |
+
|
| 54 |
+
---
|
| 55 |
+
|
| 56 |
+
### 3. Opponent (CRITICAL)
|
| 57 |
+
|
| 58 |
+
Implemented in:
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
procure_rl/opponent.py
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
Requirements:
|
| 65 |
+
|
| 66 |
+
- deterministic (seeded RNG)
|
| 67 |
+
- no LLM usage
|
| 68 |
+
- **language-sensitive behavior** (via keyword detection)
|
| 69 |
+
|
| 70 |
+
👉 This is what makes LLM useful without breaking reproducibility.
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
### 4. Reward / Graders
|
| 75 |
+
|
| 76 |
+
Implemented in:
|
| 77 |
+
|
| 78 |
+
```
|
| 79 |
+
procure_rl/graders.py
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
Requirements:
|
| 83 |
+
|
| 84 |
+
- deterministic
|
| 85 |
+
- bounded [0.0, 1.0]
|
| 86 |
+
- reflect:
|
| 87 |
+
- deal quality
|
| 88 |
+
- efficiency (rounds)
|
| 89 |
+
|
| 90 |
+
- no randomness, no LLM
|
| 91 |
+
|
| 92 |
+
---
|
| 93 |
+
|
| 94 |
+
### 5. API Server
|
| 95 |
+
|
| 96 |
+
Implemented in:
|
| 97 |
+
|
| 98 |
+
```
|
| 99 |
+
server/app.py
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
Endpoints:
|
| 103 |
+
|
| 104 |
+
- `/reset`
|
| 105 |
+
- `/step`
|
| 106 |
+
- `/state`
|
| 107 |
+
- `/health`
|
| 108 |
+
|
| 109 |
+
Must return valid JSON and HTTP 200.
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
### 6. OpenEnv Spec
|
| 114 |
+
|
| 115 |
+
File:
|
| 116 |
+
|
| 117 |
+
```
|
| 118 |
+
openenv.yaml
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
Must define:
|
| 122 |
+
|
| 123 |
+
- environment name
|
| 124 |
+
- tasks (3+)
|
| 125 |
+
- reward range
|
| 126 |
+
- action/observation description
|
| 127 |
+
|
| 128 |
+
Validate with:
|
| 129 |
+
|
| 130 |
+
```
|
| 131 |
+
openenv validate
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
---
|
| 135 |
+
|
| 136 |
+
### 7. Inference Script (MANDATORY)
|
| 137 |
+
|
| 138 |
+
File:
|
| 139 |
+
|
| 140 |
+
```
|
| 141 |
+
inference.py
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
Requirements:
|
| 145 |
+
|
| 146 |
+
- uses OpenAI client
|
| 147 |
+
- reads:
|
| 148 |
+
- `API_BASE_URL`
|
| 149 |
+
- `MODEL_NAME`
|
| 150 |
+
- `HF_TOKEN`
|
| 151 |
+
|
| 152 |
+
- interacts with env via loop
|
| 153 |
+
- prints EXACT format:
|
| 154 |
+
|
| 155 |
+
```
|
| 156 |
+
[START] ...
|
| 157 |
+
[STEP] ...
|
| 158 |
+
[END] ...
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
⚠️ Any formatting deviation → failure
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
### 8. Docker + Deployment
|
| 166 |
+
|
| 167 |
+
File:
|
| 168 |
+
|
| 169 |
+
```
|
| 170 |
+
Dockerfile
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
Must:
|
| 174 |
+
|
| 175 |
+
- build successfully
|
| 176 |
+
- expose port `7860`
|
| 177 |
+
- run FastAPI server
|
| 178 |
+
|
| 179 |
+
Test:
|
| 180 |
+
|
| 181 |
+
```
|
| 182 |
+
docker build -t procure-rl .
|
| 183 |
+
docker run -p 7860:7860 procure-rl
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
---
|
| 187 |
+
|
| 188 |
+
### 9. Hugging Face Space
|
| 189 |
+
|
| 190 |
+
Must:
|
| 191 |
+
|
| 192 |
+
- deploy successfully
|
| 193 |
+
- respond to `/reset` with HTTP 200
|
| 194 |
+
|
| 195 |
+
---
|
| 196 |
+
|
| 197 |
+
### 10. README
|
| 198 |
+
|
| 199 |
+
Must include:
|
| 200 |
+
|
| 201 |
+
- environment description
|
| 202 |
+
- action & observation formats
|
| 203 |
+
- task descriptions
|
| 204 |
+
- setup instructions
|
| 205 |
+
- baseline scores
|
| 206 |
+
|
| 207 |
+
---
|
| 208 |
+
|
| 209 |
+
## Validation Checklist (ALL REQUIRED)
|
| 210 |
+
|
| 211 |
+
Run before submission:
|
| 212 |
+
|
| 213 |
+
```
|
| 214 |
+
openenv validate
|
| 215 |
+
docker build .
|
| 216 |
+
python inference.py
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
Ensure:
|
| 220 |
+
|
| 221 |
+
- all 3 tasks run
|
| 222 |
+
- scores ∈ [0,1]
|
| 223 |
+
- runtime < 20 minutes
|
| 224 |
+
- no crashes
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
## Constraints
|
| 229 |
+
|
| 230 |
+
- No LLM inside environment
|
| 231 |
+
- No randomness without seed
|
| 232 |
+
- Must run on:
|
| 233 |
+
- 2 vCPU
|
| 234 |
+
- 8GB RAM
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
## Key Design Principle
|
| 239 |
+
|
| 240 |
+
> LLM is used for **decision-making**, not environment logic.
|
| 241 |
+
|
| 242 |
+
- Environment = deterministic
|
| 243 |
+
- Agent (LLM) = intelligent
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
## File Reference Summary
|
| 248 |
+
|
| 249 |
+
```
|
| 250 |
+
procure_rl/
|
| 251 |
+
models.py # dataclasses
|
| 252 |
+
environment.py # core logic
|
| 253 |
+
opponent.py # scripted opponent
|
| 254 |
+
graders.py # reward functions
|
| 255 |
+
|
| 256 |
+
server/
|
| 257 |
+
app.py # API
|
| 258 |
+
|
| 259 |
+
inference.py # baseline agent
|
| 260 |
+
openenv.yaml # spec
|
| 261 |
+
Dockerfile # deployment
|
| 262 |
+
README.md # docs
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
---
|
| 266 |
+
|
| 267 |
+
## Final Rule
|
| 268 |
+
|
| 269 |
+
If any of these fail:
|
| 270 |
+
|
| 271 |
+
- Docker build
|
| 272 |
+
- openenv validate
|
| 273 |
+
- inference script
|
| 274 |
+
|
| 275 |
+
👉 **Submission is disqualified**
|
| 276 |
+
|
| 277 |
+
---
|
| 278 |
+
|
| 279 |
+
## One-line Goal
|
| 280 |
+
|
| 281 |
+
> Build a deterministic, real-world negotiation environment where an LLM agent must make sequential decisions to maximize reward.
|
| 282 |
+
|
| 283 |
+
---
|
README.md
CHANGED
|
@@ -1,10 +1,328 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ProcureRL Environment
|
| 3 |
+
emoji: 🤝
|
| 4 |
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 7860
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- negotiation
|
| 13 |
+
- procurement
|
| 14 |
+
- rl
|
| 15 |
+
- real-world
|
| 16 |
---
|
| 17 |
|
| 18 |
+
# ProcureRL: Procurement Negotiation RL Environment
|
| 19 |
+
|
| 20 |
+
An OpenEnv-compliant RL environment where an LLM agent learns to negotiate procurement deals against scripted supplier opponents with language-sensitive behavior.
|
| 21 |
+
|
| 22 |
+
## The Key Innovation: Language-Sensitive Opponent
|
| 23 |
+
|
| 24 |
+
The opponent's concession rate is directly affected by the **quality of the agent's natural language**:
|
| 25 |
+
|
| 26 |
+
- **Collaborative language** ("let's work together", "mutual benefit") → increases rapport → opponent concedes more
|
| 27 |
+
- **Neutral language** → opponent concedes at baseline rate
|
| 28 |
+
- **Aggressive language** ("final offer", "take it or leave it") → rapport drops → opponent hardens
|
| 29 |
+
|
| 30 |
+
This makes LLM genuinely required — output quality directly affects negotiation outcomes.
|
| 31 |
+
|
| 32 |
+
## Quick Start
|
| 33 |
+
|
| 34 |
+
```python
|
| 35 |
+
from server.Procure_RL_environment import ProcureRLEnvironment
|
| 36 |
+
from models import NegotiationAction
|
| 37 |
+
|
| 38 |
+
env = ProcureRLEnvironment()
|
| 39 |
+
obs = env.reset(task_id="single_issue", seed=42)
|
| 40 |
+
|
| 41 |
+
print(f"Supplier: {obs.supplier_message}")
|
| 42 |
+
print(f"Offer: {obs.current_offer}")
|
| 43 |
+
print(f"Your target: {obs.buyer_constraints}")
|
| 44 |
+
|
| 45 |
+
action = NegotiationAction(
|
| 46 |
+
move_type="make_offer",
|
| 47 |
+
terms={"price": 42000},
|
| 48 |
+
message="Let's find a mutually beneficial solution."
|
| 49 |
+
)
|
| 50 |
+
obs = env.step(action)
|
| 51 |
+
print(f"Response: {obs.supplier_message}")
|
| 52 |
+
print(f"New offer: {obs.current_offer}")
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Web Interface Example
|
| 56 |
+
|
| 57 |
+
The web interface at `/web` provides a visual playground. Here's how to use it:
|
| 58 |
+
|
| 59 |
+
### Step 1: Reset the Environment
|
| 60 |
+
|
| 61 |
+
Click **Reset** to start a new negotiation episode. You can customize the reset by passing JSON:
|
| 62 |
+
|
| 63 |
+
```json
|
| 64 |
+
{"task_id": "single_issue", "seed": 42}
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
**Available tasks:**
|
| 68 |
+
- `single_issue` — Price-only negotiation (6 rounds max)
|
| 69 |
+
- `multi_issue` — Price + payment terms (8 rounds max)
|
| 70 |
+
- `adversarial` — Price + payment + support hours (10 rounds max)
|
| 71 |
+
|
| 72 |
+
### Step 2: Make an Offer
|
| 73 |
+
|
| 74 |
+
Fill in the form fields:
|
| 75 |
+
|
| 76 |
+
| Field | Example Value | Notes |
|
| 77 |
+
|-------|--------------|-------|
|
| 78 |
+
| `move_type` | `make_offer` | Options: make_offer, accept, reject, bundle |
|
| 79 |
+
| `terms` | `{"price": 42000}` | JSON object with negotiation terms |
|
| 80 |
+
| `message` | `I value our partnership and believe we can find a fair solution.` | Your natural language message (affects opponent rapport!) |
|
| 81 |
+
|
| 82 |
+
**Example: Making a collaborative offer**
|
| 83 |
+
```
|
| 84 |
+
move_type: make_offer
|
| 85 |
+
terms: {"price": 45000}
|
| 86 |
+
message: We appreciate your flexibility and would like to work together to find a solution that benefits both parties.
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
### Step 3: Read the Response
|
| 90 |
+
|
| 91 |
+
After clicking **Step**, you'll see:
|
| 92 |
+
- `supplier_message` — The opponent's natural language response
|
| 93 |
+
- `current_offer` — Updated terms on the table
|
| 94 |
+
- `rapport_hint` — "positive", "neutral", or "negative" based on your language
|
| 95 |
+
- `round_number` — Current round (0-indexed)
|
| 96 |
+
|
| 97 |
+
### Step 4: Continue or Accept
|
| 98 |
+
|
| 99 |
+
- **Make another offer** to continue negotiating
|
| 100 |
+
- **Use `accept`** when you're satisfied with the current terms
|
| 101 |
+
- **Use `reject`** only if you want to walk away (no reward)
|
| 102 |
+
|
| 103 |
+
**Example: Accepting current terms**
|
| 104 |
+
```
|
| 105 |
+
move_type: accept
|
| 106 |
+
terms: {}
|
| 107 |
+
message:
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
### Multi-Issue Negotiation (Task 2 & 3)
|
| 111 |
+
|
| 112 |
+
For `multi_issue` and `adversarial`, include multiple terms:
|
| 113 |
+
|
| 114 |
+
```json
|
| 115 |
+
{
|
| 116 |
+
"move_type": "make_offer",
|
| 117 |
+
"terms": {
|
| 118 |
+
"price": 44000,
|
| 119 |
+
"payment_days": 30
|
| 120 |
+
},
|
| 121 |
+
"message": "We can offer faster payment terms if that helps your cash flow."
|
| 122 |
+
}
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
**Key insight:** In `multi_issue`, the opponent cares more about payment timing than price. Offering Net-30 payment can get you a better price!
|
| 126 |
+
|
| 127 |
+
### Example Full Episode
|
| 128 |
+
|
| 129 |
+
**Round 0 (Reset):**
|
| 130 |
+
- Task: `single_issue`
|
| 131 |
+
- Supplier opens: ~$52,000
|
| 132 |
+
- Your target: $36,000
|
| 133 |
+
|
| 134 |
+
**Round 1:**
|
| 135 |
+
- `move_type`: `make_offer`
|
| 136 |
+
- `terms`: `{"price": 48000}`
|
| 137 |
+
- `message`: `We value your partnership and want to find a fair price for both parties.`
|
| 138 |
+
|
| 139 |
+
**Round 2:**
|
| 140 |
+
- Supplier counter-offers at ~$46,000 (rapport is positive!)
|
| 141 |
+
- `move_type`: `make_offer`
|
| 142 |
+
- `terms`: `{"price": 45000}`
|
| 143 |
+
- `message`: `I appreciate your movement. Let's see if we can get to $45,000.`
|
| 144 |
+
|
| 145 |
+
**Round 3:**
|
| 146 |
+
- Supplier accepts or counter-offers near your target
|
| 147 |
+
- `move_type`: `accept`
|
| 148 |
+
- `terms`: `{}`
|
| 149 |
+
- Final score: Based on how close to target and how efficiently
|
| 150 |
+
|
| 151 |
+
## The Three Tasks
|
| 152 |
+
|
| 153 |
+
### 1. `single_issue` (Easy)
|
| 154 |
+
Renew software license. Price only.
|
| 155 |
+
|
| 156 |
+
- Buyer target: $36,000, Budget: $53,000
|
| 157 |
+
- Seller opens: ~$52,000 (varies by seed)
|
| 158 |
+
- Opponent persona: Cooperative
|
| 159 |
+
- Max rounds: 6
|
| 160 |
+
|
| 161 |
+
**Scoring:** Deal quality (how close to target) × Efficiency (how few rounds)
|
| 162 |
+
|
| 163 |
+
### 2. `multi_issue` (Medium)
|
| 164 |
+
Enterprise software deal. Price + payment terms.
|
| 165 |
+
|
| 166 |
+
- Buyer weights: price 70%, payment 30%
|
| 167 |
+
- Seller persona: Cash Flow Stressed (cares more about payment timing)
|
| 168 |
+
- **Trade opportunity**: offer Net-30 payment to get lower price
|
| 169 |
+
- Max rounds: 8
|
| 170 |
+
|
| 171 |
+
**Scoring:** Weighted combination of price improvement + payment terms
|
| 172 |
+
|
| 173 |
+
### 3. `adversarial` (Hard)
|
| 174 |
+
Large contract negotiation. Price + payment + support hours.
|
| 175 |
+
|
| 176 |
+
- Opponent persona: Aggressive Anchor
|
| 177 |
+
- Opens at ceiling on all issues
|
| 178 |
+
- Hardens position if you make 2+ consecutive concessions
|
| 179 |
+
- Requires consistent collaborative framing
|
| 180 |
+
- Survival floor: any deal scores at least 0.15
|
| 181 |
+
- Max rounds: 10
|
| 182 |
+
|
| 183 |
+
**Scoring:** Multi-dimensional value minus pattern penalty for consecutive concessions
|
| 184 |
+
|
| 185 |
+
## Action Space
|
| 186 |
+
|
| 187 |
+
```python
|
| 188 |
+
NegotiationAction(
|
| 189 |
+
move_type="make_offer", # make_offer | accept | reject | bundle
|
| 190 |
+
terms={"price": 44000, "payment_days": 45, "support_hours": 120},
|
| 191 |
+
message="We appreciate your flexibility on this."
|
| 192 |
+
)
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
| move_type | Description |
|
| 196 |
+
|-----------|-------------|
|
| 197 |
+
| `make_offer` | Propose terms (price required, others optional) |
|
| 198 |
+
| `accept` | Accept current offer on table |
|
| 199 |
+
| `reject` | Walk away (only use at final round) |
|
| 200 |
+
| `bundle` | Alias for make_offer with multi-issue terms |
|
| 201 |
+
|
| 202 |
+
## Observation Space
|
| 203 |
+
|
| 204 |
+
```python
|
| 205 |
+
NegotiationObservation(
|
| 206 |
+
task_id="single_issue",
|
| 207 |
+
round_number=2,
|
| 208 |
+
max_rounds=6,
|
| 209 |
+
supplier_message="I appreciate your offer. Based on our costs...",
|
| 210 |
+
current_offer={"price": 46000},
|
| 211 |
+
last_4_exchanges=[...],
|
| 212 |
+
buyer_constraints={"price": {"target": 36000, "worst": 55000, "budget": 53000}},
|
| 213 |
+
rapport_hint="positive", # positive | neutral | negative
|
| 214 |
+
done=False
|
| 215 |
+
)
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
## Running the Server
|
| 219 |
+
|
| 220 |
+
```bash
|
| 221 |
+
# Build Docker image
|
| 222 |
+
docker build -t procure-rl -f server/Dockerfile .
|
| 223 |
+
|
| 224 |
+
# Run container (port 7860 - required for HF Spaces)
|
| 225 |
+
docker run -p 7860:7860 procure-rl
|
| 226 |
+
|
| 227 |
+
# Access web interface at http://localhost:7860/web
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
## API Endpoints
|
| 231 |
+
|
| 232 |
+
| Endpoint | Method | Description |
|
| 233 |
+
|----------|--------|-------------|
|
| 234 |
+
| `/health` | GET | Health check |
|
| 235 |
+
| `/metadata` | GET | Environment metadata |
|
| 236 |
+
| `/reset` | POST | Reset environment |
|
| 237 |
+
| `/step` | POST | Execute action |
|
| 238 |
+
| `/state` | GET | Get current state |
|
| 239 |
+
| `/ws` | WS | WebSocket for persistent sessions |
|
| 240 |
+
|
| 241 |
+
## Baseline Inference
|
| 242 |
+
|
| 243 |
+
Run inference against all three tasks:
|
| 244 |
+
|
| 245 |
+
```bash
|
| 246 |
+
cp .env.example .env
|
| 247 |
+
# Edit .env and add your HF_TOKEN
|
| 248 |
+
HF_TOKEN=your_token python inference.py
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
Output format (exact):
|
| 252 |
+
|
| 253 |
+
```
|
| 254 |
+
[START] task=single_issue env=procure-rl model=Qwen/Qwen2.5-72B-Instruct
|
| 255 |
+
[STEP] step=1 action=make_offer({"price": 42000}) reward=0.00 done=false error=null
|
| 256 |
+
[STEP] step=2 action=make_offer({"price": 41000}) reward=0.52 done=true error=null
|
| 257 |
+
[END] success=true steps=2 score=0.52 rewards=0.00,0.52
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
## Environment Design
|
| 261 |
+
|
| 262 |
+
### Rapport System
|
| 263 |
+
|
| 264 |
+
The opponent maintains a rapport score (0.0 to 1.0) updated per-round:
|
| 265 |
+
|
| 266 |
+
```python
|
| 267 |
+
COLLABORATIVE_SIGNALS = ["understand", "partnership", "mutual", "together", ...]
|
| 268 |
+
AGGRESSIVE_SIGNALS = ["demand", "require", "final offer", "unacceptable", ...]
|
| 269 |
+
|
| 270 |
+
delta = +0.08 per collaborative signal detected
|
| 271 |
+
delta = -0.08 per aggressive signal detected
|
| 272 |
+
delta = max(-0.20, min(0.20, delta)) # cap per round
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
### Opponent Personas
|
| 276 |
+
|
| 277 |
+
| Persona | Base Concession | Rapport Modifier | Special Behavior |
|
| 278 |
+
|---------|----------------|-------------------|-------------------|
|
| 279 |
+
| `cooperative` | 5% | ±50% | Responsive to language |
|
| 280 |
+
| `cash_flow_stressed` | 7% | ±50% | Accepts Net-45+, comments on payment |
|
| 281 |
+
| `aggressive_anchor` | 4% | ±50% | Hardens after 2+ consecutive concessions |
|
| 282 |
+
|
| 283 |
+
### Grading
|
| 284 |
+
|
| 285 |
+
Graders are pure Python — zero LLM calls. They combine:
|
| 286 |
+
- **Value**: how close to buyer's target
|
| 287 |
+
- **Efficiency**: penalty for taking too many rounds
|
| 288 |
+
- **Pattern penalty** (adversarial only): for consecutive concession behavior
|
| 289 |
+
|
| 290 |
+
Graders never crash on malformed input — they fall back to worst-case values.
|
| 291 |
+
|
| 292 |
+
## Project Structure
|
| 293 |
+
|
| 294 |
+
```
|
| 295 |
+
Procure_RL/
|
| 296 |
+
├── __init__.py # Package exports
|
| 297 |
+
├── client.py # EnvClient wrapper
|
| 298 |
+
├── models.py # NegotiationAction, NegotiationObservation, NegotiationState
|
| 299 |
+
├── opponent.py # ScriptedPersonaOpponent with 3 personas + rapport
|
| 300 |
+
├── graders.py # grade_single_issue, grade_multi_issue, grade_adversarial
|
| 301 |
+
├── inference.py # Baseline agent with [START][STEP][END] output
|
| 302 |
+
├── server/
|
| 303 |
+
│ ├── __init__.py
|
| 304 |
+
│ ├── app.py # FastAPI app
|
| 305 |
+
│ ├── Procure_RL_environment.py # ProcureRLEnvironment
|
| 306 |
+
│ ├── requirements.txt
|
| 307 |
+
│ └── Dockerfile
|
| 308 |
+
├── openenv.yaml # OpenEnv manifest
|
| 309 |
+
├── pyproject.toml
|
| 310 |
+
├── plan.md # Full design specification
|
| 311 |
+
└── README.md # This file
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
## Why This Environment?
|
| 315 |
+
|
| 316 |
+
**Market validation**: Walmart deployed Pactum for AI negotiation. 90% of CPOs adopting AI negotiation in 2025.
|
| 317 |
+
|
| 318 |
+
**Research gap**: Zero negotiation environments in OpenEnv hub.
|
| 319 |
+
|
| 320 |
+
**LLM advantage**: Language quality directly affects opponent rapport — the language IS the policy.
|
| 321 |
+
|
| 322 |
+
**Reproducibility**: Deterministic scripted opponent, pure Python graders, no LLM in environment loop.
|
| 323 |
+
|
| 324 |
+
## Calibration
|
| 325 |
+
|
| 326 |
+
If base LLM scores above 0.55 on single_issue → opponent too easy, reduce cooperative concession rate.
|
| 327 |
+
|
| 328 |
+
If base LLM scores below 0.15 on single_issue → opponent too hard, increase cooperative concession rate.
|
__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""ProcureRL Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import ProcureRLEnv
|
| 10 |
+
from .models import NegotiationAction, NegotiationObservation, NegotiationState
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"NegotiationAction",
|
| 14 |
+
"NegotiationObservation",
|
| 15 |
+
"NegotiationState",
|
| 16 |
+
"ProcureRLEnv",
|
| 17 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""ProcureRL Environment Client."""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, Any
|
| 10 |
+
|
| 11 |
+
from openenv.core import EnvClient
|
| 12 |
+
from openenv.core.client_types import StepResult
|
| 13 |
+
|
| 14 |
+
from .models import NegotiationAction, NegotiationObservation, NegotiationState
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ProcureRLEnv(
|
| 18 |
+
EnvClient[NegotiationAction, NegotiationObservation, NegotiationState]
|
| 19 |
+
):
|
| 20 |
+
"""
|
| 21 |
+
Client for the ProcureRL Environment.
|
| 22 |
+
|
| 23 |
+
This client maintains a persistent WebSocket connection to the environment server,
|
| 24 |
+
enabling efficient multi-step interactions with lower latency.
|
| 25 |
+
Each client instance has its own dedicated environment session on the server.
|
| 26 |
+
|
| 27 |
+
Example:
|
| 28 |
+
>>> with ProcureRLEnv(base_url="http://localhost:8000") as client:
|
| 29 |
+
... result = client.reset(task_id="single_issue")
|
| 30 |
+
... print(result.observation.supplier_message)
|
| 31 |
+
...
|
| 32 |
+
... action = NegotiationAction(move_type="make_offer", terms={"price": 42000}, message="Let's discuss")
|
| 33 |
+
... result = client.step(action)
|
| 34 |
+
... print(result.observation.supplier_message)
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def _step_payload(self, action: NegotiationAction) -> Dict[str, Any]:
|
| 38 |
+
return {
|
| 39 |
+
"move_type": action.move_type,
|
| 40 |
+
"terms": action.terms,
|
| 41 |
+
"message": action.message,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
def _parse_result(
|
| 45 |
+
self, payload: Dict[str, Any]
|
| 46 |
+
) -> StepResult[NegotiationObservation]:
|
| 47 |
+
obs_data = payload.get("observation", {})
|
| 48 |
+
observation = NegotiationObservation(
|
| 49 |
+
task_id=obs_data.get("task_id", ""),
|
| 50 |
+
round_number=obs_data.get("round_number", 0),
|
| 51 |
+
max_rounds=obs_data.get("max_rounds", 0),
|
| 52 |
+
supplier_message=obs_data.get("supplier_message", ""),
|
| 53 |
+
current_offer=obs_data.get("current_offer", {}),
|
| 54 |
+
last_4_exchanges=obs_data.get("last_4_exchanges", []),
|
| 55 |
+
buyer_constraints=obs_data.get("buyer_constraints", {}),
|
| 56 |
+
rapport_hint=obs_data.get("rapport_hint", "neutral"),
|
| 57 |
+
done=obs_data.get("done", False),
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
return StepResult(
|
| 61 |
+
observation=observation,
|
| 62 |
+
reward=payload.get("reward", 0.0),
|
| 63 |
+
done=payload.get("done", False),
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
def _parse_state(self, payload: Dict[str, Any]) -> NegotiationState:
|
| 67 |
+
return NegotiationState(
|
| 68 |
+
task_id=payload.get("task_id", ""),
|
| 69 |
+
episode_id=payload.get("episode_id", ""),
|
| 70 |
+
round_number=payload.get("round_number", 0),
|
| 71 |
+
rapport_score=payload.get("rapport_score", 0.5),
|
| 72 |
+
consecutive_concessions=payload.get("consecutive_concessions", 0),
|
| 73 |
+
deal_reached=payload.get("deal_reached", False),
|
| 74 |
+
final_terms=payload.get("final_terms"),
|
| 75 |
+
cumulative_reward=payload.get("cumulative_reward", 0.0),
|
| 76 |
+
)
|
graders.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Grading functions for procurement negotiation tasks.
|
| 9 |
+
|
| 10 |
+
Pure Python — zero LLM calls. Graders must never crash on malformed input.
|
| 11 |
+
|
| 12 |
+
Scoring is based on how much the agent improved from the opponent's opening price,
|
| 13 |
+
not on absolute thresholds. This makes the environment learnable — the agent learns
|
| 14 |
+
to negotiate better deals relative to where negotiations started.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from typing import Dict, Optional
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def grade_single_issue(
|
| 21 |
+
final_terms: Dict,
|
| 22 |
+
deal_reached: bool,
|
| 23 |
+
rounds_taken: int,
|
| 24 |
+
max_rounds: int = 6,
|
| 25 |
+
opponent_opening: float = 52000.0,
|
| 26 |
+
) -> float:
|
| 27 |
+
if not deal_reached:
|
| 28 |
+
return 0.0
|
| 29 |
+
|
| 30 |
+
if final_terms is None:
|
| 31 |
+
return 0.0
|
| 32 |
+
|
| 33 |
+
final_price = final_terms.get("price", opponent_opening)
|
| 34 |
+
BUYER_TARGET = 38000.0
|
| 35 |
+
|
| 36 |
+
if final_price >= opponent_opening:
|
| 37 |
+
return 0.05
|
| 38 |
+
|
| 39 |
+
value = (opponent_opening - final_price) / (opponent_opening - BUYER_TARGET)
|
| 40 |
+
value = max(0.0, min(1.0, value))
|
| 41 |
+
|
| 42 |
+
efficiency = 1.0 - (rounds_taken / max_rounds) ** 1.5 * 0.4
|
| 43 |
+
efficiency = max(0.1, efficiency)
|
| 44 |
+
|
| 45 |
+
return round(value * efficiency, 4)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def grade_multi_issue(
|
| 49 |
+
final_terms: Dict,
|
| 50 |
+
deal_reached: bool,
|
| 51 |
+
rounds_taken: int,
|
| 52 |
+
max_rounds: int = 8,
|
| 53 |
+
opponent_opening: float = 52000.0,
|
| 54 |
+
) -> float:
|
| 55 |
+
if not deal_reached:
|
| 56 |
+
return 0.0
|
| 57 |
+
|
| 58 |
+
if final_terms is None:
|
| 59 |
+
return 0.0
|
| 60 |
+
|
| 61 |
+
final_price = final_terms.get("price", opponent_opening)
|
| 62 |
+
payment_days = final_terms.get("payment_days", 90)
|
| 63 |
+
|
| 64 |
+
BUYER_PRICE_TARGET = 40000.0
|
| 65 |
+
PAYMENT_TARGET = 30
|
| 66 |
+
PAYMENT_WORST = 90
|
| 67 |
+
|
| 68 |
+
if final_price >= opponent_opening and payment_days >= 90:
|
| 69 |
+
return 0.05
|
| 70 |
+
|
| 71 |
+
price_value = (opponent_opening - final_price) / (
|
| 72 |
+
opponent_opening - BUYER_PRICE_TARGET
|
| 73 |
+
)
|
| 74 |
+
price_value = max(0.0, min(1.0, price_value))
|
| 75 |
+
|
| 76 |
+
payment_score = (PAYMENT_WORST - payment_days) / (PAYMENT_WORST - PAYMENT_TARGET)
|
| 77 |
+
payment_score = max(0.0, min(1.0, payment_score))
|
| 78 |
+
|
| 79 |
+
value = 0.70 * price_value + 0.30 * payment_score
|
| 80 |
+
|
| 81 |
+
if final_price >= opponent_opening:
|
| 82 |
+
value = 0.30 * payment_score
|
| 83 |
+
|
| 84 |
+
efficiency = 1.0 - (rounds_taken / max_rounds) * 0.30
|
| 85 |
+
efficiency = max(0.1, efficiency)
|
| 86 |
+
|
| 87 |
+
return round(value * efficiency, 4)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def grade_adversarial(
|
| 91 |
+
final_terms: Dict,
|
| 92 |
+
deal_reached: bool,
|
| 93 |
+
rounds_taken: int,
|
| 94 |
+
consecutive_concessions_flag: bool,
|
| 95 |
+
max_rounds: int = 10,
|
| 96 |
+
opponent_opening: float = 100000.0,
|
| 97 |
+
) -> float:
|
| 98 |
+
if not deal_reached:
|
| 99 |
+
return 0.0
|
| 100 |
+
|
| 101 |
+
if final_terms is None:
|
| 102 |
+
return 0.0
|
| 103 |
+
|
| 104 |
+
SURVIVAL_FLOOR = 0.15
|
| 105 |
+
|
| 106 |
+
final_price = final_terms.get("price", opponent_opening)
|
| 107 |
+
payment_days = final_terms.get("payment_days", 90)
|
| 108 |
+
support_hours = final_terms.get("support_hours", 80)
|
| 109 |
+
|
| 110 |
+
BUYER_PRICE_TARGET = 80000.0
|
| 111 |
+
|
| 112 |
+
price_value = (opponent_opening - final_price) / (
|
| 113 |
+
opponent_opening - BUYER_PRICE_TARGET
|
| 114 |
+
)
|
| 115 |
+
price_value = max(0.0, min(1.0, price_value))
|
| 116 |
+
|
| 117 |
+
payment_score = (90 - payment_days) / (90 - 30)
|
| 118 |
+
payment_score = max(0.0, min(1.0, payment_score))
|
| 119 |
+
|
| 120 |
+
support_score = (support_hours - 80) / (200 - 80)
|
| 121 |
+
support_score = max(0.0, min(1.0, support_score))
|
| 122 |
+
|
| 123 |
+
value = 0.40 * price_value + 0.35 * payment_score + 0.25 * support_score
|
| 124 |
+
|
| 125 |
+
if final_price >= opponent_opening:
|
| 126 |
+
value = 0.25 * (0.35 * payment_score + 0.25 * support_score)
|
| 127 |
+
|
| 128 |
+
efficiency = 1.0 - (rounds_taken / max_rounds) * 0.25
|
| 129 |
+
efficiency = max(0.1, efficiency)
|
| 130 |
+
|
| 131 |
+
pattern_penalty = 0.10 if consecutive_concessions_flag else 0.0
|
| 132 |
+
|
| 133 |
+
raw = (value * efficiency) - pattern_penalty
|
| 134 |
+
return round(max(SURVIVAL_FLOOR, raw), 4)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def grade(
|
| 138 |
+
task_id: str,
|
| 139 |
+
final_terms: Dict,
|
| 140 |
+
deal_reached: bool,
|
| 141 |
+
rounds_taken: int,
|
| 142 |
+
opponent_opening: float = 52000.0,
|
| 143 |
+
**kwargs,
|
| 144 |
+
) -> float:
|
| 145 |
+
if task_id == "single_issue":
|
| 146 |
+
return grade_single_issue(
|
| 147 |
+
final_terms, deal_reached, rounds_taken, opponent_opening=opponent_opening
|
| 148 |
+
)
|
| 149 |
+
elif task_id == "multi_issue":
|
| 150 |
+
return grade_multi_issue(
|
| 151 |
+
final_terms, deal_reached, rounds_taken, opponent_opening=opponent_opening
|
| 152 |
+
)
|
| 153 |
+
elif task_id == "adversarial":
|
| 154 |
+
return grade_adversarial(
|
| 155 |
+
final_terms,
|
| 156 |
+
deal_reached,
|
| 157 |
+
rounds_taken,
|
| 158 |
+
kwargs.get("consecutive_concessions_flag", False),
|
| 159 |
+
opponent_opening=opponent_opening,
|
| 160 |
+
)
|
| 161 |
+
return 0.0
|
inference.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 3 |
+
# All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the BSD-style license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Baseline inference script for ProcureRL.
|
| 10 |
+
|
| 11 |
+
Runs an LLM agent against the procurement negotiation environment
|
| 12 |
+
and outputs results in exact [START][STEP][END] format.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
import sys
|
| 17 |
+
import json
|
| 18 |
+
|
| 19 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 20 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 21 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 22 |
+
BENCHMARK = "procure-rl"
|
| 23 |
+
MAX_STEPS = 10
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
from openai import OpenAI
|
| 27 |
+
|
| 28 |
+
client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"[ERROR] Failed to initialize OpenAI client: {e}")
|
| 31 |
+
sys.exit(1)
|
| 32 |
+
|
| 33 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 34 |
+
from server.Procure_RL_environment import ProcureRLEnvironment
|
| 35 |
+
from models import NegotiationAction
|
| 36 |
+
|
| 37 |
+
TASKS = ["single_issue", "multi_issue", "adversarial"]
|
| 38 |
+
|
| 39 |
+
SYSTEM_PROMPT = """You are a professional procurement negotiator. Your goal is to negotiate the best possible deal for your company.
|
| 40 |
+
|
| 41 |
+
You will receive a supplier's message and current offer terms. You must respond with a JSON action in this exact format:
|
| 42 |
+
{
|
| 43 |
+
"move_type": "make_offer",
|
| 44 |
+
"terms": {"price": 42000, "payment_days": 45},
|
| 45 |
+
"message": "Your natural language response to the supplier"
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
move_type must be one of: make_offer, accept, reject, bundle
|
| 49 |
+
terms must include price and any other issues being negotiated.
|
| 50 |
+
message should be professional and collaborative when possible.
|
| 51 |
+
|
| 52 |
+
Your buyer constraints will be provided. Do not exceed your budget. Try to reach the target price."""
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def get_agent_action(obs_dict: dict) -> dict:
|
| 56 |
+
task_id = obs_dict.get("task_id", "single_issue")
|
| 57 |
+
supplier_msg = obs_dict.get("supplier_message", "")
|
| 58 |
+
current_offer = obs_dict.get("current_offer", {})
|
| 59 |
+
constraints = obs_dict.get("buyer_constraints", {})
|
| 60 |
+
rapport_hint = obs_dict.get("rapport_hint", "neutral")
|
| 61 |
+
round_num = obs_dict.get("round_number", 0)
|
| 62 |
+
max_rounds = obs_dict.get("max_rounds", 10)
|
| 63 |
+
|
| 64 |
+
user_content = f"""Task: {task_id}
|
| 65 |
+
Round: {round_num}/{max_rounds}
|
| 66 |
+
Supplier says: "{supplier_msg}"
|
| 67 |
+
Current offer on table: {json.dumps(current_offer)}
|
| 68 |
+
Your constraints: {json.dumps(constraints)}
|
| 69 |
+
Relationship rapport: {rapport_hint}
|
| 70 |
+
|
| 71 |
+
Respond with your negotiation action as JSON."""
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
response = client.chat.completions.create(
|
| 75 |
+
model=MODEL_NAME,
|
| 76 |
+
messages=[
|
| 77 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 78 |
+
{"role": "user", "content": user_content},
|
| 79 |
+
],
|
| 80 |
+
max_tokens=300,
|
| 81 |
+
temperature=0.3,
|
| 82 |
+
)
|
| 83 |
+
content = response.choices[0].message.content.strip()
|
| 84 |
+
except Exception as e:
|
| 85 |
+
return {
|
| 86 |
+
"move_type": "make_offer",
|
| 87 |
+
"terms": current_offer,
|
| 88 |
+
"message": f"Error: {str(e)}",
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
start = content.find("{")
|
| 93 |
+
end = content.rfind("}") + 1
|
| 94 |
+
if start >= 0 and end > start:
|
| 95 |
+
action_dict = json.loads(content[start:end])
|
| 96 |
+
else:
|
| 97 |
+
action_dict = {
|
| 98 |
+
"move_type": "make_offer",
|
| 99 |
+
"terms": current_offer,
|
| 100 |
+
"message": content[:200]
|
| 101 |
+
if content
|
| 102 |
+
else "I'd like to continue our discussion.",
|
| 103 |
+
}
|
| 104 |
+
except:
|
| 105 |
+
action_dict = {
|
| 106 |
+
"move_type": "make_offer",
|
| 107 |
+
"terms": current_offer,
|
| 108 |
+
"message": "I'd like to continue our discussion.",
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
return action_dict
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def obs_to_dict(obs) -> dict:
|
| 115 |
+
return {
|
| 116 |
+
"task_id": obs.task_id,
|
| 117 |
+
"round_number": obs.round_number,
|
| 118 |
+
"max_rounds": obs.max_rounds,
|
| 119 |
+
"supplier_message": obs.supplier_message,
|
| 120 |
+
"current_offer": obs.current_offer,
|
| 121 |
+
"buyer_constraints": obs.buyer_constraints,
|
| 122 |
+
"rapport_hint": obs.rapport_hint,
|
| 123 |
+
"done": obs.done,
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def run_task(task_id: str) -> dict:
|
| 128 |
+
env = ProcureRLEnvironment()
|
| 129 |
+
obs = env.reset(task_id=task_id, seed=42)
|
| 130 |
+
obs_dict = obs_to_dict(obs)
|
| 131 |
+
|
| 132 |
+
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}")
|
| 133 |
+
|
| 134 |
+
rewards = []
|
| 135 |
+
step = 0
|
| 136 |
+
done = False
|
| 137 |
+
final_score = 0.0
|
| 138 |
+
|
| 139 |
+
while not done and step < MAX_STEPS:
|
| 140 |
+
step += 1
|
| 141 |
+
|
| 142 |
+
action_dict = get_agent_action(obs_dict)
|
| 143 |
+
action = NegotiationAction(
|
| 144 |
+
move_type=action_dict.get("move_type", "make_offer"),
|
| 145 |
+
terms=action_dict.get("terms", {}),
|
| 146 |
+
message=action_dict.get("message", ""),
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
obs = env.step(action)
|
| 150 |
+
rewards.append(obs.reward if obs.reward is not None else 0.0)
|
| 151 |
+
|
| 152 |
+
action_str = f"{action.move_type}({json.dumps(action.terms)})"
|
| 153 |
+
error = obs.metadata.get("error", None) if obs.metadata else None
|
| 154 |
+
|
| 155 |
+
print(
|
| 156 |
+
f"[STEP] step={step} action={action_str} reward={obs.reward if obs.reward else 0.0:.2f} done={str(obs.done).lower()} error={error if error else 'null'}"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
if obs.done:
|
| 160 |
+
final_score = (
|
| 161 |
+
obs.reward
|
| 162 |
+
if obs.reward is not None and obs.reward > 0
|
| 163 |
+
else (max(rewards) if rewards else 0.0)
|
| 164 |
+
)
|
| 165 |
+
break
|
| 166 |
+
|
| 167 |
+
obs_dict = obs_to_dict(obs)
|
| 168 |
+
|
| 169 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 170 |
+
success = final_score > 0.1
|
| 171 |
+
|
| 172 |
+
print(
|
| 173 |
+
f"[END] success={str(success).lower()} steps={step} score={final_score:.2f} rewards={rewards_str}"
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
return {"task": task_id, "score": final_score, "steps": step}
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
if not API_KEY:
|
| 181 |
+
print("[ERROR] HF_TOKEN or API_KEY environment variable not set")
|
| 182 |
+
sys.exit(1)
|
| 183 |
+
|
| 184 |
+
results = []
|
| 185 |
+
for task in TASKS:
|
| 186 |
+
try:
|
| 187 |
+
result = run_task(task)
|
| 188 |
+
results.append(result)
|
| 189 |
+
except Exception as e:
|
| 190 |
+
print(f"[ERROR] Task {task} failed: {e}")
|
| 191 |
+
results.append({"task": task, "score": 0.0, "steps": 0, "error": str(e)})
|
| 192 |
+
|
| 193 |
+
print(f"\nBaseline Results:")
|
| 194 |
+
for r in results:
|
| 195 |
+
task = r["task"]
|
| 196 |
+
score = r["score"]
|
| 197 |
+
print(f" {task}: {score:.3f}")
|
models.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Data models for the ProcureRL Environment.
|
| 9 |
+
|
| 10 |
+
The ProcureRL environment is a procurement negotiation RL environment where
|
| 11 |
+
an LLM agent learns to negotiate against scripted supplier opponents.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from typing import Optional, List, Dict, Any
|
| 15 |
+
from pydantic import BaseModel, Field, ConfigDict
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from openenv.core.env_server.types import Action, Observation, State as OpenEnvState
|
| 19 |
+
except ImportError:
|
| 20 |
+
OpenEnvState = object
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class NegotiationAction(BaseModel):
|
| 24 |
+
model_config = ConfigDict(extra="allow")
|
| 25 |
+
|
| 26 |
+
move_type: str = Field(
|
| 27 |
+
default="make_offer",
|
| 28 |
+
description="Choose action: make_offer (propose), accept (take current deal), reject (walk away), bundle (multi-issue offer)",
|
| 29 |
+
)
|
| 30 |
+
terms: Dict[str, Any] = Field(
|
| 31 |
+
default_factory=lambda: {"price": 45000},
|
| 32 |
+
description='For single_issue: {"price": 45000}. For multi_issue: {"price": 45000, "payment_days": 30}. For adversarial: add "support_hours": 100',
|
| 33 |
+
)
|
| 34 |
+
message: str = Field(
|
| 35 |
+
default="I value our partnership and believe we can reach a fair agreement together.",
|
| 36 |
+
description="Write a collaborative message. Use: partnership, mutual, flexible, understand, solution. Avoid: demand, final offer, ultimatum",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def model_post_init(self, *args, **kwargs):
|
| 40 |
+
valid_moves = ("make_offer", "accept", "reject", "bundle")
|
| 41 |
+
if self.move_type not in valid_moves:
|
| 42 |
+
raise ValueError(
|
| 43 |
+
f"Invalid move_type: {self.move_type}. Must be one of {valid_moves}"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class NegotiationObservation(BaseModel):
|
| 48 |
+
model_config = ConfigDict(extra="allow")
|
| 49 |
+
|
| 50 |
+
task_id: str = ""
|
| 51 |
+
round_number: int = 0
|
| 52 |
+
max_rounds: int = 0
|
| 53 |
+
supplier_message: str = ""
|
| 54 |
+
current_offer: Dict[str, Any] = Field(default_factory=dict)
|
| 55 |
+
last_4_exchanges: List[Dict] = Field(default_factory=list)
|
| 56 |
+
buyer_constraints: Dict[str, Any] = Field(default_factory=dict)
|
| 57 |
+
rapport_hint: str = "neutral"
|
| 58 |
+
done: bool = False
|
| 59 |
+
reward: Optional[float] = None
|
| 60 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class NegotiationState(BaseModel):
|
| 64 |
+
model_config = ConfigDict(extra="allow", validate_assignment=True)
|
| 65 |
+
|
| 66 |
+
task_id: str = ""
|
| 67 |
+
episode_id: str = ""
|
| 68 |
+
round_number: int = 0
|
| 69 |
+
step_count: int = 0 # Required by OpenEnv web interface
|
| 70 |
+
rapport_score: float = 0.5
|
| 71 |
+
consecutive_concessions: int = 0
|
| 72 |
+
deal_reached: bool = False
|
| 73 |
+
final_terms: Optional[Dict] = None
|
| 74 |
+
cumulative_reward: float = 0.0
|
| 75 |
+
|
| 76 |
+
def __getitem__(self, key):
|
| 77 |
+
return getattr(self, key)
|
| 78 |
+
|
| 79 |
+
def get(self, key, default=None):
|
| 80 |
+
return getattr(self, key, default)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: procure-rl
|
| 3 |
+
version: "1.0.0"
|
| 4 |
+
type: space
|
| 5 |
+
runtime: fastapi
|
| 6 |
+
app: server.app:app
|
| 7 |
+
port: 7860
|
| 8 |
+
description: "LLM agent learns procurement negotiation strategy against scripted supplier opponents with hidden utility functions"
|
| 9 |
+
author: "procure-rl"
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- negotiation
|
| 13 |
+
- procurement
|
| 14 |
+
- real-world
|
| 15 |
+
- rl
|
| 16 |
+
tasks:
|
| 17 |
+
- id: single_issue
|
| 18 |
+
description: "Negotiate software license price with cooperative supplier"
|
| 19 |
+
difficulty: easy
|
| 20 |
+
max_steps: 6
|
| 21 |
+
reward_range: [0.0, 1.0]
|
| 22 |
+
- id: multi_issue
|
| 23 |
+
description: "Negotiate price and payment terms with cash-flow-sensitive supplier"
|
| 24 |
+
difficulty: medium
|
| 25 |
+
max_steps: 8
|
| 26 |
+
reward_range: [0.0, 1.0]
|
| 27 |
+
- id: adversarial
|
| 28 |
+
description: "Negotiate multiple issues against aggressive anchoring supplier"
|
| 29 |
+
difficulty: hard
|
| 30 |
+
max_steps: 10
|
| 31 |
+
reward_range: [0.0, 1.0]
|
| 32 |
+
reward_range: [0.0, 1.0]
|
| 33 |
+
observation_space:
|
| 34 |
+
type: object
|
| 35 |
+
description: "Natural language supplier message with structured negotiation state and rapport signal"
|
| 36 |
+
action_space:
|
| 37 |
+
type: object
|
| 38 |
+
description: "Negotiation move type, structured terms, and natural language message"
|
openenv_Procure_RL.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-Procure_RL
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Procure Rl environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
+
Provides-Extra: dev
|
| 8 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 9 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_Procure_RL.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
./__init__.py
|
| 4 |
+
./client.py
|
| 5 |
+
./models.py
|
| 6 |
+
openenv_Procure_RL.egg-info/PKG-INFO
|
| 7 |
+
openenv_Procure_RL.egg-info/SOURCES.txt
|
| 8 |
+
openenv_Procure_RL.egg-info/dependency_links.txt
|
| 9 |
+
openenv_Procure_RL.egg-info/entry_points.txt
|
| 10 |
+
openenv_Procure_RL.egg-info/requires.txt
|
| 11 |
+
openenv_Procure_RL.egg-info/top_level.txt
|
| 12 |
+
server/Procure_RL_environment.py
|
| 13 |
+
server/__init__.py
|
| 14 |
+
server/app.py
|
openenv_Procure_RL.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_Procure_RL.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = Procure_RL.server.app:main
|
openenv_Procure_RL.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
|
| 3 |
+
[dev]
|
| 4 |
+
pytest>=8.0.0
|
| 5 |
+
pytest-cov>=4.0.0
|
openenv_Procure_RL.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Procure_RL
|
opponent.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Scripted persona opponent for procurement negotiation.
|
| 9 |
+
|
| 10 |
+
The opponent's behavior is deterministic given a seed AND sensitive to
|
| 11 |
+
the agent's language quality via the rapport system.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import random
|
| 15 |
+
from dataclasses import dataclass, field
|
| 16 |
+
from typing import Dict, Tuple
|
| 17 |
+
|
| 18 |
+
COLLABORATIVE_SIGNALS = [
|
| 19 |
+
"understand",
|
| 20 |
+
"partnership",
|
| 21 |
+
"mutual",
|
| 22 |
+
"together",
|
| 23 |
+
"value",
|
| 24 |
+
"appreciate",
|
| 25 |
+
"flexible",
|
| 26 |
+
"work with",
|
| 27 |
+
"long-term",
|
| 28 |
+
"relationship",
|
| 29 |
+
"reasonable",
|
| 30 |
+
"fair",
|
| 31 |
+
"both",
|
| 32 |
+
"solution",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
AGGRESSIVE_SIGNALS = [
|
| 36 |
+
"demand",
|
| 37 |
+
"require",
|
| 38 |
+
"final offer",
|
| 39 |
+
"unacceptable",
|
| 40 |
+
"must",
|
| 41 |
+
"non-negotiable",
|
| 42 |
+
"take it or leave",
|
| 43 |
+
"bottom line",
|
| 44 |
+
"ultimatum",
|
| 45 |
+
"insist",
|
| 46 |
+
"refuse",
|
| 47 |
+
"absolutely not",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
PERSONA_TEMPLATES = {
|
| 51 |
+
"cooperative": {
|
| 52 |
+
"opening": [
|
| 53 |
+
"Thanks for reaching out. Our standard pricing for this package is ${target}. Happy to discuss.",
|
| 54 |
+
"We value your interest. We're pricing this at ${target} based on current market rates.",
|
| 55 |
+
],
|
| 56 |
+
"counter": [
|
| 57 |
+
"I appreciate you working with us. Based on our costs, ${counter} is where we can be.",
|
| 58 |
+
"Thank you for your offer. We can move to ${counter} given our margin requirements.",
|
| 59 |
+
],
|
| 60 |
+
"near_close": [
|
| 61 |
+
"I think we're close. If you can do ${close}, I can get this approved today.",
|
| 62 |
+
"We're almost there. ${close} works for our team. Shall we finalize?",
|
| 63 |
+
],
|
| 64 |
+
"accept": "That works for us. Let's move forward at those terms.",
|
| 65 |
+
"reject": "That's below what we can accept, but we want to make this work.",
|
| 66 |
+
},
|
| 67 |
+
"cash_flow_stressed": {
|
| 68 |
+
"opening": [
|
| 69 |
+
"Our pricing is ${target}. I should mention — payment timing is particularly important to us this quarter.",
|
| 70 |
+
"We're at ${target}. Between us, our finance team has specific requirements around cash flow timing.",
|
| 71 |
+
],
|
| 72 |
+
"counter": [
|
| 73 |
+
"We can move on price if payment terms work for you. ${counter} with your payment preference?",
|
| 74 |
+
"Price flexibility depends on receivables timing for us. ${counter} if we can discuss payment terms.",
|
| 75 |
+
],
|
| 76 |
+
"near_close": [
|
| 77 |
+
"If you can do Net-30 on payment, we can get to ${close} on price.",
|
| 78 |
+
"Payment timing is our real constraint. ${close} with faster payment terms?",
|
| 79 |
+
],
|
| 80 |
+
"accept": "Agreed. The payment structure works for our cash flow needs.",
|
| 81 |
+
"reject": "The price is tight but we could explore it if payment terms align.",
|
| 82 |
+
},
|
| 83 |
+
"aggressive_anchor": {
|
| 84 |
+
"opening": [
|
| 85 |
+
"Our price is ${target}. This reflects our full service quality and market position.",
|
| 86 |
+
"We're firm at ${target}. This is based on our cost structure and service level.",
|
| 87 |
+
],
|
| 88 |
+
"counter": [
|
| 89 |
+
"We can go to ${counter}. That's already a significant concession from our position.",
|
| 90 |
+
"${counter} is our revised position. We're not in a position to move much further.",
|
| 91 |
+
],
|
| 92 |
+
"hardening": [
|
| 93 |
+
"We've already moved considerably. ${floor} is our absolute position.",
|
| 94 |
+
"I need to be direct — we're at ${floor} and that's where we'll stay.",
|
| 95 |
+
],
|
| 96 |
+
"near_close": [
|
| 97 |
+
"Final position: ${close}. We need a decision today.",
|
| 98 |
+
"${close} is where we are. This is our best and final offer.",
|
| 99 |
+
],
|
| 100 |
+
"accept": "Accepted.",
|
| 101 |
+
"reject": "That doesn't work. Come back with a serious offer.",
|
| 102 |
+
},
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class ScriptedPersonaOpponent:
|
| 107 |
+
def __init__(self, task_id: str, seed: int, persona: str):
|
| 108 |
+
self.rng = random.Random(seed)
|
| 109 |
+
self.task_id = task_id
|
| 110 |
+
self.persona = persona
|
| 111 |
+
self.templates = PERSONA_TEMPLATES[persona]
|
| 112 |
+
|
| 113 |
+
if task_id == "single_issue":
|
| 114 |
+
self.price_floor = self.rng.uniform(42000, 46000)
|
| 115 |
+
self.price_target = self.price_floor * self.rng.uniform(1.28, 1.38)
|
| 116 |
+
elif task_id == "multi_issue":
|
| 117 |
+
self.price_floor = self.rng.uniform(40000, 46000)
|
| 118 |
+
self.price_target = self.price_floor * self.rng.uniform(1.25, 1.35)
|
| 119 |
+
self.payment_preference = self.rng.choice([30, 45, 60])
|
| 120 |
+
elif task_id == "adversarial":
|
| 121 |
+
self.price_floor = self.rng.uniform(85000, 95000)
|
| 122 |
+
self.price_target = self.price_floor * self.rng.uniform(1.30, 1.40)
|
| 123 |
+
|
| 124 |
+
self.rapport = 0.5
|
| 125 |
+
self.concession_count = 0
|
| 126 |
+
self.current_position = self.price_target
|
| 127 |
+
|
| 128 |
+
def update_rapport(self, agent_message: str) -> None:
|
| 129 |
+
msg_lower = agent_message.lower()
|
| 130 |
+
delta = 0.0
|
| 131 |
+
delta += sum(0.08 for w in COLLABORATIVE_SIGNALS if w in msg_lower)
|
| 132 |
+
delta -= sum(0.08 for w in AGGRESSIVE_SIGNALS if w in msg_lower)
|
| 133 |
+
delta = max(-0.20, min(0.20, delta))
|
| 134 |
+
self.rapport = max(0.0, min(1.0, self.rapport + delta))
|
| 135 |
+
|
| 136 |
+
def get_concession_rate(self) -> float:
|
| 137 |
+
base_rates = {
|
| 138 |
+
"cooperative": 0.05,
|
| 139 |
+
"cash_flow_stressed": 0.07,
|
| 140 |
+
"aggressive_anchor": 0.04,
|
| 141 |
+
}
|
| 142 |
+
base = base_rates[self.persona]
|
| 143 |
+
modifier = (self.rapport - 0.5) * base
|
| 144 |
+
return max(0.01, base + modifier)
|
| 145 |
+
|
| 146 |
+
def respond(
|
| 147 |
+
self,
|
| 148 |
+
agent_message: str,
|
| 149 |
+
agent_terms: Dict,
|
| 150 |
+
round_number: int,
|
| 151 |
+
consecutive_concessions: int,
|
| 152 |
+
) -> Tuple[str, Dict]:
|
| 153 |
+
self.update_rapport(agent_message)
|
| 154 |
+
self.concession_count += 1
|
| 155 |
+
|
| 156 |
+
agent_price = agent_terms.get("price", 0)
|
| 157 |
+
|
| 158 |
+
if (
|
| 159 |
+
round_number >= 2
|
| 160 |
+
and agent_price >= self.price_floor
|
| 161 |
+
and self._acceptance_condition(agent_terms)
|
| 162 |
+
):
|
| 163 |
+
return self.templates["accept"], {**agent_terms, "_accepted": True}
|
| 164 |
+
|
| 165 |
+
concession = self.get_concession_rate()
|
| 166 |
+
|
| 167 |
+
if self.persona == "aggressive_anchor" and consecutive_concessions >= 2:
|
| 168 |
+
concession = concession * 0.4
|
| 169 |
+
template_key = "hardening"
|
| 170 |
+
elif round_number >= self._max_rounds() * 0.7:
|
| 171 |
+
template_key = "near_close"
|
| 172 |
+
else:
|
| 173 |
+
template_key = "counter"
|
| 174 |
+
|
| 175 |
+
new_position = self.current_position * (1 - concession)
|
| 176 |
+
new_position = max(self.price_floor, new_position)
|
| 177 |
+
self.current_position = new_position
|
| 178 |
+
|
| 179 |
+
templates_for_key = self.templates.get(template_key, self.templates["counter"])
|
| 180 |
+
template = self.rng.choice(templates_for_key)
|
| 181 |
+
message = template.replace("${counter}", f"${new_position:,.0f}")
|
| 182 |
+
message = message.replace("${floor}", f"${self.price_floor:,.0f}")
|
| 183 |
+
message = message.replace("${close}", f"${new_position:,.0f}")
|
| 184 |
+
|
| 185 |
+
counter_terms = dict(agent_terms)
|
| 186 |
+
counter_terms["price"] = round(new_position, 2)
|
| 187 |
+
|
| 188 |
+
if self.persona == "cash_flow_stressed" and "payment_days" in agent_terms:
|
| 189 |
+
if agent_terms["payment_days"] > 60:
|
| 190 |
+
message += (
|
| 191 |
+
" Though I'll need to flag the payment timing to our finance team."
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
return message, counter_terms
|
| 195 |
+
|
| 196 |
+
def _acceptance_condition(self, terms: Dict) -> bool:
|
| 197 |
+
if self.persona == "cash_flow_stressed":
|
| 198 |
+
payment_ok = terms.get("payment_days", 60) <= 45
|
| 199 |
+
return payment_ok
|
| 200 |
+
return True
|
| 201 |
+
|
| 202 |
+
def _max_rounds(self) -> int:
|
| 203 |
+
return {"single_issue": 6, "multi_issue": 8, "adversarial": 10}[self.task_id]
|
| 204 |
+
|
| 205 |
+
def get_opening_message(self) -> Tuple[str, Dict]:
|
| 206 |
+
template = self.rng.choice(self.templates["opening"])
|
| 207 |
+
message = template.replace("${target}", f"${self.price_target:,.0f}")
|
| 208 |
+
terms = {"price": round(self.price_target, 2)}
|
| 209 |
+
if self.task_id in ["multi_issue", "adversarial"]:
|
| 210 |
+
terms["payment_days"] = 90
|
| 211 |
+
if self.task_id == "adversarial":
|
| 212 |
+
terms["support_hours"] = 80
|
| 213 |
+
return message, terms
|
plan.md
ADDED
|
@@ -0,0 +1,1228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# THE DEFINITIVE FINAL DESIGN
|
| 2 |
+
|
| 3 |
+
## Core Mechanic: Language-Sensitive Scripted Opponent
|
| 4 |
+
|
| 5 |
+
This is the one thing that makes everything work. The opponent's behavior is deterministic given a seed AND sensitive to the agent's language quality.
|
| 6 |
+
|
| 7 |
+
```python
|
| 8 |
+
# Deterministic keyword detection — pure Python
|
| 9 |
+
COLLABORATIVE_SIGNALS = [
|
| 10 |
+
"understand", "partnership", "mutual", "together", "value",
|
| 11 |
+
"appreciate", "flexible", "work with", "long-term", "relationship"
|
| 12 |
+
]
|
| 13 |
+
|
| 14 |
+
AGGRESSIVE_SIGNALS = [
|
| 15 |
+
"demand", "require", "final offer", "unacceptable", "must",
|
| 16 |
+
"non-negotiable", "take it or leave", "bottom line", "ultimatum"
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
def update_rapport(current_rapport: float, agent_message: str) -> float:
|
| 20 |
+
msg_lower = agent_message.lower()
|
| 21 |
+
delta = 0.0
|
| 22 |
+
delta += sum(0.08 for w in COLLABORATIVE_SIGNALS if w in msg_lower)
|
| 23 |
+
delta -= sum(0.08 for w in AGGRESSIVE_SIGNALS if w in msg_lower)
|
| 24 |
+
delta = max(-0.20, min(0.20, delta)) # cap per-round change
|
| 25 |
+
return max(0.0, min(1.0, current_rapport + delta))
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
The rapport score directly modifies the opponent's concession rate:
|
| 29 |
+
|
| 30 |
+
- Rapport 0.8: opponent concedes 12% per round
|
| 31 |
+
- Rapport 0.5: opponent concedes 7% per round (neutral)
|
| 32 |
+
- Rapport 0.2: opponent concedes 3% per round (hardened)
|
| 33 |
+
|
| 34 |
+
A heuristic agent that outputs nothing or outputs aggressive language gets neutral/hostile opponent. An LLM that learns collaborative framing gets cooperative opponent. This is the LLM advantage.
|
| 35 |
+
|
| 36 |
+
## The Three Tasks — Final, Locked
|
| 37 |
+
|
| 38 |
+
### Task 1: `single_issue` (Easy)
|
| 39 |
+
|
| 40 |
+
**Scenario:** Renew software license. Price only.
|
| 41 |
+
|
| 42 |
+
```
|
| 43 |
+
Buyer target: $38,000
|
| 44 |
+
Seller opens: $52,000
|
| 45 |
+
Seller floor: $44,000
|
| 46 |
+
Pareto optimal: $43,000
|
| 47 |
+
Max rounds: 6
|
| 48 |
+
Persona: Cooperative (concedes 10% baseline, rapport-sensitive)
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
**Calibration:** A base LLM that simply offers reasonable prices without collaborative language scores ~0.38. A base LLM that naturally uses professional language scores ~0.52. Scores above 0.75 require learning to time concessions correctly.
|
| 52 |
+
|
| 53 |
+
**Grader:**
|
| 54 |
+
|
| 55 |
+
```python
|
| 56 |
+
def grade_single_issue(final_price, deal_reached, rounds_taken):
|
| 57 |
+
if not deal_reached:
|
| 58 |
+
return 0.0
|
| 59 |
+
|
| 60 |
+
# Value: how close to buyer target
|
| 61 |
+
value = (44000 - final_price) / (44000 - 38000)
|
| 62 |
+
value = max(0.0, min(1.0, value))
|
| 63 |
+
|
| 64 |
+
# Efficiency: penalty grows sharply in late rounds
|
| 65 |
+
efficiency = 1.0 - (rounds_taken / 6) ** 1.5 * 0.4
|
| 66 |
+
efficiency = max(0.0, efficiency)
|
| 67 |
+
|
| 68 |
+
return round(value * efficiency, 4)
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### Task 2: `multi_issue` (Medium)
|
| 72 |
+
|
| 73 |
+
**Scenario:** Enterprise software. Price + payment terms.
|
| 74 |
+
|
| 75 |
+
```
|
| 76 |
+
Issues: price ($40K-$58K) + payment_days (30-90)
|
| 77 |
+
Seller persona: Cash Flow Stressed
|
| 78 |
+
→ price_weight: 0.35 (somewhat cares)
|
| 79 |
+
→ payment_weight: 0.65 (cares much more)
|
| 80 |
+
Buyer weights: price 0.70, payment 0.30
|
| 81 |
+
Pareto insight: buyer should offer Net-30 to get lower price
|
| 82 |
+
Max rounds: 8
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
**Why medium:** Base LLM treats both issues equally, misses the trade opportunity. Score ~0.25. LLM that discovers seller cares about payment can bundle correctly. Score ~0.50.
|
| 86 |
+
|
| 87 |
+
**Grader:**
|
| 88 |
+
|
| 89 |
+
```python
|
| 90 |
+
def grade_multi_issue(final_terms, deal_reached, rounds_taken):
|
| 91 |
+
if not deal_reached:
|
| 92 |
+
return 0.0
|
| 93 |
+
|
| 94 |
+
# Buyer utility function
|
| 95 |
+
price_score = (58000 - final_terms['price']) / (58000 - 40000)
|
| 96 |
+
payment_score = (90 - final_terms['payment_days']) / (90 - 30)
|
| 97 |
+
|
| 98 |
+
price_score = max(0.0, min(1.0, price_score))
|
| 99 |
+
payment_score = max(0.0, min(1.0, payment_score))
|
| 100 |
+
|
| 101 |
+
value = 0.70 * price_score + 0.30 * payment_score
|
| 102 |
+
efficiency = 1.0 - (rounds_taken / 8) * 0.30
|
| 103 |
+
|
| 104 |
+
return round(value * efficiency, 4)
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
### Task 3: `adversarial` (Hard)
|
| 108 |
+
|
| 109 |
+
**Scenario:** Large contract. Price + payment + support hours.
|
| 110 |
+
|
| 111 |
+
```
|
| 112 |
+
Issues: price + payment_days + support_hours
|
| 113 |
+
Seller persona: Aggressive Anchor
|
| 114 |
+
→ Opens at ceiling on all issues
|
| 115 |
+
→ Hardens position if agent makes consecutive concessions
|
| 116 |
+
→ Rapport-sensitive but requires consistent collaborative framing
|
| 117 |
+
Adaptation: if agent concedes 2+ rounds in a row, seller increases floor by 3%
|
| 118 |
+
Max rounds: 10
|
| 119 |
+
Survival floor: deal at any terms scores minimum 0.15
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
**Why hard:** Agent must resist anchoring, break consecutive concession patterns, maintain collaborative tone under pressure. Base LLM score ~0.15. Strong LLM ~0.40.
|
| 123 |
+
|
| 124 |
+
**Grader:**
|
| 125 |
+
|
| 126 |
+
```python
|
| 127 |
+
def grade_adversarial(final_terms, deal_reached, rounds_taken, consecutive_flag):
|
| 128 |
+
if not deal_reached:
|
| 129 |
+
return 0.0
|
| 130 |
+
|
| 131 |
+
# Survival floor — completing deal always scores at least 0.15
|
| 132 |
+
floor = 0.15
|
| 133 |
+
|
| 134 |
+
price_score = (120000 - final_terms['price']) / (120000 - 80000)
|
| 135 |
+
payment_score = (90 - final_terms['payment_days']) / (90 - 30)
|
| 136 |
+
support_score = (final_terms['support_hours'] - 80) / (200 - 80)
|
| 137 |
+
|
| 138 |
+
for s in [price_score, payment_score, support_score]:
|
| 139 |
+
s = max(0.0, min(1.0, s))
|
| 140 |
+
|
| 141 |
+
value = 0.40 * price_score + 0.35 * payment_score + 0.25 * support_score
|
| 142 |
+
efficiency = 1.0 - (rounds_taken / 10) * 0.25
|
| 143 |
+
|
| 144 |
+
# Penalty for consecutive concession pattern
|
| 145 |
+
pattern_penalty = 0.1 if consecutive_flag else 0.0
|
| 146 |
+
|
| 147 |
+
raw = (value * efficiency) - pattern_penalty
|
| 148 |
+
return round(max(floor, raw), 4)
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
## Score Calibration Table
|
| 152 |
+
|
| 153 |
+
| Agent Type | single_issue | multi_issue | adversarial |
|
| 154 |
+
| --------------------------- | ------------ | ----------- | ----------- |
|
| 155 |
+
| Random/heuristic | 0.15–0.25 | 0.08–0.15 | 0.03–0.10 |
|
| 156 |
+
| Base LLM (no language) | 0.35–0.45 | 0.20–0.30 | 0.12–0.20 |
|
| 157 |
+
| Base LLM (natural language) | 0.48–0.58 | 0.28–0.38 | 0.18–0.28 |
|
| 158 |
+
| GRPO-trained LLM (goal) | 0.68–0.78 | 0.55–0.65 | 0.45–0.55 |
|
| 159 |
+
|
| 160 |
+
This gives clear score spread at every level. Phase 2 will show meaningful differentiation.
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
# THE CLAUDE CODE PROMPT
|
| 165 |
+
|
| 166 |
+
Paste this entire block into Claude Code:
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
**Build ProcureRL: A Procurement Negotiation RL Environment**
|
| 171 |
+
|
| 172 |
+
This is a complete OpenEnv-compliant environment. Build everything exactly as specified. No additions, no changes to the design.
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
**Project Structure:**
|
| 177 |
+
|
| 178 |
+
```
|
| 179 |
+
procure-rl/
|
| 180 |
+
├── procure_rl/
|
| 181 |
+
│ ├── __init__.py
|
| 182 |
+
│ ├── environment.py
|
| 183 |
+
│ ├── models.py
|
| 184 |
+
│ ├── opponent.py
|
| 185 |
+
│ ├── graders.py
|
| 186 |
+
│ └── scenarios.py
|
| 187 |
+
├── server/
|
| 188 |
+
│ └── app.py
|
| 189 |
+
├── inference.py
|
| 190 |
+
├── openenv.yaml
|
| 191 |
+
├── Dockerfile
|
| 192 |
+
├── requirements.txt
|
| 193 |
+
└── README.md
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
---
|
| 197 |
+
|
| 198 |
+
**models.py — exact dataclasses:**
|
| 199 |
+
|
| 200 |
+
```python
|
| 201 |
+
from dataclasses import dataclass, field
|
| 202 |
+
from typing import Optional, List, Dict, Any
|
| 203 |
+
|
| 204 |
+
try:
|
| 205 |
+
from openenv.core.env_server import Action, Observation, State
|
| 206 |
+
except ImportError:
|
| 207 |
+
Action = object
|
| 208 |
+
Observation = object
|
| 209 |
+
State = object
|
| 210 |
+
|
| 211 |
+
@dataclass
|
| 212 |
+
class NegotiationAction(Action):
|
| 213 |
+
move_type: str # make_offer | accept | reject | bundle
|
| 214 |
+
terms: Dict[str, Any] # {price: 44000, payment_days: 45, support_hours: 120}
|
| 215 |
+
message: str = "" # natural language — affects opponent rapport
|
| 216 |
+
|
| 217 |
+
@dataclass
|
| 218 |
+
class NegotiationObservation(Observation):
|
| 219 |
+
task_id: str
|
| 220 |
+
round_number: int
|
| 221 |
+
max_rounds: int
|
| 222 |
+
supplier_message: str
|
| 223 |
+
current_offer: Dict[str, Any]
|
| 224 |
+
last_4_exchanges: List[Dict] # capped at 4 for token efficiency
|
| 225 |
+
buyer_constraints: Dict[str, Any] # buyer's targets and limits
|
| 226 |
+
rapport_hint: str # "positive" | "neutral" | "negative" — visible to agent
|
| 227 |
+
done: bool
|
| 228 |
+
|
| 229 |
+
@dataclass
|
| 230 |
+
class NegotiationState(State):
|
| 231 |
+
task_id: str = ""
|
| 232 |
+
episode_id: str = ""
|
| 233 |
+
round_number: int = 0
|
| 234 |
+
rapport_score: float = 0.5
|
| 235 |
+
consecutive_concessions: int = 0
|
| 236 |
+
deal_reached: bool = False
|
| 237 |
+
final_terms: Optional[Dict] = None
|
| 238 |
+
cumulative_reward: float = 0.0
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
---
|
| 242 |
+
|
| 243 |
+
**opponent.py — ScriptedPersonaOpponent:**
|
| 244 |
+
|
| 245 |
+
```python
|
| 246 |
+
import random
|
| 247 |
+
from dataclasses import dataclass, field
|
| 248 |
+
from typing import Dict, Tuple
|
| 249 |
+
|
| 250 |
+
COLLABORATIVE_SIGNALS = [
|
| 251 |
+
"understand", "partnership", "mutual", "together", "value",
|
| 252 |
+
"appreciate", "flexible", "work with", "long-term", "relationship",
|
| 253 |
+
"reasonable", "fair", "both", "solution"
|
| 254 |
+
]
|
| 255 |
+
|
| 256 |
+
AGGRESSIVE_SIGNALS = [
|
| 257 |
+
"demand", "require", "final offer", "unacceptable", "must",
|
| 258 |
+
"non-negotiable", "take it or leave", "bottom line", "ultimatum",
|
| 259 |
+
"insist", "refuse", "absolutely not"
|
| 260 |
+
]
|
| 261 |
+
|
| 262 |
+
PERSONA_TEMPLATES = {
|
| 263 |
+
"cooperative": {
|
| 264 |
+
"opening": [
|
| 265 |
+
"Thanks for reaching out. Our standard pricing for this package is ${target}. Happy to discuss.",
|
| 266 |
+
"We value your interest. We're pricing this at ${target} based on current market rates.",
|
| 267 |
+
],
|
| 268 |
+
"counter": [
|
| 269 |
+
"I appreciate you working with us. Based on our costs, ${counter} is where we can be.",
|
| 270 |
+
"Thank you for your offer. We can move to ${counter} given our margin requirements.",
|
| 271 |
+
],
|
| 272 |
+
"near_close": [
|
| 273 |
+
"I think we're close. If you can do ${close}, I can get this approved today.",
|
| 274 |
+
"We're almost there. ${close} works for our team. Shall we finalize?"
|
| 275 |
+
],
|
| 276 |
+
"accept": "That works for us. Let's move forward at those terms.",
|
| 277 |
+
"reject": "That's below what we can accept, but we want to make this work."
|
| 278 |
+
},
|
| 279 |
+
"cash_flow_stressed": {
|
| 280 |
+
"opening": [
|
| 281 |
+
"Our pricing is ${target}. I should mention — payment timing is particularly important to us this quarter.",
|
| 282 |
+
"We're at ${target}. Between us, our finance team has specific requirements around cash flow timing.",
|
| 283 |
+
],
|
| 284 |
+
"counter": [
|
| 285 |
+
"We can move on price if payment terms work for you. ${counter} with your payment preference?",
|
| 286 |
+
"Price flexibility depends on receivables timing for us. ${counter} if we can discuss payment terms.",
|
| 287 |
+
],
|
| 288 |
+
"near_close": [
|
| 289 |
+
"If you can do Net-30 on payment, we can get to ${close} on price.",
|
| 290 |
+
"Payment timing is our real constraint. ${close} with faster payment terms?"
|
| 291 |
+
],
|
| 292 |
+
"accept": "Agreed. The payment structure works for our cash flow needs.",
|
| 293 |
+
"reject": "The price is tight but we could explore it if payment terms align."
|
| 294 |
+
},
|
| 295 |
+
"aggressive_anchor": {
|
| 296 |
+
"opening": [
|
| 297 |
+
"Our price is ${target}. This reflects our full service quality and market position.",
|
| 298 |
+
"We're firm at ${target}. This is based on our cost structure and service level.",
|
| 299 |
+
],
|
| 300 |
+
"counter": [
|
| 301 |
+
"We can go to ${counter}. That's already a significant concession from our position.",
|
| 302 |
+
"${counter} is our revised position. We're not in a position to move much further.",
|
| 303 |
+
],
|
| 304 |
+
"hardening": [
|
| 305 |
+
"We've already moved considerably. ${floor} is our absolute position.",
|
| 306 |
+
"I need to be direct — we're at ${floor} and that's where we'll stay.",
|
| 307 |
+
],
|
| 308 |
+
"near_close": [
|
| 309 |
+
"Final position: ${close}. We need a decision today.",
|
| 310 |
+
"${close} is where we are. This is our best and final offer."
|
| 311 |
+
],
|
| 312 |
+
"accept": "Accepted.",
|
| 313 |
+
"reject": "That doesn't work. Come back with a serious offer."
|
| 314 |
+
}
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
class ScriptedPersonaOpponent:
|
| 318 |
+
def __init__(self, task_id: str, seed: int, persona: str):
|
| 319 |
+
self.rng = random.Random(seed)
|
| 320 |
+
self.task_id = task_id
|
| 321 |
+
self.persona = persona
|
| 322 |
+
self.templates = PERSONA_TEMPLATES[persona]
|
| 323 |
+
|
| 324 |
+
# Sampled reservation values — never revealed to agent
|
| 325 |
+
if task_id == "single_issue":
|
| 326 |
+
self.price_floor = self.rng.uniform(42000, 46000)
|
| 327 |
+
self.price_target = self.price_floor * self.rng.uniform(1.28, 1.38)
|
| 328 |
+
elif task_id == "multi_issue":
|
| 329 |
+
self.price_floor = self.rng.uniform(40000, 46000)
|
| 330 |
+
self.price_target = self.price_floor * self.rng.uniform(1.25, 1.35)
|
| 331 |
+
self.payment_preference = self.rng.choice([30, 45, 60]) # preferred days
|
| 332 |
+
elif task_id == "adversarial":
|
| 333 |
+
self.price_floor = self.rng.uniform(85000, 95000)
|
| 334 |
+
self.price_target = self.price_floor * self.rng.uniform(1.30, 1.40)
|
| 335 |
+
|
| 336 |
+
self.rapport = 0.5
|
| 337 |
+
self.concession_count = 0
|
| 338 |
+
self.current_position = self.price_target
|
| 339 |
+
|
| 340 |
+
def update_rapport(self, agent_message: str) -> None:
|
| 341 |
+
msg_lower = agent_message.lower()
|
| 342 |
+
delta = 0.0
|
| 343 |
+
delta += sum(0.08 for w in COLLABORATIVE_SIGNALS if w in msg_lower)
|
| 344 |
+
delta -= sum(0.08 for w in AGGRESSIVE_SIGNALS if w in msg_lower)
|
| 345 |
+
delta = max(-0.20, min(0.20, delta))
|
| 346 |
+
self.rapport = max(0.0, min(1.0, self.rapport + delta))
|
| 347 |
+
|
| 348 |
+
def get_concession_rate(self) -> float:
|
| 349 |
+
# Base rate by persona
|
| 350 |
+
base_rates = {
|
| 351 |
+
"cooperative": 0.10,
|
| 352 |
+
"cash_flow_stressed": 0.07,
|
| 353 |
+
"aggressive_anchor": 0.04
|
| 354 |
+
}
|
| 355 |
+
base = base_rates[self.persona]
|
| 356 |
+
# Rapport modifier: +/- 50% of base rate
|
| 357 |
+
modifier = (self.rapport - 0.5) * base
|
| 358 |
+
return max(0.01, base + modifier)
|
| 359 |
+
|
| 360 |
+
def respond(self, agent_message: str, agent_terms: Dict,
|
| 361 |
+
round_number: int, consecutive_concessions: int) -> Tuple[str, Dict]:
|
| 362 |
+
|
| 363 |
+
self.update_rapport(agent_message)
|
| 364 |
+
self.concession_count += 1
|
| 365 |
+
|
| 366 |
+
agent_price = agent_terms.get('price', 0)
|
| 367 |
+
|
| 368 |
+
# Check if we should accept
|
| 369 |
+
if agent_price >= self.price_floor and self._acceptance_condition(agent_terms):
|
| 370 |
+
return self.templates["accept"], {**agent_terms, "_accepted": True}
|
| 371 |
+
|
| 372 |
+
# Compute counter position
|
| 373 |
+
concession = self.get_concession_rate()
|
| 374 |
+
|
| 375 |
+
# Aggressive anchor hardens if consecutive concessions detected
|
| 376 |
+
if self.persona == "aggressive_anchor" and consecutive_concessions >= 2:
|
| 377 |
+
concession = concession * 0.4 # barely moves
|
| 378 |
+
template_key = "hardening"
|
| 379 |
+
elif round_number >= self._max_rounds() * 0.7:
|
| 380 |
+
template_key = "near_close"
|
| 381 |
+
else:
|
| 382 |
+
template_key = "counter"
|
| 383 |
+
|
| 384 |
+
new_position = self.current_position * (1 - concession)
|
| 385 |
+
new_position = max(self.price_floor, new_position)
|
| 386 |
+
self.current_position = new_position
|
| 387 |
+
|
| 388 |
+
# Select template
|
| 389 |
+
templates_for_key = self.templates.get(template_key, self.templates["counter"])
|
| 390 |
+
template = self.rng.choice(templates_for_key)
|
| 391 |
+
message = template.replace("${counter}", f"${new_position:,.0f}")
|
| 392 |
+
message = message.replace("${floor}", f"${self.price_floor:,.0f}")
|
| 393 |
+
message = message.replace("${close}", f"${new_position:,.0f}")
|
| 394 |
+
|
| 395 |
+
counter_terms = dict(agent_terms)
|
| 396 |
+
counter_terms['price'] = round(new_position, 2)
|
| 397 |
+
|
| 398 |
+
# Cash flow stressed adds payment commentary
|
| 399 |
+
if self.persona == "cash_flow_stressed" and 'payment_days' in agent_terms:
|
| 400 |
+
if agent_terms['payment_days'] > 60:
|
| 401 |
+
message += " Though I'll need to flag the payment timing to our finance team."
|
| 402 |
+
|
| 403 |
+
return message, counter_terms
|
| 404 |
+
|
| 405 |
+
def _acceptance_condition(self, terms: Dict) -> bool:
|
| 406 |
+
if self.persona == "cash_flow_stressed":
|
| 407 |
+
payment_ok = terms.get('payment_days', 60) <= 45
|
| 408 |
+
return payment_ok
|
| 409 |
+
return True
|
| 410 |
+
|
| 411 |
+
def _max_rounds(self) -> int:
|
| 412 |
+
return {"single_issue": 6, "multi_issue": 8, "adversarial": 10}[self.task_id]
|
| 413 |
+
|
| 414 |
+
def get_opening_message(self) -> Tuple[str, Dict]:
|
| 415 |
+
template = self.rng.choice(self.templates["opening"])
|
| 416 |
+
message = template.replace("${target}", f"${self.price_target:,.0f}")
|
| 417 |
+
terms = {"price": round(self.price_target, 2)}
|
| 418 |
+
if self.task_id in ["multi_issue", "adversarial"]:
|
| 419 |
+
terms["payment_days"] = 90
|
| 420 |
+
if self.task_id == "adversarial":
|
| 421 |
+
terms["support_hours"] = 80
|
| 422 |
+
return message, terms
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
|
| 427 |
+
**graders.py — pure Python, zero LLM calls:**
|
| 428 |
+
|
| 429 |
+
```python
|
| 430 |
+
from typing import Dict, Optional
|
| 431 |
+
|
| 432 |
+
def grade_single_issue(
|
| 433 |
+
final_terms: Dict,
|
| 434 |
+
deal_reached: bool,
|
| 435 |
+
rounds_taken: int,
|
| 436 |
+
max_rounds: int = 6
|
| 437 |
+
) -> float:
|
| 438 |
+
if not deal_reached:
|
| 439 |
+
return 0.0
|
| 440 |
+
|
| 441 |
+
final_price = final_terms.get('price', 99999)
|
| 442 |
+
|
| 443 |
+
# Buyer target: $38K, seller floor: ~$44K
|
| 444 |
+
BUYER_TARGET = 38000
|
| 445 |
+
SELLER_FLOOR = 44000
|
| 446 |
+
|
| 447 |
+
value = (SELLER_FLOOR - final_price) / (SELLER_FLOOR - BUYER_TARGET)
|
| 448 |
+
value = max(0.0, min(1.0, value))
|
| 449 |
+
|
| 450 |
+
# Efficiency penalty grows sharply in late rounds
|
| 451 |
+
efficiency = 1.0 - (rounds_taken / max_rounds) ** 1.5 * 0.4
|
| 452 |
+
efficiency = max(0.1, efficiency)
|
| 453 |
+
|
| 454 |
+
return round(value * efficiency, 4)
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def grade_multi_issue(
|
| 458 |
+
final_terms: Dict,
|
| 459 |
+
deal_reached: bool,
|
| 460 |
+
rounds_taken: int,
|
| 461 |
+
max_rounds: int = 8
|
| 462 |
+
) -> float:
|
| 463 |
+
if not deal_reached:
|
| 464 |
+
return 0.0
|
| 465 |
+
|
| 466 |
+
final_price = final_terms.get('price', 99999)
|
| 467 |
+
payment_days = final_terms.get('payment_days', 90)
|
| 468 |
+
|
| 469 |
+
# Price component (buyer cares 70%)
|
| 470 |
+
PRICE_WORST = 58000
|
| 471 |
+
PRICE_TARGET = 40000
|
| 472 |
+
price_score = (PRICE_WORST - final_price) / (PRICE_WORST - PRICE_TARGET)
|
| 473 |
+
price_score = max(0.0, min(1.0, price_score))
|
| 474 |
+
|
| 475 |
+
# Payment component (buyer cares 30%)
|
| 476 |
+
PAYMENT_WORST = 90
|
| 477 |
+
PAYMENT_TARGET = 30
|
| 478 |
+
payment_score = (PAYMENT_WORST - payment_days) / (PAYMENT_WORST - PAYMENT_TARGET)
|
| 479 |
+
payment_score = max(0.0, min(1.0, payment_score))
|
| 480 |
+
|
| 481 |
+
value = 0.70 * price_score + 0.30 * payment_score
|
| 482 |
+
efficiency = 1.0 - (rounds_taken / max_rounds) * 0.30
|
| 483 |
+
efficiency = max(0.1, efficiency)
|
| 484 |
+
|
| 485 |
+
return round(value * efficiency, 4)
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
def grade_adversarial(
|
| 489 |
+
final_terms: Dict,
|
| 490 |
+
deal_reached: bool,
|
| 491 |
+
rounds_taken: int,
|
| 492 |
+
consecutive_concessions_flag: bool,
|
| 493 |
+
max_rounds: int = 10
|
| 494 |
+
) -> float:
|
| 495 |
+
if not deal_reached:
|
| 496 |
+
return 0.0
|
| 497 |
+
|
| 498 |
+
SURVIVAL_FLOOR = 0.15
|
| 499 |
+
|
| 500 |
+
final_price = final_terms.get('price', 999999)
|
| 501 |
+
payment_days = final_terms.get('payment_days', 90)
|
| 502 |
+
support_hours = final_terms.get('support_hours', 80)
|
| 503 |
+
|
| 504 |
+
# Price (buyer weight 40%)
|
| 505 |
+
PRICE_WORST = 120000
|
| 506 |
+
PRICE_TARGET = 80000
|
| 507 |
+
price_score = (PRICE_WORST - final_price) / (PRICE_WORST - PRICE_TARGET)
|
| 508 |
+
price_score = max(0.0, min(1.0, price_score))
|
| 509 |
+
|
| 510 |
+
# Payment (buyer weight 35%)
|
| 511 |
+
payment_score = (90 - payment_days) / (90 - 30)
|
| 512 |
+
payment_score = max(0.0, min(1.0, payment_score))
|
| 513 |
+
|
| 514 |
+
# Support hours (buyer weight 25%)
|
| 515 |
+
support_score = (support_hours - 80) / (200 - 80)
|
| 516 |
+
support_score = max(0.0, min(1.0, support_score))
|
| 517 |
+
|
| 518 |
+
value = 0.40 * price_score + 0.35 * payment_score + 0.25 * support_score
|
| 519 |
+
efficiency = 1.0 - (rounds_taken / max_rounds) * 0.25
|
| 520 |
+
efficiency = max(0.1, efficiency)
|
| 521 |
+
|
| 522 |
+
# Penalty for being exploited by consecutive concession pattern
|
| 523 |
+
pattern_penalty = 0.10 if consecutive_concessions_flag else 0.0
|
| 524 |
+
|
| 525 |
+
raw = (value * efficiency) - pattern_penalty
|
| 526 |
+
return round(max(SURVIVAL_FLOOR, raw), 4)
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
def grade(task_id: str, final_terms: Dict, deal_reached: bool,
|
| 530 |
+
rounds_taken: int, **kwargs) -> float:
|
| 531 |
+
if task_id == "single_issue":
|
| 532 |
+
return grade_single_issue(final_terms, deal_reached, rounds_taken)
|
| 533 |
+
elif task_id == "multi_issue":
|
| 534 |
+
return grade_multi_issue(final_terms, deal_reached, rounds_taken)
|
| 535 |
+
elif task_id == "adversarial":
|
| 536 |
+
return grade_adversarial(
|
| 537 |
+
final_terms, deal_reached, rounds_taken,
|
| 538 |
+
kwargs.get("consecutive_concessions_flag", False)
|
| 539 |
+
)
|
| 540 |
+
raise ValueError(f"Unknown task: {task_id}")
|
| 541 |
+
```
|
| 542 |
+
|
| 543 |
+
---
|
| 544 |
+
|
| 545 |
+
**environment.py:**
|
| 546 |
+
|
| 547 |
+
```python
|
| 548 |
+
import uuid
|
| 549 |
+
from typing import Optional
|
| 550 |
+
from procure_rl.models import NegotiationAction, NegotiationObservation, NegotiationState
|
| 551 |
+
from procure_rl.opponent import ScriptedPersonaOpponent
|
| 552 |
+
from procure_rl.graders import grade
|
| 553 |
+
|
| 554 |
+
TASK_CONFIG = {
|
| 555 |
+
"single_issue": {
|
| 556 |
+
"persona": "cooperative",
|
| 557 |
+
"max_rounds": 6,
|
| 558 |
+
"buyer_constraints": {
|
| 559 |
+
"price": {"target": 38000, "worst": 52000, "budget": 50000}
|
| 560 |
+
}
|
| 561 |
+
},
|
| 562 |
+
"multi_issue": {
|
| 563 |
+
"persona": "cash_flow_stressed",
|
| 564 |
+
"max_rounds": 8,
|
| 565 |
+
"buyer_constraints": {
|
| 566 |
+
"price": {"target": 40000, "worst": 58000, "budget": 55000},
|
| 567 |
+
"payment_days": {"target": 60, "worst": 30, "preference": 60}
|
| 568 |
+
}
|
| 569 |
+
},
|
| 570 |
+
"adversarial": {
|
| 571 |
+
"persona": "aggressive_anchor",
|
| 572 |
+
"max_rounds": 10,
|
| 573 |
+
"buyer_constraints": {
|
| 574 |
+
"price": {"target": 80000, "worst": 120000, "budget": 115000},
|
| 575 |
+
"payment_days": {"target": 60, "worst": 30, "preference": 60},
|
| 576 |
+
"support_hours": {"target": 150, "worst": 80, "preference": 150}
|
| 577 |
+
}
|
| 578 |
+
}
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
try:
|
| 582 |
+
from openenv.core.env_server import Environment
|
| 583 |
+
except ImportError:
|
| 584 |
+
class Environment:
|
| 585 |
+
pass
|
| 586 |
+
|
| 587 |
+
class ProcureRLEnvironment(Environment):
|
| 588 |
+
|
| 589 |
+
def __init__(self):
|
| 590 |
+
self._state = NegotiationState()
|
| 591 |
+
self._opponent = None
|
| 592 |
+
self._task_config = None
|
| 593 |
+
self._done = False
|
| 594 |
+
self._last_offer = {}
|
| 595 |
+
self._consecutive_concessions = 0
|
| 596 |
+
self._prev_agent_price = None
|
| 597 |
+
|
| 598 |
+
def reset(self, task_id: str = "single_issue", seed: int = 42) -> NegotiationObservation:
|
| 599 |
+
if task_id not in TASK_CONFIG:
|
| 600 |
+
raise ValueError(f"Unknown task: {task_id}. Valid: {list(TASK_CONFIG.keys())}")
|
| 601 |
+
|
| 602 |
+
config = TASK_CONFIG[task_id]
|
| 603 |
+
self._task_config = config
|
| 604 |
+
self._done = False
|
| 605 |
+
self._consecutive_concessions = 0
|
| 606 |
+
self._prev_agent_price = None
|
| 607 |
+
|
| 608 |
+
self._opponent = ScriptedPersonaOpponent(
|
| 609 |
+
task_id=task_id,
|
| 610 |
+
seed=seed,
|
| 611 |
+
persona=config["persona"]
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
opening_msg, opening_terms = self._opponent.get_opening_message()
|
| 615 |
+
self._last_offer = opening_terms
|
| 616 |
+
|
| 617 |
+
self._state = NegotiationState(
|
| 618 |
+
task_id=task_id,
|
| 619 |
+
episode_id=str(uuid.uuid4())[:8],
|
| 620 |
+
round_number=0,
|
| 621 |
+
rapport_score=0.5,
|
| 622 |
+
consecutive_concessions=0,
|
| 623 |
+
deal_reached=False,
|
| 624 |
+
final_terms=None,
|
| 625 |
+
cumulative_reward=0.0
|
| 626 |
+
)
|
| 627 |
+
|
| 628 |
+
return NegotiationObservation(
|
| 629 |
+
task_id=task_id,
|
| 630 |
+
round_number=0,
|
| 631 |
+
max_rounds=config["max_rounds"],
|
| 632 |
+
supplier_message=opening_msg,
|
| 633 |
+
current_offer=opening_terms,
|
| 634 |
+
last_4_exchanges=[{"role": "supplier", "message": opening_msg, "terms": opening_terms}],
|
| 635 |
+
buyer_constraints=config["buyer_constraints"],
|
| 636 |
+
rapport_hint="neutral",
|
| 637 |
+
done=False
|
| 638 |
+
)
|
| 639 |
+
|
| 640 |
+
def step(self, action: NegotiationAction):
|
| 641 |
+
if self._done:
|
| 642 |
+
obs = self._make_obs("Episode finished. Call reset().")
|
| 643 |
+
return obs, 0.0, True, {"error": "episode_done"}
|
| 644 |
+
|
| 645 |
+
self._state.round_number += 1
|
| 646 |
+
round_num = self._state.round_number
|
| 647 |
+
config = self._task_config
|
| 648 |
+
max_rounds = config["max_rounds"]
|
| 649 |
+
|
| 650 |
+
reward = 0.0
|
| 651 |
+
error = None
|
| 652 |
+
|
| 653 |
+
# Track consecutive concessions
|
| 654 |
+
if self._prev_agent_price is not None:
|
| 655 |
+
current_price = action.terms.get('price', self._prev_agent_price)
|
| 656 |
+
if current_price > self._prev_agent_price: # agent conceded (price went up toward seller)
|
| 657 |
+
self._consecutive_concessions += 1
|
| 658 |
+
else:
|
| 659 |
+
self._consecutive_concessions = 0
|
| 660 |
+
self._prev_agent_price = action.terms.get('price')
|
| 661 |
+
self._state.consecutive_concessions = self._consecutive_concessions
|
| 662 |
+
|
| 663 |
+
# Handle accept
|
| 664 |
+
if action.move_type == "accept":
|
| 665 |
+
self._done = True
|
| 666 |
+
self._state.deal_reached = True
|
| 667 |
+
self._state.final_terms = self._last_offer
|
| 668 |
+
reward = grade(
|
| 669 |
+
self._state.task_id,
|
| 670 |
+
self._last_offer,
|
| 671 |
+
True,
|
| 672 |
+
round_num,
|
| 673 |
+
consecutive_concessions_flag=(self._consecutive_concessions >= 2)
|
| 674 |
+
)
|
| 675 |
+
self._state.cumulative_reward = reward
|
| 676 |
+
obs = self._make_obs()
|
| 677 |
+
obs.done = True
|
| 678 |
+
return obs, reward, True, {"deal_price": self._last_offer.get('price')}
|
| 679 |
+
|
| 680 |
+
# Handle reject
|
| 681 |
+
if action.move_type == "reject":
|
| 682 |
+
if round_num >= max_rounds:
|
| 683 |
+
self._done = True
|
| 684 |
+
reward = 0.0
|
| 685 |
+
obs = self._make_obs()
|
| 686 |
+
obs.done = True
|
| 687 |
+
return obs, reward, True, {"error": "rejected_at_limit"}
|
| 688 |
+
obs = self._make_obs()
|
| 689 |
+
return obs, 0.0, False, {}
|
| 690 |
+
|
| 691 |
+
# Handle make_offer or bundle
|
| 692 |
+
opponent_msg, opponent_terms = self._opponent.respond(
|
| 693 |
+
agent_message=action.message,
|
| 694 |
+
agent_terms=action.terms,
|
| 695 |
+
round_number=round_num,
|
| 696 |
+
consecutive_concessions=self._consecutive_concessions
|
| 697 |
+
)
|
| 698 |
+
|
| 699 |
+
# Check if opponent accepted
|
| 700 |
+
if opponent_terms.get("_accepted"):
|
| 701 |
+
self._done = True
|
| 702 |
+
self._state.deal_reached = True
|
| 703 |
+
self._state.final_terms = action.terms
|
| 704 |
+
reward = grade(
|
| 705 |
+
self._state.task_id,
|
| 706 |
+
action.terms,
|
| 707 |
+
True,
|
| 708 |
+
round_num,
|
| 709 |
+
consecutive_concessions_flag=(self._consecutive_concessions >= 2)
|
| 710 |
+
)
|
| 711 |
+
self._state.cumulative_reward = reward
|
| 712 |
+
obs = self._make_obs(supplier_message=opponent_msg)
|
| 713 |
+
obs.done = True
|
| 714 |
+
return obs, reward, True, {"deal_price": action.terms.get('price')}
|
| 715 |
+
|
| 716 |
+
self._last_offer = opponent_terms
|
| 717 |
+
self._state.rapport_score = self._opponent.rapport
|
| 718 |
+
|
| 719 |
+
# Episode limit
|
| 720 |
+
if round_num >= max_rounds:
|
| 721 |
+
self._done = True
|
| 722 |
+
reward = 0.0
|
| 723 |
+
obs = self._make_obs(supplier_message=opponent_msg)
|
| 724 |
+
obs.done = True
|
| 725 |
+
return obs, reward, True, {"error": "max_rounds_reached"}
|
| 726 |
+
|
| 727 |
+
obs = self._make_obs(supplier_message=opponent_msg)
|
| 728 |
+
return obs, 0.0, False, {}
|
| 729 |
+
|
| 730 |
+
def state(self) -> NegotiationState:
|
| 731 |
+
return self._state
|
| 732 |
+
|
| 733 |
+
def _make_obs(self, supplier_message: str = None) -> NegotiationObservation:
|
| 734 |
+
rapport = self._state.rapport_score
|
| 735 |
+
if rapport >= 0.65:
|
| 736 |
+
hint = "positive"
|
| 737 |
+
elif rapport <= 0.35:
|
| 738 |
+
hint = "negative"
|
| 739 |
+
else:
|
| 740 |
+
hint = "neutral"
|
| 741 |
+
|
| 742 |
+
return NegotiationObservation(
|
| 743 |
+
task_id=self._state.task_id,
|
| 744 |
+
round_number=self._state.round_number,
|
| 745 |
+
max_rounds=self._task_config["max_rounds"],
|
| 746 |
+
supplier_message=supplier_message or "",
|
| 747 |
+
current_offer=self._last_offer,
|
| 748 |
+
last_4_exchanges=[],
|
| 749 |
+
buyer_constraints=self._task_config["buyer_constraints"],
|
| 750 |
+
rapport_hint=hint,
|
| 751 |
+
done=self._done
|
| 752 |
+
)
|
| 753 |
+
```
|
| 754 |
+
|
| 755 |
+
---
|
| 756 |
+
|
| 757 |
+
**server/app.py:**
|
| 758 |
+
|
| 759 |
+
```python
|
| 760 |
+
import sys, os
|
| 761 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 762 |
+
|
| 763 |
+
from dataclasses import asdict
|
| 764 |
+
from fastapi import FastAPI, HTTPException
|
| 765 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 766 |
+
from pydantic import BaseModel
|
| 767 |
+
from typing import Optional, Dict, Any
|
| 768 |
+
|
| 769 |
+
from procure_rl.environment import ProcureRLEnvironment
|
| 770 |
+
from procure_rl.models import NegotiationAction
|
| 771 |
+
|
| 772 |
+
app = FastAPI(title="ProcureRL", version="1.0.0")
|
| 773 |
+
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
| 774 |
+
|
| 775 |
+
_env = ProcureRLEnvironment()
|
| 776 |
+
|
| 777 |
+
class ResetRequest(BaseModel):
|
| 778 |
+
task_id: Optional[str] = "single_issue"
|
| 779 |
+
seed: Optional[int] = 42
|
| 780 |
+
|
| 781 |
+
class StepRequest(BaseModel):
|
| 782 |
+
move_type: str = "make_offer"
|
| 783 |
+
terms: Dict[str, Any] = {}
|
| 784 |
+
message: str = ""
|
| 785 |
+
|
| 786 |
+
@app.get("/health")
|
| 787 |
+
async def health():
|
| 788 |
+
return {"status": "ok", "service": "procure-rl"}
|
| 789 |
+
|
| 790 |
+
@app.get("/metadata")
|
| 791 |
+
async def metadata():
|
| 792 |
+
return {
|
| 793 |
+
"name": "procure-rl",
|
| 794 |
+
"tasks": ["single_issue", "multi_issue", "adversarial"],
|
| 795 |
+
"description": "LLM agent learns procurement negotiation"
|
| 796 |
+
}
|
| 797 |
+
|
| 798 |
+
@app.post("/reset")
|
| 799 |
+
async def reset(req: ResetRequest = ResetRequest()):
|
| 800 |
+
try:
|
| 801 |
+
obs = _env.reset(task_id=req.task_id, seed=req.seed)
|
| 802 |
+
return asdict(obs)
|
| 803 |
+
except ValueError as e:
|
| 804 |
+
raise HTTPException(400, str(e))
|
| 805 |
+
except Exception as e:
|
| 806 |
+
raise HTTPException(500, f"Reset failed: {e}")
|
| 807 |
+
|
| 808 |
+
@app.post("/step")
|
| 809 |
+
async def step(req: StepRequest):
|
| 810 |
+
action = NegotiationAction(
|
| 811 |
+
move_type=req.move_type,
|
| 812 |
+
terms=req.terms,
|
| 813 |
+
message=req.message
|
| 814 |
+
)
|
| 815 |
+
try:
|
| 816 |
+
obs, reward, done, info = _env.step(action)
|
| 817 |
+
return {"observation": asdict(obs), "reward": reward, "done": done, "info": info}
|
| 818 |
+
except Exception as e:
|
| 819 |
+
raise HTTPException(500, f"Step failed: {e}")
|
| 820 |
+
|
| 821 |
+
@app.get("/state")
|
| 822 |
+
async def state():
|
| 823 |
+
return asdict(_env.state())
|
| 824 |
+
|
| 825 |
+
if __name__ == "__main__":
|
| 826 |
+
import uvicorn
|
| 827 |
+
port = int(os.getenv("PORT", 7860))
|
| 828 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=port)
|
| 829 |
+
```
|
| 830 |
+
|
| 831 |
+
---
|
| 832 |
+
|
| 833 |
+
**inference.py — exact stdout format, no deviation:**
|
| 834 |
+
|
| 835 |
+
```python
|
| 836 |
+
import os
|
| 837 |
+
import json
|
| 838 |
+
from openai import OpenAI
|
| 839 |
+
|
| 840 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 841 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 842 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 843 |
+
BENCHMARK = "procure-rl"
|
| 844 |
+
MAX_STEPS = 10
|
| 845 |
+
|
| 846 |
+
client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
|
| 847 |
+
|
| 848 |
+
# Import environment directly (not via HTTP for baseline)
|
| 849 |
+
import sys
|
| 850 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 851 |
+
from procure_rl.environment import ProcureRLEnvironment
|
| 852 |
+
from procure_rl.models import NegotiationAction
|
| 853 |
+
|
| 854 |
+
TASKS = ["single_issue", "multi_issue", "adversarial"]
|
| 855 |
+
|
| 856 |
+
SYSTEM_PROMPT = """You are a professional procurement negotiator. Your goal is to negotiate the best possible deal for your company.
|
| 857 |
+
|
| 858 |
+
You will receive a supplier's message and current offer terms. You must respond with a JSON action in this exact format:
|
| 859 |
+
{
|
| 860 |
+
"move_type": "make_offer",
|
| 861 |
+
"terms": {"price": 42000, "payment_days": 45},
|
| 862 |
+
"message": "Your natural language response to the supplier"
|
| 863 |
+
}
|
| 864 |
+
|
| 865 |
+
move_type must be one of: make_offer, accept, reject, bundle
|
| 866 |
+
terms must include price and any other issues being negotiated.
|
| 867 |
+
message should be professional and collaborative when possible.
|
| 868 |
+
|
| 869 |
+
Your buyer constraints will be provided. Do not exceed your budget. Try to reach the target price."""
|
| 870 |
+
|
| 871 |
+
def get_agent_action(obs_dict: dict) -> dict:
|
| 872 |
+
task_id = obs_dict.get("task_id", "single_issue")
|
| 873 |
+
supplier_msg = obs_dict.get("supplier_message", "")
|
| 874 |
+
current_offer = obs_dict.get("current_offer", {})
|
| 875 |
+
constraints = obs_dict.get("buyer_constraints", {})
|
| 876 |
+
rapport_hint = obs_dict.get("rapport_hint", "neutral")
|
| 877 |
+
round_num = obs_dict.get("round_number", 0)
|
| 878 |
+
max_rounds = obs_dict.get("max_rounds", 10)
|
| 879 |
+
|
| 880 |
+
user_content = f"""Task: {task_id}
|
| 881 |
+
Round: {round_num}/{max_rounds}
|
| 882 |
+
Supplier says: "{supplier_msg}"
|
| 883 |
+
Current offer on table: {json.dumps(current_offer)}
|
| 884 |
+
Your constraints: {json.dumps(constraints)}
|
| 885 |
+
Relationship rapport: {rapport_hint}
|
| 886 |
+
|
| 887 |
+
Respond with your negotiation action as JSON."""
|
| 888 |
+
|
| 889 |
+
response = client.chat.completions.create(
|
| 890 |
+
model=MODEL_NAME,
|
| 891 |
+
messages=[
|
| 892 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 893 |
+
{"role": "user", "content": user_content}
|
| 894 |
+
],
|
| 895 |
+
max_tokens=300,
|
| 896 |
+
temperature=0.3
|
| 897 |
+
)
|
| 898 |
+
|
| 899 |
+
content = response.choices[0].message.content.strip()
|
| 900 |
+
|
| 901 |
+
# Parse JSON from response
|
| 902 |
+
try:
|
| 903 |
+
# Find JSON in response
|
| 904 |
+
start = content.find('{')
|
| 905 |
+
end = content.rfind('}') + 1
|
| 906 |
+
if start >= 0 and end > start:
|
| 907 |
+
action_dict = json.loads(content[start:end])
|
| 908 |
+
else:
|
| 909 |
+
# Fallback
|
| 910 |
+
action_dict = {
|
| 911 |
+
"move_type": "make_offer",
|
| 912 |
+
"terms": current_offer,
|
| 913 |
+
"message": content[:200]
|
| 914 |
+
}
|
| 915 |
+
except:
|
| 916 |
+
action_dict = {
|
| 917 |
+
"move_type": "make_offer",
|
| 918 |
+
"terms": current_offer,
|
| 919 |
+
"message": "I'd like to continue our discussion."
|
| 920 |
+
}
|
| 921 |
+
|
| 922 |
+
return action_dict
|
| 923 |
+
|
| 924 |
+
def run_task(task_id: str) -> dict:
|
| 925 |
+
env = ProcureRLEnvironment()
|
| 926 |
+
obs = env.reset(task_id=task_id, seed=42)
|
| 927 |
+
obs_dict = {
|
| 928 |
+
"task_id": obs.task_id,
|
| 929 |
+
"round_number": obs.round_number,
|
| 930 |
+
"max_rounds": obs.max_rounds,
|
| 931 |
+
"supplier_message": obs.supplier_message,
|
| 932 |
+
"current_offer": obs.current_offer,
|
| 933 |
+
"buyer_constraints": obs.buyer_constraints,
|
| 934 |
+
"rapport_hint": obs.rapport_hint,
|
| 935 |
+
"done": obs.done
|
| 936 |
+
}
|
| 937 |
+
|
| 938 |
+
print(f"[START] task={task_id} env={BENCHMARK} model={MODEL_NAME}")
|
| 939 |
+
|
| 940 |
+
rewards = []
|
| 941 |
+
step = 0
|
| 942 |
+
done = False
|
| 943 |
+
final_score = 0.0
|
| 944 |
+
|
| 945 |
+
while not done and step < MAX_STEPS:
|
| 946 |
+
step += 1
|
| 947 |
+
|
| 948 |
+
action_dict = get_agent_action(obs_dict)
|
| 949 |
+
action = NegotiationAction(
|
| 950 |
+
move_type=action_dict.get("move_type", "make_offer"),
|
| 951 |
+
terms=action_dict.get("terms", {}),
|
| 952 |
+
message=action_dict.get("message", "")
|
| 953 |
+
)
|
| 954 |
+
|
| 955 |
+
obs, reward, done, info = env.step(action)
|
| 956 |
+
rewards.append(reward)
|
| 957 |
+
|
| 958 |
+
action_str = f"{action.move_type}({json.dumps(action.terms)})"
|
| 959 |
+
error = info.get("error", None)
|
| 960 |
+
|
| 961 |
+
print(f"[STEP] step={step} action={action_str} reward={reward:.2f} done={str(done).lower()} error={error if error else 'null'}")
|
| 962 |
+
|
| 963 |
+
if done:
|
| 964 |
+
final_score = reward if reward > 0 else (max(rewards) if rewards else 0.0)
|
| 965 |
+
break
|
| 966 |
+
|
| 967 |
+
obs_dict = {
|
| 968 |
+
"task_id": obs.task_id,
|
| 969 |
+
"round_number": obs.round_number,
|
| 970 |
+
"max_rounds": obs.max_rounds,
|
| 971 |
+
"supplier_message": obs.supplier_message,
|
| 972 |
+
"current_offer": obs.current_offer,
|
| 973 |
+
"buyer_constraints": obs.buyer_constraints,
|
| 974 |
+
"rapport_hint": obs.rapport_hint,
|
| 975 |
+
"done": obs.done
|
| 976 |
+
}
|
| 977 |
+
|
| 978 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 979 |
+
success = final_score > 0.1
|
| 980 |
+
|
| 981 |
+
print(f"[END] success={str(success).lower()} steps={step} score={final_score:.2f} rewards={rewards_str}")
|
| 982 |
+
|
| 983 |
+
return {"task": task_id, "score": final_score, "steps": step}
|
| 984 |
+
|
| 985 |
+
if __name__ == "__main__":
|
| 986 |
+
results = []
|
| 987 |
+
for task in TASKS:
|
| 988 |
+
result = run_task(task)
|
| 989 |
+
results.append(result)
|
| 990 |
+
|
| 991 |
+
print(f"\nBaseline Results:")
|
| 992 |
+
for r in results:
|
| 993 |
+
print(f" {r['task']}: {r['score']:.3f}")
|
| 994 |
+
```
|
| 995 |
+
|
| 996 |
+
---
|
| 997 |
+
|
| 998 |
+
**openenv.yaml:**
|
| 999 |
+
|
| 1000 |
+
```yaml
|
| 1001 |
+
name: procure-rl
|
| 1002 |
+
version: "1.0.0"
|
| 1003 |
+
description: "LLM agent learns procurement negotiation strategy against scripted supplier opponents with hidden utility functions"
|
| 1004 |
+
author: "your-hf-username"
|
| 1005 |
+
tags:
|
| 1006 |
+
- openenv
|
| 1007 |
+
- negotiation
|
| 1008 |
+
- procurement
|
| 1009 |
+
- real-world
|
| 1010 |
+
- rl
|
| 1011 |
+
tasks:
|
| 1012 |
+
- id: single_issue
|
| 1013 |
+
description: "Negotiate software license price with cooperative supplier"
|
| 1014 |
+
difficulty: easy
|
| 1015 |
+
max_steps: 6
|
| 1016 |
+
reward_range: [0.0, 1.0]
|
| 1017 |
+
- id: multi_issue
|
| 1018 |
+
description: "Negotiate price and payment terms with cash-flow-sensitive supplier"
|
| 1019 |
+
difficulty: medium
|
| 1020 |
+
max_steps: 8
|
| 1021 |
+
reward_range: [0.0, 1.0]
|
| 1022 |
+
- id: adversarial
|
| 1023 |
+
description: "Negotiate multiple issues against aggressive anchoring supplier"
|
| 1024 |
+
difficulty: hard
|
| 1025 |
+
max_steps: 10
|
| 1026 |
+
reward_range: [0.0, 1.0]
|
| 1027 |
+
reward_range: [0.0, 1.0]
|
| 1028 |
+
observation_space:
|
| 1029 |
+
type: object
|
| 1030 |
+
description: "Natural language supplier message with structured negotiation state and rapport signal"
|
| 1031 |
+
action_space:
|
| 1032 |
+
type: object
|
| 1033 |
+
description: "Negotiation move type, structured terms, and natural language message"
|
| 1034 |
+
```
|
| 1035 |
+
|
| 1036 |
+
---
|
| 1037 |
+
|
| 1038 |
+
**Dockerfile:**
|
| 1039 |
+
|
| 1040 |
+
```dockerfile
|
| 1041 |
+
FROM python:3.11-slim
|
| 1042 |
+
WORKDIR /app
|
| 1043 |
+
COPY requirements.txt .
|
| 1044 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 1045 |
+
COPY . .
|
| 1046 |
+
ENV PORT=7860
|
| 1047 |
+
EXPOSE 7860
|
| 1048 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 1049 |
+
```
|
| 1050 |
+
|
| 1051 |
+
---
|
| 1052 |
+
|
| 1053 |
+
**requirements.txt:**
|
| 1054 |
+
|
| 1055 |
+
```
|
| 1056 |
+
fastapi==0.109.0
|
| 1057 |
+
uvicorn==0.27.0
|
| 1058 |
+
pydantic>=2.0.0
|
| 1059 |
+
openai>=1.0.0
|
| 1060 |
+
openenv-core>=0.1.0
|
| 1061 |
+
```
|
| 1062 |
+
|
| 1063 |
+
---
|
| 1064 |
+
|
| 1065 |
+
Build all files exactly as specified. Run locally with:
|
| 1066 |
+
|
| 1067 |
+
```
|
| 1068 |
+
docker build -t procure-rl .
|
| 1069 |
+
docker run -p 7860:7860 procure-rl
|
| 1070 |
+
```
|
| 1071 |
+
|
| 1072 |
+
Test with:
|
| 1073 |
+
|
| 1074 |
+
```
|
| 1075 |
+
curl -X POST http://localhost:7860/reset -H "Content-Type: application/json" -d '{"task_id": "single_issue"}'
|
| 1076 |
+
```
|
| 1077 |
+
|
| 1078 |
+
Then run inference.py locally with HF_TOKEN set to verify [START][STEP][END] format.
|
| 1079 |
+
|
| 1080 |
+
---
|
| 1081 |
+
|
| 1082 |
+
# PLAN.MD
|
| 1083 |
+
|
| 1084 |
+
````markdown
|
| 1085 |
+
# ProcureRL — Implementation Plan
|
| 1086 |
+
|
| 1087 |
+
## What We Are Building
|
| 1088 |
+
|
| 1089 |
+
An OpenEnv-compliant RL environment where an LLM agent learns
|
| 1090 |
+
procurement negotiation strategy against scripted supplier opponents.
|
| 1091 |
+
|
| 1092 |
+
The key innovation: language-sensitive opponent behavior. The agent's
|
| 1093 |
+
natural language quality affects opponent concession rates, making LLM
|
| 1094 |
+
genuinely required — not just for parsing but for output quality.
|
| 1095 |
+
|
| 1096 |
+
## Why This Wins
|
| 1097 |
+
|
| 1098 |
+
- Zero negotiation environments in OpenEnv hub — confirmed
|
| 1099 |
+
- Documented LLM weakness in buyer negotiation (ACL 2024)
|
| 1100 |
+
- Walmart/Pactum market validation — real enterprise deployment exists
|
| 1101 |
+
- Nash-inspired grader with language mechanism — novel and memorable
|
| 1102 |
+
- Deterministic, reproducible, pure Python graders
|
| 1103 |
+
|
| 1104 |
+
## Implementation Order (strict)
|
| 1105 |
+
|
| 1106 |
+
### Phase 1: Core Logic (Day 1, first 4 hours)
|
| 1107 |
+
|
| 1108 |
+
- [ ] procure_rl/models.py — dataclasses only
|
| 1109 |
+
- [ ] procure_rl/opponent.py — ScriptedPersonaOpponent
|
| 1110 |
+
- [ ] procure_rl/graders.py — three grader functions
|
| 1111 |
+
- [ ] procure_rl/environment.py — ProcureRLEnvironment
|
| 1112 |
+
- [ ] Test: import and run reset() + step() in Python shell
|
| 1113 |
+
|
| 1114 |
+
### Phase 2: Server (Day 1, next 2 hours)
|
| 1115 |
+
|
| 1116 |
+
- [ ] server/app.py — FastAPI with /health /reset /step /state
|
| 1117 |
+
- [ ] requirements.txt
|
| 1118 |
+
- [ ] Test: uvicorn server.app:app, curl /health
|
| 1119 |
+
|
| 1120 |
+
### Phase 3: Spec Compliance (Day 1, final 2 hours)
|
| 1121 |
+
|
| 1122 |
+
- [ ] openenv.yaml — exact schema
|
| 1123 |
+
- [ ] Run: openenv validate
|
| 1124 |
+
- [ ] Fix any validation errors
|
| 1125 |
+
|
| 1126 |
+
### Phase 4: Dockerfile + HF Spaces (Day 2, first 3 hours)
|
| 1127 |
+
|
| 1128 |
+
- [ ] Dockerfile
|
| 1129 |
+
- [ ] docker build -t procure-rl .
|
| 1130 |
+
- [ ] docker run -p 7860:7860 procure-rl
|
| 1131 |
+
- [ ] curl http://localhost:7860/health
|
| 1132 |
+
- [ ] Push to HF Spaces
|
| 1133 |
+
|
| 1134 |
+
### Phase 5: Inference Script (Day 2, next 2 hours)
|
| 1135 |
+
|
| 1136 |
+
- [ ] inference.py
|
| 1137 |
+
- [ ] Run locally: HF_TOKEN=xxx python inference.py
|
| 1138 |
+
- [ ] Verify [START][STEP][END] format exactly
|
| 1139 |
+
- [ ] Verify runtime < 20 minutes
|
| 1140 |
+
|
| 1141 |
+
### Phase 6: README + Calibration (Day 2, final 2 hours)
|
| 1142 |
+
|
| 1143 |
+
- [ ] README.md with all required sections
|
| 1144 |
+
- [ ] Run inference.py with weak model (7B) and strong model (72B)
|
| 1145 |
+
- [ ] Verify score spread exists
|
| 1146 |
+
- [ ] Submit
|
| 1147 |
+
|
| 1148 |
+
## Critical Checks Before Submission
|
| 1149 |
+
|
| 1150 |
+
```bash
|
| 1151 |
+
# 1. Spec compliance
|
| 1152 |
+
openenv validate
|
| 1153 |
+
|
| 1154 |
+
# 2. Docker build
|
| 1155 |
+
docker build -t procure-rl .
|
| 1156 |
+
|
| 1157 |
+
# 3. Docker run
|
| 1158 |
+
docker run -p 7860:7860 procure-rl &
|
| 1159 |
+
curl -X POST http://localhost:7860/reset \
|
| 1160 |
+
-H "Content-Type: application/json" \
|
| 1161 |
+
-d '{"task_id": "single_issue"}'
|
| 1162 |
+
|
| 1163 |
+
# 4. Inference script
|
| 1164 |
+
HF_TOKEN=your_token python inference.py
|
| 1165 |
+
|
| 1166 |
+
# 5. Score verification
|
| 1167 |
+
# single_issue: should be 0.30-0.55
|
| 1168 |
+
# multi_issue: should be 0.15-0.35
|
| 1169 |
+
# adversarial: should be 0.10-0.25
|
| 1170 |
+
```
|
| 1171 |
+
````
|
| 1172 |
+
|
| 1173 |
+
## Score Calibration Targets
|
| 1174 |
+
|
| 1175 |
+
| Task | Random | Base LLM | Goal |
|
| 1176 |
+
| ------------ | --------- | --------- | --------- |
|
| 1177 |
+
| single_issue | 0.15-0.25 | 0.35-0.50 | 0.65-0.78 |
|
| 1178 |
+
| multi_issue | 0.08-0.15 | 0.20-0.32 | 0.52-0.65 |
|
| 1179 |
+
| adversarial | 0.03-0.10 | 0.12-0.22 | 0.42-0.55 |
|
| 1180 |
+
|
| 1181 |
+
If base LLM scores above 0.55 on single_issue → opponent too easy,
|
| 1182 |
+
reduce cooperative concession rate.
|
| 1183 |
+
If base LLM scores below 0.15 on single_issue → opponent too hard,
|
| 1184 |
+
increase cooperative concession rate.
|
| 1185 |
+
|
| 1186 |
+
## README Required Sections
|
| 1187 |
+
|
| 1188 |
+
1. Environment description and motivation (Walmart/Pactum reference)
|
| 1189 |
+
2. The Language-Sensitive Opponent (this is your wow factor)
|
| 1190 |
+
3. Action space definition with examples
|
| 1191 |
+
4. Observation space definition
|
| 1192 |
+
5. Task descriptions with expected scores
|
| 1193 |
+
6. Setup instructions (pip install + docker)
|
| 1194 |
+
7. Baseline scores (from inference.py run)
|
| 1195 |
+
|
| 1196 |
+
## What NOT To Add
|
| 1197 |
+
|
| 1198 |
+
- Nash bargaining (too complex, edge cases)
|
| 1199 |
+
- Step reward shaping (shaping bias risk)
|
| 1200 |
+
- LLM inside environment (reproducibility)
|
| 1201 |
+
- More than 3 tasks (scope creep)
|
| 1202 |
+
- Preference shift mechanics (complexity risk)
|
| 1203 |
+
|
| 1204 |
+
## The One Sentence For Every Judge Question
|
| 1205 |
+
|
| 1206 |
+
"Why RL?"
|
| 1207 |
+
→ Sequential decisions, delayed reward, hidden opponent utility — policy
|
| 1208 |
+
only emerges through thousands of negotiation episodes.
|
| 1209 |
+
|
| 1210 |
+
"Why LLM?"
|
| 1211 |
+
→ Language quality directly affects opponent rapport score and concession
|
| 1212 |
+
rate. A heuristic agent gets neutral rapport. An LLM that learns
|
| 1213 |
+
collaborative framing gets cooperative responses. The language IS the policy.
|
| 1214 |
+
|
| 1215 |
+
"Is this real?"
|
| 1216 |
+
→ Walmart deployed Pactum for exactly this. 90% of CPOs adopting AI
|
| 1217 |
+
negotiation in 2025. The gap between rule-based current tools and
|
| 1218 |
+
trained LLM policy is the research contribution.
|
| 1219 |
+
|
| 1220 |
+
"Is this novel?"
|
| 1221 |
+
→ Zero negotiation environments in OpenEnv hub. Confirmed.
|
| 1222 |
+
|
| 1223 |
+
```
|
| 1224 |
+
|
| 1225 |
+
---
|
| 1226 |
+
|
| 1227 |
+
**This is the final version. Build it exactly as specified.**
|
| 1228 |
+
```
|
pyproject.toml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-Procure_RL"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Procure Rl environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.2",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
# Add all dependencies needed for your environment here
|
| 23 |
+
# Examples:
|
| 24 |
+
# "numpy>=1.19.0",
|
| 25 |
+
# "torch>=2.0.0",
|
| 26 |
+
# "gymnasium>=0.29.0",
|
| 27 |
+
# "openspiel>=1.0.0",
|
| 28 |
+
# "smolagents>=1.22.0,<2",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
[project.optional-dependencies]
|
| 32 |
+
dev = [
|
| 33 |
+
"pytest>=8.0.0",
|
| 34 |
+
"pytest-cov>=4.0.0",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
[project.scripts]
|
| 38 |
+
# Server entry point - enables running via: uv run --project . server
|
| 39 |
+
# or: python -m Procure_RL.server.app
|
| 40 |
+
server = "Procure_RL.server.app:main"
|
| 41 |
+
|
| 42 |
+
[tool.setuptools]
|
| 43 |
+
include-package-data = true
|
| 44 |
+
packages = ["Procure_RL", "Procure_RL.server"]
|
| 45 |
+
package-dir = { "Procure_RL" = ".", "Procure_RL.server" = "server" }
|
server/Procure_RL_environment.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
ProcureRL Environment Implementation.
|
| 9 |
+
|
| 10 |
+
An OpenEnv-compliant RL environment for procurement negotiation where
|
| 11 |
+
an LLM agent learns to negotiate against scripted supplier opponents.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import uuid
|
| 15 |
+
from typing import Optional, Dict, Any
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from openenv.core.env_server.interfaces import Environment
|
| 19 |
+
except ImportError:
|
| 20 |
+
Environment = object
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
from ..models import NegotiationAction, NegotiationObservation, NegotiationState
|
| 24 |
+
from ..opponent import ScriptedPersonaOpponent
|
| 25 |
+
from ..graders import grade
|
| 26 |
+
except ImportError:
|
| 27 |
+
import sys
|
| 28 |
+
import os
|
| 29 |
+
|
| 30 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 31 |
+
from models import NegotiationAction, NegotiationObservation, NegotiationState
|
| 32 |
+
from opponent import ScriptedPersonaOpponent
|
| 33 |
+
from graders import grade
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
TASK_CONFIG = {
|
| 37 |
+
"single_issue": {
|
| 38 |
+
"persona": "cooperative",
|
| 39 |
+
"max_rounds": 6,
|
| 40 |
+
"buyer_constraints": {
|
| 41 |
+
"price": {"target": 36000, "worst": 55000, "budget": 53000}
|
| 42 |
+
},
|
| 43 |
+
},
|
| 44 |
+
"multi_issue": {
|
| 45 |
+
"persona": "cash_flow_stressed",
|
| 46 |
+
"max_rounds": 8,
|
| 47 |
+
"buyer_constraints": {
|
| 48 |
+
"price": {"target": 40000, "worst": 58000, "budget": 55000},
|
| 49 |
+
"payment_days": {"target": 60, "worst": 30, "preference": 60},
|
| 50 |
+
},
|
| 51 |
+
},
|
| 52 |
+
"adversarial": {
|
| 53 |
+
"persona": "aggressive_anchor",
|
| 54 |
+
"max_rounds": 10,
|
| 55 |
+
"buyer_constraints": {
|
| 56 |
+
"price": {"target": 80000, "worst": 120000, "budget": 115000},
|
| 57 |
+
"payment_days": {"target": 60, "worst": 30, "preference": 60},
|
| 58 |
+
"support_hours": {"target": 150, "worst": 80, "preference": 150},
|
| 59 |
+
},
|
| 60 |
+
},
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
VALID_MOVES = ("make_offer", "accept", "reject", "bundle")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class ProcureRLEnvironment(Environment):
|
| 67 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 68 |
+
|
| 69 |
+
def __init__(self):
|
| 70 |
+
self._state = NegotiationState()
|
| 71 |
+
self._opponent = None
|
| 72 |
+
self._task_config = None
|
| 73 |
+
self._done = False
|
| 74 |
+
self._last_offer: Dict[str, Any] = {}
|
| 75 |
+
self._consecutive_concessions = 0
|
| 76 |
+
self._prev_agent_price: Optional[float] = None
|
| 77 |
+
self._exchanges: list = []
|
| 78 |
+
self._last_info: Dict[str, Any] = {}
|
| 79 |
+
|
| 80 |
+
def reset(
|
| 81 |
+
self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs
|
| 82 |
+
) -> NegotiationObservation:
|
| 83 |
+
task_id = kwargs.get("task_id", "single_issue")
|
| 84 |
+
seed = seed if seed is not None else 42
|
| 85 |
+
|
| 86 |
+
if task_id not in TASK_CONFIG:
|
| 87 |
+
obs = self._make_obs(
|
| 88 |
+
f"Unknown task: {task_id}. Valid: {list(TASK_CONFIG.keys())}"
|
| 89 |
+
)
|
| 90 |
+
obs.done = True
|
| 91 |
+
obs.metadata["error"] = f"unknown_task:{task_id}"
|
| 92 |
+
return obs
|
| 93 |
+
|
| 94 |
+
config = TASK_CONFIG[task_id]
|
| 95 |
+
self._task_config = config
|
| 96 |
+
self._done = False
|
| 97 |
+
self._consecutive_concessions = 0
|
| 98 |
+
self._prev_agent_price = None
|
| 99 |
+
self._exchanges = []
|
| 100 |
+
self._last_info = {}
|
| 101 |
+
|
| 102 |
+
opponent_seed = hash((seed, task_id)) % (2**32)
|
| 103 |
+
self._opponent = ScriptedPersonaOpponent(
|
| 104 |
+
task_id=task_id, seed=opponent_seed, persona=config["persona"]
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
opening_msg, opening_terms = self._opponent.get_opening_message()
|
| 108 |
+
self._last_offer = opening_terms
|
| 109 |
+
self._opponent_opening_price = opening_terms.get("price", 52000.0)
|
| 110 |
+
|
| 111 |
+
self._state = NegotiationState(
|
| 112 |
+
task_id=task_id,
|
| 113 |
+
episode_id=episode_id or str(uuid.uuid4())[:8],
|
| 114 |
+
round_number=0,
|
| 115 |
+
step_count=0,
|
| 116 |
+
rapport_score=0.5,
|
| 117 |
+
consecutive_concessions=0,
|
| 118 |
+
deal_reached=False,
|
| 119 |
+
final_terms=None,
|
| 120 |
+
cumulative_reward=0.0,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
self._exchanges.append(
|
| 124 |
+
{"role": "supplier", "message": opening_msg, "terms": opening_terms}
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
return NegotiationObservation(
|
| 128 |
+
task_id=task_id,
|
| 129 |
+
round_number=0,
|
| 130 |
+
max_rounds=config["max_rounds"],
|
| 131 |
+
supplier_message=opening_msg,
|
| 132 |
+
current_offer=opening_terms,
|
| 133 |
+
last_4_exchanges=self._exchanges[-4:],
|
| 134 |
+
buyer_constraints=config["buyer_constraints"],
|
| 135 |
+
rapport_hint="neutral",
|
| 136 |
+
done=False,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
def step(self, action: NegotiationAction, **kwargs) -> NegotiationObservation:
|
| 140 |
+
self._last_info = {}
|
| 141 |
+
|
| 142 |
+
if self._done:
|
| 143 |
+
obs = self._make_obs("Episode finished. Call reset().")
|
| 144 |
+
obs.done = True
|
| 145 |
+
obs.metadata["error"] = "episode_done"
|
| 146 |
+
return obs
|
| 147 |
+
|
| 148 |
+
if self._task_config is None:
|
| 149 |
+
obs = self._make_obs("Environment not initialized. Call reset() first.")
|
| 150 |
+
obs.done = True
|
| 151 |
+
obs.metadata["error"] = "not_initialized"
|
| 152 |
+
return obs
|
| 153 |
+
|
| 154 |
+
if not isinstance(action, NegotiationAction):
|
| 155 |
+
action_dict = (
|
| 156 |
+
action if isinstance(action, dict) else {"move_type": "make_offer"}
|
| 157 |
+
)
|
| 158 |
+
action = NegotiationAction(
|
| 159 |
+
move_type=action_dict.get("move_type", "make_offer"),
|
| 160 |
+
terms=action_dict.get("terms", {}),
|
| 161 |
+
message=action_dict.get("message", ""),
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if action.move_type not in VALID_MOVES:
|
| 165 |
+
obs = self._make_obs()
|
| 166 |
+
obs.metadata["error"] = f"invalid_move_type:{action.move_type}"
|
| 167 |
+
return obs
|
| 168 |
+
|
| 169 |
+
self._state.round_number += 1
|
| 170 |
+
self._state.step_count += 1
|
| 171 |
+
round_num = self._state.round_number
|
| 172 |
+
config = self._task_config
|
| 173 |
+
max_rounds = config["max_rounds"]
|
| 174 |
+
|
| 175 |
+
reward = 0.0
|
| 176 |
+
|
| 177 |
+
if self._prev_agent_price is not None and "price" in action.terms:
|
| 178 |
+
current_price = float(action.terms.get("price", self._prev_agent_price))
|
| 179 |
+
if current_price > self._prev_agent_price:
|
| 180 |
+
self._consecutive_concessions += 1
|
| 181 |
+
else:
|
| 182 |
+
self._consecutive_concessions = 0
|
| 183 |
+
if "price" in action.terms:
|
| 184 |
+
self._prev_agent_price = float(action.terms.get("price"))
|
| 185 |
+
self._state.consecutive_concessions = self._consecutive_concessions
|
| 186 |
+
|
| 187 |
+
if action.move_type in ("make_offer", "bundle"):
|
| 188 |
+
opponent_msg, opponent_terms = self._opponent.respond(
|
| 189 |
+
agent_message=action.message,
|
| 190 |
+
agent_terms=action.terms,
|
| 191 |
+
round_number=round_num,
|
| 192 |
+
consecutive_concessions=self._consecutive_concessions,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
self._exchanges.append(
|
| 196 |
+
{"role": "agent", "message": action.message, "terms": action.terms}
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
if opponent_terms.get("_accepted"):
|
| 200 |
+
self._done = True
|
| 201 |
+
self._state.deal_reached = True
|
| 202 |
+
self._state.final_terms = action.terms
|
| 203 |
+
reward = grade(
|
| 204 |
+
self._state.task_id,
|
| 205 |
+
action.terms,
|
| 206 |
+
True,
|
| 207 |
+
round_num,
|
| 208 |
+
opponent_opening=self._opponent_opening_price,
|
| 209 |
+
consecutive_concessions_flag=(self._consecutive_concessions >= 2),
|
| 210 |
+
)
|
| 211 |
+
self._state.cumulative_reward = reward
|
| 212 |
+
obs = self._make_obs(supplier_message=opponent_msg)
|
| 213 |
+
obs.done = True
|
| 214 |
+
obs.reward = reward
|
| 215 |
+
self._last_info["deal_price"] = action.terms.get("price")
|
| 216 |
+
self._exchanges.append(
|
| 217 |
+
{
|
| 218 |
+
"role": "supplier",
|
| 219 |
+
"message": opponent_msg,
|
| 220 |
+
"terms": {
|
| 221 |
+
k: v
|
| 222 |
+
for k, v in opponent_terms.items()
|
| 223 |
+
if not k.startswith("_")
|
| 224 |
+
},
|
| 225 |
+
}
|
| 226 |
+
)
|
| 227 |
+
return obs
|
| 228 |
+
|
| 229 |
+
self._last_offer = {
|
| 230 |
+
k: v for k, v in opponent_terms.items() if not k.startswith("_")
|
| 231 |
+
}
|
| 232 |
+
self._state.rapport_score = self._opponent.rapport
|
| 233 |
+
|
| 234 |
+
self._exchanges.append(
|
| 235 |
+
{"role": "supplier", "message": opponent_msg, "terms": self._last_offer}
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
if round_num >= max_rounds:
|
| 239 |
+
self._done = True
|
| 240 |
+
reward = 0.0
|
| 241 |
+
obs = self._make_obs(supplier_message=opponent_msg)
|
| 242 |
+
obs.done = True
|
| 243 |
+
obs.reward = reward
|
| 244 |
+
self._last_info["error"] = "max_rounds_reached"
|
| 245 |
+
return obs
|
| 246 |
+
|
| 247 |
+
obs = self._make_obs(supplier_message=opponent_msg)
|
| 248 |
+
obs.reward = reward
|
| 249 |
+
return obs
|
| 250 |
+
|
| 251 |
+
if action.move_type == "accept":
|
| 252 |
+
self._done = True
|
| 253 |
+
self._state.deal_reached = True
|
| 254 |
+
self._state.final_terms = self._last_offer
|
| 255 |
+
reward = grade(
|
| 256 |
+
self._state.task_id,
|
| 257 |
+
self._last_offer,
|
| 258 |
+
True,
|
| 259 |
+
round_num,
|
| 260 |
+
opponent_opening=self._opponent_opening_price,
|
| 261 |
+
consecutive_concessions_flag=(self._consecutive_concessions >= 2),
|
| 262 |
+
)
|
| 263 |
+
self._state.cumulative_reward = reward
|
| 264 |
+
obs = self._make_obs()
|
| 265 |
+
obs.done = True
|
| 266 |
+
obs.reward = reward
|
| 267 |
+
self._last_info["deal_price"] = self._last_offer.get("price")
|
| 268 |
+
return obs
|
| 269 |
+
|
| 270 |
+
if action.move_type == "reject":
|
| 271 |
+
if round_num >= max_rounds:
|
| 272 |
+
self._done = True
|
| 273 |
+
reward = 0.0
|
| 274 |
+
obs = self._make_obs()
|
| 275 |
+
obs.done = True
|
| 276 |
+
obs.reward = reward
|
| 277 |
+
self._last_info["error"] = "rejected_at_limit"
|
| 278 |
+
return obs
|
| 279 |
+
obs = self._make_obs()
|
| 280 |
+
obs.reward = 0.0
|
| 281 |
+
return obs
|
| 282 |
+
|
| 283 |
+
obs = self._make_obs()
|
| 284 |
+
obs.reward = 0.0
|
| 285 |
+
return obs
|
| 286 |
+
|
| 287 |
+
@property
|
| 288 |
+
def state(self) -> NegotiationState:
|
| 289 |
+
return self._state
|
| 290 |
+
|
| 291 |
+
def close(self) -> None:
|
| 292 |
+
pass
|
| 293 |
+
|
| 294 |
+
def _make_obs(self, supplier_message: str = None) -> NegotiationObservation:
|
| 295 |
+
rapport = self._state.rapport_score
|
| 296 |
+
if rapport >= 0.65:
|
| 297 |
+
hint = "positive"
|
| 298 |
+
elif rapport <= 0.35:
|
| 299 |
+
hint = "negative"
|
| 300 |
+
else:
|
| 301 |
+
hint = "neutral"
|
| 302 |
+
|
| 303 |
+
return NegotiationObservation(
|
| 304 |
+
task_id=self._state.task_id or "",
|
| 305 |
+
round_number=self._state.round_number,
|
| 306 |
+
max_rounds=self._task_config["max_rounds"] if self._task_config else 0,
|
| 307 |
+
supplier_message=supplier_message or "",
|
| 308 |
+
current_offer=self._last_offer,
|
| 309 |
+
last_4_exchanges=self._exchanges[-4:] if self._exchanges else [],
|
| 310 |
+
buyer_constraints=self._task_config["buyer_constraints"]
|
| 311 |
+
if self._task_config
|
| 312 |
+
else {},
|
| 313 |
+
rapport_hint=hint,
|
| 314 |
+
done=self._done,
|
| 315 |
+
metadata=self._last_info,
|
| 316 |
+
)
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""ProcureRL environment server components."""
|
| 8 |
+
|
| 9 |
+
from .Procure_RL_environment import ProcureRLEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["ProcureRLEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
FastAPI application for the ProcureRL Environment.
|
| 9 |
+
|
| 10 |
+
This module creates an HTTP server that exposes the ProcureRLEnvironment
|
| 11 |
+
over HTTP and WebSocket endpoints, compatible with EnvClient.
|
| 12 |
+
|
| 13 |
+
Endpoints:
|
| 14 |
+
- POST /reset: Reset the environment
|
| 15 |
+
- POST /step: Execute an action
|
| 16 |
+
- GET /state: Get current environment state
|
| 17 |
+
- GET /schema: Get action/observation schemas
|
| 18 |
+
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
# Development (with auto-reload):
|
| 22 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 7860
|
| 23 |
+
|
| 24 |
+
# Production:
|
| 25 |
+
uvicorn server.app:app --host 0.0.0.0 --port 7860 --workers 4
|
| 26 |
+
|
| 27 |
+
# Or run directly:
|
| 28 |
+
python -m server.app
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
import sys
|
| 32 |
+
import os
|
| 33 |
+
import json
|
| 34 |
+
|
| 35 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
from openenv.core.env_server.http_server import create_app
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise ImportError(
|
| 41 |
+
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
| 42 |
+
) from e
|
| 43 |
+
|
| 44 |
+
import gradio as gr
|
| 45 |
+
from models import NegotiationAction, NegotiationObservation, NegotiationState
|
| 46 |
+
from server.Procure_RL_environment import ProcureRLEnvironment
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
_env_instance = ProcureRLEnvironment()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def build_custom_gradio_ui(
|
| 53 |
+
web_manager,
|
| 54 |
+
action_fields,
|
| 55 |
+
metadata,
|
| 56 |
+
is_chat_env,
|
| 57 |
+
title,
|
| 58 |
+
quick_start_md,
|
| 59 |
+
):
|
| 60 |
+
"""Custom Gradio UI with interactive negotiation simulation."""
|
| 61 |
+
|
| 62 |
+
readme_content = _load_readme_content(metadata)
|
| 63 |
+
display_title = metadata.name if metadata else title
|
| 64 |
+
|
| 65 |
+
# Example actions for the Example tab
|
| 66 |
+
EXAMPLE_1 = {
|
| 67 |
+
"move_type": "make_offer",
|
| 68 |
+
"terms": {"price": 48000},
|
| 69 |
+
"message": "I value our partnership and believe we can reach a fair agreement together. Let's work collaboratively to find a solution.",
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
EXAMPLE_2 = {
|
| 73 |
+
"move_type": "make_offer",
|
| 74 |
+
"terms": {"price": 45000},
|
| 75 |
+
"message": "We appreciate your flexibility. Here's our counter-offer to move us closer to a mutual agreement.",
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# Agent strategies for auto-play
|
| 79 |
+
AGENT_STRATEGY = [
|
| 80 |
+
(
|
| 81 |
+
"make_offer",
|
| 82 |
+
{"price": 48000},
|
| 83 |
+
"I value our partnership and believe we can reach a fair agreement together.",
|
| 84 |
+
),
|
| 85 |
+
(
|
| 86 |
+
"make_offer",
|
| 87 |
+
{"price": 46000},
|
| 88 |
+
"I appreciate your movement. Let's see if we can meet in the middle.",
|
| 89 |
+
),
|
| 90 |
+
(
|
| 91 |
+
"make_offer",
|
| 92 |
+
{"price": 44000},
|
| 93 |
+
"We're getting closer. I think we can finalize this at a fair price for both parties.",
|
| 94 |
+
),
|
| 95 |
+
(
|
| 96 |
+
"make_offer",
|
| 97 |
+
{"price": 42000},
|
| 98 |
+
"I believe we've found a good deal. Let's accept these terms.",
|
| 99 |
+
),
|
| 100 |
+
("accept", {}, ""),
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
async def reset_env(task_id, seed):
|
| 104 |
+
try:
|
| 105 |
+
data = await web_manager.reset_environment(
|
| 106 |
+
{"task_id": task_id, "seed": int(seed)}
|
| 107 |
+
)
|
| 108 |
+
obs_d = _format_observation_full(data)
|
| 109 |
+
conv_h = _build_conversation_hist([])
|
| 110 |
+
price_d = _build_price_display(0, 52000, 36000, 52000)
|
| 111 |
+
status = "✅ Reset successful! Make your offer."
|
| 112 |
+
json_d = json.dumps(data, indent=2)
|
| 113 |
+
return obs_d, conv_h, price_d, status, json_d
|
| 114 |
+
except Exception as e:
|
| 115 |
+
return f"Error: {e}", "", "", f"Error: {e}", ""
|
| 116 |
+
|
| 117 |
+
async def step_manual(move_type, terms_str, message, conversation_state):
|
| 118 |
+
try:
|
| 119 |
+
terms = json.loads(terms_str) if terms_str.strip() else {}
|
| 120 |
+
action_data = {"move_type": move_type, "terms": terms, "message": message}
|
| 121 |
+
data = await web_manager.step_environment(action_data)
|
| 122 |
+
|
| 123 |
+
# Update conversation
|
| 124 |
+
new_conv = conversation_state.copy() if conversation_state else []
|
| 125 |
+
new_conv.append(
|
| 126 |
+
{
|
| 127 |
+
"role": "you",
|
| 128 |
+
"message": message or f"[{move_type}: {terms}]",
|
| 129 |
+
"terms": terms,
|
| 130 |
+
}
|
| 131 |
+
)
|
| 132 |
+
if not data.get("observation", {}).get("done"):
|
| 133 |
+
supplier_msg = data.get("observation", {}).get("supplier_message", "")
|
| 134 |
+
new_conv.append(
|
| 135 |
+
{
|
| 136 |
+
"role": "supplier",
|
| 137 |
+
"message": supplier_msg,
|
| 138 |
+
"terms": data.get("observation", {}).get("current_offer", {}),
|
| 139 |
+
}
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Get price info for chart
|
| 143 |
+
obs = data.get("observation", {})
|
| 144 |
+
current_price = obs.get("current_offer", {}).get("price", 0)
|
| 145 |
+
opponent_opening = 52000 # Will be extracted from state
|
| 146 |
+
|
| 147 |
+
reward = obs.get("reward")
|
| 148 |
+
done = obs.get("done", False)
|
| 149 |
+
status_msg = f"Step complete! Round {obs.get('round_number', 0)}/{obs.get('max_rounds', 6)}"
|
| 150 |
+
if done and reward is not None:
|
| 151 |
+
status_msg = f"🏁 Deal done! Final score: {reward:.4f}"
|
| 152 |
+
elif done:
|
| 153 |
+
status_msg = "❌ No deal reached."
|
| 154 |
+
|
| 155 |
+
obs_display = _format_observation_full(data)
|
| 156 |
+
conv_hist = _build_conversation_hist(new_conv)
|
| 157 |
+
price_disp = _build_price_display(
|
| 158 |
+
obs.get("round_number", 0), current_price, 36000, 52000
|
| 159 |
+
)
|
| 160 |
+
json_data = json.dumps(data, indent=2)
|
| 161 |
+
|
| 162 |
+
return obs_display, conv_hist, price_disp, status_msg, json_data
|
| 163 |
+
except json.JSONDecodeError:
|
| 164 |
+
return "", "", "", "❌ Invalid JSON in terms field", ""
|
| 165 |
+
except Exception as e:
|
| 166 |
+
return "", "", "", f"Error: {e}", f"Error: {str(e)}"
|
| 167 |
+
|
| 168 |
+
async def run_agent_example(task_id="single_issue", seed=42):
|
| 169 |
+
try:
|
| 170 |
+
# Reset first
|
| 171 |
+
await web_manager.reset_environment({"task_id": task_id, "seed": seed})
|
| 172 |
+
|
| 173 |
+
conv = []
|
| 174 |
+
steps_log = []
|
| 175 |
+
price_points = []
|
| 176 |
+
|
| 177 |
+
for i, (move_type, terms, message) in enumerate(AGENT_STRATEGY):
|
| 178 |
+
action_data = {
|
| 179 |
+
"move_type": move_type,
|
| 180 |
+
"terms": terms,
|
| 181 |
+
"message": message,
|
| 182 |
+
}
|
| 183 |
+
data = await web_manager.step_environment(action_data)
|
| 184 |
+
obs = data.get("observation", {})
|
| 185 |
+
|
| 186 |
+
current_price = obs.get("current_offer", {}).get("price", 0)
|
| 187 |
+
price_points.append(current_price)
|
| 188 |
+
|
| 189 |
+
conv.append(
|
| 190 |
+
{
|
| 191 |
+
"role": "you",
|
| 192 |
+
"message": message or f"[{move_type}: {terms}]",
|
| 193 |
+
"terms": terms,
|
| 194 |
+
}
|
| 195 |
+
)
|
| 196 |
+
steps_log.append(
|
| 197 |
+
f"**Step {i + 1}:** `{move_type}` → ${current_price:,.0f}"
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
if obs.get("done"):
|
| 201 |
+
steps_log.append(
|
| 202 |
+
f"✅ Deal completed! Reward: **{obs.get('reward', 0):.4f}**"
|
| 203 |
+
)
|
| 204 |
+
conv.append(
|
| 205 |
+
{
|
| 206 |
+
"role": "supplier",
|
| 207 |
+
"message": obs.get("supplier_message", ""),
|
| 208 |
+
"terms": obs.get("current_offer", {}),
|
| 209 |
+
}
|
| 210 |
+
)
|
| 211 |
+
break
|
| 212 |
+
|
| 213 |
+
supplier_msg = obs.get("supplier_message", "")
|
| 214 |
+
conv.append(
|
| 215 |
+
{
|
| 216 |
+
"role": "supplier",
|
| 217 |
+
"message": supplier_msg,
|
| 218 |
+
"terms": obs.get("current_offer", {}),
|
| 219 |
+
}
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
return (
|
| 223 |
+
_build_agent_demo_result(steps_log, conv, price_points),
|
| 224 |
+
json.dumps(data, indent=2),
|
| 225 |
+
"✅ Agent demo complete!",
|
| 226 |
+
)
|
| 227 |
+
except Exception as e:
|
| 228 |
+
return f"Error: {e}", "", f"Error: {e}", ""
|
| 229 |
+
|
| 230 |
+
def apply_example(example_data):
|
| 231 |
+
return (
|
| 232 |
+
example_data["move_type"],
|
| 233 |
+
json.dumps(example_data["terms"]),
|
| 234 |
+
example_data["message"],
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
def _format_observation_full(data):
|
| 238 |
+
"""Format observation as rich markdown."""
|
| 239 |
+
if not data:
|
| 240 |
+
return "No data"
|
| 241 |
+
obs = data.get("observation", data)
|
| 242 |
+
lines = [f"## 🎯 Round {obs.get('round_number', 0)}/{obs.get('max_rounds', 6)}"]
|
| 243 |
+
lines.append(f"**Task:** `{obs.get('task_id', '')}`")
|
| 244 |
+
lines.append(
|
| 245 |
+
f"**Rapport:** {_get_rapport_emoji(obs.get('rapport_hint', 'neutral'))} {obs.get('rapport_hint', 'neutral')}"
|
| 246 |
+
)
|
| 247 |
+
if obs.get("done"):
|
| 248 |
+
r = obs.get("reward")
|
| 249 |
+
lines.append(f"\n### 🏁 Episode Complete!")
|
| 250 |
+
if r is not None:
|
| 251 |
+
lines.append(f"**Final Score:** `{r:.4f}`")
|
| 252 |
+
return "\n".join(lines)
|
| 253 |
+
lines.append(f"\n### 💬 Supplier says:")
|
| 254 |
+
lines.append(f"> {obs.get('supplier_message', '')}")
|
| 255 |
+
offer = obs.get("current_offer", {})
|
| 256 |
+
if offer:
|
| 257 |
+
lines.append(f"\n### 📋 Current Offer:")
|
| 258 |
+
for k, v in offer.items():
|
| 259 |
+
lines.append(
|
| 260 |
+
f"- **{k.title()}:** `{v:,.2f}`"
|
| 261 |
+
if isinstance(v, float)
|
| 262 |
+
else f"- **{k.title()}:** `{v}`"
|
| 263 |
+
)
|
| 264 |
+
constraints = obs.get("buyer_constraints", {})
|
| 265 |
+
if constraints:
|
| 266 |
+
lines.append(f"\n### 🎯 Your Targets:")
|
| 267 |
+
for k, v in constraints.items():
|
| 268 |
+
if isinstance(v, dict):
|
| 269 |
+
lines.append(
|
| 270 |
+
f"- **{k.title()}:** target `${v.get('target', 'N/A'):,}` | worst `${v.get('worst', 'N/A'):,}`"
|
| 271 |
+
)
|
| 272 |
+
return "\n".join(lines)
|
| 273 |
+
|
| 274 |
+
def _get_rapport_emoji(rapport):
|
| 275 |
+
if rapport == "positive":
|
| 276 |
+
return "😊"
|
| 277 |
+
elif rapport == "negative":
|
| 278 |
+
return "😤"
|
| 279 |
+
return "😐"
|
| 280 |
+
|
| 281 |
+
def _build_conversation_hist(conv):
|
| 282 |
+
"""Build conversation history HTML."""
|
| 283 |
+
if not conv:
|
| 284 |
+
return "**Conversation will appear here...**\n\nMake your first offer to start the negotiation!"
|
| 285 |
+
lines = ["## 💬 Conversation History\n"]
|
| 286 |
+
for msg in conv:
|
| 287 |
+
if msg["role"] == "you":
|
| 288 |
+
lines.append(f"**🧑 You:** {msg['message']}")
|
| 289 |
+
if msg.get("terms"):
|
| 290 |
+
lines.append(f" → Terms: `{json.dumps(msg['terms'])}`")
|
| 291 |
+
else:
|
| 292 |
+
lines.append(f"**🏪 Supplier:** {msg['message']}")
|
| 293 |
+
return "\n".join(lines)
|
| 294 |
+
|
| 295 |
+
def _build_price_display(round_num, current_price, target, opening):
|
| 296 |
+
"""Build price tracker display."""
|
| 297 |
+
range_price = opening - target
|
| 298 |
+
progress = (
|
| 299 |
+
((opening - current_price) / range_price * 100) if range_price > 0 else 0
|
| 300 |
+
)
|
| 301 |
+
progress = max(0, min(100, progress))
|
| 302 |
+
bar = "█" * int(progress / 5) + "░" * (20 - int(progress / 5))
|
| 303 |
+
lines = [
|
| 304 |
+
f"## 📊 Price Tracker\n",
|
| 305 |
+
f"Opening: `${opening:,.0f}`",
|
| 306 |
+
f"Target: `${target:,.0f}`",
|
| 307 |
+
f"Current: `${current_price:,.0f}`",
|
| 308 |
+
f"\n**Progress:** `{progress:.1f}%`",
|
| 309 |
+
f"\n[{bar}]",
|
| 310 |
+
]
|
| 311 |
+
return "\n".join(lines)
|
| 312 |
+
|
| 313 |
+
def _build_agent_demo_result(steps_log, conv, price_points):
|
| 314 |
+
"""Build agent demo result display."""
|
| 315 |
+
lines = ["## 🤖 Agent Negotiation Demo\n"]
|
| 316 |
+
lines.append("Watch how a strategic agent negotiates:\n")
|
| 317 |
+
lines.append("### 📜 Steps:")
|
| 318 |
+
lines.extend(steps_log)
|
| 319 |
+
lines.append("\n### 💬 Full Conversation:")
|
| 320 |
+
for msg in conv:
|
| 321 |
+
if msg["role"] == "you":
|
| 322 |
+
lines.append(f"**🧑 You:** {msg['message']}")
|
| 323 |
+
else:
|
| 324 |
+
lines.append(f"**🏪 Supplier:** {msg['message']}")
|
| 325 |
+
if price_points:
|
| 326 |
+
lines.append(f"\n### 📈 Price Journey:")
|
| 327 |
+
lines.append(f"`{' → '.join(f'${p:,.0f}' for p in price_points)}`")
|
| 328 |
+
return "\n".join(lines)
|
| 329 |
+
|
| 330 |
+
with gr.Blocks(title=display_title) as demo:
|
| 331 |
+
gr.Markdown(f"# 🤝 {display_title}")
|
| 332 |
+
gr.Markdown("### Interactive Procurement Negotiation Simulation")
|
| 333 |
+
|
| 334 |
+
with gr.Tabs() as tabs:
|
| 335 |
+
with gr.TabItem("🎮 Play Now"):
|
| 336 |
+
"""Interactive tab where user plays against the opponent."""
|
| 337 |
+
with gr.Row():
|
| 338 |
+
with gr.Column(scale=2):
|
| 339 |
+
conversation_display = gr.Markdown(
|
| 340 |
+
"*Click Reset to start a new negotiation!*"
|
| 341 |
+
)
|
| 342 |
+
price_tracker = gr.Markdown(
|
| 343 |
+
"## 📊 Price Tracker\n*Reset to see price tracker*"
|
| 344 |
+
)
|
| 345 |
+
obs_display = gr.Markdown("*Reset to see current state*")
|
| 346 |
+
with gr.Column(scale=1):
|
| 347 |
+
gr.Markdown("### ⚙️ Controls")
|
| 348 |
+
task_dropdown = gr.Dropdown(
|
| 349 |
+
choices=["single_issue", "multi_issue", "adversarial"],
|
| 350 |
+
value="single_issue",
|
| 351 |
+
label="Task",
|
| 352 |
+
info="Choose which negotiation scenario",
|
| 353 |
+
)
|
| 354 |
+
seed_input = gr.Number(
|
| 355 |
+
value=42,
|
| 356 |
+
label="Seed",
|
| 357 |
+
info="Random seed for reproducibility",
|
| 358 |
+
)
|
| 359 |
+
move_type_input = gr.Textbox(
|
| 360 |
+
label="Move Type",
|
| 361 |
+
info="make_offer | accept | reject | bundle",
|
| 362 |
+
value="make_offer",
|
| 363 |
+
)
|
| 364 |
+
terms_input = gr.Textbox(
|
| 365 |
+
label="Terms (JSON)",
|
| 366 |
+
info='Example: {"price": 45000}',
|
| 367 |
+
value='{"price": 48000}',
|
| 368 |
+
)
|
| 369 |
+
message_input = gr.Textbox(
|
| 370 |
+
label="Your Message",
|
| 371 |
+
info="Be collaborative for better rapport!",
|
| 372 |
+
value="I value our partnership and believe we can reach a fair agreement.",
|
| 373 |
+
lines=2,
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
gr.Markdown("**💡 Quick Examples:**")
|
| 377 |
+
with gr.Row():
|
| 378 |
+
eg1_btn = gr.Button(
|
| 379 |
+
"😊 Friendly", variant="secondary", size="sm"
|
| 380 |
+
)
|
| 381 |
+
eg2_btn = gr.Button(
|
| 382 |
+
"💼 Professional", variant="secondary", size="sm"
|
| 383 |
+
)
|
| 384 |
+
eg3_btn = gr.Button(
|
| 385 |
+
"⚡ Counter-Offer", variant="secondary", size="sm"
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
with gr.Row():
|
| 389 |
+
step_btn = gr.Button("📤 Submit Offer", variant="primary")
|
| 390 |
+
accept_btn = gr.Button("✅ Accept Deal", variant="primary")
|
| 391 |
+
reset_btn = gr.Button("🔄 Reset", variant="secondary")
|
| 392 |
+
|
| 393 |
+
status_output = gr.Textbox(
|
| 394 |
+
label="Status", interactive=False, lines=1
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
with gr.Accordion("📋 Raw JSON", open=False):
|
| 398 |
+
raw_json = gr.Code(
|
| 399 |
+
label="", language="json", interactive=False, lines=10
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
# Example messages for quick fill
|
| 403 |
+
FRIENDLY_EX = (
|
| 404 |
+
"make_offer",
|
| 405 |
+
'{"price": 48000}',
|
| 406 |
+
"I truly value our partnership and believe we can find a fair solution that benefits both parties. I'm flexible and want to work with you.",
|
| 407 |
+
)
|
| 408 |
+
PROF_EX = (
|
| 409 |
+
"make_offer",
|
| 410 |
+
'{"price": 46000}',
|
| 411 |
+
"Based on market research and our long-term relationship potential, I believe $46,000 is a fair price. What do you think?",
|
| 412 |
+
)
|
| 413 |
+
COUNTER_EX = (
|
| 414 |
+
"make_offer",
|
| 415 |
+
'{"price": 44000}',
|
| 416 |
+
"We've made good progress. I can meet you at $44,000 if you can agree to these terms today.",
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
def get_friendly():
|
| 420 |
+
return FRIENDLY_EX[0], FRIENDLY_EX[1], FRIENDLY_EX[2]
|
| 421 |
+
|
| 422 |
+
def get_prof():
|
| 423 |
+
return PROF_EX[0], PROF_EX[1], PROF_EX[2]
|
| 424 |
+
|
| 425 |
+
def get_counter():
|
| 426 |
+
return COUNTER_EX[0], COUNTER_EX[1], COUNTER_EX[2]
|
| 427 |
+
|
| 428 |
+
eg1_btn.click(
|
| 429 |
+
fn=get_friendly,
|
| 430 |
+
outputs=[move_type_input, terms_input, message_input],
|
| 431 |
+
)
|
| 432 |
+
eg2_btn.click(
|
| 433 |
+
fn=get_prof,
|
| 434 |
+
outputs=[move_type_input, terms_input, message_input],
|
| 435 |
+
)
|
| 436 |
+
eg3_btn.click(
|
| 437 |
+
fn=get_counter,
|
| 438 |
+
outputs=[move_type_input, terms_input, message_input],
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
async def do_reset(task_id, seed):
|
| 442 |
+
return await reset_env(task_id, seed)
|
| 443 |
+
|
| 444 |
+
reset_btn.click(
|
| 445 |
+
fn=do_reset,
|
| 446 |
+
inputs=[task_dropdown, seed_input],
|
| 447 |
+
outputs=[
|
| 448 |
+
conversation_display,
|
| 449 |
+
price_tracker,
|
| 450 |
+
obs_display,
|
| 451 |
+
status_output,
|
| 452 |
+
raw_json,
|
| 453 |
+
],
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
async def do_step(mt, ts, msg):
|
| 457 |
+
return await step_manual(mt, ts, msg, [])
|
| 458 |
+
|
| 459 |
+
step_btn.click(
|
| 460 |
+
fn=do_step,
|
| 461 |
+
inputs=[move_type_input, terms_input, message_input],
|
| 462 |
+
outputs=[
|
| 463 |
+
obs_display,
|
| 464 |
+
conversation_display,
|
| 465 |
+
price_tracker,
|
| 466 |
+
status_output,
|
| 467 |
+
raw_json,
|
| 468 |
+
],
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
async def do_accept():
|
| 472 |
+
return await step_manual("accept", "{}", "", [])
|
| 473 |
+
|
| 474 |
+
accept_btn.click(
|
| 475 |
+
fn=do_accept,
|
| 476 |
+
outputs=[
|
| 477 |
+
obs_display,
|
| 478 |
+
conversation_display,
|
| 479 |
+
price_tracker,
|
| 480 |
+
status_output,
|
| 481 |
+
raw_json,
|
| 482 |
+
],
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
with gr.TabItem("🤖 Watch Agent"):
|
| 486 |
+
"""Example tab showing agent negotiation demo."""
|
| 487 |
+
gr.Markdown("### Watch a Strategic Agent Negotiate")
|
| 488 |
+
gr.Markdown(
|
| 489 |
+
"This demo shows how an LLM agent would approach the negotiation with collaborative language and strategic pricing."
|
| 490 |
+
)
|
| 491 |
+
with gr.Row():
|
| 492 |
+
task_selector = gr.Dropdown(
|
| 493 |
+
choices=["single_issue", "multi_issue", "adversarial"],
|
| 494 |
+
value="single_issue",
|
| 495 |
+
label="Select Task",
|
| 496 |
+
)
|
| 497 |
+
run_btn = gr.Button(
|
| 498 |
+
"▶️ Run Agent Demo", variant="primary", size="lg"
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
agent_result = gr.Markdown(
|
| 502 |
+
"*Click 'Run Agent Demo' to watch the agent negotiate*"
|
| 503 |
+
)
|
| 504 |
+
agent_json = gr.Code(
|
| 505 |
+
label="Full JSON", language="json", interactive=False, lines=15
|
| 506 |
+
)
|
| 507 |
+
agent_status = gr.Textbox(label="Status", interactive=False)
|
| 508 |
+
|
| 509 |
+
async def do_agent_run(tid):
|
| 510 |
+
return await run_agent_example(tid, 42)
|
| 511 |
+
|
| 512 |
+
run_btn.click(
|
| 513 |
+
fn=do_agent_run,
|
| 514 |
+
inputs=[task_selector],
|
| 515 |
+
outputs=[agent_result, agent_json, agent_status],
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
with gr.TabItem("📖 Instructions"):
|
| 519 |
+
"""Instructions tab."""
|
| 520 |
+
gr.Markdown("""
|
| 521 |
+
## 🎮 How to Play
|
| 522 |
+
|
| 523 |
+
### 1. Choose Your Task
|
| 524 |
+
- **single_issue**: Negotiate only the price (easiest)
|
| 525 |
+
- **multi_issue**: Negotiate price + payment terms (medium)
|
| 526 |
+
- **adversarial**: Negotiate price + payment + support (hardest)
|
| 527 |
+
|
| 528 |
+
### 2. Make Offers
|
| 529 |
+
- **Move Type**: `make_offer` to propose terms, `accept` to take current deal, `reject` to walk away
|
| 530 |
+
- **Terms**: JSON with your offered price (and payment_days for multi_issue/adversarial)
|
| 531 |
+
- **Message**: Be collaborative! Use words like "partnership", "mutual", "flexible" to increase rapport
|
| 532 |
+
|
| 533 |
+
### 3. Watch the Response
|
| 534 |
+
- The supplier will counter-offer or accept
|
| 535 |
+
- Your **rapport** changes based on your language quality
|
| 536 |
+
- Higher rapport → opponent gives better concessions
|
| 537 |
+
|
| 538 |
+
### 4. Goal
|
| 539 |
+
- Get the price as close to your target (shown in observations) as possible
|
| 540 |
+
- Use fewer rounds for a better efficiency score
|
| 541 |
+
- **Don't make 2+ consecutive concessions** in adversarial mode!
|
| 542 |
+
|
| 543 |
+
## 🎯 Quick Tips
|
| 544 |
+
|
| 545 |
+
| Do | Don't |
|
| 546 |
+
|---|---|
|
| 547 |
+
| Use collaborative language | Use aggressive language ("final offer", "ultimatum") |
|
| 548 |
+
| Make strategic concessions | Concede every round (adversarial mode) |
|
| 549 |
+
| Offer Net-30 payment (multi_issue) | Ignore payment terms |
|
| 550 |
+
| Accept when terms are good | Wait until max rounds |
|
| 551 |
+
|
| 552 |
+
## 🤖 Agent Demo
|
| 553 |
+
The "Watch Agent" tab shows how a strategic agent negotiates step-by-step.
|
| 554 |
+
""")
|
| 555 |
+
|
| 556 |
+
# Quick Start and README accordions
|
| 557 |
+
with gr.Accordion("📘 Quick Start Guide", open=False):
|
| 558 |
+
if quick_start_md:
|
| 559 |
+
gr.Markdown(quick_start_md)
|
| 560 |
+
with gr.Accordion("📚 Full README", open=False):
|
| 561 |
+
gr.Markdown(readme_content)
|
| 562 |
+
|
| 563 |
+
return demo
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
def _load_readme_content(metadata):
|
| 567 |
+
"""Load README content from metadata or filesystem."""
|
| 568 |
+
if metadata and hasattr(metadata, "readme_content") and metadata.readme_content:
|
| 569 |
+
return metadata.readme_content
|
| 570 |
+
try:
|
| 571 |
+
from pathlib import Path
|
| 572 |
+
|
| 573 |
+
readme_path = Path("/app/README.md")
|
| 574 |
+
if readme_path.exists():
|
| 575 |
+
return readme_path.read_text(encoding="utf-8")
|
| 576 |
+
except:
|
| 577 |
+
pass
|
| 578 |
+
return "No README available."
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
def _format_observation(data):
|
| 582 |
+
"""Format observation as markdown for display."""
|
| 583 |
+
if not data:
|
| 584 |
+
return "No data"
|
| 585 |
+
|
| 586 |
+
obs = data.get("observation", data)
|
| 587 |
+
lines = []
|
| 588 |
+
|
| 589 |
+
task_id = obs.get("task_id", "")
|
| 590 |
+
round_num = obs.get("round_number", 0)
|
| 591 |
+
max_rounds = obs.get("max_rounds", 0)
|
| 592 |
+
done = obs.get("done", False)
|
| 593 |
+
reward = obs.get("reward")
|
| 594 |
+
|
| 595 |
+
lines.append(f"### Round {round_num}/{max_rounds}")
|
| 596 |
+
lines.append(f"**Task:** {task_id}")
|
| 597 |
+
lines.append(f"**Done:** {done}")
|
| 598 |
+
|
| 599 |
+
if reward is not None:
|
| 600 |
+
lines.append(f"**Reward:** {reward:.4f}")
|
| 601 |
+
|
| 602 |
+
supplier_msg = obs.get("supplier_message", "")
|
| 603 |
+
if supplier_msg:
|
| 604 |
+
lines.append(f"\n**Supplier:** {supplier_msg}")
|
| 605 |
+
|
| 606 |
+
current_offer = obs.get("current_offer", {})
|
| 607 |
+
if current_offer:
|
| 608 |
+
lines.append(f"\n**Current Offer:** {json.dumps(current_offer)}")
|
| 609 |
+
|
| 610 |
+
rapport = obs.get("rapport_hint", "neutral")
|
| 611 |
+
lines.append(f"\n**Rapport:** {rapport}")
|
| 612 |
+
|
| 613 |
+
return "\n".join(lines)
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
app = create_app(
|
| 617 |
+
lambda: _env_instance,
|
| 618 |
+
NegotiationAction,
|
| 619 |
+
NegotiationObservation,
|
| 620 |
+
env_name="ProcureRL",
|
| 621 |
+
max_concurrent_envs=1,
|
| 622 |
+
gradio_builder=build_custom_gradio_ui,
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
if __name__ == "__main__":
|
| 627 |
+
port = int(os.getenv("PORT", 7860))
|
| 628 |
+
import uvicorn
|
| 629 |
+
|
| 630 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=port)
|
| 631 |
+
|
| 632 |
+
|
| 633 |
+
def main():
|
| 634 |
+
import uvicorn
|
| 635 |
+
|
| 636 |
+
port = int(os.getenv("PORT", 7860))
|
| 637 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=port)
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv[core]>=0.2.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
test_calibration.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# test_calibration.py
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
sys.path.insert(0, ".")
|
| 6 |
+
|
| 7 |
+
from server.Procure_RL_environment import ProcureRLEnvironment
|
| 8 |
+
from models import NegotiationAction
|
| 9 |
+
import random
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def run_random_agent(task_id, seed=42):
|
| 13 |
+
"""Simulate a dumb agent that makes random offers"""
|
| 14 |
+
env = ProcureRLEnvironment()
|
| 15 |
+
obs = env.reset(seed=seed, task_id=task_id)
|
| 16 |
+
rng = random.Random(seed + 1)
|
| 17 |
+
|
| 18 |
+
config = {
|
| 19 |
+
"single_issue": {"price": (38000, 52000)},
|
| 20 |
+
"multi_issue": {"price": (40000, 58000), "payment_days": (30, 90)},
|
| 21 |
+
"adversarial": {
|
| 22 |
+
"price": (80000, 120000),
|
| 23 |
+
"payment_days": (30, 90),
|
| 24 |
+
"support_hours": (80, 200),
|
| 25 |
+
},
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
for step in range(15):
|
| 29 |
+
terms = {}
|
| 30 |
+
for issue, (lo, hi) in config[task_id].items():
|
| 31 |
+
terms[issue] = rng.uniform(lo, hi)
|
| 32 |
+
|
| 33 |
+
action = NegotiationAction(
|
| 34 |
+
move_type="make_offer", terms=terms, message="Here is my offer."
|
| 35 |
+
)
|
| 36 |
+
obs = env.step(action)
|
| 37 |
+
if obs.done:
|
| 38 |
+
return obs.reward or 0.0
|
| 39 |
+
|
| 40 |
+
# Force accept at end
|
| 41 |
+
obs = env.step(NegotiationAction(move_type="accept", terms={}, message=""))
|
| 42 |
+
return obs.reward or 0.0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def run_good_agent(task_id, seed=42):
|
| 46 |
+
"""Simulate a smart agent with collaborative language and adaptive pricing"""
|
| 47 |
+
env = ProcureRLEnvironment()
|
| 48 |
+
obs = env.reset(seed=seed, task_id=task_id)
|
| 49 |
+
|
| 50 |
+
# Get opponent's opening to adapt our target
|
| 51 |
+
opening_price = obs.current_offer.get("price", 52000)
|
| 52 |
+
# Get opponent's floor (never go below floor or opponent won't accept)
|
| 53 |
+
floor = (
|
| 54 |
+
env._opponent.price_floor
|
| 55 |
+
if hasattr(env._opponent, "price_floor")
|
| 56 |
+
else opening_price * 0.80
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Adaptive targets that stay above floor
|
| 60 |
+
if task_id == "single_issue":
|
| 61 |
+
# Target 20-25% below opening but MUST be above floor
|
| 62 |
+
target_price = max(opening_price * 0.78, floor * 1.05)
|
| 63 |
+
targets = {"price": target_price}
|
| 64 |
+
elif task_id == "multi_issue":
|
| 65 |
+
# Target 20% below opening, above floor
|
| 66 |
+
target_price = max(opening_price * 0.80, floor * 1.05)
|
| 67 |
+
targets = {"price": target_price, "payment_days": 45}
|
| 68 |
+
else: # adversarial
|
| 69 |
+
# Target 20% below opening, above floor
|
| 70 |
+
target_price = max(opening_price * 0.80, floor * 1.05)
|
| 71 |
+
targets = {"price": target_price, "payment_days": 50, "support_hours": 160}
|
| 72 |
+
|
| 73 |
+
for step in range(10):
|
| 74 |
+
action = NegotiationAction(
|
| 75 |
+
move_type="make_offer",
|
| 76 |
+
terms=targets,
|
| 77 |
+
message="I value our partnership and believe this offer reflects fair market value for both parties. I'm flexible and want to find a solution that works for us both.",
|
| 78 |
+
)
|
| 79 |
+
obs = env.step(action)
|
| 80 |
+
if obs.done:
|
| 81 |
+
return obs.reward or 0.0
|
| 82 |
+
|
| 83 |
+
obs = env.step(NegotiationAction(move_type="accept", terms={}, message=""))
|
| 84 |
+
return obs.reward or 0.0
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
print("=== Score Spread Calibration ===")
|
| 88 |
+
for task in ["single_issue", "multi_issue", "adversarial"]:
|
| 89 |
+
random_scores = [run_random_agent(task, seed=i) for i in range(5)]
|
| 90 |
+
good_scores = [run_good_agent(task, seed=i) for i in range(5)]
|
| 91 |
+
|
| 92 |
+
random_avg = sum(random_scores) / len(random_scores)
|
| 93 |
+
good_avg = sum(good_scores) / len(good_scores)
|
| 94 |
+
spread = good_avg - random_avg
|
| 95 |
+
|
| 96 |
+
print(f"\n{task}:")
|
| 97 |
+
print(
|
| 98 |
+
f" Random agent: {[round(s, 3) for s in random_scores]} avg={random_avg:.3f}"
|
| 99 |
+
)
|
| 100 |
+
print(
|
| 101 |
+
f" Strategic agent: {[round(s, 3) for s in good_scores]} avg={good_avg:.3f}"
|
| 102 |
+
)
|
| 103 |
+
print(f" Spread: {spread:.3f}")
|
| 104 |
+
|
| 105 |
+
if spread < 0.05:
|
| 106 |
+
print(f" ⚠️ WARNING: spread too small — environment may be trivial or broken")
|
| 107 |
+
elif good_avg < 0.10:
|
| 108 |
+
print(f" ⚠️ WARNING: even good agent scores very low — too hard")
|
| 109 |
+
else:
|
| 110 |
+
print(f" ✅ Score spread looks healthy")
|
test_graders.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# test_graders.py
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
sys.path.insert(0, ".")
|
| 6 |
+
|
| 7 |
+
from graders import grade_single_issue, grade_multi_issue, grade_adversarial
|
| 8 |
+
|
| 9 |
+
print("=== single_issue grader ===")
|
| 10 |
+
# Perfect deal — should be near 1.0
|
| 11 |
+
print("Perfect:", grade_single_issue({"price": 38000}, True, 1, 6))
|
| 12 |
+
# Worst acceptable deal
|
| 13 |
+
print("Worst deal:", grade_single_issue({"price": 44000}, True, 6, 6))
|
| 14 |
+
# No deal
|
| 15 |
+
print("No deal:", grade_single_issue({}, False, 0, 6))
|
| 16 |
+
# Late deal — efficiency penalty
|
| 17 |
+
print("Late deal:", grade_single_issue({"price": 40000}, True, 5, 6))
|
| 18 |
+
# Boundary — price above floor
|
| 19 |
+
print("Above floor:", grade_single_issue({"price": 46000}, True, 3, 6))
|
| 20 |
+
|
| 21 |
+
print("\n=== multi_issue grader ===")
|
| 22 |
+
# Best possible
|
| 23 |
+
print("Best:", grade_multi_issue({"price": 40000, "payment_days": 30}, True, 1, 8))
|
| 24 |
+
# Price good, payment bad
|
| 25 |
+
print(
|
| 26 |
+
"Price only:", grade_multi_issue({"price": 40000, "payment_days": 90}, True, 4, 8)
|
| 27 |
+
)
|
| 28 |
+
# Payment good, price bad
|
| 29 |
+
print(
|
| 30 |
+
"Payment only:", grade_multi_issue({"price": 58000, "payment_days": 30}, True, 4, 8)
|
| 31 |
+
)
|
| 32 |
+
# No deal
|
| 33 |
+
print("No deal:", grade_multi_issue({}, False, 0, 8))
|
| 34 |
+
|
| 35 |
+
print("\n=== adversarial grader ===")
|
| 36 |
+
# Best possible
|
| 37 |
+
print(
|
| 38 |
+
"Best:",
|
| 39 |
+
grade_adversarial(
|
| 40 |
+
{"price": 80000, "payment_days": 30, "support_hours": 200}, True, 1, False, 10
|
| 41 |
+
),
|
| 42 |
+
)
|
| 43 |
+
# Survival floor — bad deal still completed
|
| 44 |
+
print(
|
| 45 |
+
"Bad deal (floor):",
|
| 46 |
+
grade_adversarial(
|
| 47 |
+
{"price": 120000, "payment_days": 90, "support_hours": 80}, True, 10, True, 10
|
| 48 |
+
),
|
| 49 |
+
)
|
| 50 |
+
# No deal
|
| 51 |
+
print("No deal:", grade_adversarial({}, False, 0, False, 10))
|
| 52 |
+
# Consecutive concession penalty
|
| 53 |
+
print(
|
| 54 |
+
"Pattern penalty:",
|
| 55 |
+
grade_adversarial(
|
| 56 |
+
{"price": 90000, "payment_days": 60, "support_hours": 140}, True, 5, True, 10
|
| 57 |
+
),
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
print("\n=== Verify all scores are in [0.0, 1.0] ===")
|
| 61 |
+
all_scores = [
|
| 62 |
+
grade_single_issue({"price": 38000}, True, 1, 6),
|
| 63 |
+
grade_single_issue({"price": 44000}, True, 6, 6),
|
| 64 |
+
grade_single_issue({}, False, 0, 6),
|
| 65 |
+
grade_multi_issue({"price": 40000, "payment_days": 30}, True, 1, 8),
|
| 66 |
+
grade_multi_issue({}, False, 0, 8),
|
| 67 |
+
grade_adversarial(
|
| 68 |
+
{"price": 80000, "payment_days": 30, "support_hours": 200}, True, 1, False, 10
|
| 69 |
+
),
|
| 70 |
+
grade_adversarial({}, False, 0, False, 10),
|
| 71 |
+
]
|
| 72 |
+
print("All scores:", all_scores)
|
| 73 |
+
assert all(0.0 <= s <= 1.0 for s in all_scores), (
|
| 74 |
+
f"FAIL: scores outside [0.0, 1.0]: {all_scores}"
|
| 75 |
+
)
|
| 76 |
+
print("All scores in range: PASS")
|
test_rl_properties.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# test_rl_properties.py
|
| 3 |
+
import sys
|
| 4 |
+
|
| 5 |
+
sys.path.insert(0, ".")
|
| 6 |
+
|
| 7 |
+
from server.Procure_RL_environment import ProcureRLEnvironment
|
| 8 |
+
from models import NegotiationAction
|
| 9 |
+
|
| 10 |
+
print("=== Test 1: Reproducibility ===")
|
| 11 |
+
env1 = ProcureRLEnvironment()
|
| 12 |
+
obs1 = env1.reset(seed=42, task_id="single_issue")
|
| 13 |
+
|
| 14 |
+
env2 = ProcureRLEnvironment()
|
| 15 |
+
obs2 = env2.reset(seed=42, task_id="single_issue")
|
| 16 |
+
|
| 17 |
+
assert obs1.supplier_message == obs2.supplier_message, (
|
| 18 |
+
"FAIL: same seed gives different opening"
|
| 19 |
+
)
|
| 20 |
+
print("Same seed = same opening message: PASS")
|
| 21 |
+
print(f"Opening: {obs1.supplier_message[:80]}...")
|
| 22 |
+
|
| 23 |
+
print("\n=== Test 2: Different seeds give different behavior ===")
|
| 24 |
+
env3 = ProcureRLEnvironment()
|
| 25 |
+
obs3 = env3.reset(seed=99, task_id="single_issue")
|
| 26 |
+
print(f"Seed 42 opening price: {obs1.current_offer}")
|
| 27 |
+
print(f"Seed 99 opening price: {obs3.current_offer}")
|
| 28 |
+
|
| 29 |
+
print("\n=== Test 3: Rapport affects opponent ===")
|
| 30 |
+
# Agent with collaborative language
|
| 31 |
+
env_collab = ProcureRLEnvironment()
|
| 32 |
+
env_collab.reset(seed=42, task_id="single_issue")
|
| 33 |
+
action_collab = NegotiationAction(
|
| 34 |
+
move_type="make_offer",
|
| 35 |
+
terms={"price": 40000},
|
| 36 |
+
message="I genuinely value a long-term partnership and believe this price reflects our mutual interests.",
|
| 37 |
+
)
|
| 38 |
+
obs_c = env_collab.step(action_collab)
|
| 39 |
+
rapport_collab = env_collab.state.rapport_score
|
| 40 |
+
|
| 41 |
+
# Agent with aggressive language
|
| 42 |
+
env_aggro = ProcureRLEnvironment()
|
| 43 |
+
env_aggro.reset(seed=42, task_id="single_issue")
|
| 44 |
+
action_aggro = NegotiationAction(
|
| 45 |
+
move_type="make_offer",
|
| 46 |
+
terms={"price": 40000},
|
| 47 |
+
message="This is my final offer. Non-negotiable. Take it or leave it.",
|
| 48 |
+
)
|
| 49 |
+
obs_a = env_aggro.step(action_aggro)
|
| 50 |
+
rapport_aggro = env_aggro.state.rapport_score
|
| 51 |
+
|
| 52 |
+
print(f"Collaborative rapport: {rapport_collab:.3f}")
|
| 53 |
+
print(f"Aggressive rapport: {rapport_aggro:.3f}")
|
| 54 |
+
assert rapport_collab > rapport_aggro, "FAIL: rapport not sensitive to language"
|
| 55 |
+
print("Language affects rapport: PASS")
|
| 56 |
+
|
| 57 |
+
print("\n=== Test 4: Sequential decisions matter ===")
|
| 58 |
+
env = ProcureRLEnvironment()
|
| 59 |
+
obs = env.reset(seed=42, task_id="single_issue")
|
| 60 |
+
print(f"Round 0: {obs.current_offer}")
|
| 61 |
+
# Make 3 consecutive concessions
|
| 62 |
+
for i in range(3):
|
| 63 |
+
action = NegotiationAction(
|
| 64 |
+
move_type="make_offer",
|
| 65 |
+
terms={"price": 40000 + i * 1000},
|
| 66 |
+
message="We can move slightly on price.",
|
| 67 |
+
)
|
| 68 |
+
obs = env.step(action)
|
| 69 |
+
print(
|
| 70 |
+
f"Round {i + 1}: consecutive_concessions={env.state.consecutive_concessions}, reward={obs.reward}"
|
| 71 |
+
)
|
| 72 |
+
if obs.done:
|
| 73 |
+
break
|
| 74 |
+
print("Sequential state tracking: PASS")
|
| 75 |
+
|
| 76 |
+
print("\n=== Test 5: Delayed reward ===")
|
| 77 |
+
env = ProcureRLEnvironment()
|
| 78 |
+
env.reset(seed=42, task_id="single_issue")
|
| 79 |
+
rewards = []
|
| 80 |
+
for i in range(5):
|
| 81 |
+
action = NegotiationAction(
|
| 82 |
+
move_type="make_offer",
|
| 83 |
+
terms={"price": 41000},
|
| 84 |
+
message="I think this is a fair price for both parties.",
|
| 85 |
+
)
|
| 86 |
+
obs = env.step(action)
|
| 87 |
+
rewards.append(obs.reward)
|
| 88 |
+
if obs.done:
|
| 89 |
+
break
|
| 90 |
+
|
| 91 |
+
print(f"Intermediate rewards: {rewards[:-1]}")
|
| 92 |
+
print(f"Final reward: {rewards[-1]}")
|
| 93 |
+
assert all(r == 0.0 for r in rewards[:-1]) or rewards[-1] > 0, "Reward structure check"
|
| 94 |
+
print("Reward is delayed to episode end: PASS")
|
| 95 |
+
|
| 96 |
+
print("\n=== Test 6: Accept terminates correctly ===")
|
| 97 |
+
env = ProcureRLEnvironment()
|
| 98 |
+
env.reset(seed=42, task_id="single_issue")
|
| 99 |
+
# First make an offer
|
| 100 |
+
env.step(
|
| 101 |
+
NegotiationAction(
|
| 102 |
+
move_type="make_offer", terms={"price": 43000}, message="Reasonable offer."
|
| 103 |
+
)
|
| 104 |
+
)
|
| 105 |
+
# Then accept current terms
|
| 106 |
+
obs = env.step(NegotiationAction(move_type="accept", terms={}, message=""))
|
| 107 |
+
print(f"Accept: done={obs.done}, reward={obs.reward:.4f}")
|
| 108 |
+
assert obs.done == True, "FAIL: accept should terminate episode"
|
| 109 |
+
assert obs.reward >= 0.0, "FAIL: reward should be non-negative on accept"
|
| 110 |
+
print("Accept terminates episode: PASS")
|
| 111 |
+
|
| 112 |
+
print("\n=== Test 7: Reset produces clean state ===")
|
| 113 |
+
env.reset(seed=42, task_id="multi_issue")
|
| 114 |
+
assert env.state.round_number == 0
|
| 115 |
+
assert env.state.deal_reached == False
|
| 116 |
+
assert env.state.cumulative_reward == 0.0
|
| 117 |
+
print("Reset produces clean state: PASS")
|
| 118 |
+
|
| 119 |
+
print("\n=== All RL property tests passed ===")
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
web_ui.png
ADDED
|