diff --git a/.DS_Store b/.DS_Store index 4312e6f6eacbbb8ae393a975b2ce66e7c32c78bc..58136bb9fa3d73f344681011cd4d0a219eb0584a 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/workflowTwin/.env.example b/.env.example similarity index 100% rename from workflowTwin/.env.example rename to .env.example diff --git a/workflowTwin/.gitignore b/.gitignore similarity index 100% rename from workflowTwin/.gitignore rename to .gitignore diff --git a/workflowTwin/Dockerfile b/Dockerfile similarity index 100% rename from workflowTwin/Dockerfile rename to Dockerfile diff --git a/README.md b/README.md index 61ee163302b7d5aa0529ee7f7661e2d6d7169e38..161c8961bd3cfa676e148b26c8fa8f91f79d4507 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,215 @@ --- -title: Workflow Twin -emoji: 🌍 -colorFrom: purple -colorTo: gray sdk: docker -pinned: false -license: mit -short_description: OpenEnv environment for workflow simulation under memory con +app_port: 8000 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# WorkflowTwin + +An OpenEnv-compatible environment for training and evaluating agents under memory and resource constraints. + +This environment simulates multi-step ticket resolution pipelines with: +- queueing, prioritization, and dependencies +- stochastic arrivals and agent failures +- strict memory budgets on agent state + +We introduce a **quantized memory policy** based on: +- random orthogonal projection +- scalar vector quantization +- random projection residual sketching + +to study how compression affects agent performance under resource constraints. + +## Motivation + +Real-world agents must operate under limited memory and compute. + +Without compression: +- state grows unbounded +- agents violate system constraints + +With quantized memory: +- state is compressed +- agents remain feasible under tight budgets + +This environment enables controlled evaluation of this tradeoff. + +## Key Results + +We evaluate two modes: +- **baseline**: no compression (truncation under pressure) +- **quant**: rotated quantized memory compression + +This establishes a clear crossover point where compression transitions from unnecessary to essential. + +### Memory Budget vs Feasibility + +![Memory Budget vs Compliance Rate](experiments/figures/memory_budget_vs_compliance.svg) + +### Key Findings + +- **Feasibility threshold shift:** + Baseline requires ~6000 memory, while quantized memory achieves full compliance at ~3000. + +- **2× efficiency gain:** + Compression halves the memory required for feasible operation. + +- **No-regret behavior:** + Under no memory pressure, both methods perform identically. + +- **Constraint robustness:** + Under tight budgets, baseline fails (0% compliance) while quantized memory remains fully feasible (100%). + +**Conclusion:** Compression extends the feasible operating regime without degrading task performance. + +## Structure + +- `env/`: core environment logic, models, scoring, reward + - includes `quantizer.py` with rotated vector quantization primitives +- `server/`: FastAPI app exposing `reset`, `step`, `state` +- `tasks/`: JSON task definitions by difficulty +- `baseline/`: non-LLM heuristic policy +- `baselines/`: research evaluation baselines for `workflow_twin` +- `inference.py`: local rollout entrypoint +- `openenv.yaml`: environment spec + +## Quickstart + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +uvicorn server.app:app --reload +``` + +Server endpoints: + +- `POST /reset` +- `POST /step` with body `{ "action_type": "triage|respond|resolve|escalate", "note": "..." }` +- `GET /state` +- `GET /config` (resolved runtime config loaded from env vars) + +Run baseline inference: + +```bash +python inference.py +``` + +Inference environment variables: + +- `API_BASE_URL`: OpenAI-compatible endpoint base URL +- `HF_TOKEN`: API token (used as `api_key`) +- `MODEL_NAME`: chat model name (default: `gpt-4o-mini`) + +If `API_BASE_URL` or `HF_TOKEN` is missing, inference automatically falls back to heuristic policy. + +`inference.py` result fields: + +- `score`: final reported score (`env_score` when available, otherwise `partial_score`) +- `env_score`: environment-provided score from `env.state()` +- `partial_score`: fallback score from normalized accumulated reward +- `openai_client_configured`: `true` when both `API_BASE_URL` and `HF_TOKEN` are present + +## Method: Quantized Memory Policy + +We implement a rotated vector quantization pipeline: + +1. **Random Orthogonal Projection** + - decorrelates embedding dimensions + +2. **Scalar Quantization** + - coordinate-wise discretization + +3. **Residual Random Projection Sketch** + - preserves inner-product structure + +Reward shaping includes: +- distortion penalty (MSE) +- inner-product preservation penalty + +## Research-Grade WorkflowTwin (L1-L5) + +A new package `workflow_twin/` is now implemented to evolve the simulator from single-ticket MVP to multi-ticket workflow research environment. + +### Included + +- `workflow_twin/core/entities.py`: multi-ticket state, agents, time, SLA/resource fields +- `workflow_twin/core/dynamics.py`: queue logic, SLA penalties, dependencies, stochastic arrivals/failures +- `workflow_twin/core/config.py`: level configs (L1-L5) +- `workflow_twin/environment.py`: main level-aware environment (`WorkflowTwinEnv`) +- `workflow_twin/memory.py`: `MemoryBoundedEnv` wrapper using rotated quantized memory compression +- `workflow_twin/levels/`: level hooks for L1 simple → L5 memory pressure +- `baselines/heuristics.py`: simple queue baseline policy +- `tasks/level1..level5/`: task scaffolding per level + +### Quick Example + +```bash +python - <<'PY' +from workflow_twin.environment import WorkflowTwinEnv +from baselines.heuristics import greedy_queue_policy + +env = WorkflowTwinEnv(level=3, seed=42) +obs = env.reset() + +for _ in range(10): + action = greedy_queue_policy(obs) + obs, reward, done, info = env.step(action) + print(info["step_count"], reward, info["queue"]) + if done: + break +PY +``` + +### Memory-Bounded Wrapper Example (L5) + +```bash +python - <<'PY' +from workflow_twin.environment import WorkflowTwinEnv +from workflow_twin.memory import MemoryBoundedEnv + +base_env = WorkflowTwinEnv(level=5, seed=42) +env = MemoryBoundedEnv(base_env, memory_budget=3500, bits=3) +obs = env.reset() +obs, reward, done, info = env.step({"action_type": "triage", "note": "memory-check"}) +print(info["memory"]) +PY +``` + +## Docker + +```bash +docker build -t workflowtwin . +docker run -p 8000:8000 workflowtwin +``` + +## Controlled A/B Quantized Memory Evaluation + +Run the controlled experiment suite: + +```bash +python -m experiments.ab_quantized_memory_eval +``` + +This executes two tests with shared metrics: + +- control_no_memory_pressure (Level 1, large memory budget) +- critical_memory_constrained_long_horizon (Level 5, tight memory budget) +- memory_budget_sweep (budgets: 2000, 3000, 4000, 6000) + +Modes compared: + +- baseline: no compression, truncation under pressure +- quant: rotated quantized memory compression under pressure + +Reported metrics: + +- avg_reward +- success_rate (resolved/total) +- avg_sla_violations +- avg_memory_used vs avg_memory_budget +- memory_compliance_rate +- steps_per_sec + +Figure (generated by the experiment runner): + +![Memory Budget vs Compliance Rate](experiments/figures/memory_budget_vs_compliance.svg) diff --git a/workflowTwin/baseline/policy.py b/baseline/policy.py similarity index 100% rename from workflowTwin/baseline/policy.py rename to baseline/policy.py diff --git a/workflowTwin/baselines/heuristics.py b/baselines/heuristics.py similarity index 100% rename from workflowTwin/baselines/heuristics.py rename to baselines/heuristics.py diff --git a/workflowTwin/baselines/rl_agents.py b/baselines/rl_agents.py similarity index 100% rename from workflowTwin/baselines/rl_agents.py rename to baselines/rl_agents.py diff --git a/workflowTwin/env/__init__.py b/env/__init__.py similarity index 100% rename from workflowTwin/env/__init__.py rename to env/__init__.py diff --git a/workflowTwin/env/dynamics.py b/env/dynamics.py similarity index 100% rename from workflowTwin/env/dynamics.py rename to env/dynamics.py diff --git a/workflowTwin/env/entities.py b/env/entities.py similarity index 100% rename from workflowTwin/env/entities.py rename to env/entities.py diff --git a/workflowTwin/env/environment.py b/env/environment.py similarity index 100% rename from workflowTwin/env/environment.py rename to env/environment.py diff --git a/workflowTwin/env/graders.py b/env/graders.py similarity index 100% rename from workflowTwin/env/graders.py rename to env/graders.py diff --git a/workflowTwin/env/models.py b/env/models.py similarity index 100% rename from workflowTwin/env/models.py rename to env/models.py diff --git a/workflowTwin/env/quantizer.py b/env/quantizer.py similarity index 100% rename from workflowTwin/env/quantizer.py rename to env/quantizer.py diff --git a/workflowTwin/env/reward.py b/env/reward.py similarity index 100% rename from workflowTwin/env/reward.py rename to env/reward.py diff --git a/workflowTwin/env/runtime_config.py b/env/runtime_config.py similarity index 100% rename from workflowTwin/env/runtime_config.py rename to env/runtime_config.py diff --git a/workflowTwin/env/tasks.py b/env/tasks.py similarity index 100% rename from workflowTwin/env/tasks.py rename to env/tasks.py diff --git a/workflowTwin/experiments/ab_quantized_memory_eval.py b/experiments/ab_quantized_memory_eval.py similarity index 100% rename from workflowTwin/experiments/ab_quantized_memory_eval.py rename to experiments/ab_quantized_memory_eval.py diff --git a/workflowTwin/experiments/ab_turboquant_eval.py b/experiments/ab_turboquant_eval.py similarity index 100% rename from workflowTwin/experiments/ab_turboquant_eval.py rename to experiments/ab_turboquant_eval.py diff --git a/workflowTwin/experiments/figures/memory_budget_vs_compliance.svg b/experiments/figures/memory_budget_vs_compliance.svg similarity index 100% rename from workflowTwin/experiments/figures/memory_budget_vs_compliance.svg rename to experiments/figures/memory_budget_vs_compliance.svg diff --git a/workflowTwin/inference.py b/inference.py similarity index 100% rename from workflowTwin/inference.py rename to inference.py diff --git a/workflowTwin/openenv.yaml b/openenv.yaml similarity index 100% rename from workflowTwin/openenv.yaml rename to openenv.yaml diff --git a/workflowTwin/requirements.txt b/requirements.txt similarity index 100% rename from workflowTwin/requirements.txt rename to requirements.txt diff --git a/workflowTwin/server/app.py b/server/app.py similarity index 100% rename from workflowTwin/server/app.py rename to server/app.py diff --git a/workflowTwin/server/routes.py b/server/routes.py similarity index 100% rename from workflowTwin/server/routes.py rename to server/routes.py diff --git a/workflowTwin/tasks/easy.json b/tasks/easy.json similarity index 100% rename from workflowTwin/tasks/easy.json rename to tasks/easy.json diff --git a/workflowTwin/tasks/hard.json b/tasks/hard.json similarity index 100% rename from workflowTwin/tasks/hard.json rename to tasks/hard.json diff --git a/workflowTwin/tasks/level1/tasks.json b/tasks/level1/tasks.json similarity index 100% rename from workflowTwin/tasks/level1/tasks.json rename to tasks/level1/tasks.json diff --git a/workflowTwin/tasks/level2/tasks.json b/tasks/level2/tasks.json similarity index 100% rename from workflowTwin/tasks/level2/tasks.json rename to tasks/level2/tasks.json diff --git a/workflowTwin/tasks/level3/tasks.json b/tasks/level3/tasks.json similarity index 100% rename from workflowTwin/tasks/level3/tasks.json rename to tasks/level3/tasks.json diff --git a/workflowTwin/tasks/level4/tasks.json b/tasks/level4/tasks.json similarity index 100% rename from workflowTwin/tasks/level4/tasks.json rename to tasks/level4/tasks.json diff --git a/workflowTwin/tasks/level5/tasks.json b/tasks/level5/tasks.json similarity index 100% rename from workflowTwin/tasks/level5/tasks.json rename to tasks/level5/tasks.json diff --git a/workflowTwin/tasks/medium.json b/tasks/medium.json similarity index 100% rename from workflowTwin/tasks/medium.json rename to tasks/medium.json diff --git a/workflowTwin/.DS_Store b/workflowTwin/.DS_Store deleted file mode 100644 index 58136bb9fa3d73f344681011cd4d0a219eb0584a..0000000000000000000000000000000000000000 Binary files a/workflowTwin/.DS_Store and /dev/null differ diff --git a/workflowTwin/README.md b/workflowTwin/README.md deleted file mode 100644 index 161c8961bd3cfa676e148b26c8fa8f91f79d4507..0000000000000000000000000000000000000000 --- a/workflowTwin/README.md +++ /dev/null @@ -1,215 +0,0 @@ ---- -sdk: docker -app_port: 8000 ---- - -# WorkflowTwin - -An OpenEnv-compatible environment for training and evaluating agents under memory and resource constraints. - -This environment simulates multi-step ticket resolution pipelines with: -- queueing, prioritization, and dependencies -- stochastic arrivals and agent failures -- strict memory budgets on agent state - -We introduce a **quantized memory policy** based on: -- random orthogonal projection -- scalar vector quantization -- random projection residual sketching - -to study how compression affects agent performance under resource constraints. - -## Motivation - -Real-world agents must operate under limited memory and compute. - -Without compression: -- state grows unbounded -- agents violate system constraints - -With quantized memory: -- state is compressed -- agents remain feasible under tight budgets - -This environment enables controlled evaluation of this tradeoff. - -## Key Results - -We evaluate two modes: -- **baseline**: no compression (truncation under pressure) -- **quant**: rotated quantized memory compression - -This establishes a clear crossover point where compression transitions from unnecessary to essential. - -### Memory Budget vs Feasibility - -![Memory Budget vs Compliance Rate](experiments/figures/memory_budget_vs_compliance.svg) - -### Key Findings - -- **Feasibility threshold shift:** - Baseline requires ~6000 memory, while quantized memory achieves full compliance at ~3000. - -- **2× efficiency gain:** - Compression halves the memory required for feasible operation. - -- **No-regret behavior:** - Under no memory pressure, both methods perform identically. - -- **Constraint robustness:** - Under tight budgets, baseline fails (0% compliance) while quantized memory remains fully feasible (100%). - -**Conclusion:** Compression extends the feasible operating regime without degrading task performance. - -## Structure - -- `env/`: core environment logic, models, scoring, reward - - includes `quantizer.py` with rotated vector quantization primitives -- `server/`: FastAPI app exposing `reset`, `step`, `state` -- `tasks/`: JSON task definitions by difficulty -- `baseline/`: non-LLM heuristic policy -- `baselines/`: research evaluation baselines for `workflow_twin` -- `inference.py`: local rollout entrypoint -- `openenv.yaml`: environment spec - -## Quickstart - -```bash -python -m venv .venv -source .venv/bin/activate -pip install -r requirements.txt -uvicorn server.app:app --reload -``` - -Server endpoints: - -- `POST /reset` -- `POST /step` with body `{ "action_type": "triage|respond|resolve|escalate", "note": "..." }` -- `GET /state` -- `GET /config` (resolved runtime config loaded from env vars) - -Run baseline inference: - -```bash -python inference.py -``` - -Inference environment variables: - -- `API_BASE_URL`: OpenAI-compatible endpoint base URL -- `HF_TOKEN`: API token (used as `api_key`) -- `MODEL_NAME`: chat model name (default: `gpt-4o-mini`) - -If `API_BASE_URL` or `HF_TOKEN` is missing, inference automatically falls back to heuristic policy. - -`inference.py` result fields: - -- `score`: final reported score (`env_score` when available, otherwise `partial_score`) -- `env_score`: environment-provided score from `env.state()` -- `partial_score`: fallback score from normalized accumulated reward -- `openai_client_configured`: `true` when both `API_BASE_URL` and `HF_TOKEN` are present - -## Method: Quantized Memory Policy - -We implement a rotated vector quantization pipeline: - -1. **Random Orthogonal Projection** - - decorrelates embedding dimensions - -2. **Scalar Quantization** - - coordinate-wise discretization - -3. **Residual Random Projection Sketch** - - preserves inner-product structure - -Reward shaping includes: -- distortion penalty (MSE) -- inner-product preservation penalty - -## Research-Grade WorkflowTwin (L1-L5) - -A new package `workflow_twin/` is now implemented to evolve the simulator from single-ticket MVP to multi-ticket workflow research environment. - -### Included - -- `workflow_twin/core/entities.py`: multi-ticket state, agents, time, SLA/resource fields -- `workflow_twin/core/dynamics.py`: queue logic, SLA penalties, dependencies, stochastic arrivals/failures -- `workflow_twin/core/config.py`: level configs (L1-L5) -- `workflow_twin/environment.py`: main level-aware environment (`WorkflowTwinEnv`) -- `workflow_twin/memory.py`: `MemoryBoundedEnv` wrapper using rotated quantized memory compression -- `workflow_twin/levels/`: level hooks for L1 simple → L5 memory pressure -- `baselines/heuristics.py`: simple queue baseline policy -- `tasks/level1..level5/`: task scaffolding per level - -### Quick Example - -```bash -python - <<'PY' -from workflow_twin.environment import WorkflowTwinEnv -from baselines.heuristics import greedy_queue_policy - -env = WorkflowTwinEnv(level=3, seed=42) -obs = env.reset() - -for _ in range(10): - action = greedy_queue_policy(obs) - obs, reward, done, info = env.step(action) - print(info["step_count"], reward, info["queue"]) - if done: - break -PY -``` - -### Memory-Bounded Wrapper Example (L5) - -```bash -python - <<'PY' -from workflow_twin.environment import WorkflowTwinEnv -from workflow_twin.memory import MemoryBoundedEnv - -base_env = WorkflowTwinEnv(level=5, seed=42) -env = MemoryBoundedEnv(base_env, memory_budget=3500, bits=3) -obs = env.reset() -obs, reward, done, info = env.step({"action_type": "triage", "note": "memory-check"}) -print(info["memory"]) -PY -``` - -## Docker - -```bash -docker build -t workflowtwin . -docker run -p 8000:8000 workflowtwin -``` - -## Controlled A/B Quantized Memory Evaluation - -Run the controlled experiment suite: - -```bash -python -m experiments.ab_quantized_memory_eval -``` - -This executes two tests with shared metrics: - -- control_no_memory_pressure (Level 1, large memory budget) -- critical_memory_constrained_long_horizon (Level 5, tight memory budget) -- memory_budget_sweep (budgets: 2000, 3000, 4000, 6000) - -Modes compared: - -- baseline: no compression, truncation under pressure -- quant: rotated quantized memory compression under pressure - -Reported metrics: - -- avg_reward -- success_rate (resolved/total) -- avg_sla_violations -- avg_memory_used vs avg_memory_budget -- memory_compliance_rate -- steps_per_sec - -Figure (generated by the experiment runner): - -![Memory Budget vs Compliance Rate](experiments/figures/memory_budget_vs_compliance.svg) diff --git a/workflowTwin/workflow_twin/.DS_Store b/workflow_twin/.DS_Store similarity index 100% rename from workflowTwin/workflow_twin/.DS_Store rename to workflow_twin/.DS_Store diff --git a/workflowTwin/workflow_twin/__init__.py b/workflow_twin/__init__.py similarity index 100% rename from workflowTwin/workflow_twin/__init__.py rename to workflow_twin/__init__.py diff --git a/workflowTwin/workflow_twin/core/__init__.py b/workflow_twin/core/__init__.py similarity index 100% rename from workflowTwin/workflow_twin/core/__init__.py rename to workflow_twin/core/__init__.py diff --git a/workflowTwin/workflow_twin/core/config.py b/workflow_twin/core/config.py similarity index 100% rename from workflowTwin/workflow_twin/core/config.py rename to workflow_twin/core/config.py diff --git a/workflowTwin/workflow_twin/core/dynamics.py b/workflow_twin/core/dynamics.py similarity index 100% rename from workflowTwin/workflow_twin/core/dynamics.py rename to workflow_twin/core/dynamics.py diff --git a/workflowTwin/workflow_twin/core/entities.py b/workflow_twin/core/entities.py similarity index 100% rename from workflowTwin/workflow_twin/core/entities.py rename to workflow_twin/core/entities.py diff --git a/workflowTwin/workflow_twin/environment.py b/workflow_twin/environment.py similarity index 100% rename from workflowTwin/workflow_twin/environment.py rename to workflow_twin/environment.py diff --git a/workflowTwin/workflow_twin/levels/__init__.py b/workflow_twin/levels/__init__.py similarity index 100% rename from workflowTwin/workflow_twin/levels/__init__.py rename to workflow_twin/levels/__init__.py diff --git a/workflowTwin/workflow_twin/levels/level1_simple.py b/workflow_twin/levels/level1_simple.py similarity index 100% rename from workflowTwin/workflow_twin/levels/level1_simple.py rename to workflow_twin/levels/level1_simple.py diff --git a/workflowTwin/workflow_twin/levels/level2_sla.py b/workflow_twin/levels/level2_sla.py similarity index 100% rename from workflowTwin/workflow_twin/levels/level2_sla.py rename to workflow_twin/levels/level2_sla.py diff --git a/workflowTwin/workflow_twin/levels/level3_approval.py b/workflow_twin/levels/level3_approval.py similarity index 100% rename from workflowTwin/workflow_twin/levels/level3_approval.py rename to workflow_twin/levels/level3_approval.py diff --git a/workflowTwin/workflow_twin/levels/level4_stochastic.py b/workflow_twin/levels/level4_stochastic.py similarity index 100% rename from workflowTwin/workflow_twin/levels/level4_stochastic.py rename to workflow_twin/levels/level4_stochastic.py diff --git a/workflowTwin/workflow_twin/levels/level5_memory.py b/workflow_twin/levels/level5_memory.py similarity index 100% rename from workflowTwin/workflow_twin/levels/level5_memory.py rename to workflow_twin/levels/level5_memory.py diff --git a/workflowTwin/workflow_twin/memory.py b/workflow_twin/memory.py similarity index 100% rename from workflowTwin/workflow_twin/memory.py rename to workflow_twin/memory.py diff --git a/workflowTwin/workflow_twin/models.py b/workflow_twin/models.py similarity index 100% rename from workflowTwin/workflow_twin/models.py rename to workflow_twin/models.py diff --git a/workflowTwin/workflow_twin/quantizer.py b/workflow_twin/quantizer.py similarity index 100% rename from workflowTwin/workflow_twin/quantizer.py rename to workflow_twin/quantizer.py