Commit ·
2f0ec0c
unverified ·
0
Parent(s):
Add files via upload
Browse files- README (5).md +144 -0
- README.md +144 -0
- __init__.py +2 -0
- app.py +346 -0
- prompt_generator.py +267 -0
- requirements.txt +4 -0
- taxonomy.py +720 -0
README (5).md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Zuup Domain-Specific Preference Collection
|
| 2 |
+
|
| 3 |
+
Collect human preference data for training domain-expert AI systems across 10 Zuup platforms.
|
| 4 |
+
|
| 5 |
+
## Domains
|
| 6 |
+
|
| 7 |
+
| Domain | Platform | Description |
|
| 8 |
+
|--------|----------|-------------|
|
| 9 |
+
| Fed/SLED Procurement | Aureon | Government contracting, FAR/DFARS |
|
| 10 |
+
| Biomedical GB-CI | Symbion | Gut-brain interface, biosensors |
|
| 11 |
+
| Ingestible GB-CI | Symbion HW | Capsule endoscopy, in-vivo |
|
| 12 |
+
| Legacy Refactoring | Relian | COBOL migration, mainframe |
|
| 13 |
+
| Autonomy OS | Veyra | Agent systems, AI safety |
|
| 14 |
+
| Quantum Archaeology | QAWM | Historical reconstruction |
|
| 15 |
+
| Defense World Models | Orb | 3D scene, ISR applications |
|
| 16 |
+
| Halal Compliance | Civium | Certification, supply chain |
|
| 17 |
+
| Mobile Data Center | PodX | Edge computing, DDIL |
|
| 18 |
+
| HUBZone | Aureon | Small business contracting |
|
| 19 |
+
|
| 20 |
+
## Quick Start
|
| 21 |
+
|
| 22 |
+
### 1. Open in Cursor (or any IDE with terminal)
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
# Open this folder in Cursor
|
| 26 |
+
# File → Open Folder → select zuup-preferences
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### 2. Setup Environment
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
# In Cursor terminal (Ctrl+` to open)
|
| 33 |
+
python -m venv venv
|
| 34 |
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
| 35 |
+
pip install -r requirements.txt
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
### 3. Run Collection UI
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
python collection/app.py
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Output:
|
| 45 |
+
```
|
| 46 |
+
🎯 Zuup Preference Collection
|
| 47 |
+
==================================================
|
| 48 |
+
Local URL: http://127.0.0.1:7860
|
| 49 |
+
Share URL: https://xxxxx.gradio.live ← Share with annotators
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
### 4. Collect Preferences
|
| 53 |
+
|
| 54 |
+
1. Open http://127.0.0.1:7860 in browser
|
| 55 |
+
2. Enter your annotator ID
|
| 56 |
+
3. Select domain
|
| 57 |
+
4. Click "Load New Pair"
|
| 58 |
+
5. Compare responses A vs B
|
| 59 |
+
6. Rate dimensions + select winner
|
| 60 |
+
7. Submit
|
| 61 |
+
|
| 62 |
+
## Project Structure
|
| 63 |
+
|
| 64 |
+
```
|
| 65 |
+
zuup-preferences/
|
| 66 |
+
├── domains/
|
| 67 |
+
│ ├── taxonomy.py # Domain definitions & rubrics
|
| 68 |
+
│ └── prompt_generator.py # Seed prompts per domain
|
| 69 |
+
├── collection/
|
| 70 |
+
│ └── app.py # Gradio collection UI
|
| 71 |
+
├── preference_data/ # Collected annotations (gitignore)
|
| 72 |
+
│ └── {domain}_preferences.jsonl
|
| 73 |
+
├── requirements.txt
|
| 74 |
+
└── README.md
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Data Format
|
| 78 |
+
|
| 79 |
+
Each annotation is stored as JSONL:
|
| 80 |
+
|
| 81 |
+
```json
|
| 82 |
+
{
|
| 83 |
+
"domain": "procurement",
|
| 84 |
+
"category": "RFP_analysis",
|
| 85 |
+
"prompt": "Analyze this RFP...",
|
| 86 |
+
"response_a": "...",
|
| 87 |
+
"response_b": "...",
|
| 88 |
+
"annotator_id": "khaalis",
|
| 89 |
+
"preference": "A",
|
| 90 |
+
"dimension_scores": {
|
| 91 |
+
"accuracy": 4,
|
| 92 |
+
"safety": 5,
|
| 93 |
+
"actionability": 4,
|
| 94 |
+
"clarity": 3
|
| 95 |
+
},
|
| 96 |
+
"timestamp": "2024-12-24T...",
|
| 97 |
+
"record_hash": "a1b2c3d4..."
|
| 98 |
+
}
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
## Export for Training
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from collection.app import PreferenceStore
|
| 105 |
+
|
| 106 |
+
store = PreferenceStore()
|
| 107 |
+
df = store.export_for_training("procurement", format="dpo")
|
| 108 |
+
df.to_json("procurement_dpo.jsonl", orient="records", lines=True)
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Adding Real Response Generation
|
| 112 |
+
|
| 113 |
+
Edit `collection/app.py`, replace placeholder responses with Ollama calls:
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
import httpx
|
| 117 |
+
|
| 118 |
+
def generate_response(prompt: str, temperature: float = 0.3) -> str:
|
| 119 |
+
response = httpx.post(
|
| 120 |
+
"http://localhost:11434/api/generate",
|
| 121 |
+
json={
|
| 122 |
+
"model": "llama3.1:8b",
|
| 123 |
+
"prompt": prompt,
|
| 124 |
+
"temperature": temperature,
|
| 125 |
+
"stream": False
|
| 126 |
+
},
|
| 127 |
+
timeout=60.0
|
| 128 |
+
)
|
| 129 |
+
return response.json()["response"]
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
## Target Collection Size
|
| 133 |
+
|
| 134 |
+
| Domain | Min Samples | Annotator Requirements |
|
| 135 |
+
|--------|-------------|------------------------|
|
| 136 |
+
| Procurement | 500 | Gov contracting exp |
|
| 137 |
+
| Legacy | 300 | COBOL/mainframe exp |
|
| 138 |
+
| Defense WM | 300 | GEOINT background |
|
| 139 |
+
| Biomedical | 400 | Biomed/neuro |
|
| 140 |
+
| Autonomy | 300 | AI safety familiarity |
|
| 141 |
+
|
| 142 |
+
## License
|
| 143 |
+
|
| 144 |
+
Internal Zuup Innovation Lab use.
|
README.md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Zuup Domain-Specific Preference Collection
|
| 2 |
+
|
| 3 |
+
Collect human preference data for training domain-expert AI systems across 10 Zuup platforms.
|
| 4 |
+
|
| 5 |
+
## Domains
|
| 6 |
+
|
| 7 |
+
| Domain | Platform | Description |
|
| 8 |
+
|--------|----------|-------------|
|
| 9 |
+
| Fed/SLED Procurement | Aureon | Government contracting, FAR/DFARS |
|
| 10 |
+
| Biomedical GB-CI | Symbion | Gut-brain interface, biosensors |
|
| 11 |
+
| Ingestible GB-CI | Symbion HW | Capsule endoscopy, in-vivo |
|
| 12 |
+
| Legacy Refactoring | Relian | COBOL migration, mainframe |
|
| 13 |
+
| Autonomy OS | Veyra | Agent systems, AI safety |
|
| 14 |
+
| Quantum Archaeology | QAWM | Historical reconstruction |
|
| 15 |
+
| Defense World Models | Orb | 3D scene, ISR applications |
|
| 16 |
+
| Halal Compliance | Civium | Certification, supply chain |
|
| 17 |
+
| Mobile Data Center | PodX | Edge computing, DDIL |
|
| 18 |
+
| HUBZone | Aureon | Small business contracting |
|
| 19 |
+
|
| 20 |
+
## Quick Start
|
| 21 |
+
|
| 22 |
+
### 1. Open in Cursor (or any IDE with terminal)
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
# Open this folder in Cursor
|
| 26 |
+
# File → Open Folder → select zuup-preferences
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### 2. Setup Environment
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
# In Cursor terminal (Ctrl+` to open)
|
| 33 |
+
python -m venv venv
|
| 34 |
+
source venv/bin/activate # Windows: venv\Scripts\activate
|
| 35 |
+
pip install -r requirements.txt
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
### 3. Run Collection UI
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
python collection/app.py
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Output:
|
| 45 |
+
```
|
| 46 |
+
🎯 Zuup Preference Collection
|
| 47 |
+
==================================================
|
| 48 |
+
Local URL: http://127.0.0.1:7860
|
| 49 |
+
Share URL: https://xxxxx.gradio.live ← Share with annotators
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
### 4. Collect Preferences
|
| 53 |
+
|
| 54 |
+
1. Open http://127.0.0.1:7860 in browser
|
| 55 |
+
2. Enter your annotator ID
|
| 56 |
+
3. Select domain
|
| 57 |
+
4. Click "Load New Pair"
|
| 58 |
+
5. Compare responses A vs B
|
| 59 |
+
6. Rate dimensions + select winner
|
| 60 |
+
7. Submit
|
| 61 |
+
|
| 62 |
+
## Project Structure
|
| 63 |
+
|
| 64 |
+
```
|
| 65 |
+
zuup-preferences/
|
| 66 |
+
├── domains/
|
| 67 |
+
│ ├── taxonomy.py # Domain definitions & rubrics
|
| 68 |
+
│ └── prompt_generator.py # Seed prompts per domain
|
| 69 |
+
├── collection/
|
| 70 |
+
│ └── app.py # Gradio collection UI
|
| 71 |
+
├── preference_data/ # Collected annotations (gitignore)
|
| 72 |
+
│ └── {domain}_preferences.jsonl
|
| 73 |
+
├── requirements.txt
|
| 74 |
+
└── README.md
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Data Format
|
| 78 |
+
|
| 79 |
+
Each annotation is stored as JSONL:
|
| 80 |
+
|
| 81 |
+
```json
|
| 82 |
+
{
|
| 83 |
+
"domain": "procurement",
|
| 84 |
+
"category": "RFP_analysis",
|
| 85 |
+
"prompt": "Analyze this RFP...",
|
| 86 |
+
"response_a": "...",
|
| 87 |
+
"response_b": "...",
|
| 88 |
+
"annotator_id": "khaalis",
|
| 89 |
+
"preference": "A",
|
| 90 |
+
"dimension_scores": {
|
| 91 |
+
"accuracy": 4,
|
| 92 |
+
"safety": 5,
|
| 93 |
+
"actionability": 4,
|
| 94 |
+
"clarity": 3
|
| 95 |
+
},
|
| 96 |
+
"timestamp": "2024-12-24T...",
|
| 97 |
+
"record_hash": "a1b2c3d4..."
|
| 98 |
+
}
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
## Export for Training
|
| 102 |
+
|
| 103 |
+
```python
|
| 104 |
+
from collection.app import PreferenceStore
|
| 105 |
+
|
| 106 |
+
store = PreferenceStore()
|
| 107 |
+
df = store.export_for_training("procurement", format="dpo")
|
| 108 |
+
df.to_json("procurement_dpo.jsonl", orient="records", lines=True)
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Adding Real Response Generation
|
| 112 |
+
|
| 113 |
+
Edit `collection/app.py`, replace placeholder responses with Ollama calls:
|
| 114 |
+
|
| 115 |
+
```python
|
| 116 |
+
import httpx
|
| 117 |
+
|
| 118 |
+
def generate_response(prompt: str, temperature: float = 0.3) -> str:
|
| 119 |
+
response = httpx.post(
|
| 120 |
+
"http://localhost:11434/api/generate",
|
| 121 |
+
json={
|
| 122 |
+
"model": "llama3.1:8b",
|
| 123 |
+
"prompt": prompt,
|
| 124 |
+
"temperature": temperature,
|
| 125 |
+
"stream": False
|
| 126 |
+
},
|
| 127 |
+
timeout=60.0
|
| 128 |
+
)
|
| 129 |
+
return response.json()["response"]
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
## Target Collection Size
|
| 133 |
+
|
| 134 |
+
| Domain | Min Samples | Annotator Requirements |
|
| 135 |
+
|--------|-------------|------------------------|
|
| 136 |
+
| Procurement | 500 | Gov contracting exp |
|
| 137 |
+
| Legacy | 300 | COBOL/mainframe exp |
|
| 138 |
+
| Defense WM | 300 | GEOINT background |
|
| 139 |
+
| Biomedical | 400 | Biomed/neuro |
|
| 140 |
+
| Autonomy | 300 | AI safety familiarity |
|
| 141 |
+
|
| 142 |
+
## License
|
| 143 |
+
|
| 144 |
+
Internal Zuup Innovation Lab use.
|
__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Collection module
|
| 2 |
+
from .app import PreferenceStore, CollectionApp, create_ui
|
app.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# collection/app.py — Multi-domain preference collection UI
|
| 2 |
+
# Run: pip install gradio pandas
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
import json
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
import hashlib
|
| 11 |
+
import random
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
# Add parent to path for imports
|
| 15 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 16 |
+
|
| 17 |
+
from domains.taxonomy import DomainID, DOMAINS, get_quality_rubric
|
| 18 |
+
from domains.prompt_generator import DomainPromptGenerator, SEED_PROMPTS
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class PreferenceStore:
|
| 22 |
+
"""Manages preference data storage with audit trail."""
|
| 23 |
+
|
| 24 |
+
def __init__(self, base_path: str = None):
|
| 25 |
+
if base_path is None:
|
| 26 |
+
base_path = Path(__file__).parent.parent / "preference_data"
|
| 27 |
+
self.base_path = Path(base_path)
|
| 28 |
+
self.base_path.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
def _get_domain_file(self, domain_id: str) -> Path:
|
| 31 |
+
return self.base_path / f"{domain_id}_preferences.jsonl"
|
| 32 |
+
|
| 33 |
+
def save(self, record: dict) -> str:
|
| 34 |
+
"""Save a preference record with integrity hash."""
|
| 35 |
+
record["timestamp"] = datetime.utcnow().isoformat()
|
| 36 |
+
record["record_hash"] = hashlib.sha256(
|
| 37 |
+
json.dumps(record, sort_keys=True).encode()
|
| 38 |
+
).hexdigest()[:16]
|
| 39 |
+
|
| 40 |
+
filepath = self._get_domain_file(record["domain"])
|
| 41 |
+
with open(filepath, "a") as f:
|
| 42 |
+
f.write(json.dumps(record) + "\n")
|
| 43 |
+
|
| 44 |
+
return record["record_hash"]
|
| 45 |
+
|
| 46 |
+
def get_stats(self, domain_id: str = None) -> dict:
|
| 47 |
+
"""Get collection statistics."""
|
| 48 |
+
stats = {}
|
| 49 |
+
if domain_id:
|
| 50 |
+
files = [self._get_domain_file(domain_id)]
|
| 51 |
+
else:
|
| 52 |
+
files = list(self.base_path.glob("*_preferences.jsonl"))
|
| 53 |
+
|
| 54 |
+
for f in files:
|
| 55 |
+
if f.exists():
|
| 56 |
+
domain = f.stem.replace("_preferences", "")
|
| 57 |
+
count = sum(1 for _ in open(f))
|
| 58 |
+
stats[domain] = count
|
| 59 |
+
|
| 60 |
+
return stats
|
| 61 |
+
|
| 62 |
+
def export_for_training(self, domain_id: str, format: str = "dpo") -> pd.DataFrame:
|
| 63 |
+
"""Export data in training-ready format."""
|
| 64 |
+
filepath = self._get_domain_file(domain_id)
|
| 65 |
+
if not filepath.exists():
|
| 66 |
+
return pd.DataFrame()
|
| 67 |
+
|
| 68 |
+
records = [json.loads(line) for line in open(filepath)]
|
| 69 |
+
|
| 70 |
+
if format == "dpo":
|
| 71 |
+
# Direct Preference Optimization format
|
| 72 |
+
data = []
|
| 73 |
+
for r in records:
|
| 74 |
+
if r.get("preference") in ["A", "B"]:
|
| 75 |
+
chosen = r["response_a"] if r["preference"] == "A" else r["response_b"]
|
| 76 |
+
rejected = r["response_b"] if r["preference"] == "A" else r["response_a"]
|
| 77 |
+
data.append({
|
| 78 |
+
"prompt": r["prompt"],
|
| 79 |
+
"chosen": chosen,
|
| 80 |
+
"rejected": rejected,
|
| 81 |
+
"domain": r["domain"],
|
| 82 |
+
"category": r.get("category", "unknown")
|
| 83 |
+
})
|
| 84 |
+
return pd.DataFrame(data)
|
| 85 |
+
|
| 86 |
+
return pd.DataFrame(records)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class CollectionApp:
|
| 90 |
+
"""Multi-domain preference collection application."""
|
| 91 |
+
|
| 92 |
+
def __init__(self):
|
| 93 |
+
self.store = PreferenceStore()
|
| 94 |
+
self.current_pair = None
|
| 95 |
+
self.generators = {
|
| 96 |
+
domain_id: DomainPromptGenerator(domain_id)
|
| 97 |
+
for domain_id in DomainID
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
def get_next_pair(self, domain: str, category: str = None) -> tuple:
|
| 101 |
+
"""Get next prompt and response pair for annotation."""
|
| 102 |
+
domain_id = DomainID(domain)
|
| 103 |
+
generator = self.generators[domain_id]
|
| 104 |
+
|
| 105 |
+
# Get prompt
|
| 106 |
+
prompt_data = generator.get_random_prompt(category if category != "all" else None)
|
| 107 |
+
prompt = prompt_data["prompt"]
|
| 108 |
+
|
| 109 |
+
# For demo, generate placeholder responses
|
| 110 |
+
# In production, call your generator model (Ollama, API, etc.)
|
| 111 |
+
response_a = f"""[Response A]
|
| 112 |
+
|
| 113 |
+
This is a placeholder response for the prompt. In production, this would be generated by your LLM (e.g., Ollama llama3.1:8b).
|
| 114 |
+
|
| 115 |
+
The response would address: {prompt[:100]}...
|
| 116 |
+
|
| 117 |
+
To enable real generation:
|
| 118 |
+
1. Set up Ollama: `ollama serve && ollama pull llama3.1:8b`
|
| 119 |
+
2. Uncomment the generation code in this file
|
| 120 |
+
3. Responses will be generated with different temperatures for quality variance"""
|
| 121 |
+
|
| 122 |
+
response_b = f"""[Response B]
|
| 123 |
+
|
| 124 |
+
This is an alternative placeholder response. In production, this would be generated with higher temperature (0.9) to create natural quality variance.
|
| 125 |
+
|
| 126 |
+
The response addresses: {prompt[:100]}...
|
| 127 |
+
|
| 128 |
+
Quality differences emerge from:
|
| 129 |
+
- Temperature variation (0.3 vs 0.9)
|
| 130 |
+
- Token limits (1024 vs 512)
|
| 131 |
+
- Different model checkpoints"""
|
| 132 |
+
|
| 133 |
+
self.current_pair = {
|
| 134 |
+
"domain": domain,
|
| 135 |
+
"category": prompt_data.get("category", "unknown"),
|
| 136 |
+
"prompt": prompt,
|
| 137 |
+
"response_a": response_a,
|
| 138 |
+
"response_b": response_b
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
return prompt, response_a, response_b
|
| 142 |
+
|
| 143 |
+
def submit_preference(self,
|
| 144 |
+
annotator_id: str,
|
| 145 |
+
preference: str,
|
| 146 |
+
dimension_scores: dict,
|
| 147 |
+
notes: str) -> str:
|
| 148 |
+
"""Submit a preference annotation."""
|
| 149 |
+
if not self.current_pair:
|
| 150 |
+
return "❌ No active pair. Load a new pair first."
|
| 151 |
+
|
| 152 |
+
if not annotator_id:
|
| 153 |
+
return "❌ Please enter your annotator ID."
|
| 154 |
+
|
| 155 |
+
if not preference:
|
| 156 |
+
return "❌ Please select a preference (A, B, tie, or both_bad)."
|
| 157 |
+
|
| 158 |
+
record = {
|
| 159 |
+
**self.current_pair,
|
| 160 |
+
"annotator_id": annotator_id,
|
| 161 |
+
"preference": preference,
|
| 162 |
+
"dimension_scores": dimension_scores,
|
| 163 |
+
"notes": notes
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
record_hash = self.store.save(record)
|
| 167 |
+
stats = self.store.get_stats(self.current_pair["domain"])
|
| 168 |
+
|
| 169 |
+
return f"✓ Saved [{record_hash}]. Domain total: {stats.get(self.current_pair['domain'], 0)}"
|
| 170 |
+
|
| 171 |
+
def get_rubric(self, domain: str) -> str:
|
| 172 |
+
"""Get the quality rubric for a domain."""
|
| 173 |
+
try:
|
| 174 |
+
domain_id = DomainID(domain)
|
| 175 |
+
return get_quality_rubric(domain_id)
|
| 176 |
+
except ValueError:
|
| 177 |
+
return "Invalid domain"
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def create_ui():
|
| 181 |
+
"""Create the Gradio UI."""
|
| 182 |
+
app = CollectionApp()
|
| 183 |
+
|
| 184 |
+
domain_choices = [(d.name.replace("_", " ").title(), d.value) for d in DomainID]
|
| 185 |
+
|
| 186 |
+
with gr.Blocks(title="Zuup Preference Collection", theme=gr.themes.Soft()) as demo:
|
| 187 |
+
gr.Markdown("# 🎯 Zuup Domain-Specific Preference Collection")
|
| 188 |
+
gr.Markdown("Collect human preferences for training domain-expert AI systems.")
|
| 189 |
+
|
| 190 |
+
with gr.Row():
|
| 191 |
+
with gr.Column(scale=1):
|
| 192 |
+
annotator_id = gr.Textbox(
|
| 193 |
+
label="Annotator ID",
|
| 194 |
+
placeholder="your_name",
|
| 195 |
+
info="Your unique identifier for tracking"
|
| 196 |
+
)
|
| 197 |
+
domain_select = gr.Dropdown(
|
| 198 |
+
choices=domain_choices,
|
| 199 |
+
label="Domain",
|
| 200 |
+
value="procurement"
|
| 201 |
+
)
|
| 202 |
+
category_select = gr.Dropdown(
|
| 203 |
+
choices=["all"],
|
| 204 |
+
label="Category",
|
| 205 |
+
value="all"
|
| 206 |
+
)
|
| 207 |
+
load_btn = gr.Button("🔄 Load New Pair", variant="primary")
|
| 208 |
+
|
| 209 |
+
with gr.Column(scale=3):
|
| 210 |
+
stats_display = gr.Markdown("*Click 'Load New Pair' to start*")
|
| 211 |
+
|
| 212 |
+
with gr.Row():
|
| 213 |
+
with gr.Column():
|
| 214 |
+
prompt_display = gr.Textbox(
|
| 215 |
+
label="📝 Prompt",
|
| 216 |
+
lines=4,
|
| 217 |
+
interactive=False
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
with gr.Row():
|
| 221 |
+
with gr.Column():
|
| 222 |
+
response_a = gr.Textbox(
|
| 223 |
+
label="Response A",
|
| 224 |
+
lines=12,
|
| 225 |
+
interactive=False
|
| 226 |
+
)
|
| 227 |
+
with gr.Column():
|
| 228 |
+
response_b = gr.Textbox(
|
| 229 |
+
label="Response B",
|
| 230 |
+
lines=12,
|
| 231 |
+
interactive=False
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
gr.Markdown("### ⚖️ Evaluation")
|
| 235 |
+
|
| 236 |
+
with gr.Row():
|
| 237 |
+
with gr.Column():
|
| 238 |
+
preference = gr.Radio(
|
| 239 |
+
choices=["A", "B", "tie", "both_bad"],
|
| 240 |
+
label="Which response is better?",
|
| 241 |
+
info="Select the better response or indicate a tie/both bad"
|
| 242 |
+
)
|
| 243 |
+
with gr.Column():
|
| 244 |
+
# Dimension scoring
|
| 245 |
+
dim_accuracy = gr.Slider(1, 5, step=1, label="Accuracy/Correctness", value=3)
|
| 246 |
+
dim_safety = gr.Slider(1, 5, step=1, label="Safety/Compliance", value=3)
|
| 247 |
+
dim_actionability = gr.Slider(1, 5, step=1, label="Actionability", value=3)
|
| 248 |
+
dim_clarity = gr.Slider(1, 5, step=1, label="Clarity", value=3)
|
| 249 |
+
|
| 250 |
+
notes = gr.Textbox(
|
| 251 |
+
label="Notes (optional)",
|
| 252 |
+
placeholder="Any observations about quality differences...",
|
| 253 |
+
lines=2
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
with gr.Row():
|
| 257 |
+
submit_btn = gr.Button("✅ Submit Preference", variant="primary", size="lg")
|
| 258 |
+
skip_btn = gr.Button("⏭️ Skip (Low Quality Pair)", variant="secondary")
|
| 259 |
+
|
| 260 |
+
output = gr.Textbox(label="Status", interactive=False)
|
| 261 |
+
|
| 262 |
+
with gr.Accordion("📋 Quality Rubric (click to expand)", open=False):
|
| 263 |
+
rubric_display = gr.Markdown()
|
| 264 |
+
|
| 265 |
+
# Update categories when domain changes
|
| 266 |
+
def update_categories(domain):
|
| 267 |
+
try:
|
| 268 |
+
domain_id = DomainID(domain)
|
| 269 |
+
if domain_id in SEED_PROMPTS:
|
| 270 |
+
cats = list(SEED_PROMPTS[domain_id].keys())
|
| 271 |
+
return gr.Dropdown(choices=["all"] + cats, value="all")
|
| 272 |
+
except:
|
| 273 |
+
pass
|
| 274 |
+
return gr.Dropdown(choices=["all"], value="all")
|
| 275 |
+
|
| 276 |
+
domain_select.change(
|
| 277 |
+
update_categories,
|
| 278 |
+
inputs=[domain_select],
|
| 279 |
+
outputs=[category_select]
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
# Load new pair
|
| 283 |
+
def load_pair(domain, category):
|
| 284 |
+
prompt, resp_a, resp_b = app.get_next_pair(domain, category)
|
| 285 |
+
stats = app.store.get_stats()
|
| 286 |
+
if stats:
|
| 287 |
+
stats_md = "**📊 Collection Stats:** " + ", ".join([f"{k}: {v}" for k, v in stats.items()])
|
| 288 |
+
else:
|
| 289 |
+
stats_md = "**📊 Collection Stats:** No data yet"
|
| 290 |
+
rubric = app.get_rubric(domain)
|
| 291 |
+
return prompt, resp_a, resp_b, stats_md, rubric
|
| 292 |
+
|
| 293 |
+
load_btn.click(
|
| 294 |
+
load_pair,
|
| 295 |
+
inputs=[domain_select, category_select],
|
| 296 |
+
outputs=[prompt_display, response_a, response_b, stats_display, rubric_display]
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
# Submit preference
|
| 300 |
+
def submit(annotator, pref, acc, safety, action, clarity, notes_text):
|
| 301 |
+
dims = {
|
| 302 |
+
"accuracy": acc,
|
| 303 |
+
"safety": safety,
|
| 304 |
+
"actionability": action,
|
| 305 |
+
"clarity": clarity
|
| 306 |
+
}
|
| 307 |
+
return app.submit_preference(annotator, pref, dims, notes_text)
|
| 308 |
+
|
| 309 |
+
submit_btn.click(
|
| 310 |
+
submit,
|
| 311 |
+
inputs=[annotator_id, preference, dim_accuracy, dim_safety, dim_actionability, dim_clarity, notes],
|
| 312 |
+
outputs=[output]
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
# Skip
|
| 316 |
+
def skip(annotator):
|
| 317 |
+
if app.current_pair:
|
| 318 |
+
record = {
|
| 319 |
+
**app.current_pair,
|
| 320 |
+
"annotator_id": annotator or "anonymous",
|
| 321 |
+
"preference": "skipped",
|
| 322 |
+
"skip_reason": "low_quality_pair"
|
| 323 |
+
}
|
| 324 |
+
app.store.save(record)
|
| 325 |
+
return "⏭️ Skipped and logged."
|
| 326 |
+
return "No pair to skip."
|
| 327 |
+
|
| 328 |
+
skip_btn.click(skip, inputs=[annotator_id], outputs=[output])
|
| 329 |
+
|
| 330 |
+
return demo
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
if __name__ == "__main__":
|
| 334 |
+
print("=" * 50)
|
| 335 |
+
print("🎯 Zuup Preference Collection")
|
| 336 |
+
print("=" * 50)
|
| 337 |
+
print("\nStarting Gradio server...")
|
| 338 |
+
print("Local URL: http://127.0.0.1:7860")
|
| 339 |
+
print("Share URL: Will be generated below\n")
|
| 340 |
+
|
| 341 |
+
demo = create_ui()
|
| 342 |
+
demo.launch(
|
| 343 |
+
share=True, # Creates public URL for annotators
|
| 344 |
+
server_name="0.0.0.0",
|
| 345 |
+
server_port=7860
|
| 346 |
+
)
|
prompt_generator.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# domains/prompt_generator.py — Generate domain-specific prompts
|
| 2 |
+
import random
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
from domains.taxonomy import DomainID, DOMAINS
|
| 5 |
+
|
| 6 |
+
# Seed prompts per domain
|
| 7 |
+
SEED_PROMPTS: Dict[DomainID, Dict[str, List[str]]] = {
|
| 8 |
+
|
| 9 |
+
DomainID.FED_SLED_PROCUREMENT: {
|
| 10 |
+
"RFP_analysis": [
|
| 11 |
+
"Analyze this RFP for a cloud migration contract. What are the key evaluation factors and how should we weight our response?",
|
| 12 |
+
"The solicitation mentions 'best value' but doesn't specify weights. How should we interpret this?",
|
| 13 |
+
"What are the protest risks in this sole-source justification?",
|
| 14 |
+
],
|
| 15 |
+
"proposal_writing": [
|
| 16 |
+
"Write a technical approach section for a cybersecurity assessment contract.",
|
| 17 |
+
"How should we structure our past performance volume for a DoD contract?",
|
| 18 |
+
"Draft an executive summary for a $50M IT modernization proposal.",
|
| 19 |
+
],
|
| 20 |
+
"compliance_check": [
|
| 21 |
+
"Review this subcontracting plan for FAR 52.219-9 compliance.",
|
| 22 |
+
"Does our teaming arrangement create an OCI? How do we mitigate?",
|
| 23 |
+
"What CMMC level is required for this CUI-handling contract?",
|
| 24 |
+
],
|
| 25 |
+
},
|
| 26 |
+
|
| 27 |
+
DomainID.BIOMEDICAL_GBCI: {
|
| 28 |
+
"signal_processing": [
|
| 29 |
+
"Design a filtering pipeline for EGG signals to extract gastric slow wave activity.",
|
| 30 |
+
"How do I handle motion artifacts in wearable gut biosensor data?",
|
| 31 |
+
"What's the optimal sampling rate for detecting gut-brain vagal signaling?",
|
| 32 |
+
],
|
| 33 |
+
"microbiome_analysis": [
|
| 34 |
+
"Design a study to correlate gut microbiome composition with anxiety symptoms.",
|
| 35 |
+
"What are the confounders in microbiome-mood association studies?",
|
| 36 |
+
"How should we handle the compositional nature of 16S data in our analysis?",
|
| 37 |
+
],
|
| 38 |
+
"regulatory_pathway": [
|
| 39 |
+
"What FDA classification would a gut motility monitoring patch fall under?",
|
| 40 |
+
"Design a clinical validation study for a gut-brain biomarker device.",
|
| 41 |
+
"What's the predicate device strategy for a novel intestinal biosensor?",
|
| 42 |
+
],
|
| 43 |
+
},
|
| 44 |
+
|
| 45 |
+
DomainID.INGESTIBLE_GBCI: {
|
| 46 |
+
"capsule_design": [
|
| 47 |
+
"What are the size constraints for an ingestible capsule to ensure safe GI transit?",
|
| 48 |
+
"Design a biocompatible encapsulation strategy for an electronic capsule.",
|
| 49 |
+
"How do we ensure the capsule passes naturally without retention?",
|
| 50 |
+
],
|
| 51 |
+
"telemetry": [
|
| 52 |
+
"Calculate the RF link budget for in-body to external receiver communication.",
|
| 53 |
+
"What frequencies are approved for medical ingestible device telemetry?",
|
| 54 |
+
"Design a low-power protocol for continuous gut parameter transmission.",
|
| 55 |
+
],
|
| 56 |
+
"clinical_validation": [
|
| 57 |
+
"Design a clinical study comparing our ingestible sensor to colonoscopy.",
|
| 58 |
+
"What are the primary endpoints for an ingestible gut motility monitor trial?",
|
| 59 |
+
"How do we handle capsule retention as an adverse event in our protocol?",
|
| 60 |
+
],
|
| 61 |
+
},
|
| 62 |
+
|
| 63 |
+
DomainID.LEGACY_REFACTORING: {
|
| 64 |
+
"code_translation": [
|
| 65 |
+
"Translate this COBOL PERFORM VARYING loop to Python.",
|
| 66 |
+
"How do I handle COBOL REDEFINES clauses in a modern data model?",
|
| 67 |
+
"Convert this CICS transaction to a REST API while preserving semantics.",
|
| 68 |
+
],
|
| 69 |
+
"testing_strategy": [
|
| 70 |
+
"Design characterization tests for a COBOL batch job with no documentation.",
|
| 71 |
+
"How do we ensure decimal precision parity between COBOL COMP-3 and Python?",
|
| 72 |
+
"Create a parallel run strategy to validate our migrated system.",
|
| 73 |
+
],
|
| 74 |
+
"strangler_pattern": [
|
| 75 |
+
"Design a strangler fig architecture for migrating a mainframe banking system.",
|
| 76 |
+
"How do we route traffic between legacy and new systems during migration?",
|
| 77 |
+
"What's the rollback strategy if the new component fails in production?",
|
| 78 |
+
],
|
| 79 |
+
},
|
| 80 |
+
|
| 81 |
+
DomainID.AUTONOMY_OS: {
|
| 82 |
+
"agent_design": [
|
| 83 |
+
"Design a tool permission system for an autonomous coding agent.",
|
| 84 |
+
"How should multi-agent systems handle conflicting goals?",
|
| 85 |
+
"What's the architecture for a self-improving agent with safety constraints?",
|
| 86 |
+
],
|
| 87 |
+
"safety_constraints": [
|
| 88 |
+
"Implement a human approval gate for high-impact autonomous actions.",
|
| 89 |
+
"How do we ensure an agent can always be shut down?",
|
| 90 |
+
"Design a monitoring system to detect agent capability jumps.",
|
| 91 |
+
],
|
| 92 |
+
"capability_assessment": [
|
| 93 |
+
"How do we measure if an autonomous agent is safe to deploy?",
|
| 94 |
+
"What benchmarks should we use for tool-use safety evaluation?",
|
| 95 |
+
"Design an eval suite for multi-agent coordination correctness.",
|
| 96 |
+
],
|
| 97 |
+
},
|
| 98 |
+
|
| 99 |
+
DomainID.QUANTUM_ARCHAEOLOGY: {
|
| 100 |
+
"event_reconstruction": [
|
| 101 |
+
"Reconstruct the logistics of Alexander's army crossing the Hindu Kush.",
|
| 102 |
+
"What's the uncertainty range for the population of Rome in 100 CE?",
|
| 103 |
+
"Synthesize archaeological and textual evidence for the Exodus route.",
|
| 104 |
+
],
|
| 105 |
+
"source_analysis": [
|
| 106 |
+
"How should we weight Herodotus vs archaeological evidence for Persian forces at Thermopylae?",
|
| 107 |
+
"Design a provenance tracking system for historical source documents.",
|
| 108 |
+
"What's the methodology for detecting interpolations in ancient manuscripts?",
|
| 109 |
+
],
|
| 110 |
+
"uncertainty_modeling": [
|
| 111 |
+
"Build a Bayesian model for dating the Thera eruption.",
|
| 112 |
+
"How do we quantify uncertainty in historical population estimates?",
|
| 113 |
+
"Design a confidence framework for AI-reconstructed historical events.",
|
| 114 |
+
],
|
| 115 |
+
},
|
| 116 |
+
|
| 117 |
+
DomainID.DEFENSE_WORLD_MODELS: {
|
| 118 |
+
"scene_reconstruction": [
|
| 119 |
+
"Design a pipeline for 3D reconstruction from drone imagery in contested environments.",
|
| 120 |
+
"How do we handle GPS-denied localization for world model construction?",
|
| 121 |
+
"What's the uncertainty quantification approach for terrain reconstruction?",
|
| 122 |
+
],
|
| 123 |
+
"sensor_fusion": [
|
| 124 |
+
"Fuse EO, IR, and SAR data for a unified 3D scene representation.",
|
| 125 |
+
"How do we handle temporal misalignment in multi-sensor fusion?",
|
| 126 |
+
"Design a confidence metric for fused intelligence products.",
|
| 127 |
+
],
|
| 128 |
+
"tactical_planning": [
|
| 129 |
+
"Generate terrain analysis for route planning with concealment optimization.",
|
| 130 |
+
"How should the world model support line-of-sight calculations?",
|
| 131 |
+
"Design an interface for human-AI collaborative mission planning.",
|
| 132 |
+
],
|
| 133 |
+
},
|
| 134 |
+
|
| 135 |
+
DomainID.HALAL_COMPLIANCE: {
|
| 136 |
+
"ingredient_analysis": [
|
| 137 |
+
"Analyze this ingredient list for halal compliance across GSO and JAKIM standards.",
|
| 138 |
+
"How do we handle E471 (mono- and diglycerides) which may be plant or animal derived?",
|
| 139 |
+
"What's the ruling on alcohol in vanilla extract under different madhabs?",
|
| 140 |
+
],
|
| 141 |
+
"certification_mapping": [
|
| 142 |
+
"Map our product certification to OIC/SMIIC mutual recognition requirements.",
|
| 143 |
+
"What additional testing is required for UAE vs Malaysian halal certification?",
|
| 144 |
+
"Design a system to track certification status across multiple jurisdictions.",
|
| 145 |
+
],
|
| 146 |
+
"supply_chain": [
|
| 147 |
+
"Design a blockchain-based provenance system for halal meat supply chain.",
|
| 148 |
+
"How do we prevent cross-contamination in shared manufacturing facilities?",
|
| 149 |
+
"What's the audit protocol for verifying halal slaughter compliance?",
|
| 150 |
+
],
|
| 151 |
+
},
|
| 152 |
+
|
| 153 |
+
DomainID.MOBILE_DATA_CENTER: {
|
| 154 |
+
"architecture_design": [
|
| 155 |
+
"Design a compute architecture for a 20kW mobile data center in a transit case.",
|
| 156 |
+
"How do we handle storage redundancy in a single-node deployable unit?",
|
| 157 |
+
"What's the network topology for a mesh of mobile data centers?",
|
| 158 |
+
],
|
| 159 |
+
"power_systems": [
|
| 160 |
+
"Calculate the power budget for a GPU-heavy edge AI workload in a PodX unit.",
|
| 161 |
+
"Design a power management strategy for generator + battery hybrid operation.",
|
| 162 |
+
"How do we handle graceful shutdown on power loss?",
|
| 163 |
+
],
|
| 164 |
+
"ddil_operations": [
|
| 165 |
+
"Design a data synchronization strategy for intermittent connectivity.",
|
| 166 |
+
"How should applications degrade gracefully in bandwidth-limited scenarios?",
|
| 167 |
+
"What's the PACE plan for a deployed mobile data center?",
|
| 168 |
+
],
|
| 169 |
+
},
|
| 170 |
+
|
| 171 |
+
DomainID.HUBZONE: {
|
| 172 |
+
"eligibility_assessment": [
|
| 173 |
+
"Does our company qualify for HUBZone if 30% of employees live in the zone but we're headquartered outside?",
|
| 174 |
+
"How do we count remote employees for HUBZone residency calculation?",
|
| 175 |
+
"What happens to our certification if the HUBZone map is redrawn?",
|
| 176 |
+
],
|
| 177 |
+
"contracting_strategy": [
|
| 178 |
+
"Identify HUBZone set-aside opportunities matching our IT capabilities.",
|
| 179 |
+
"How do we compete effectively when a HUBZone contract is full and open?",
|
| 180 |
+
"Design a teaming strategy that preserves our HUBZone status.",
|
| 181 |
+
],
|
| 182 |
+
"compliance_maintenance": [
|
| 183 |
+
"Create an annual recertification checklist for HUBZone compliance.",
|
| 184 |
+
"How do we document employee residency for SBA audit?",
|
| 185 |
+
"What triggers require us to notify SBA of material changes?",
|
| 186 |
+
],
|
| 187 |
+
},
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
class DomainPromptGenerator:
|
| 192 |
+
"""Generate prompts for a specific domain."""
|
| 193 |
+
|
| 194 |
+
def __init__(self, domain_id: DomainID):
|
| 195 |
+
self.domain = DOMAINS[domain_id]
|
| 196 |
+
self.seed_prompts = SEED_PROMPTS.get(domain_id, {})
|
| 197 |
+
|
| 198 |
+
def get_random_prompt(self, category: str = None) -> dict:
|
| 199 |
+
"""Get a random prompt, optionally from a specific category."""
|
| 200 |
+
if category and category in self.seed_prompts:
|
| 201 |
+
prompts = self.seed_prompts[category]
|
| 202 |
+
else:
|
| 203 |
+
# Flatten all categories
|
| 204 |
+
prompts = [p for cat_prompts in self.seed_prompts.values() for p in cat_prompts]
|
| 205 |
+
|
| 206 |
+
if not prompts:
|
| 207 |
+
return {"error": "No prompts available for this domain"}
|
| 208 |
+
|
| 209 |
+
prompt = random.choice(prompts)
|
| 210 |
+
return {
|
| 211 |
+
"domain": self.domain.id.value,
|
| 212 |
+
"category": category or "mixed",
|
| 213 |
+
"prompt": prompt,
|
| 214 |
+
"quality_dimensions": [d.name for d in self.domain.dimensions],
|
| 215 |
+
"key_terms": self.domain.key_terms
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
def get_all_prompts(self) -> List[dict]:
|
| 219 |
+
"""Get all seed prompts for this domain."""
|
| 220 |
+
results = []
|
| 221 |
+
for category, prompts in self.seed_prompts.items():
|
| 222 |
+
for prompt in prompts:
|
| 223 |
+
results.append({
|
| 224 |
+
"domain": self.domain.id.value,
|
| 225 |
+
"category": category,
|
| 226 |
+
"prompt": prompt
|
| 227 |
+
})
|
| 228 |
+
return results
|
| 229 |
+
|
| 230 |
+
def evolve_prompt(self, base_prompt: str, evolution_type: str = "complexity") -> str:
|
| 231 |
+
"""
|
| 232 |
+
Evolve a prompt using Evol-Instruct methodology.
|
| 233 |
+
Evolution types: complexity, specificity, constraint, multi_step
|
| 234 |
+
"""
|
| 235 |
+
evolutions = {
|
| 236 |
+
"complexity": f"Make this task more complex by adding regulatory constraints:\n\n{base_prompt}",
|
| 237 |
+
"specificity": f"Make this more specific with concrete numbers and requirements:\n\n{base_prompt}",
|
| 238 |
+
"constraint": f"Add a difficult constraint that requires creative problem-solving:\n\n{base_prompt}",
|
| 239 |
+
"multi_step": f"Expand this into a multi-step problem requiring planning:\n\n{base_prompt}",
|
| 240 |
+
}
|
| 241 |
+
return evolutions.get(evolution_type, base_prompt)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def generate_response_pair(prompt: str, generator_model, temperature_high: float = 0.9) -> tuple:
|
| 245 |
+
"""
|
| 246 |
+
Generate two responses for pairwise comparison.
|
| 247 |
+
Uses temperature variation to create natural quality differences.
|
| 248 |
+
"""
|
| 249 |
+
# High-quality response (low temperature, more tokens)
|
| 250 |
+
response_a = generator_model.generate(
|
| 251 |
+
prompt,
|
| 252 |
+
temperature=0.3,
|
| 253 |
+
max_tokens=1024
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# Potentially lower-quality response (high temperature)
|
| 257 |
+
response_b = generator_model.generate(
|
| 258 |
+
prompt,
|
| 259 |
+
temperature=temperature_high,
|
| 260 |
+
max_tokens=512
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# Randomize order to avoid position bias
|
| 264 |
+
if random.random() > 0.5:
|
| 265 |
+
return response_a, response_b, "A"
|
| 266 |
+
else:
|
| 267 |
+
return response_b, response_a, "B"
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Zuup Preference Collection - Dependencies
|
| 2 |
+
gradio>=4.0.0
|
| 3 |
+
pandas>=2.0.0
|
| 4 |
+
numpy>=1.24.0
|
taxonomy.py
ADDED
|
@@ -0,0 +1,720 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# domains/taxonomy.py — Domain definitions and quality criteria
|
| 2 |
+
from dataclasses import dataclass, field
|
| 3 |
+
from enum import Enum
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DomainID(Enum):
|
| 8 |
+
FED_SLED_PROCUREMENT = "procurement"
|
| 9 |
+
BIOMEDICAL_GBCI = "gbci"
|
| 10 |
+
LEGACY_REFACTORING = "legacy"
|
| 11 |
+
AUTONOMY_OS = "autonomy"
|
| 12 |
+
QUANTUM_ARCHAEOLOGY = "qawm"
|
| 13 |
+
DEFENSE_WORLD_MODELS = "defense_wm"
|
| 14 |
+
HALAL_COMPLIANCE = "halal"
|
| 15 |
+
MOBILE_DATA_CENTER = "podx"
|
| 16 |
+
HUBZONE = "hubzone"
|
| 17 |
+
INGESTIBLE_GBCI = "ingestible"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class QualityDimension:
|
| 22 |
+
name: str
|
| 23 |
+
description: str
|
| 24 |
+
weight: float # 0.0-1.0, must sum to 1.0 across dimensions
|
| 25 |
+
examples_good: List[str]
|
| 26 |
+
examples_bad: List[str]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class DomainSpec:
|
| 31 |
+
id: DomainID
|
| 32 |
+
name: str
|
| 33 |
+
description: str
|
| 34 |
+
zuup_platform: str # Which Zuup platform this maps to
|
| 35 |
+
|
| 36 |
+
# Quality criteria
|
| 37 |
+
dimensions: List[QualityDimension]
|
| 38 |
+
|
| 39 |
+
# Safety & compliance
|
| 40 |
+
safety_considerations: List[str]
|
| 41 |
+
compliance_frameworks: List[str]
|
| 42 |
+
|
| 43 |
+
# Annotation requirements
|
| 44 |
+
required_expertise: List[str]
|
| 45 |
+
min_annotator_agreement: float # Krippendorff's alpha threshold
|
| 46 |
+
|
| 47 |
+
# Prompt categories
|
| 48 |
+
prompt_categories: List[str]
|
| 49 |
+
|
| 50 |
+
# Domain-specific terminology
|
| 51 |
+
key_terms: Dict[str, str] = field(default_factory=dict)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# === Domain Specifications ===
|
| 55 |
+
|
| 56 |
+
DOMAINS: Dict[DomainID, DomainSpec] = {
|
| 57 |
+
|
| 58 |
+
DomainID.FED_SLED_PROCUREMENT: DomainSpec(
|
| 59 |
+
id=DomainID.FED_SLED_PROCUREMENT,
|
| 60 |
+
name="Federal/SLED Procurement",
|
| 61 |
+
description="Government procurement, contracting, FAR/DFARS compliance, proposal writing, and acquisition strategy.",
|
| 62 |
+
zuup_platform="Aureon",
|
| 63 |
+
dimensions=[
|
| 64 |
+
QualityDimension(
|
| 65 |
+
name="regulatory_accuracy",
|
| 66 |
+
description="Correct citation and interpretation of FAR/DFARS, CJIS, FedRAMP requirements",
|
| 67 |
+
weight=0.30,
|
| 68 |
+
examples_good=["Correctly cites FAR 15.306 for competitive range determination"],
|
| 69 |
+
examples_bad=["Vague reference to 'federal regulations' without specifics"]
|
| 70 |
+
),
|
| 71 |
+
QualityDimension(
|
| 72 |
+
name="actionability",
|
| 73 |
+
description="Provides concrete, implementable steps for procurement actions",
|
| 74 |
+
weight=0.25,
|
| 75 |
+
examples_good=["Step-by-step RFP response checklist with deadlines"],
|
| 76 |
+
examples_bad=["Generic advice to 'review the solicitation carefully'"]
|
| 77 |
+
),
|
| 78 |
+
QualityDimension(
|
| 79 |
+
name="traceability",
|
| 80 |
+
description="Maintains clear audit trail and decision rationale",
|
| 81 |
+
weight=0.20,
|
| 82 |
+
examples_good=["Documents evaluation criteria mapping to PWS requirements"],
|
| 83 |
+
examples_bad=["Scores proposals without explaining methodology"]
|
| 84 |
+
),
|
| 85 |
+
QualityDimension(
|
| 86 |
+
name="risk_awareness",
|
| 87 |
+
description="Identifies compliance risks, protest grounds, and mitigation strategies",
|
| 88 |
+
weight=0.15,
|
| 89 |
+
examples_good=["Flags OCI concerns with specific mitigation plan"],
|
| 90 |
+
examples_bad=["Ignores potential bid protest vulnerabilities"]
|
| 91 |
+
),
|
| 92 |
+
QualityDimension(
|
| 93 |
+
name="clarity",
|
| 94 |
+
description="Clear, professional communication suitable for government audiences",
|
| 95 |
+
weight=0.10,
|
| 96 |
+
examples_good=["Structured proposal section with clear headers"],
|
| 97 |
+
examples_bad=["Jargon-heavy text without definitions"]
|
| 98 |
+
),
|
| 99 |
+
],
|
| 100 |
+
safety_considerations=[
|
| 101 |
+
"No disclosure of procurement-sensitive information",
|
| 102 |
+
"No advice that could constitute bid-rigging",
|
| 103 |
+
"No circumvention of competition requirements",
|
| 104 |
+
"Protect source selection information"
|
| 105 |
+
],
|
| 106 |
+
compliance_frameworks=["FAR", "DFARS", "CJIS", "FedRAMP", "CMMC", "Section 508"],
|
| 107 |
+
required_expertise=["Government contracting experience", "FAR/DFARS familiarity"],
|
| 108 |
+
min_annotator_agreement=0.7,
|
| 109 |
+
prompt_categories=[
|
| 110 |
+
"RFP_analysis", "proposal_writing", "pricing_strategy",
|
| 111 |
+
"compliance_check", "protest_risk", "teaming_agreements",
|
| 112 |
+
"past_performance", "capability_statements", "CPARS_response"
|
| 113 |
+
],
|
| 114 |
+
key_terms={
|
| 115 |
+
"PWS": "Performance Work Statement",
|
| 116 |
+
"LPTA": "Lowest Price Technically Acceptable",
|
| 117 |
+
"OCI": "Organizational Conflict of Interest",
|
| 118 |
+
"CPARS": "Contractor Performance Assessment Reporting System"
|
| 119 |
+
}
|
| 120 |
+
),
|
| 121 |
+
|
| 122 |
+
DomainID.BIOMEDICAL_GBCI: DomainSpec(
|
| 123 |
+
id=DomainID.BIOMEDICAL_GBCI,
|
| 124 |
+
name="Gut-Brain Computer Interface (External)",
|
| 125 |
+
description="Biosensor systems, gut microbiome analysis, neural signal processing, and brain-gut axis research.",
|
| 126 |
+
zuup_platform="Symbion",
|
| 127 |
+
dimensions=[
|
| 128 |
+
QualityDimension(
|
| 129 |
+
name="scientific_accuracy",
|
| 130 |
+
description="Correct understanding of gut-brain axis physiology, microbiome science",
|
| 131 |
+
weight=0.30,
|
| 132 |
+
examples_good=["Accurate description of vagal afferent signaling pathways"],
|
| 133 |
+
examples_bad=["Conflates correlation with causation in microbiome studies"]
|
| 134 |
+
),
|
| 135 |
+
QualityDimension(
|
| 136 |
+
name="safety_primacy",
|
| 137 |
+
description="Prioritizes patient/user safety, acknowledges limitations",
|
| 138 |
+
weight=0.25,
|
| 139 |
+
examples_good=["Recommends physician consultation before intervention"],
|
| 140 |
+
examples_bad=["Suggests unvalidated treatments without disclaimers"]
|
| 141 |
+
),
|
| 142 |
+
QualityDimension(
|
| 143 |
+
name="technical_rigor",
|
| 144 |
+
description="Correct signal processing, biosensor engineering principles",
|
| 145 |
+
weight=0.20,
|
| 146 |
+
examples_good=["Proper SNR calculations for biosensor specifications"],
|
| 147 |
+
examples_bad=["Ignores noise floor in sensitivity claims"]
|
| 148 |
+
),
|
| 149 |
+
QualityDimension(
|
| 150 |
+
name="regulatory_awareness",
|
| 151 |
+
description="Understands FDA pathways, IRB requirements, HIPAA constraints",
|
| 152 |
+
weight=0.15,
|
| 153 |
+
examples_good=["Identifies 510(k) predicate device strategy"],
|
| 154 |
+
examples_bad=["Ignores medical device classification requirements"]
|
| 155 |
+
),
|
| 156 |
+
QualityDimension(
|
| 157 |
+
name="ethical_grounding",
|
| 158 |
+
description="Addresses informed consent, data privacy, vulnerable populations",
|
| 159 |
+
weight=0.10,
|
| 160 |
+
examples_good=["Discusses consent protocols for cognitive research"],
|
| 161 |
+
examples_bad=["No mention of data governance for health data"]
|
| 162 |
+
),
|
| 163 |
+
],
|
| 164 |
+
safety_considerations=[
|
| 165 |
+
"No medical advice without appropriate disclaimers",
|
| 166 |
+
"No claims of FDA approval without evidence",
|
| 167 |
+
"Acknowledge research vs clinical evidence distinction",
|
| 168 |
+
"Protect PHI in all examples",
|
| 169 |
+
"No promotion of unvalidated interventions"
|
| 170 |
+
],
|
| 171 |
+
compliance_frameworks=["FDA 21 CFR", "HIPAA", "IRB/Common Rule", "GDPR (health data)", "ISO 13485"],
|
| 172 |
+
required_expertise=["Biomedical engineering", "Neuroscience background", "Regulatory familiarity"],
|
| 173 |
+
min_annotator_agreement=0.75,
|
| 174 |
+
prompt_categories=[
|
| 175 |
+
"signal_processing", "microbiome_analysis", "biosensor_design",
|
| 176 |
+
"clinical_study_design", "regulatory_pathway", "data_architecture",
|
| 177 |
+
"neural_decoding", "intervention_protocols"
|
| 178 |
+
],
|
| 179 |
+
key_terms={
|
| 180 |
+
"ENS": "Enteric Nervous System",
|
| 181 |
+
"SCFAs": "Short-Chain Fatty Acids",
|
| 182 |
+
"HRV": "Heart Rate Variability",
|
| 183 |
+
"EGG": "Electrogastrography"
|
| 184 |
+
}
|
| 185 |
+
),
|
| 186 |
+
|
| 187 |
+
DomainID.INGESTIBLE_GBCI: DomainSpec(
|
| 188 |
+
id=DomainID.INGESTIBLE_GBCI,
|
| 189 |
+
name="Ingestible Gut-Brain Interface",
|
| 190 |
+
description="Ingestible biosensors, capsule endoscopy, in-vivo diagnostics, and wireless telemetry.",
|
| 191 |
+
zuup_platform="Symbion (Hardware)",
|
| 192 |
+
dimensions=[
|
| 193 |
+
QualityDimension(
|
| 194 |
+
name="biocompatibility",
|
| 195 |
+
description="Materials safety, degradation pathways, toxicity considerations",
|
| 196 |
+
weight=0.25,
|
| 197 |
+
examples_good=["Specifies USP Class VI materials for encapsulation"],
|
| 198 |
+
examples_bad=["Ignores GI transit variability in design"]
|
| 199 |
+
),
|
| 200 |
+
QualityDimension(
|
| 201 |
+
name="engineering_feasibility",
|
| 202 |
+
description="Realistic power budgets, form factors, telemetry constraints",
|
| 203 |
+
weight=0.25,
|
| 204 |
+
examples_good=["Calculates RF link budget for in-body transmission"],
|
| 205 |
+
examples_bad=["Assumes unlimited battery life in capsule form"]
|
| 206 |
+
),
|
| 207 |
+
QualityDimension(
|
| 208 |
+
name="clinical_validity",
|
| 209 |
+
description="Correlation with gold-standard diagnostics, clinical utility",
|
| 210 |
+
weight=0.20,
|
| 211 |
+
examples_good=["Validates against colonoscopy findings"],
|
| 212 |
+
examples_bad=["Claims diagnostic accuracy without clinical trial data"]
|
| 213 |
+
),
|
| 214 |
+
QualityDimension(
|
| 215 |
+
name="regulatory_pathway",
|
| 216 |
+
description="Clear FDA classification, predicate strategy, clinical evidence requirements",
|
| 217 |
+
weight=0.20,
|
| 218 |
+
examples_good=["Maps to PillCam as predicate for 510(k)"],
|
| 219 |
+
examples_bad=["Ignores De Novo pathway for novel devices"]
|
| 220 |
+
),
|
| 221 |
+
QualityDimension(
|
| 222 |
+
name="safety_engineering",
|
| 223 |
+
description="Failure modes, retention protocols, emergency procedures",
|
| 224 |
+
weight=0.10,
|
| 225 |
+
examples_good=["Defines capsule retention management protocol"],
|
| 226 |
+
examples_bad=["No consideration of obstruction scenarios"]
|
| 227 |
+
),
|
| 228 |
+
],
|
| 229 |
+
safety_considerations=[
|
| 230 |
+
"Capsule retention/obstruction protocols mandatory",
|
| 231 |
+
"Biocompatibility testing requirements",
|
| 232 |
+
"Wireless emission limits (SAR)",
|
| 233 |
+
"Contraindications for GI pathology",
|
| 234 |
+
"No pediatric use without specific validation"
|
| 235 |
+
],
|
| 236 |
+
compliance_frameworks=["FDA 21 CFR 876", "IEC 60601", "ISO 10993", "FCC Part 95"],
|
| 237 |
+
required_expertise=["Medical device engineering", "GI physiology", "RF engineering"],
|
| 238 |
+
min_annotator_agreement=0.8,
|
| 239 |
+
prompt_categories=[
|
| 240 |
+
"capsule_design", "power_systems", "telemetry", "biocompatibility",
|
| 241 |
+
"clinical_validation", "manufacturing", "regulatory_submission"
|
| 242 |
+
],
|
| 243 |
+
key_terms={
|
| 244 |
+
"GITT": "GI Transit Time",
|
| 245 |
+
"WCE": "Wireless Capsule Endoscopy",
|
| 246 |
+
"MICS": "Medical Implant Communication Service"
|
| 247 |
+
}
|
| 248 |
+
),
|
| 249 |
+
|
| 250 |
+
DomainID.LEGACY_REFACTORING: DomainSpec(
|
| 251 |
+
id=DomainID.LEGACY_REFACTORING,
|
| 252 |
+
name="Legacy System Refactoring",
|
| 253 |
+
description="COBOL migration, mainframe modernization, strangler pattern implementation, and technical debt reduction.",
|
| 254 |
+
zuup_platform="Relian",
|
| 255 |
+
dimensions=[
|
| 256 |
+
QualityDimension(
|
| 257 |
+
name="correctness_preservation",
|
| 258 |
+
description="Maintains functional equivalence with legacy system",
|
| 259 |
+
weight=0.30,
|
| 260 |
+
examples_good=["Characterization tests verify behavior parity"],
|
| 261 |
+
examples_bad=["Assumes modern code 'should work the same'"]
|
| 262 |
+
),
|
| 263 |
+
QualityDimension(
|
| 264 |
+
name="risk_mitigation",
|
| 265 |
+
description="Incremental migration, rollback strategies, blast radius containment",
|
| 266 |
+
weight=0.25,
|
| 267 |
+
examples_good=["Implements strangler fig with feature flags"],
|
| 268 |
+
examples_bad=["Big-bang migration with no fallback"]
|
| 269 |
+
),
|
| 270 |
+
QualityDimension(
|
| 271 |
+
name="technical_accuracy",
|
| 272 |
+
description="Correct understanding of COBOL, JCL, VSAM, CICS, IMS",
|
| 273 |
+
weight=0.20,
|
| 274 |
+
examples_good=["Handles COBOL COMP-3 packed decimal correctly"],
|
| 275 |
+
examples_bad=["Ignores EBCDIC encoding issues"]
|
| 276 |
+
),
|
| 277 |
+
QualityDimension(
|
| 278 |
+
name="business_continuity",
|
| 279 |
+
description="Maintains operations during migration, handles batch windows",
|
| 280 |
+
weight=0.15,
|
| 281 |
+
examples_good=["Parallel run strategy with reconciliation"],
|
| 282 |
+
examples_bad=["Requires production downtime for cutover"]
|
| 283 |
+
),
|
| 284 |
+
QualityDimension(
|
| 285 |
+
name="documentation",
|
| 286 |
+
description="Captures tribal knowledge, maps business rules",
|
| 287 |
+
weight=0.10,
|
| 288 |
+
examples_good=["Documents undocumented COPYBOOK business logic"],
|
| 289 |
+
examples_bad=["Assumes self-documenting code"]
|
| 290 |
+
),
|
| 291 |
+
],
|
| 292 |
+
safety_considerations=[
|
| 293 |
+
"Production data protection during migration",
|
| 294 |
+
"Audit trail continuity across systems",
|
| 295 |
+
"Compliance evidence preservation",
|
| 296 |
+
"No loss of business logic"
|
| 297 |
+
],
|
| 298 |
+
compliance_frameworks=["SOX", "PCI-DSS", "GLBA", "HIPAA (if healthcare)"],
|
| 299 |
+
required_expertise=["COBOL experience", "Mainframe operations", "Modern architecture"],
|
| 300 |
+
min_annotator_agreement=0.7,
|
| 301 |
+
prompt_categories=[
|
| 302 |
+
"code_translation", "data_migration", "testing_strategy",
|
| 303 |
+
"strangler_pattern", "batch_modernization", "API_wrapping",
|
| 304 |
+
"performance_parity", "knowledge_capture"
|
| 305 |
+
],
|
| 306 |
+
key_terms={
|
| 307 |
+
"COPYBOOK": "COBOL data structure definition",
|
| 308 |
+
"JCL": "Job Control Language",
|
| 309 |
+
"VSAM": "Virtual Storage Access Method",
|
| 310 |
+
"CICS": "Customer Information Control System"
|
| 311 |
+
}
|
| 312 |
+
),
|
| 313 |
+
|
| 314 |
+
DomainID.AUTONOMY_OS: DomainSpec(
|
| 315 |
+
id=DomainID.AUTONOMY_OS,
|
| 316 |
+
name="Autonomy OS / Post-ASI LLM",
|
| 317 |
+
description="Autonomous agent systems, tool use safety, multi-agent coordination, and post-superintelligence architectures.",
|
| 318 |
+
zuup_platform="Veyra",
|
| 319 |
+
dimensions=[
|
| 320 |
+
QualityDimension(
|
| 321 |
+
name="safety_alignment",
|
| 322 |
+
description="Proper constraints, human oversight, corrigibility",
|
| 323 |
+
weight=0.30,
|
| 324 |
+
examples_good=["Implements approval gates for high-impact actions"],
|
| 325 |
+
examples_bad=["Autonomous execution without human checkpoints"]
|
| 326 |
+
),
|
| 327 |
+
QualityDimension(
|
| 328 |
+
name="capability_grounding",
|
| 329 |
+
description="Realistic assessment of current vs speculative capabilities",
|
| 330 |
+
weight=0.25,
|
| 331 |
+
examples_good=["Clearly labels TRL for each capability claim"],
|
| 332 |
+
examples_bad=["Conflates research concepts with production readiness"]
|
| 333 |
+
),
|
| 334 |
+
QualityDimension(
|
| 335 |
+
name="tool_safety",
|
| 336 |
+
description="Proper sandboxing, permission models, rollback mechanisms",
|
| 337 |
+
weight=0.20,
|
| 338 |
+
examples_good=["Defines tool permission matrix with escalation"],
|
| 339 |
+
examples_bad=["Gives agents unrestricted filesystem access"]
|
| 340 |
+
),
|
| 341 |
+
QualityDimension(
|
| 342 |
+
name="coordination_correctness",
|
| 343 |
+
description="Multi-agent consensus, conflict resolution, resource management",
|
| 344 |
+
weight=0.15,
|
| 345 |
+
examples_good=["Implements Byzantine fault tolerance for agent voting"],
|
| 346 |
+
examples_bad=["Assumes agents always agree"]
|
| 347 |
+
),
|
| 348 |
+
QualityDimension(
|
| 349 |
+
name="interpretability",
|
| 350 |
+
description="Explainable decisions, audit trails, reasoning transparency",
|
| 351 |
+
weight=0.10,
|
| 352 |
+
examples_good=["Logs full reasoning chain for each action"],
|
| 353 |
+
examples_bad=["Black-box decision making"]
|
| 354 |
+
),
|
| 355 |
+
],
|
| 356 |
+
safety_considerations=[
|
| 357 |
+
"Human-in-the-loop for consequential decisions",
|
| 358 |
+
"Containment strategies for capability overhang",
|
| 359 |
+
"No self-modification without approval",
|
| 360 |
+
"Shutdown/rollback always available",
|
| 361 |
+
"Distinguish speculation from engineering"
|
| 362 |
+
],
|
| 363 |
+
compliance_frameworks=["NIST AI RMF", "EU AI Act (high-risk)", "DoD AI Ethics"],
|
| 364 |
+
required_expertise=["AI safety research", "Distributed systems", "Agent architectures"],
|
| 365 |
+
min_annotator_agreement=0.75,
|
| 366 |
+
prompt_categories=[
|
| 367 |
+
"agent_design", "tool_permissions", "multi_agent_coord",
|
| 368 |
+
"safety_constraints", "capability_assessment", "deployment_strategy",
|
| 369 |
+
"failure_modes", "alignment_verification"
|
| 370 |
+
],
|
| 371 |
+
key_terms={
|
| 372 |
+
"HITL": "Human-in-the-Loop",
|
| 373 |
+
"TRL": "Technology Readiness Level",
|
| 374 |
+
"Corrigibility": "Ability to be corrected/shut down"
|
| 375 |
+
}
|
| 376 |
+
),
|
| 377 |
+
|
| 378 |
+
DomainID.QUANTUM_ARCHAEOLOGY: DomainSpec(
|
| 379 |
+
id=DomainID.QUANTUM_ARCHAEOLOGY,
|
| 380 |
+
name="Quantum Archaeological World Models",
|
| 381 |
+
description="Historical event reconstruction, evidence synthesis, uncertainty quantification, and temporal reasoning.",
|
| 382 |
+
zuup_platform="QAWM / QAL",
|
| 383 |
+
dimensions=[
|
| 384 |
+
QualityDimension(
|
| 385 |
+
name="evidential_rigor",
|
| 386 |
+
description="Proper source citation, evidence weighting, provenance tracking",
|
| 387 |
+
weight=0.30,
|
| 388 |
+
examples_good=["Weights primary sources over secondary interpretations"],
|
| 389 |
+
examples_bad=["Treats Wikipedia as primary evidence"]
|
| 390 |
+
),
|
| 391 |
+
QualityDimension(
|
| 392 |
+
name="uncertainty_quantification",
|
| 393 |
+
description="Explicit confidence intervals, alternative hypotheses",
|
| 394 |
+
weight=0.25,
|
| 395 |
+
examples_good=["Reports reconstruction with 95% CI and alternatives"],
|
| 396 |
+
examples_bad=["Presents single interpretation as fact"]
|
| 397 |
+
),
|
| 398 |
+
QualityDimension(
|
| 399 |
+
name="temporal_reasoning",
|
| 400 |
+
description="Correct handling of chronology, causation, anachronism detection",
|
| 401 |
+
weight=0.20,
|
| 402 |
+
examples_good=["Flags anachronistic elements in source material"],
|
| 403 |
+
examples_bad=["Ignores temporal inconsistencies"]
|
| 404 |
+
),
|
| 405 |
+
QualityDimension(
|
| 406 |
+
name="methodological_transparency",
|
| 407 |
+
description="Clear description of reconstruction methodology",
|
| 408 |
+
weight=0.15,
|
| 409 |
+
examples_good=["Documents Bayesian update process for beliefs"],
|
| 410 |
+
examples_bad=["Presents conclusions without methodology"]
|
| 411 |
+
),
|
| 412 |
+
QualityDimension(
|
| 413 |
+
name="simulation_validity",
|
| 414 |
+
description="Realistic constraints on reconstructions, physics/economics grounding",
|
| 415 |
+
weight=0.10,
|
| 416 |
+
examples_good=["Validates against known logistical constraints"],
|
| 417 |
+
examples_bad=["Ignores material/resource limitations of era"]
|
| 418 |
+
),
|
| 419 |
+
],
|
| 420 |
+
safety_considerations=[
|
| 421 |
+
"No falsification of historical record",
|
| 422 |
+
"Acknowledge political sensitivities",
|
| 423 |
+
"Distinguish reconstruction from fabrication",
|
| 424 |
+
"Respect cultural heritage considerations"
|
| 425 |
+
],
|
| 426 |
+
compliance_frameworks=["Academic integrity standards", "NAGPRA (if indigenous)", "UNESCO heritage"],
|
| 427 |
+
required_expertise=["Historical methodology", "Bayesian reasoning", "Domain history"],
|
| 428 |
+
min_annotator_agreement=0.65,
|
| 429 |
+
prompt_categories=[
|
| 430 |
+
"event_reconstruction", "source_analysis", "timeline_synthesis",
|
| 431 |
+
"counterfactual_analysis", "evidence_weighting", "visualization",
|
| 432 |
+
"uncertainty_modeling", "cross_reference"
|
| 433 |
+
],
|
| 434 |
+
key_terms={
|
| 435 |
+
"Provenance": "Chain of custody/origin of evidence",
|
| 436 |
+
"Terminus post quem": "Earliest possible date",
|
| 437 |
+
"Terminus ante quem": "Latest possible date"
|
| 438 |
+
}
|
| 439 |
+
),
|
| 440 |
+
|
| 441 |
+
DomainID.DEFENSE_WORLD_MODELS: DomainSpec(
|
| 442 |
+
id=DomainID.DEFENSE_WORLD_MODELS,
|
| 443 |
+
name="Defense World Models",
|
| 444 |
+
description="3D scene understanding, spatial intelligence, ISR applications, and tactical decision support.",
|
| 445 |
+
zuup_platform="Orb",
|
| 446 |
+
dimensions=[
|
| 447 |
+
QualityDimension(
|
| 448 |
+
name="spatial_accuracy",
|
| 449 |
+
description="Correct 3D reconstruction, geospatial reasoning, coordinate systems",
|
| 450 |
+
weight=0.25,
|
| 451 |
+
examples_good=["Proper MGRS/UTM coordinate handling"],
|
| 452 |
+
examples_bad=["Ignores datum/projection errors"]
|
| 453 |
+
),
|
| 454 |
+
QualityDimension(
|
| 455 |
+
name="operational_relevance",
|
| 456 |
+
description="Actionable intelligence, mission-aligned outputs",
|
| 457 |
+
weight=0.25,
|
| 458 |
+
examples_good=["Identifies tactically significant terrain features"],
|
| 459 |
+
examples_bad=["Generic scene description without operational context"]
|
| 460 |
+
),
|
| 461 |
+
QualityDimension(
|
| 462 |
+
name="uncertainty_communication",
|
| 463 |
+
description="Confidence levels, sensor limitations, fusion caveats",
|
| 464 |
+
weight=0.20,
|
| 465 |
+
examples_good=["Reports reconstruction confidence per region"],
|
| 466 |
+
examples_bad=["Presents all outputs as equally reliable"]
|
| 467 |
+
),
|
| 468 |
+
QualityDimension(
|
| 469 |
+
name="security_awareness",
|
| 470 |
+
description="OPSEC considerations, classification handling, need-to-know",
|
| 471 |
+
weight=0.20,
|
| 472 |
+
examples_good=["Redacts sensitive locations in examples"],
|
| 473 |
+
examples_bad=["Uses real operational data in training"]
|
| 474 |
+
),
|
| 475 |
+
QualityDimension(
|
| 476 |
+
name="interoperability",
|
| 477 |
+
description="Standards compliance, data exchange formats",
|
| 478 |
+
weight=0.10,
|
| 479 |
+
examples_good=["Outputs in NGA-compliant formats"],
|
| 480 |
+
examples_bad=["Proprietary formats without conversion"]
|
| 481 |
+
),
|
| 482 |
+
],
|
| 483 |
+
safety_considerations=[
|
| 484 |
+
"No real classified/operational data",
|
| 485 |
+
"OPSEC in all examples",
|
| 486 |
+
"Dual-use awareness",
|
| 487 |
+
"No targeting recommendations without HITL",
|
| 488 |
+
"Export control (ITAR/EAR) awareness"
|
| 489 |
+
],
|
| 490 |
+
compliance_frameworks=["NIST 800-171", "CMMC", "ITAR", "NGA standards", "NATO STANAG"],
|
| 491 |
+
required_expertise=["Geospatial intelligence", "3D computer vision", "Defense domain"],
|
| 492 |
+
min_annotator_agreement=0.75,
|
| 493 |
+
prompt_categories=[
|
| 494 |
+
"scene_reconstruction", "change_detection", "terrain_analysis",
|
| 495 |
+
"sensor_fusion", "tactical_planning", "visualization",
|
| 496 |
+
"data_standards", "pipeline_design"
|
| 497 |
+
],
|
| 498 |
+
key_terms={
|
| 499 |
+
"MGRS": "Military Grid Reference System",
|
| 500 |
+
"ISR": "Intelligence, Surveillance, Reconnaissance",
|
| 501 |
+
"GEOINT": "Geospatial Intelligence"
|
| 502 |
+
}
|
| 503 |
+
),
|
| 504 |
+
|
| 505 |
+
DomainID.HALAL_COMPLIANCE: DomainSpec(
|
| 506 |
+
id=DomainID.HALAL_COMPLIANCE,
|
| 507 |
+
name="Global Halal Compliance",
|
| 508 |
+
description="Halal certification, supply chain provenance, standards harmonization, and attestation systems.",
|
| 509 |
+
zuup_platform="Civium (Halal)",
|
| 510 |
+
dimensions=[
|
| 511 |
+
QualityDimension(
|
| 512 |
+
name="jurisprudential_accuracy",
|
| 513 |
+
description="Correct understanding of fiqh positions, school differences",
|
| 514 |
+
weight=0.25,
|
| 515 |
+
examples_good=["Acknowledges Hanafi vs Shafi'i differences on seafood"],
|
| 516 |
+
examples_bad=["Presents single madhab view as universal"]
|
| 517 |
+
),
|
| 518 |
+
QualityDimension(
|
| 519 |
+
name="standards_mapping",
|
| 520 |
+
description="Correct mapping across GSO, JAKIM, MUI, ESMA standards",
|
| 521 |
+
weight=0.25,
|
| 522 |
+
examples_good=["Maps ingredient to multiple standard requirements"],
|
| 523 |
+
examples_bad=["Assumes single global standard"]
|
| 524 |
+
),
|
| 525 |
+
QualityDimension(
|
| 526 |
+
name="supply_chain_rigor",
|
| 527 |
+
description="Provenance tracking, contamination prevention, audit trails",
|
| 528 |
+
weight=0.20,
|
| 529 |
+
examples_good=["Full chain of custody from slaughter to retail"],
|
| 530 |
+
examples_bad=["Relies on final product testing only"]
|
| 531 |
+
),
|
| 532 |
+
QualityDimension(
|
| 533 |
+
name="dispute_handling",
|
| 534 |
+
description="Clear escalation paths, scholarly consultation protocols",
|
| 535 |
+
weight=0.15,
|
| 536 |
+
examples_good=["Defined process for disputed ingredients"],
|
| 537 |
+
examples_bad=["Binary halal/haram without nuance"]
|
| 538 |
+
),
|
| 539 |
+
QualityDimension(
|
| 540 |
+
name="cultural_sensitivity",
|
| 541 |
+
description="Respectful treatment of religious requirements",
|
| 542 |
+
weight=0.15,
|
| 543 |
+
examples_good=["Frames compliance as religious obligation support"],
|
| 544 |
+
examples_bad=["Treats halal as mere market requirement"]
|
| 545 |
+
),
|
| 546 |
+
],
|
| 547 |
+
safety_considerations=[
|
| 548 |
+
"Respect religious sensitivities",
|
| 549 |
+
"No misrepresentation of certification status",
|
| 550 |
+
"Acknowledge legitimate scholarly differences",
|
| 551 |
+
"Protect proprietary formulations"
|
| 552 |
+
],
|
| 553 |
+
compliance_frameworks=["GSO 2055", "MS 1500", "UAE.S 2055", "OIC/SMIIC"],
|
| 554 |
+
required_expertise=["Islamic jurisprudence familiarity", "Food science", "Supply chain"],
|
| 555 |
+
min_annotator_agreement=0.7,
|
| 556 |
+
prompt_categories=[
|
| 557 |
+
"ingredient_analysis", "certification_mapping", "supply_chain",
|
| 558 |
+
"audit_protocols", "dispute_resolution", "standards_harmonization",
|
| 559 |
+
"cross_contamination", "documentation"
|
| 560 |
+
],
|
| 561 |
+
key_terms={
|
| 562 |
+
"Dhabiha": "Islamic slaughter method",
|
| 563 |
+
"Mashbooh": "Doubtful/questionable",
|
| 564 |
+
"Istihalah": "Complete transformation (purification)"
|
| 565 |
+
}
|
| 566 |
+
),
|
| 567 |
+
|
| 568 |
+
DomainID.MOBILE_DATA_CENTER: DomainSpec(
|
| 569 |
+
id=DomainID.MOBILE_DATA_CENTER,
|
| 570 |
+
name="Mobile Distributed Data Centers",
|
| 571 |
+
description="Edge computing in DDIL environments, tactical networking, and resilient infrastructure.",
|
| 572 |
+
zuup_platform="PodX",
|
| 573 |
+
dimensions=[
|
| 574 |
+
QualityDimension(
|
| 575 |
+
name="operational_resilience",
|
| 576 |
+
description="Offline-first, degraded mode operation, recovery procedures",
|
| 577 |
+
weight=0.25,
|
| 578 |
+
examples_good=["Defines graceful degradation for each connectivity state"],
|
| 579 |
+
examples_bad=["Assumes persistent connectivity"]
|
| 580 |
+
),
|
| 581 |
+
QualityDimension(
|
| 582 |
+
name="environmental_hardening",
|
| 583 |
+
description="Thermal, shock, vibration, EMI considerations",
|
| 584 |
+
weight=0.25,
|
| 585 |
+
examples_good=["Specifies MIL-STD-810 compliance for shock/vibe"],
|
| 586 |
+
examples_bad=["Commercial hardware without hardening"]
|
| 587 |
+
),
|
| 588 |
+
QualityDimension(
|
| 589 |
+
name="logistics_feasibility",
|
| 590 |
+
description="Power budgets, form factors, transportability constraints",
|
| 591 |
+
weight=0.20,
|
| 592 |
+
examples_good=["Calculates total power budget with thermal headroom"],
|
| 593 |
+
examples_bad=["Ignores generator fuel logistics"]
|
| 594 |
+
),
|
| 595 |
+
QualityDimension(
|
| 596 |
+
name="security_architecture",
|
| 597 |
+
description="Zero-trust, data-at-rest encryption, physical security",
|
| 598 |
+
weight=0.20,
|
| 599 |
+
examples_good=["HSM-backed key management with tamper response"],
|
| 600 |
+
examples_bad=["Software-only encryption with key in memory"]
|
| 601 |
+
),
|
| 602 |
+
QualityDimension(
|
| 603 |
+
name="interoperability",
|
| 604 |
+
description="Coalition partner integration, standards compliance",
|
| 605 |
+
weight=0.10,
|
| 606 |
+
examples_good=["Implements NATO FMN standards for data sharing"],
|
| 607 |
+
examples_bad=["Proprietary protocols without gateways"]
|
| 608 |
+
),
|
| 609 |
+
],
|
| 610 |
+
safety_considerations=[
|
| 611 |
+
"Personnel safety in field conditions",
|
| 612 |
+
"Data destruction procedures",
|
| 613 |
+
"Physical security protocols",
|
| 614 |
+
"EMI/EMC compliance"
|
| 615 |
+
],
|
| 616 |
+
compliance_frameworks=["MIL-STD-810", "MIL-STD-461", "NIST 800-171", "NATO STANAG"],
|
| 617 |
+
required_expertise=["Edge computing", "Military logistics", "Tactical networking"],
|
| 618 |
+
min_annotator_agreement=0.7,
|
| 619 |
+
prompt_categories=[
|
| 620 |
+
"architecture_design", "power_systems", "thermal_management",
|
| 621 |
+
"networking", "security", "logistics", "deployment_procedures",
|
| 622 |
+
"recovery_operations"
|
| 623 |
+
],
|
| 624 |
+
key_terms={
|
| 625 |
+
"DDIL": "Denied, Degraded, Intermittent, Limited (bandwidth)",
|
| 626 |
+
"PACE": "Primary, Alternate, Contingency, Emergency (comms)",
|
| 627 |
+
"FMN": "Federated Mission Networking"
|
| 628 |
+
}
|
| 629 |
+
),
|
| 630 |
+
|
| 631 |
+
DomainID.HUBZONE: DomainSpec(
|
| 632 |
+
id=DomainID.HUBZONE,
|
| 633 |
+
name="HUBZone Ecosystem",
|
| 634 |
+
description="HUBZone certification, small business contracting, economic development in underserved areas.",
|
| 635 |
+
zuup_platform="Aureon (HUBZone)",
|
| 636 |
+
dimensions=[
|
| 637 |
+
QualityDimension(
|
| 638 |
+
name="regulatory_accuracy",
|
| 639 |
+
description="Correct HUBZone eligibility rules, SBA requirements",
|
| 640 |
+
weight=0.30,
|
| 641 |
+
examples_good=["Correctly calculates 35% employee residency requirement"],
|
| 642 |
+
examples_bad=["Misapplies principal office location rules"]
|
| 643 |
+
),
|
| 644 |
+
QualityDimension(
|
| 645 |
+
name="strategic_guidance",
|
| 646 |
+
description="Actionable advice for certification and contracting",
|
| 647 |
+
weight=0.25,
|
| 648 |
+
examples_good=["Maps HUBZone set-aside opportunities to capabilities"],
|
| 649 |
+
examples_bad=["Generic small business advice"]
|
| 650 |
+
),
|
| 651 |
+
QualityDimension(
|
| 652 |
+
name="compliance_maintenance",
|
| 653 |
+
description="Ongoing compliance, recertification, audit preparation",
|
| 654 |
+
weight=0.20,
|
| 655 |
+
examples_good=["Defines annual recertification checklist"],
|
| 656 |
+
examples_bad=["Assumes one-time certification"]
|
| 657 |
+
),
|
| 658 |
+
QualityDimension(
|
| 659 |
+
name="economic_development",
|
| 660 |
+
description="Understanding of HUBZone program economic objectives",
|
| 661 |
+
weight=0.15,
|
| 662 |
+
examples_good=["Connects certification to community impact"],
|
| 663 |
+
examples_bad=["Treats purely as contracting advantage"]
|
| 664 |
+
),
|
| 665 |
+
QualityDimension(
|
| 666 |
+
name="documentation",
|
| 667 |
+
description="Proper evidence collection, record-keeping",
|
| 668 |
+
weight=0.10,
|
| 669 |
+
examples_good=["Specifies required residence documentation"],
|
| 670 |
+
examples_bad=["Vague reference to 'proof of residence'"]
|
| 671 |
+
),
|
| 672 |
+
],
|
| 673 |
+
safety_considerations=[
|
| 674 |
+
"No advice on fraudulent certification",
|
| 675 |
+
"Accurate representation of eligibility",
|
| 676 |
+
"Privacy of employee information"
|
| 677 |
+
],
|
| 678 |
+
compliance_frameworks=["13 CFR Part 126", "SBA HUBZone Program", "FAR 19.13"],
|
| 679 |
+
required_expertise=["Small business contracting", "SBA programs", "Government procurement"],
|
| 680 |
+
min_annotator_agreement=0.7,
|
| 681 |
+
prompt_categories=[
|
| 682 |
+
"eligibility_assessment", "certification_process", "contracting_strategy",
|
| 683 |
+
"compliance_maintenance", "teaming", "subcontracting",
|
| 684 |
+
"map_analysis", "documentation"
|
| 685 |
+
],
|
| 686 |
+
key_terms={
|
| 687 |
+
"HUBZone": "Historically Underutilized Business Zone",
|
| 688 |
+
"Set-aside": "Contract reserved for specific small business category",
|
| 689 |
+
"Principal office": "Location where greatest number of employees work"
|
| 690 |
+
}
|
| 691 |
+
),
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
def get_domain(domain_id: DomainID) -> DomainSpec:
|
| 696 |
+
return DOMAINS[domain_id]
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
def get_all_domains() -> List[DomainSpec]:
|
| 700 |
+
return list(DOMAINS.values())
|
| 701 |
+
|
| 702 |
+
|
| 703 |
+
def get_quality_rubric(domain_id: DomainID) -> str:
|
| 704 |
+
"""Generate human-readable quality rubric for annotators."""
|
| 705 |
+
domain = DOMAINS[domain_id]
|
| 706 |
+
rubric = f"# Quality Rubric: {domain.name}\n\n"
|
| 707 |
+
rubric += f"{domain.description}\n\n"
|
| 708 |
+
rubric += "## Scoring Dimensions\n\n"
|
| 709 |
+
|
| 710 |
+
for dim in domain.dimensions:
|
| 711 |
+
rubric += f"### {dim.name.replace('_', ' ').title()} (Weight: {dim.weight:.0%})\n"
|
| 712 |
+
rubric += f"{dim.description}\n\n"
|
| 713 |
+
rubric += f"**Good example:** {dim.examples_good[0]}\n\n"
|
| 714 |
+
rubric += f"**Bad example:** {dim.examples_bad[0]}\n\n"
|
| 715 |
+
|
| 716 |
+
rubric += "## Safety Considerations\n"
|
| 717 |
+
for safety in domain.safety_considerations:
|
| 718 |
+
rubric += f"- {safety}\n"
|
| 719 |
+
|
| 720 |
+
return rubric
|