Upload jobs/occ_debate_real_llm.py
Browse files- jobs/occ_debate_real_llm.py +262 -0
jobs/occ_debate_real_llm.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OCC Multi-Agent Debate Benchmark with Real LLM (Qwen3-Coder-30B-A3B-Instruct).
|
| 4 |
+
|
| 5 |
+
Two conditions:
|
| 6 |
+
A. Equal turns — all 4 agents get 1024 tokens each per round
|
| 7 |
+
B. OCC allocation — agents earn credits based on argument quality;
|
| 8 |
+
broker gates additional turns. Agents who make weak/unsupported claims
|
| 9 |
+
lose credits and get denied.
|
| 10 |
+
|
| 11 |
+
Scenario: 30 factual dispute topics. 3 honest agents + 1 adversarial.
|
| 12 |
+
Decision quality measured by agreement with ground truth answer.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json, time, os, sys
|
| 16 |
+
import torch
|
| 17 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 18 |
+
|
| 19 |
+
MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
| 20 |
+
MAX_NEW = 256
|
| 21 |
+
NUM_TOPICS = 30
|
| 22 |
+
NUM_ROUNDS = 3
|
| 23 |
+
NUM_AGENTS = 4 # 3 honest, 1 adversarial
|
| 24 |
+
|
| 25 |
+
# Topics with ground truth answers
|
| 26 |
+
TOPICS = [
|
| 27 |
+
{"q": "Is Python faster than C for numerical computation?", "truth": "no"},
|
| 28 |
+
{"q": "Does water boil at 100°C at all altitudes?", "truth": "no"},
|
| 29 |
+
{"q": "Can quantum computers break RSA-2048 today (2026)?", "truth": "no"},
|
| 30 |
+
{"q": "Is the Earth's core hotter than the surface of the Sun?", "truth": "yes"},
|
| 31 |
+
{"q": "Does the Moon have an atmosphere?", "truth": "no"},
|
| 32 |
+
{"q": "Can sound travel through a vacuum?", "truth": "no"},
|
| 33 |
+
{"q": "Is JavaScript single-threaded by default in browsers?", "truth": "yes"},
|
| 34 |
+
{"q": "Does DNA replication occur in the nucleus of eukaryotic cells?", "truth": "yes"},
|
| 35 |
+
{"q": "Can a protein crystal structure be determined with 100% certainty from X-ray data?", "truth": "no"},
|
| 36 |
+
{"q": "Is gradient descent guaranteed to find the global minimum for any convex function?", "truth": "yes"},
|
| 37 |
+
{"q": "Can GPT-4 reliably solve novel mathematical proofs without supervision?", "truth": "no"},
|
| 38 |
+
{"q": "Is the P vs NP problem solved as of 2026?", "truth": "no"},
|
| 39 |
+
{"q": "Do all metals expand when heated?", "truth": "no"},
|
| 40 |
+
{"q": "Is the speed of light constant in all reference frames?", "truth": "yes"},
|
| 41 |
+
{"q": "Can a computer program determine if another arbitrary program halts?", "truth": "no"},
|
| 42 |
+
{"q": "Is the Earth flat?", "truth": "no"},
|
| 43 |
+
{"q": "Does carbon dioxide make up more than 1% of Earth's atmosphere?", "truth": "no"},
|
| 44 |
+
{"q": "Can a classical computer efficiently simulate any quantum computation?", "truth": "no"},
|
| 45 |
+
{"q": "Is the golden ratio (1.618...) exactly equal to (1+√5)/2?", "truth": "yes"},
|
| 46 |
+
{"q": "Can a neural network with one hidden layer approximate any continuous function on a compact set?", "truth": "yes"},
|
| 47 |
+
{"q": "Does entropy always increase in an isolated system?", "truth": "yes"},
|
| 48 |
+
{"q": "Is Python's GIL removed in CPython 3.13+?", "truth": "yes"},
|
| 49 |
+
{"q": "Do sharks get cancer?", "truth": "yes"},
|
| 50 |
+
{"q": "Is Antarctica a country?", "truth": "no"},
|
| 51 |
+
{"q": "Can humans survive without gut bacteria?", "truth": "yes"},
|
| 52 |
+
{"q": "Do all birds fly?", "truth": "no"},
|
| 53 |
+
{"q": "Is lightning hotter than the surface of the Sun?", "truth": "yes"},
|
| 54 |
+
{"q": "Can a Turing machine with a finite tape recognize all recursive languages?", "truth": "no"},
|
| 55 |
+
{"q": "Is the Riemann Hypothesis proved as of 2026?", "truth": "no"},
|
| 56 |
+
{"q": "Does gravitational lensing confirm general relativity?", "truth": "yes"},
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
def log(msg):
|
| 60 |
+
print(f"[DEBATE] {msg}", flush=True)
|
| 61 |
+
|
| 62 |
+
def load_model():
|
| 63 |
+
log(f"Loading {MODEL_ID}...")
|
| 64 |
+
tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 65 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 66 |
+
MODEL_ID, trust_remote_code=True, torch_dtype=torch.bfloat16,
|
| 67 |
+
device_map="auto"
|
| 68 |
+
)
|
| 69 |
+
log("Loaded.")
|
| 70 |
+
return model, tok
|
| 71 |
+
|
| 72 |
+
def generate(model, tok, prompt, max_new=MAX_NEW):
|
| 73 |
+
inputs = tok(prompt, return_tensors="pt").to(model.device)
|
| 74 |
+
with torch.no_grad():
|
| 75 |
+
out = model.generate(**inputs, max_new_tokens=max_new, do_sample=True,
|
| 76 |
+
temperature=0.7, top_p=0.9, pad_token_id=tok.eos_token_id)
|
| 77 |
+
response = tok.decode(out[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
|
| 78 |
+
return response
|
| 79 |
+
|
| 80 |
+
def score_argument(response, topic):
|
| 81 |
+
"""Simple heuristic: does the response state a clear position, give reasoning?"""
|
| 82 |
+
score = 0.0
|
| 83 |
+
lower = response.lower()
|
| 84 |
+
# Has a clear position
|
| 85 |
+
if any(w in lower for w in ["yes", "no", "true", "false"]):
|
| 86 |
+
score += 0.3
|
| 87 |
+
# Has reasoning keywords
|
| 88 |
+
if any(w in lower for w in ["because", "therefore", "since", "due to", "the reason"]):
|
| 89 |
+
score += 0.3
|
| 90 |
+
# Not too short
|
| 91 |
+
if len(response.split()) > 20:
|
| 92 |
+
score += 0.2
|
| 93 |
+
# Not too rambly
|
| 94 |
+
if len(response.split()) < 500:
|
| 95 |
+
score += 0.2
|
| 96 |
+
return min(score, 1.0)
|
| 97 |
+
|
| 98 |
+
def extract_position(response):
|
| 99 |
+
lower = response.lower()
|
| 100 |
+
if any(w in lower for w in ["clearly yes", "definitely yes", "answer is yes", "the answer is yes"]):
|
| 101 |
+
return "yes"
|
| 102 |
+
if any(w in lower for w in ["clearly no", "definitely no", "answer is no", "the answer is no"]):
|
| 103 |
+
return "no"
|
| 104 |
+
# Count yes/no mentions
|
| 105 |
+
yes_count = lower.count(" yes ") + lower.count(" yes.") + lower.count(" yes,")
|
| 106 |
+
no_count = lower.count(" no ") + lower.count(" no.") + lower.count(" no,")
|
| 107 |
+
if yes_count > no_count:
|
| 108 |
+
return "yes"
|
| 109 |
+
elif no_count > yes_count:
|
| 110 |
+
return "no"
|
| 111 |
+
return "unclear"
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def run_equal_turns(model, tok):
|
| 115 |
+
"""Baseline: equal 1024 tokens per agent per round."""
|
| 116 |
+
log("\n=== CONDITION A: Equal Turns (baseline) ===")
|
| 117 |
+
correct = 0
|
| 118 |
+
total_tokens = 0
|
| 119 |
+
results = []
|
| 120 |
+
|
| 121 |
+
for i, topic in enumerate(TOPICS):
|
| 122 |
+
log(f"\nTopic {i+1}: {topic['q']}")
|
| 123 |
+
arguments = []
|
| 124 |
+
for agent in range(NUM_AGENTS):
|
| 125 |
+
persona = "honest" if agent < 3 else "adversarial"
|
| 126 |
+
prompt = f"Question: {topic['q']}\n\nYou are a {persona} debater. Give a clear yes/no answer with a brief explanation:\n"
|
| 127 |
+
response = generate(model, tok, prompt, max_new=1024)
|
| 128 |
+
pos = extract_position(response)
|
| 129 |
+
score = score_argument(response, topic)
|
| 130 |
+
arguments.append({"agent": agent, "persona": persona, "response": response,
|
| 131 |
+
"position": pos, "score": score})
|
| 132 |
+
total_tokens += 1024
|
| 133 |
+
log(f" Agent {agent} ({persona}): pos={pos}, score={score:.2f}")
|
| 134 |
+
|
| 135 |
+
# Majority vote
|
| 136 |
+
votes = [a["position"] for a in arguments if a["position"] != "unclear"]
|
| 137 |
+
if votes:
|
| 138 |
+
majority = max(set(votes), key=votes.count)
|
| 139 |
+
is_correct = majority == topic["truth"]
|
| 140 |
+
if is_correct:
|
| 141 |
+
correct += 1
|
| 142 |
+
log(f" Majority: {majority} (truth={topic['truth']}), correct={is_correct}")
|
| 143 |
+
else:
|
| 144 |
+
is_correct = False
|
| 145 |
+
log(f" No clear majority, counted as wrong")
|
| 146 |
+
|
| 147 |
+
results.append({"topic": topic["q"], "truth": topic["truth"],
|
| 148 |
+
"majority": majority if votes else "unclear", "correct": is_correct})
|
| 149 |
+
|
| 150 |
+
acc = correct / NUM_TOPICS
|
| 151 |
+
log(f"\nEqual Turns: {correct}/{NUM_TOPICS} correct ({acc:.3f}), {total_tokens} tokens")
|
| 152 |
+
return acc, total_tokens, results
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def run_occ_allocation(model, tok):
|
| 156 |
+
"""OCC: agents earn credits based on argument quality. Broker gates turns."""
|
| 157 |
+
log("\n=== CONDITION B: OCC Credit Allocation ===")
|
| 158 |
+
|
| 159 |
+
# OCC parameters
|
| 160 |
+
INITIAL_CREDITS = 10
|
| 161 |
+
COST_PER_TURN = 5
|
| 162 |
+
DECAY_INTERVAL = 2
|
| 163 |
+
DECAY_AMOUNT = 2
|
| 164 |
+
|
| 165 |
+
correct = 0
|
| 166 |
+
total_tokens = 0
|
| 167 |
+
|
| 168 |
+
for i, topic in enumerate(TOPICS):
|
| 169 |
+
log(f"\nTopic {i+1}: {topic['q']}")
|
| 170 |
+
credits = [INITIAL_CREDITS] * NUM_AGENTS
|
| 171 |
+
arguments = []
|
| 172 |
+
tokens_used = 0
|
| 173 |
+
|
| 174 |
+
for round_num in range(NUM_ROUNDS):
|
| 175 |
+
# Decay credits every DECAY_INTERVAL rounds
|
| 176 |
+
if round_num > 0 and round_num % DECAY_INTERVAL == 0:
|
| 177 |
+
credits = [max(0, c - DECAY_AMOUNT) for c in credits]
|
| 178 |
+
|
| 179 |
+
round_args = []
|
| 180 |
+
for agent in range(NUM_AGENTS):
|
| 181 |
+
if credits[agent] >= COST_PER_TURN:
|
| 182 |
+
persona = "honest" if agent < 3 else "adversarial"
|
| 183 |
+
prompt = f"Question: {topic['q']}\n\nYou are a {persona} debater. Give a clear yes/no answer with a brief explanation:\n"
|
| 184 |
+
response = generate(model, tok, prompt, max_new=1024)
|
| 185 |
+
pos = extract_position(response)
|
| 186 |
+
qual = score_argument(response, topic)
|
| 187 |
+
tokens_used += 1024
|
| 188 |
+
|
| 189 |
+
# Earn credits for good arguments
|
| 190 |
+
earned = int(qual * 5)
|
| 191 |
+
credits[agent] = credits[agent] - COST_PER_TURN + earned
|
| 192 |
+
credits[agent] = min(credits[agent], 20) # Cap
|
| 193 |
+
|
| 194 |
+
round_args.append({"agent": agent, "persona": persona, "response": response,
|
| 195 |
+
"position": pos, "score": qual, "credits": credits[agent]})
|
| 196 |
+
log(f" R{round_num} Agent {agent}: pos={pos}, qual={qual:.2f}, credits={credits[agent]}")
|
| 197 |
+
else:
|
| 198 |
+
log(f" R{round_num} Agent {agent}: DENIED (credits={credits[agent]} < {COST_PER_TURN})")
|
| 199 |
+
|
| 200 |
+
arguments.extend(round_args)
|
| 201 |
+
|
| 202 |
+
# Check if we have consensus after each round
|
| 203 |
+
positions = [a["position"] for a in arguments if a["position"] != "unclear"]
|
| 204 |
+
if len(positions) >= 2:
|
| 205 |
+
maj = max(set(positions), key=positions.count)
|
| 206 |
+
if positions.count(maj) >= 3: # Strong consensus
|
| 207 |
+
break
|
| 208 |
+
|
| 209 |
+
total_tokens += tokens_used
|
| 210 |
+
|
| 211 |
+
# Final decision: majority of all arguments
|
| 212 |
+
positions = [a["position"] for a in arguments if a["position"] != "unclear"]
|
| 213 |
+
if positions:
|
| 214 |
+
majority = max(set(positions), key=positions.count)
|
| 215 |
+
is_correct = majority == topic["truth"]
|
| 216 |
+
if is_correct:
|
| 217 |
+
correct += 1
|
| 218 |
+
log(f" Decision: {majority} (truth={topic['truth']}), correct={is_correct}, tokens={tokens_used}")
|
| 219 |
+
else:
|
| 220 |
+
is_correct = False
|
| 221 |
+
log(f" No decision, tokens={tokens_used}")
|
| 222 |
+
|
| 223 |
+
acc = correct / NUM_TOPICS
|
| 224 |
+
log(f"\nOCC: {correct}/{NUM_TOPICS} correct ({acc:.3f}), {total_tokens} tokens")
|
| 225 |
+
return acc, total_tokens
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def main():
|
| 229 |
+
model, tok = load_model()
|
| 230 |
+
|
| 231 |
+
# Condition A: Equal turns
|
| 232 |
+
acc_a, tok_a, _ = run_equal_turns(model, tok)
|
| 233 |
+
|
| 234 |
+
# Condition B: OCC
|
| 235 |
+
acc_b, tok_b = run_occ_allocation(model, tok)
|
| 236 |
+
|
| 237 |
+
# Report
|
| 238 |
+
savings = (1 - tok_b / tok_a) * 100
|
| 239 |
+
log("\n" + "=" * 60)
|
| 240 |
+
log("DEBATE BENCHMARK RESULTS")
|
| 241 |
+
log("=" * 60)
|
| 242 |
+
log(f" Equal turns: acc={acc_a:.3f}, tokens={tok_a}")
|
| 243 |
+
log(f" OCC: acc={acc_b:.3f}, tokens={tok_b}")
|
| 244 |
+
log(f" Delta acc: {acc_b - acc_a:+.3f}")
|
| 245 |
+
log(f" Token savings: {savings:.1f}%")
|
| 246 |
+
log(f" Decision quality per 1K tokens: {acc_a/tok_a*1000:.4f} vs {acc_b/tok_b*1000:.4f}")
|
| 247 |
+
|
| 248 |
+
results = {
|
| 249 |
+
"model": MODEL_ID,
|
| 250 |
+
"num_topics": NUM_TOPICS,
|
| 251 |
+
"equal_turns": {"accuracy": acc_a, "tokens": tok_a},
|
| 252 |
+
"occ_allocation": {"accuracy": acc_b, "tokens": tok_b},
|
| 253 |
+
"delta_accuracy": acc_b - acc_a,
|
| 254 |
+
"token_savings_pct": savings,
|
| 255 |
+
}
|
| 256 |
+
with open("/app/occ_debate_results.json", "w") as f:
|
| 257 |
+
json.dump(results, f, indent=2)
|
| 258 |
+
log(f"\nResults saved to /app/occ_debate_results.json")
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
if __name__ == "__main__":
|
| 262 |
+
main()
|