narcolepticchicken commited on
Commit
11812f7
·
verified ·
1 Parent(s): cc93099

Upload jobs/occ_debate_real_llm.py

Browse files
Files changed (1) hide show
  1. jobs/occ_debate_real_llm.py +262 -0
jobs/occ_debate_real_llm.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OCC Multi-Agent Debate Benchmark with Real LLM (Qwen3-Coder-30B-A3B-Instruct).
4
+
5
+ Two conditions:
6
+ A. Equal turns — all 4 agents get 1024 tokens each per round
7
+ B. OCC allocation — agents earn credits based on argument quality;
8
+ broker gates additional turns. Agents who make weak/unsupported claims
9
+ lose credits and get denied.
10
+
11
+ Scenario: 30 factual dispute topics. 3 honest agents + 1 adversarial.
12
+ Decision quality measured by agreement with ground truth answer.
13
+ """
14
+
15
+ import json, time, os, sys
16
+ import torch
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer
18
+
19
+ MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
20
+ MAX_NEW = 256
21
+ NUM_TOPICS = 30
22
+ NUM_ROUNDS = 3
23
+ NUM_AGENTS = 4 # 3 honest, 1 adversarial
24
+
25
+ # Topics with ground truth answers
26
+ TOPICS = [
27
+ {"q": "Is Python faster than C for numerical computation?", "truth": "no"},
28
+ {"q": "Does water boil at 100°C at all altitudes?", "truth": "no"},
29
+ {"q": "Can quantum computers break RSA-2048 today (2026)?", "truth": "no"},
30
+ {"q": "Is the Earth's core hotter than the surface of the Sun?", "truth": "yes"},
31
+ {"q": "Does the Moon have an atmosphere?", "truth": "no"},
32
+ {"q": "Can sound travel through a vacuum?", "truth": "no"},
33
+ {"q": "Is JavaScript single-threaded by default in browsers?", "truth": "yes"},
34
+ {"q": "Does DNA replication occur in the nucleus of eukaryotic cells?", "truth": "yes"},
35
+ {"q": "Can a protein crystal structure be determined with 100% certainty from X-ray data?", "truth": "no"},
36
+ {"q": "Is gradient descent guaranteed to find the global minimum for any convex function?", "truth": "yes"},
37
+ {"q": "Can GPT-4 reliably solve novel mathematical proofs without supervision?", "truth": "no"},
38
+ {"q": "Is the P vs NP problem solved as of 2026?", "truth": "no"},
39
+ {"q": "Do all metals expand when heated?", "truth": "no"},
40
+ {"q": "Is the speed of light constant in all reference frames?", "truth": "yes"},
41
+ {"q": "Can a computer program determine if another arbitrary program halts?", "truth": "no"},
42
+ {"q": "Is the Earth flat?", "truth": "no"},
43
+ {"q": "Does carbon dioxide make up more than 1% of Earth's atmosphere?", "truth": "no"},
44
+ {"q": "Can a classical computer efficiently simulate any quantum computation?", "truth": "no"},
45
+ {"q": "Is the golden ratio (1.618...) exactly equal to (1+√5)/2?", "truth": "yes"},
46
+ {"q": "Can a neural network with one hidden layer approximate any continuous function on a compact set?", "truth": "yes"},
47
+ {"q": "Does entropy always increase in an isolated system?", "truth": "yes"},
48
+ {"q": "Is Python's GIL removed in CPython 3.13+?", "truth": "yes"},
49
+ {"q": "Do sharks get cancer?", "truth": "yes"},
50
+ {"q": "Is Antarctica a country?", "truth": "no"},
51
+ {"q": "Can humans survive without gut bacteria?", "truth": "yes"},
52
+ {"q": "Do all birds fly?", "truth": "no"},
53
+ {"q": "Is lightning hotter than the surface of the Sun?", "truth": "yes"},
54
+ {"q": "Can a Turing machine with a finite tape recognize all recursive languages?", "truth": "no"},
55
+ {"q": "Is the Riemann Hypothesis proved as of 2026?", "truth": "no"},
56
+ {"q": "Does gravitational lensing confirm general relativity?", "truth": "yes"},
57
+ ]
58
+
59
+ def log(msg):
60
+ print(f"[DEBATE] {msg}", flush=True)
61
+
62
+ def load_model():
63
+ log(f"Loading {MODEL_ID}...")
64
+ tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
65
+ model = AutoModelForCausalLM.from_pretrained(
66
+ MODEL_ID, trust_remote_code=True, torch_dtype=torch.bfloat16,
67
+ device_map="auto"
68
+ )
69
+ log("Loaded.")
70
+ return model, tok
71
+
72
+ def generate(model, tok, prompt, max_new=MAX_NEW):
73
+ inputs = tok(prompt, return_tensors="pt").to(model.device)
74
+ with torch.no_grad():
75
+ out = model.generate(**inputs, max_new_tokens=max_new, do_sample=True,
76
+ temperature=0.7, top_p=0.9, pad_token_id=tok.eos_token_id)
77
+ response = tok.decode(out[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
78
+ return response
79
+
80
+ def score_argument(response, topic):
81
+ """Simple heuristic: does the response state a clear position, give reasoning?"""
82
+ score = 0.0
83
+ lower = response.lower()
84
+ # Has a clear position
85
+ if any(w in lower for w in ["yes", "no", "true", "false"]):
86
+ score += 0.3
87
+ # Has reasoning keywords
88
+ if any(w in lower for w in ["because", "therefore", "since", "due to", "the reason"]):
89
+ score += 0.3
90
+ # Not too short
91
+ if len(response.split()) > 20:
92
+ score += 0.2
93
+ # Not too rambly
94
+ if len(response.split()) < 500:
95
+ score += 0.2
96
+ return min(score, 1.0)
97
+
98
+ def extract_position(response):
99
+ lower = response.lower()
100
+ if any(w in lower for w in ["clearly yes", "definitely yes", "answer is yes", "the answer is yes"]):
101
+ return "yes"
102
+ if any(w in lower for w in ["clearly no", "definitely no", "answer is no", "the answer is no"]):
103
+ return "no"
104
+ # Count yes/no mentions
105
+ yes_count = lower.count(" yes ") + lower.count(" yes.") + lower.count(" yes,")
106
+ no_count = lower.count(" no ") + lower.count(" no.") + lower.count(" no,")
107
+ if yes_count > no_count:
108
+ return "yes"
109
+ elif no_count > yes_count:
110
+ return "no"
111
+ return "unclear"
112
+
113
+
114
+ def run_equal_turns(model, tok):
115
+ """Baseline: equal 1024 tokens per agent per round."""
116
+ log("\n=== CONDITION A: Equal Turns (baseline) ===")
117
+ correct = 0
118
+ total_tokens = 0
119
+ results = []
120
+
121
+ for i, topic in enumerate(TOPICS):
122
+ log(f"\nTopic {i+1}: {topic['q']}")
123
+ arguments = []
124
+ for agent in range(NUM_AGENTS):
125
+ persona = "honest" if agent < 3 else "adversarial"
126
+ prompt = f"Question: {topic['q']}\n\nYou are a {persona} debater. Give a clear yes/no answer with a brief explanation:\n"
127
+ response = generate(model, tok, prompt, max_new=1024)
128
+ pos = extract_position(response)
129
+ score = score_argument(response, topic)
130
+ arguments.append({"agent": agent, "persona": persona, "response": response,
131
+ "position": pos, "score": score})
132
+ total_tokens += 1024
133
+ log(f" Agent {agent} ({persona}): pos={pos}, score={score:.2f}")
134
+
135
+ # Majority vote
136
+ votes = [a["position"] for a in arguments if a["position"] != "unclear"]
137
+ if votes:
138
+ majority = max(set(votes), key=votes.count)
139
+ is_correct = majority == topic["truth"]
140
+ if is_correct:
141
+ correct += 1
142
+ log(f" Majority: {majority} (truth={topic['truth']}), correct={is_correct}")
143
+ else:
144
+ is_correct = False
145
+ log(f" No clear majority, counted as wrong")
146
+
147
+ results.append({"topic": topic["q"], "truth": topic["truth"],
148
+ "majority": majority if votes else "unclear", "correct": is_correct})
149
+
150
+ acc = correct / NUM_TOPICS
151
+ log(f"\nEqual Turns: {correct}/{NUM_TOPICS} correct ({acc:.3f}), {total_tokens} tokens")
152
+ return acc, total_tokens, results
153
+
154
+
155
+ def run_occ_allocation(model, tok):
156
+ """OCC: agents earn credits based on argument quality. Broker gates turns."""
157
+ log("\n=== CONDITION B: OCC Credit Allocation ===")
158
+
159
+ # OCC parameters
160
+ INITIAL_CREDITS = 10
161
+ COST_PER_TURN = 5
162
+ DECAY_INTERVAL = 2
163
+ DECAY_AMOUNT = 2
164
+
165
+ correct = 0
166
+ total_tokens = 0
167
+
168
+ for i, topic in enumerate(TOPICS):
169
+ log(f"\nTopic {i+1}: {topic['q']}")
170
+ credits = [INITIAL_CREDITS] * NUM_AGENTS
171
+ arguments = []
172
+ tokens_used = 0
173
+
174
+ for round_num in range(NUM_ROUNDS):
175
+ # Decay credits every DECAY_INTERVAL rounds
176
+ if round_num > 0 and round_num % DECAY_INTERVAL == 0:
177
+ credits = [max(0, c - DECAY_AMOUNT) for c in credits]
178
+
179
+ round_args = []
180
+ for agent in range(NUM_AGENTS):
181
+ if credits[agent] >= COST_PER_TURN:
182
+ persona = "honest" if agent < 3 else "adversarial"
183
+ prompt = f"Question: {topic['q']}\n\nYou are a {persona} debater. Give a clear yes/no answer with a brief explanation:\n"
184
+ response = generate(model, tok, prompt, max_new=1024)
185
+ pos = extract_position(response)
186
+ qual = score_argument(response, topic)
187
+ tokens_used += 1024
188
+
189
+ # Earn credits for good arguments
190
+ earned = int(qual * 5)
191
+ credits[agent] = credits[agent] - COST_PER_TURN + earned
192
+ credits[agent] = min(credits[agent], 20) # Cap
193
+
194
+ round_args.append({"agent": agent, "persona": persona, "response": response,
195
+ "position": pos, "score": qual, "credits": credits[agent]})
196
+ log(f" R{round_num} Agent {agent}: pos={pos}, qual={qual:.2f}, credits={credits[agent]}")
197
+ else:
198
+ log(f" R{round_num} Agent {agent}: DENIED (credits={credits[agent]} < {COST_PER_TURN})")
199
+
200
+ arguments.extend(round_args)
201
+
202
+ # Check if we have consensus after each round
203
+ positions = [a["position"] for a in arguments if a["position"] != "unclear"]
204
+ if len(positions) >= 2:
205
+ maj = max(set(positions), key=positions.count)
206
+ if positions.count(maj) >= 3: # Strong consensus
207
+ break
208
+
209
+ total_tokens += tokens_used
210
+
211
+ # Final decision: majority of all arguments
212
+ positions = [a["position"] for a in arguments if a["position"] != "unclear"]
213
+ if positions:
214
+ majority = max(set(positions), key=positions.count)
215
+ is_correct = majority == topic["truth"]
216
+ if is_correct:
217
+ correct += 1
218
+ log(f" Decision: {majority} (truth={topic['truth']}), correct={is_correct}, tokens={tokens_used}")
219
+ else:
220
+ is_correct = False
221
+ log(f" No decision, tokens={tokens_used}")
222
+
223
+ acc = correct / NUM_TOPICS
224
+ log(f"\nOCC: {correct}/{NUM_TOPICS} correct ({acc:.3f}), {total_tokens} tokens")
225
+ return acc, total_tokens
226
+
227
+
228
+ def main():
229
+ model, tok = load_model()
230
+
231
+ # Condition A: Equal turns
232
+ acc_a, tok_a, _ = run_equal_turns(model, tok)
233
+
234
+ # Condition B: OCC
235
+ acc_b, tok_b = run_occ_allocation(model, tok)
236
+
237
+ # Report
238
+ savings = (1 - tok_b / tok_a) * 100
239
+ log("\n" + "=" * 60)
240
+ log("DEBATE BENCHMARK RESULTS")
241
+ log("=" * 60)
242
+ log(f" Equal turns: acc={acc_a:.3f}, tokens={tok_a}")
243
+ log(f" OCC: acc={acc_b:.3f}, tokens={tok_b}")
244
+ log(f" Delta acc: {acc_b - acc_a:+.3f}")
245
+ log(f" Token savings: {savings:.1f}%")
246
+ log(f" Decision quality per 1K tokens: {acc_a/tok_a*1000:.4f} vs {acc_b/tok_b*1000:.4f}")
247
+
248
+ results = {
249
+ "model": MODEL_ID,
250
+ "num_topics": NUM_TOPICS,
251
+ "equal_turns": {"accuracy": acc_a, "tokens": tok_a},
252
+ "occ_allocation": {"accuracy": acc_b, "tokens": tok_b},
253
+ "delta_accuracy": acc_b - acc_a,
254
+ "token_savings_pct": savings,
255
+ }
256
+ with open("/app/occ_debate_results.json", "w") as f:
257
+ json.dump(results, f, indent=2)
258
+ log(f"\nResults saved to /app/occ_debate_results.json")
259
+
260
+
261
+ if __name__ == "__main__":
262
+ main()