roc-hci commited on
Commit
bbd38e8
·
1 Parent(s): 8859fb6

locke-logo (#1)

Browse files

- added locke logo, fixed utils (b65a1fc85513287d351302d7b367dda472ce5105)
- removed old leaderboard (679963a518a78b23fbf27b8ea801253eb288ee29)

__pycache__/about.cpython-312.pyc ADDED
Binary file (1.64 kB). View file
 
__pycache__/theme.cpython-312.pyc ADDED
Binary file (8.32 kB). View file
 
__pycache__/utils.cpython-312.pyc ADDED
Binary file (11.2 kB). View file
 
app.py CHANGED
@@ -1,99 +1,17 @@
1
- import html
2
- import re
3
-
4
  import pandas as pd
5
  import gradio as gr
6
 
7
  from about import TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_TEXT, DESCRIPTION_TEXT
8
  from theme import build_theme, CUSTOM_CSS
9
- from utils import load_results, submit_prediction
10
 
11
  GIT_CLONE_COMMAND = "git clone https://github.com/Masum06/Turing-Bench.git"
12
 
 
 
13
 
14
- def _format_inline(text: str) -> str:
15
- escaped = html.escape(text.strip())
16
- escaped = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", escaped)
17
- escaped = re.sub(r"`([^`]+)`", r"<code>\1</code>", escaped)
18
- return escaped
19
-
20
-
21
- def markdown_to_html(markdown: str, elem_classes: str = "html-block") -> str:
22
- lines = markdown.strip().splitlines()
23
- blocks: list[str] = []
24
- paragraph: list[str] = []
25
- list_items: list[str] = []
26
- code_lines: list[str] = []
27
- code_language = ""
28
- in_code_block = False
29
-
30
- def flush_paragraph():
31
- if paragraph:
32
- content = " ".join(part.strip() for part in paragraph if part.strip())
33
- if content:
34
- blocks.append(f"<p>{_format_inline(content)}</p>")
35
- paragraph.clear()
36
-
37
- def flush_list():
38
- if list_items:
39
- items_html = "".join(f"<li>{item}</li>" for item in list_items)
40
- blocks.append(f"<ul>{items_html}</ul>")
41
- list_items.clear()
42
-
43
- for raw_line in lines:
44
- stripped = raw_line.strip()
45
-
46
- if stripped.startswith("```"):
47
- flush_paragraph()
48
- flush_list()
49
- if in_code_block:
50
- code_html = html.escape("\n".join(code_lines))
51
- language_class = f' class="language-{code_language}"' if code_language else ""
52
- blocks.append(f"<pre><code{language_class}>{code_html}</code></pre>")
53
- code_lines.clear()
54
- code_language = ""
55
- in_code_block = False
56
- else:
57
- in_code_block = True
58
- code_language = stripped.removeprefix("```").strip()
59
- continue
60
-
61
- if in_code_block:
62
- code_lines.append(raw_line.rstrip())
63
- continue
64
-
65
- if not stripped:
66
- flush_paragraph()
67
- flush_list()
68
- continue
69
-
70
- heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
71
- if heading_match:
72
- flush_paragraph()
73
- flush_list()
74
- level = len(heading_match.group(1))
75
- blocks.append(f"<h{level}>{_format_inline(heading_match.group(2))}</h{level}>")
76
- continue
77
-
78
- if stripped.startswith("- "):
79
- flush_paragraph()
80
- list_items.append(_format_inline(stripped[2:]))
81
- continue
82
-
83
- flush_list()
84
- paragraph.append(stripped)
85
-
86
- flush_paragraph()
87
- flush_list()
88
-
89
- return f'<div class="{elem_classes}">{"".join(blocks)}</div>'
90
-
91
-
92
- def _format_accuracy(value) -> str:
93
- if pd.isna(value):
94
- return "N/A"
95
- return f"{float(value):.4f}"
96
-
97
 
98
  def build_leaderboard_summary(df: pd.DataFrame) -> str:
99
  if df.empty:
@@ -151,7 +69,6 @@ def refresh_leaderboard_view():
151
  df = load_results()
152
  return df, build_leaderboard_summary(df)
153
 
154
-
155
  def submit_prediction_html(model_name, predictions_file, profile: gr.OAuthProfile | None):
156
  message = submit_prediction(model_name, predictions_file, profile)
157
  return markdown_to_html(message, "html-block status-message")
@@ -271,7 +188,6 @@ with gr.Blocks(theme=build_theme(), css=CUSTOM_CSS, fill_width=True) as demo:
271
  gr.HTML(
272
  """
273
  <div class="section-kicker">Submission workflow</div>
274
- <div class="section-heading">Evaluate locally, then upload predictions</div>
275
  <div class="steps-row">
276
  <div class="step-chip">1. Log in</div>
277
  <div class="step-chip">2. Clone git repository and run evaluation locally</div>
@@ -361,13 +277,12 @@ with gr.Blocks(theme=build_theme(), css=CUSTOM_CSS, fill_width=True) as demo:
361
  )
362
 
363
  gr.HTML(
364
- """
365
- <div class="html-block>
366
  <p class="p-small">Thanks Locke (https://lockeidentity.com/) for sponsoring part of this research</p>
 
367
  </div>
368
  """
369
  )
370
- gr.Image(value="images/locke-logo.jpg", type="filepath", elem_classes="logo-small")
371
-
372
 
373
  demo.launch()
 
 
 
 
1
  import pandas as pd
2
  import gradio as gr
3
 
4
  from about import TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_TEXT, DESCRIPTION_TEXT
5
  from theme import build_theme, CUSTOM_CSS
6
+ from utils import load_results, submit_prediction, _format_inline, markdown_to_html, _format_accuracy
7
 
8
  GIT_CLONE_COMMAND = "git clone https://github.com/Masum06/Turing-Bench.git"
9
 
10
+ import base64
11
+ from pathlib import Path
12
 
13
+ img_path = Path(__file__).parent / "images" / "locke-logo.png"
14
+ b64 = base64.b64encode(img_path.read_bytes()).decode()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def build_leaderboard_summary(df: pd.DataFrame) -> str:
17
  if df.empty:
 
69
  df = load_results()
70
  return df, build_leaderboard_summary(df)
71
 
 
72
  def submit_prediction_html(model_name, predictions_file, profile: gr.OAuthProfile | None):
73
  message = submit_prediction(model_name, predictions_file, profile)
74
  return markdown_to_html(message, "html-block status-message")
 
188
  gr.HTML(
189
  """
190
  <div class="section-kicker">Submission workflow</div>
 
191
  <div class="steps-row">
192
  <div class="step-chip">1. Log in</div>
193
  <div class="step-chip">2. Clone git repository and run evaluation locally</div>
 
277
  )
278
 
279
  gr.HTML(
280
+ f"""
281
+ <div class="html-block">
282
  <p class="p-small">Thanks Locke (https://lockeidentity.com/) for sponsoring part of this research</p>
283
+ <a href="https://lockeidentity.com/" target="_blank" rel="noopener noreferrer"><img class="logo-small" src="data:image/png;base64,{b64}"/></a>
284
  </div>
285
  """
286
  )
 
 
287
 
288
  demo.launch()
leaderboard.json DELETED
@@ -1,16 +0,0 @@
1
- [
2
- {
3
- "username": "ROC-HCI",
4
- "model": "human_judge",
5
- "accuracy": 0.5458,
6
- "date": "2026-03-23",
7
- "timestamp": "2026-03-23 14:49:20"
8
- },
9
- {
10
- "username": "ROC-HCI",
11
- "model": "GPT-4o",
12
- "accuracy": 0.4363,
13
- "date": "2026-03-01",
14
- "timestamp": "2026-03-01 16:37:58"
15
- }
16
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
run_eval.py DELETED
@@ -1,393 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Turing Test Judge Benchmark — Evaluation Script
4
- ================================================
5
- Given a dataset of paired dialogues (A and B), predict which is the human-human dialogue.
6
-
7
- SETUP
8
- -----
9
- 1. Install core dependencies:
10
- pip install pandas tqdm datasets
11
-
12
- 2. Install whatever library your model needs (see examples below).
13
-
14
- 3. Fill in the `predict()` function with your model.
15
-
16
- 4. Run:
17
- # Load from HuggingFace (default)
18
- python run_eval.py
19
-
20
- # Load from a local CSV
21
- python run_eval.py --input data.csv
22
-
23
- # Save output to a custom path
24
- python run_eval.py --output my_predictions.csv
25
-
26
- # Add a delay between API calls (seconds, useful for rate limits)
27
- python run_eval.py --delay 0.5
28
-
29
- OUTPUT FORMAT
30
- -------------
31
- A single-column CSV: who_is_human ∈ {"A", "B"}
32
- Ready to submit to the leaderboard.
33
-
34
- MULTITHREADING
35
- --------------
36
- Set USE_THREADS = True below to enable parallel inference.
37
- Set N_THREADS to control the number of worker threads.
38
- Recommended for API-based models (OpenAI, Groq, Together, etc.).
39
- NOT recommended for local models (transformers, Ollama) — use N_THREADS = 1.
40
- """
41
-
42
- import argparse
43
- import sys
44
- import time
45
- from concurrent.futures import ThreadPoolExecutor, as_completed
46
- from threading import Lock
47
-
48
- import pandas as pd
49
- from tqdm import tqdm
50
-
51
- # ══════════════════════════════════════════════════════════════════════════════
52
- # MULTITHREADING CONFIG
53
- # ══════════════════════════════════════════════════════════════════════════════
54
-
55
- USE_THREADS = True # Set True to enable parallel inference
56
- N_THREADS = 4 # Number of worker threads (ignored if USE_THREADS = False)
57
-
58
- # ══════════════════════════════════════════════════════════════════════════════
59
- # PROMPTS
60
- # ══════════════════════════════════════════════════════════════════════════════
61
-
62
- SYSTEM_PROMPT = """\
63
- INSTRUCTIONS:
64
- The following a task is an offline Turing test where you will be presented with two dialogues and your job is to identify the human.
65
-
66
- In this experiment, you will be presented with a
67
- pair of conversations, labeled "Transcript A" and "Transcript B", each one between an investigator I and a witness W.
68
- One of them is a human/human conversation and another is a human/AI conversation. Ignore all statements or actions by investigator I,
69
- and only focus on the witness W. Your job will be to decide in which conversation (A or B)
70
- the Witness W is human. The AI (LLM or otherwise) is highly advanced, so the differences may be very subtle,
71
- hence pay close attention.
72
-
73
- Each conversation involves:
74
- A Witness, labeled 'W'
75
- An Investigator, labeled 'I'
76
-
77
- The Investigator (I) is always human. In exactly one
78
- of the conversations, the Witness W is human, and
79
- in the other, the Witness W is Artificial Intelligence.
80
-
81
- Please provide your response in JSON format:
82
-
83
- {
84
- "result": {
85
- "verdict": <"A" or "B">,
86
- }
87
- }
88
- """
89
-
90
- USER_TEMPLATE = """\
91
- === Transcript A ===
92
- {dialogueA}
93
-
94
- === Transcript B ===
95
- {dialogueB}
96
-
97
- Output:
98
- """
99
-
100
- # ══════════════════════════════════════════════════════════════════════════════
101
- # DEFINE YOUR MODEL HERE
102
- #
103
- # Fill in the predict() function below. It receives the two dialogue
104
- # transcripts as plain strings and must return either "A" or "B".
105
- #
106
- # Use SYSTEM_PROMPT and USER_TEMPLATE.format(dialogueA=..., dialogueB=...)
107
- # to build your prompt.
108
- #
109
- # A few copy-paste starter examples are included as comments beneath
110
- # the function.
111
- #
112
- # Thread safety: if USE_THREADS = True, predict() will be called from
113
- # multiple threads simultaneously. Stateless API clients (OpenAI, Groq, etc.)
114
- # are safe by default. For local models, set USE_THREADS = False or ensure
115
- # your pipeline/model object is thread-safe.
116
- # ══════════════════════════════════════════════════════════════════════════════
117
-
118
- MAX_RETRIES = 5
119
- BASE_DELAY = 1.0 # seconds — doubles each attempt: 1, 2, 4, 8, 16
120
-
121
- def predict(dialogueA: str, dialogueB: str) -> str:
122
- """
123
- Output the following information in JSON format:
124
- {
125
- "result": {
126
- "verdict": <"A" or "B">,
127
- "confidence": <0,(Total guess) - 100 (Totally sure)>,
128
- "reasoning": <0200 characters>
129
- }
130
- }
131
- For the "verdict" key, return "A" if dialogueA is the human-human conversation, "B" if dialogueB is the human-human conversation.
132
- Replace the body of this function with your own model call.
133
- """
134
- raise NotImplementedError(
135
- "Please fill in the predict() function with your model. "
136
- "See the examples in the comments below."
137
- )
138
-
139
- # EXAMPLE A — OpenAI-compatible API (OpenAI, Together, Groq, Ollama, etc.)
140
- # Works with any provider that follows the OpenAI chat completion format.
141
- # Safe with USE_THREADS = True
142
- """
143
- Terminal: pip install openai
144
-
145
- import os
146
- import time
147
- from openai import OpenAI, RateLimitError, APIError
148
-
149
- client = OpenAI(
150
- api_key=os.environ["OPENAI_API_KEY"], # or your provider's key
151
- base_url="https://api.openai.com/v1", # swap for Groq/Together/etc.
152
- )
153
-
154
- MAX_RETRIES = 5
155
- BASE_DELAY = 1.0 # seconds — doubles each attempt: 1, 2, 4, 8, 16
156
-
157
- def predict(dialogueA: str, dialogueB: str) -> str:
158
- prompt = USER_TEMPLATE.format(dialogueA=dialogueA, dialogueB=dialogueB)
159
- for attempt in range(MAX_RETRIES):
160
- try:
161
- resp = client.chat.completions.create(
162
- model="gpt-4o", # swap for any model name
163
- messages=[
164
- {"role": "system", "content": SYSTEM_PROMPT},
165
- {"role": "user", "content": prompt},
166
- ],
167
- max_completion_tokens=1024,
168
- temperature=1,
169
- )
170
- return resp.choices[0].message.content
171
- except RateLimitError:
172
- wait = BASE_DELAY * (2 ** attempt)
173
- print(f"Rate limited (attempt {attempt + 1}/{MAX_RETRIES}), retrying in {wait:.1f}s...")
174
- time.sleep(wait)
175
- except APIError as e:
176
- wait = BASE_DELAY * (2 ** attempt)
177
- print(f"API error: {e} (attempt {attempt + 1}/{MAX_RETRIES}), retrying in {wait:.1f}s...")
178
- time.sleep(wait)
179
- raise RuntimeError(f"predict() failed after {MAX_RETRIES} attempts")
180
- """
181
-
182
- # EXAMPLE B — Hugging Face transformers (local model)
183
- # Set USE_THREADS = False for local models
184
- """
185
- Terminal: pip install transformers torch
186
-
187
- from transformers import pipeline
188
-
189
- pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
190
-
191
- def predict(dialogueA: str, dialogueB: str) -> str:
192
- prompt = SYSTEM_PROMPT + "\\n\\n" + USER_TEMPLATE.format(
193
- dialogueA=dialogueA, dialogueB=dialogueB
194
- )
195
- out = pipe(prompt, max_new_tokens=5, temperature=0.0)[0]["generated_text"]
196
- return out
197
- """
198
-
199
- # EXAMPLE C — Ollama (local server, any model pulled via `ollama pull`)
200
- # Set USE_THREADS = False for local models
201
- """
202
- Terminal: pip install ollama
203
-
204
- import ollama
205
-
206
- def predict(dialogueA: str, dialogueB: str) -> str:
207
- prompt = USER_TEMPLATE.format(dialogueA=dialogueA, dialogueB=dialogueB)
208
- resp = ollama.chat(
209
- model="llama3",
210
- messages=[
211
- {"role": "system", "content": SYSTEM_PROMPT},
212
- {"role": "user", "content": prompt},
213
- ],
214
- )
215
- return resp["message"]["content"]
216
- """
217
-
218
-
219
- # ══════════════════════════════════════════════════════════════════════════════
220
- # Internals — no need to edit below this line
221
- # ══════════════════════════════════════════════════════════════════════════════
222
-
223
- HF_DATASET_PATH = "hf://datasets/roc-hci/Turing-Bench/turing_bench_public_shuffled.csv"
224
- HF_SPLIT = "train"
225
-
226
-
227
- def load_json(s: str) -> dict | None:
228
- import json
229
- try:
230
- return json.loads(s)
231
- except json.JSONDecodeError:
232
- return None
233
-
234
-
235
- def parse_json(reply: str) -> dict | None:
236
- if not reply:
237
- print("Empty reply")
238
- return None
239
-
240
- reply = reply.strip()
241
- if reply.startswith("```json"):
242
- reply = reply[len("```json"):].strip()
243
- if reply.endswith("```"):
244
- reply = reply[:-3].strip()
245
-
246
- if not (reply.startswith("{") and reply.endswith("}")):
247
- print("Not JSON structure")
248
- return None
249
-
250
- try:
251
- return load_json(reply)
252
- except Exception:
253
- print("Error parsing JSON")
254
- return None
255
-
256
-
257
- def load_data(input_path: str | None) -> pd.DataFrame:
258
- if input_path:
259
- print(f"Loading data from local file: {input_path}")
260
- df = pd.read_csv(input_path)
261
- else:
262
- print(f"Loading data from HuggingFace: {HF_DATASET_PATH}")
263
- try:
264
- from datasets import load_dataset
265
- except ImportError:
266
- sys.exit("datasets package not found. Run: pip install datasets")
267
- ds = load_dataset("csv", data_files=HF_DATASET_PATH, split=HF_SPLIT)
268
- df = ds.to_pandas()
269
-
270
- missing = {"dialogueA", "dialogueB"} - set(df.columns)
271
- if missing:
272
- sys.exit(f"Input data is missing required columns: {missing}")
273
-
274
- return df
275
-
276
-
277
- def run_single(rows: list[dict], delay: float) -> list[tuple[int, str]]:
278
- """Sequential inference with a progress bar."""
279
- results = []
280
- for row in tqdm(rows, desc="Running predictions (single-threaded)"):
281
- try:
282
- pred = parse_json(
283
- predict(str(row["dialogueA"]), str(row["dialogueB"]))
284
- )["result"]["verdict"]
285
- if pred not in ("A", "B"):
286
- raise ValueError(f"predict() returned {pred!r} — must be 'A' or 'B'")
287
- except NotImplementedError:
288
- sys.exit(
289
- "\n✗ predict() is not implemented yet.\n"
290
- " Open this script and fill in the predict() function with your model."
291
- )
292
- except Exception as exc:
293
- print(f"\nError on row {row['_idx']}: {exc} — defaulting to 'NA'")
294
- pred = "NA"
295
-
296
- results.append((row["_idx"], pred))
297
-
298
- if delay > 0:
299
- time.sleep(delay)
300
-
301
- return results
302
-
303
-
304
- def run_threaded(rows: list[dict], delay: float, n_threads: int) -> list[tuple[int, str]]:
305
- """Parallel inference across n_threads workers."""
306
- results = {}
307
- errors = 0
308
- lock = Lock()
309
- completed = 0
310
-
311
- print(f"Running predictions with {n_threads} threads...")
312
- pbar = tqdm(total=len(rows), desc=f"Running predictions ({n_threads} threads)")
313
-
314
- def worker(row: dict) -> tuple[int, str]:
315
- nonlocal errors, completed
316
- try:
317
- pred = parse_json(
318
- predict(str(row["dialogueA"]), str(row["dialogueB"]))
319
- )["result"]["verdict"]
320
- if pred not in ("A", "B"):
321
- raise ValueError(f"predict() returned {pred!r} — must be 'A' or 'B'")
322
- except NotImplementedError:
323
- sys.exit(
324
- "\npredict() is not implemented yet.\n"
325
- " Open this script and fill in the predict() function with your model."
326
- )
327
- except Exception as exc:
328
- print(f"\nError on row {row['_idx']}: {type(exc).__name__}: {exc} — defaulting to 'NA'")
329
- with lock:
330
- errors += 1
331
- pred = "NA"
332
-
333
- if delay > 0:
334
- time.sleep(delay)
335
-
336
- return row["_idx"], pred
337
-
338
- with ThreadPoolExecutor(max_workers=n_threads) as executor:
339
- futures = {executor.submit(worker, row): row for row in rows}
340
- for future in as_completed(futures):
341
- idx, pred = future.result()
342
- results[idx] = pred
343
- pbar.update(1)
344
-
345
- pbar.close()
346
- return sorted(results.items()) # return in original row order
347
-
348
-
349
- def main():
350
- parser = argparse.ArgumentParser(
351
- description="Turing Test Judge Benchmark — generate predictions with your model."
352
- )
353
- parser.add_argument(
354
- "--input", default=None,
355
- help="Path to a local CSV file. If omitted, data is loaded from HuggingFace.",
356
- )
357
- parser.add_argument(
358
- "--output", default="predictions.csv",
359
- help="Output CSV file path (default: predictions.csv).",
360
- )
361
- parser.add_argument(
362
- "--delay", type=float, default=0.0,
363
- help="Seconds to wait between calls (useful for rate-limited APIs, default: 0).",
364
- )
365
- args = parser.parse_args()
366
-
367
- df = load_data(args.input)
368
- print(f"Loaded {len(df)} examples.\n")
369
-
370
- # Attach index so threaded results can be re-ordered correctly
371
- rows = [{"_idx": i, **row} for i, row in df.iterrows()]
372
-
373
- if USE_THREADS:
374
- ordered = run_threaded(rows, args.delay, N_THREADS)
375
- else:
376
- ordered = run_single(rows, args.delay)
377
-
378
- preds = [pred for _, pred in ordered]
379
- errors = preds.count("NA")
380
-
381
- out_df = pd.DataFrame({"who_is_human": preds})
382
- out_df.to_csv(args.output, index=False)
383
-
384
- print(f"\n✓ Predictions saved to: {args.output}")
385
- print(f" Total : {len(preds)} | A: {preds.count('A')} | B: {preds.count('B')} | NA: {errors}")
386
- if USE_THREADS:
387
- print(f" Threads used: {N_THREADS}")
388
- if errors:
389
- print(f"{errors} row(s) errored and defaulted to 'NA'")
390
- print("\nNext step: submit your predictions CSV to the leaderboard at https://huggingface.co/spaces/roc-hci/Turing-Bench-Leaderboard")
391
-
392
- if __name__ == "__main__":
393
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
theme.py CHANGED
@@ -385,12 +385,12 @@ CUSTOM_CSS = """
385
  }
386
 
387
  .p-small {
388
- font-size: 0.5rem;
389
  }
390
 
391
  .logo-small {
392
  height: auto;
393
- max-width: 125px
394
  }
395
 
396
  @media (max-width: 900px) {
 
385
  }
386
 
387
  .p-small {
388
+ font-size: 0.8rem;
389
  }
390
 
391
  .logo-small {
392
  height: auto;
393
+ max-width: 70px
394
  }
395
 
396
  @media (max-width: 900px) {
utils.py CHANGED
@@ -6,11 +6,14 @@ import pandas as pd
6
  from huggingface_hub import HfApi
7
  import gradio as gr
8
 
 
 
 
9
  API = HfApi()
10
  SUBMISSIONS_REPO = "roc-hci/turing-bench-submissions"
11
  RESULTS_REPO = "roc-hci/turing-bench-results"
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
- GOLD_LABELS = json.loads(os.environ.get("PRIVATE_LABELS"))
14
 
15
 
16
  def submit_prediction(model_name: str, predictions_file, profile: gr.OAuthProfile | None) -> str:
@@ -158,3 +161,87 @@ def load_results() -> pd.DataFrame:
158
  except Exception as e:
159
  print(f"Error loading results: {e}")
160
  return pd.DataFrame(columns=["Model", "User", "Time" "Accuracy"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from huggingface_hub import HfApi
7
  import gradio as gr
8
 
9
+ import html
10
+ import re
11
+
12
  API = HfApi()
13
  SUBMISSIONS_REPO = "roc-hci/turing-bench-submissions"
14
  RESULTS_REPO = "roc-hci/turing-bench-results"
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
+ #GOLD_LABELS = json.loads(os.environ.get("PRIVATE_LABELS"))
17
 
18
 
19
  def submit_prediction(model_name: str, predictions_file, profile: gr.OAuthProfile | None) -> str:
 
161
  except Exception as e:
162
  print(f"Error loading results: {e}")
163
  return pd.DataFrame(columns=["Model", "User", "Time" "Accuracy"])
164
+
165
+
166
+ def _format_inline(text: str) -> str:
167
+ escaped = html.escape(text.strip())
168
+ escaped = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", escaped)
169
+ escaped = re.sub(r"`([^`]+)`", r"<code>\1</code>", escaped)
170
+ return escaped
171
+
172
+
173
+ def markdown_to_html(markdown: str, elem_classes: str = "html-block") -> str:
174
+ lines = markdown.strip().splitlines()
175
+ blocks: list[str] = []
176
+ paragraph: list[str] = []
177
+ list_items: list[str] = []
178
+ code_lines: list[str] = []
179
+ code_language = ""
180
+ in_code_block = False
181
+
182
+ def flush_paragraph():
183
+ if paragraph:
184
+ content = " ".join(part.strip() for part in paragraph if part.strip())
185
+ if content:
186
+ blocks.append(f"<p>{_format_inline(content)}</p>")
187
+ paragraph.clear()
188
+
189
+ def flush_list():
190
+ if list_items:
191
+ items_html = "".join(f"<li>{item}</li>" for item in list_items)
192
+ blocks.append(f"<ul>{items_html}</ul>")
193
+ list_items.clear()
194
+
195
+ for raw_line in lines:
196
+ stripped = raw_line.strip()
197
+
198
+ if stripped.startswith("```"):
199
+ flush_paragraph()
200
+ flush_list()
201
+ if in_code_block:
202
+ code_html = html.escape("\n".join(code_lines))
203
+ language_class = f' class="language-{code_language}"' if code_language else ""
204
+ blocks.append(f"<pre><code{language_class}>{code_html}</code></pre>")
205
+ code_lines.clear()
206
+ code_language = ""
207
+ in_code_block = False
208
+ else:
209
+ in_code_block = True
210
+ code_language = stripped.removeprefix("```").strip()
211
+ continue
212
+
213
+ if in_code_block:
214
+ code_lines.append(raw_line.rstrip())
215
+ continue
216
+
217
+ if not stripped:
218
+ flush_paragraph()
219
+ flush_list()
220
+ continue
221
+
222
+ heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
223
+ if heading_match:
224
+ flush_paragraph()
225
+ flush_list()
226
+ level = len(heading_match.group(1))
227
+ blocks.append(f"<h{level}>{_format_inline(heading_match.group(2))}</h{level}>")
228
+ continue
229
+
230
+ if stripped.startswith("- "):
231
+ flush_paragraph()
232
+ list_items.append(_format_inline(stripped[2:]))
233
+ continue
234
+
235
+ flush_list()
236
+ paragraph.append(stripped)
237
+
238
+ flush_paragraph()
239
+ flush_list()
240
+
241
+ return f'<div class="{elem_classes}">{"".join(blocks)}</div>'
242
+
243
+
244
+ def _format_accuracy(value) -> str:
245
+ if pd.isna(value):
246
+ return "N/A"
247
+ return f"{float(value):.4f}"