Spaces:

roc-hci
/

Turing-Bench-Leaderboard

Running

roc-hci commited on 13 days ago

Commit

bbd38e8

1 Parent(s): 8859fb6

locke-logo (#1)

- added locke logo, fixed utils (b65a1fc85513287d351302d7b367dda472ce5105)
- removed old leaderboard (679963a518a78b23fbf27b8ea801253eb288ee29)

Files changed (8) hide show

__pycache__/about.cpython-312.pyc +0 -0
__pycache__/theme.cpython-312.pyc +0 -0
__pycache__/utils.cpython-312.pyc +0 -0
app.py +8 -93
leaderboard.json +0 -16
run_eval.py +0 -393
theme.py +2 -2
utils.py +88 -1

__pycache__/about.cpython-312.pyc ADDED Viewed

Binary file (1.64 kB). View file

__pycache__/theme.cpython-312.pyc ADDED Viewed

Binary file (8.32 kB). View file

__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (11.2 kB). View file

app.py CHANGED Viewed

@@ -1,99 +1,17 @@
-import html
-import re
 import pandas as pd
 import gradio as gr
 from about import TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_TEXT, DESCRIPTION_TEXT
 from theme import build_theme, CUSTOM_CSS
-from utils import load_results, submit_prediction
 GIT_CLONE_COMMAND = "git clone https://github.com/Masum06/Turing-Bench.git"
-def _format_inline(text: str) -> str:
-    escaped = html.escape(text.strip())
-    escaped = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", escaped)
-    escaped = re.sub(r"`([^`]+)`", r"<code>\1</code>", escaped)
-    return escaped
-def markdown_to_html(markdown: str, elem_classes: str = "html-block") -> str:
-    lines = markdown.strip().splitlines()
-    blocks: list[str] = []
-    paragraph: list[str] = []
-    list_items: list[str] = []
-    code_lines: list[str] = []
-    code_language = ""
-    in_code_block = False
-    def flush_paragraph():
-        if paragraph:
-            content = " ".join(part.strip() for part in paragraph if part.strip())
-            if content:
-                blocks.append(f"<p>{_format_inline(content)}</p>")
-            paragraph.clear()
-    def flush_list():
-        if list_items:
-            items_html = "".join(f"<li>{item}</li>" for item in list_items)
-            blocks.append(f"<ul>{items_html}</ul>")
-            list_items.clear()
-    for raw_line in lines:
-        stripped = raw_line.strip()
-        if stripped.startswith("```"):
-            flush_paragraph()
-            flush_list()
-            if in_code_block:
-                code_html = html.escape("\n".join(code_lines))
-                language_class = f' class="language-{code_language}"' if code_language else ""
-                blocks.append(f"<pre><code{language_class}>{code_html}</code></pre>")
-                code_lines.clear()
-                code_language = ""
-                in_code_block = False
-            else:
-                in_code_block = True
-                code_language = stripped.removeprefix("```").strip()
-            continue
-        if in_code_block:
-            code_lines.append(raw_line.rstrip())
-            continue
-        if not stripped:
-            flush_paragraph()
-            flush_list()
-            continue
-        heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
-        if heading_match:
-            flush_paragraph()
-            flush_list()
-            level = len(heading_match.group(1))
-            blocks.append(f"<h{level}>{_format_inline(heading_match.group(2))}</h{level}>")
-            continue
-        if stripped.startswith("- "):
-            flush_paragraph()
-            list_items.append(_format_inline(stripped[2:]))
-            continue
-        flush_list()
-        paragraph.append(stripped)
-    flush_paragraph()
-    flush_list()
-    return f'<div class="{elem_classes}">{"".join(blocks)}</div>'
-def _format_accuracy(value) -> str:
-    if pd.isna(value):
-        return "N/A"
-    return f"{float(value):.4f}"
 def build_leaderboard_summary(df: pd.DataFrame) -> str:
     if df.empty:
@@ -151,7 +69,6 @@ def refresh_leaderboard_view():
     df = load_results()
     return df, build_leaderboard_summary(df)
 def submit_prediction_html(model_name, predictions_file, profile: gr.OAuthProfile | None):
     message = submit_prediction(model_name, predictions_file, profile)
     return markdown_to_html(message, "html-block status-message")
@@ -271,7 +188,6 @@ with gr.Blocks(theme=build_theme(), css=CUSTOM_CSS, fill_width=True) as demo:
                     gr.HTML(
                         """
                         <div class="section-kicker">Submission workflow</div>
-                        <div class="section-heading">Evaluate locally, then upload predictions</div>
                         <div class="steps-row">
                             <div class="step-chip">1. Log in</div>
                             <div class="step-chip">2. Clone git repository and run evaluation locally</div>
@@ -361,13 +277,12 @@ with gr.Blocks(theme=build_theme(), css=CUSTOM_CSS, fill_width=True) as demo:
                 )
         gr.HTML(
-            """
-            <div class="html-block>
                 <p class="p-small">Thanks Locke (https://lockeidentity.com/) for sponsoring part of this research</p>
             </div>
             """
         )
-        gr.Image(value="images/locke-logo.jpg", type="filepath", elem_classes="logo-small")
 demo.launch()

 import pandas as pd
 import gradio as gr
 from about import TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_TEXT, DESCRIPTION_TEXT
 from theme import build_theme, CUSTOM_CSS
+from utils import load_results, submit_prediction, _format_inline, markdown_to_html, _format_accuracy
 GIT_CLONE_COMMAND = "git clone https://github.com/Masum06/Turing-Bench.git"
+import base64
+from pathlib import Path
+img_path = Path(__file__).parent / "images" / "locke-logo.png"
+b64 = base64.b64encode(img_path.read_bytes()).decode()
 def build_leaderboard_summary(df: pd.DataFrame) -> str:
     if df.empty:
     df = load_results()
     return df, build_leaderboard_summary(df)
 def submit_prediction_html(model_name, predictions_file, profile: gr.OAuthProfile | None):
     message = submit_prediction(model_name, predictions_file, profile)
     return markdown_to_html(message, "html-block status-message")
                     gr.HTML(
                         """
                         <div class="section-kicker">Submission workflow</div>
                         <div class="steps-row">
                             <div class="step-chip">1. Log in</div>
                             <div class="step-chip">2. Clone git repository and run evaluation locally</div>
                 )
         gr.HTML(
+            f"""
+            <div class="html-block">
                 <p class="p-small">Thanks Locke (https://lockeidentity.com/) for sponsoring part of this research</p>
+                <a href="https://lockeidentity.com/" target="_blank" rel="noopener noreferrer"><img class="logo-small" src="data:image/png;base64,{b64}"/></a>
             </div>
             """
         )
 demo.launch()

leaderboard.json DELETED Viewed

@@ -1,16 +0,0 @@
-[
-  {
-    "username": "ROC-HCI",
-    "model": "human_judge",
-    "accuracy": 0.5458,
-    "date": "2026-03-23",
-    "timestamp": "2026-03-23 14:49:20"
-  },
-  {
-    "username": "ROC-HCI",
-    "model": "GPT-4o",
-    "accuracy": 0.4363,
-    "date": "2026-03-01",
-    "timestamp": "2026-03-01 16:37:58"
-  }
-]

run_eval.py DELETED Viewed

@@ -1,393 +0,0 @@
-#!/usr/bin/env python3
-"""
-Turing Test Judge Benchmark — Evaluation Script
-================================================
-Given a dataset of paired dialogues (A and B), predict which is the human-human dialogue.
-SETUP
------
-1. Install core dependencies:
-       pip install pandas tqdm datasets
-2. Install whatever library your model needs (see examples below).
-3. Fill in the `predict()` function with your model.
-4. Run:
-       # Load from HuggingFace (default)
-       python run_eval.py
-       # Load from a local CSV
-       python run_eval.py --input data.csv
-       # Save output to a custom path
-       python run_eval.py --output my_predictions.csv
-       # Add a delay between API calls (seconds, useful for rate limits)
-       python run_eval.py --delay 0.5
-OUTPUT FORMAT
--------------
-A single-column CSV:  who_is_human  ∈  {"A", "B"}
-Ready to submit to the leaderboard.
-MULTITHREADING
---------------
-Set USE_THREADS = True below to enable parallel inference.
-Set N_THREADS to control the number of worker threads.
-Recommended for API-based models (OpenAI, Groq, Together, etc.).
-NOT recommended for local models (transformers, Ollama) — use N_THREADS = 1.
-"""
-import argparse
-import sys
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from threading import Lock
-import pandas as pd
-from tqdm import tqdm
-# ══════════════════════════════════════════════════════════════════════════════
-#  MULTITHREADING CONFIG
-# ══════════════════════════════════════════════════════════════════════════════
-USE_THREADS = True   # Set True to enable parallel inference
-N_THREADS   = 4       # Number of worker threads (ignored if USE_THREADS = False)
-# ══════════════════════════════════════════════════════════════════════════════
-#  PROMPTS
-# ══════════════════════════════════════════════════════════════════════════════
-SYSTEM_PROMPT = """\
-INSTRUCTIONS:
-The following a task is an offline Turing test where you will be presented with two dialogues and your job is to identify the human.
-In this experiment, you will be presented with a
-pair of conversations, labeled "Transcript A" and "Transcript B", each one between an investigator I and a witness W.
-One of them is a human/human conversation and another is a human/AI conversation. Ignore all statements or actions by investigator I,
-and only focus on the witness W. Your job will be to decide in which conversation (A or B)
-the Witness W is human. The AI (LLM or otherwise) is highly advanced, so the differences may be very subtle,
-hence pay close attention.
-Each conversation involves:
-A Witness, labeled 'W'
-An Investigator, labeled 'I'
-The Investigator (I) is always human. In exactly one
-of the conversations, the Witness W is human, and
-in the other, the Witness W is Artificial Intelligence.
-Please provide your response in JSON format:
-{
-"result": {
-  "verdict": <"A" or "B">,
-  }
-}
-"""
-USER_TEMPLATE = """\
-=== Transcript A ===
-{dialogueA}
-=== Transcript B ===
-{dialogueB}
-Output:
-"""
-# ══════════════════════════════════════════════════════════════════════════════
-#  DEFINE YOUR MODEL HERE
-#
-#  Fill in the predict() function below. It receives the two dialogue
-#  transcripts as plain strings and must return either "A" or "B".
-#
-#  Use SYSTEM_PROMPT and USER_TEMPLATE.format(dialogueA=..., dialogueB=...)
-#  to build your prompt.
-#
-#  A few copy-paste starter examples are included as comments beneath
-#  the function.
-#
-#  Thread safety: if USE_THREADS = True, predict() will be called from
-#  multiple threads simultaneously. Stateless API clients (OpenAI, Groq, etc.)
-#  are safe by default. For local models, set USE_THREADS = False or ensure
-#  your pipeline/model object is thread-safe.
-# ══════════════════════════════════════════════════════════════════════════════
-MAX_RETRIES = 5
-BASE_DELAY  = 1.0   # seconds — doubles each attempt: 1, 2, 4, 8, 16
-def predict(dialogueA: str, dialogueB: str) -> str:
-    """
-    Output the following information in JSON format:
-    {
-        "result": {
-            "verdict": <"A" or "B">,
-            "confidence": <0,(Total guess) - 100 (Totally sure)>,
-            "reasoning": <0200 characters>
-        }
-    }
-    For the "verdict" key, return "A" if dialogueA is the human-human conversation, "B" if dialogueB is the human-human conversation.
-    Replace the body of this function with your own model call.
-    """
-    raise NotImplementedError(
-        "Please fill in the predict() function with your model. "
-        "See the examples in the comments below."
-    )
-# EXAMPLE A — OpenAI-compatible API (OpenAI, Together, Groq, Ollama, etc.)
-# Works with any provider that follows the OpenAI chat completion format.
-# Safe with USE_THREADS = True
-"""
-Terminal: pip install openai
-import os
-import time
-from openai import OpenAI, RateLimitError, APIError
-client = OpenAI(
-    api_key=os.environ["OPENAI_API_KEY"],   # or your provider's key
-    base_url="https://api.openai.com/v1",   # swap for Groq/Together/etc.
-)
-MAX_RETRIES = 5
-BASE_DELAY  = 1.0   # seconds — doubles each attempt: 1, 2, 4, 8, 16
-def predict(dialogueA: str, dialogueB: str) -> str:
-    prompt = USER_TEMPLATE.format(dialogueA=dialogueA, dialogueB=dialogueB)
-    for attempt in range(MAX_RETRIES):
-        try:
-            resp = client.chat.completions.create(
-                model="gpt-4o",             # swap for any model name
-                messages=[
-                    {"role": "system", "content": SYSTEM_PROMPT},
-                    {"role": "user",   "content": prompt},
-                ],
-                max_completion_tokens=1024,
-                temperature=1,
-            )
-            return resp.choices[0].message.content
-        except RateLimitError:
-            wait = BASE_DELAY * (2 ** attempt)
-            print(f"Rate limited (attempt {attempt + 1}/{MAX_RETRIES}), retrying in {wait:.1f}s...")
-            time.sleep(wait)
-        except APIError as e:
-            wait = BASE_DELAY * (2 ** attempt)
-            print(f"API error: {e} (attempt {attempt + 1}/{MAX_RETRIES}), retrying in {wait:.1f}s...")
-            time.sleep(wait)
-    raise RuntimeError(f"predict() failed after {MAX_RETRIES} attempts")
-"""
-# EXAMPLE B — Hugging Face transformers (local model)
-# Set USE_THREADS = False for local models
-"""
-Terminal: pip install transformers torch
-from transformers import pipeline
-pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
-def predict(dialogueA: str, dialogueB: str) -> str:
-    prompt = SYSTEM_PROMPT + "\\n\\n" + USER_TEMPLATE.format(
-        dialogueA=dialogueA, dialogueB=dialogueB
-    )
-    out = pipe(prompt, max_new_tokens=5, temperature=0.0)[0]["generated_text"]
-    return out
-"""
-# EXAMPLE C — Ollama (local server, any model pulled via `ollama pull`)
-# Set USE_THREADS = False for local models
-"""
-Terminal: pip install ollama
-import ollama
-def predict(dialogueA: str, dialogueB: str) -> str:
-    prompt = USER_TEMPLATE.format(dialogueA=dialogueA, dialogueB=dialogueB)
-    resp = ollama.chat(
-        model="llama3",
-        messages=[
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user",   "content": prompt},
-        ],
-    )
-    return resp["message"]["content"]
-"""
-# ══════════════════════════════════════════════════════════════════════════════
-#  Internals — no need to edit below this line
-# ══════════════════════════════════════════════════════════════════════════════
-HF_DATASET_PATH = "hf://datasets/roc-hci/Turing-Bench/turing_bench_public_shuffled.csv"
-HF_SPLIT        = "train"
-def load_json(s: str) -> dict | None:
-    import json
-    try:
-        return json.loads(s)
-    except json.JSONDecodeError:
-        return None
-def parse_json(reply: str) -> dict | None:
-    if not reply:
-        print("Empty reply")
-        return None
-    reply = reply.strip()
-    if reply.startswith("```json"):
-        reply = reply[len("```json"):].strip()
-        if reply.endswith("```"):
-            reply = reply[:-3].strip()
-    if not (reply.startswith("{") and reply.endswith("}")):
-        print("Not JSON structure")
-        return None
-    try:
-        return load_json(reply)
-    except Exception:
-        print("Error parsing JSON")
-        return None
-def load_data(input_path: str | None) -> pd.DataFrame:
-    if input_path:
-        print(f"Loading data from local file: {input_path}")
-        df = pd.read_csv(input_path)
-    else:
-        print(f"Loading data from HuggingFace: {HF_DATASET_PATH}")
-        try:
-            from datasets import load_dataset
-        except ImportError:
-            sys.exit("datasets package not found. Run: pip install datasets")
-        ds = load_dataset("csv", data_files=HF_DATASET_PATH, split=HF_SPLIT)
-        df = ds.to_pandas()
-    missing = {"dialogueA", "dialogueB"} - set(df.columns)
-    if missing:
-        sys.exit(f"Input data is missing required columns: {missing}")
-    return df
-def run_single(rows: list[dict], delay: float) -> list[tuple[int, str]]:
-    """Sequential inference with a progress bar."""
-    results = []
-    for row in tqdm(rows, desc="Running predictions (single-threaded)"):
-        try:
-            pred = parse_json(
-                predict(str(row["dialogueA"]), str(row["dialogueB"]))
-            )["result"]["verdict"]
-            if pred not in ("A", "B"):
-                raise ValueError(f"predict() returned {pred!r} — must be 'A' or 'B'")
-        except NotImplementedError:
-            sys.exit(
-                "\n✗ predict() is not implemented yet.\n"
-                "  Open this script and fill in the predict() function with your model."
-            )
-        except Exception as exc:
-            print(f"\nError on row {row['_idx']}: {exc} — defaulting to 'NA'")
-            pred = "NA"
-        results.append((row["_idx"], pred))
-        if delay > 0:
-            time.sleep(delay)
-    return results
-def run_threaded(rows: list[dict], delay: float, n_threads: int) -> list[tuple[int, str]]:
-    """Parallel inference across n_threads workers."""
-    results   = {}
-    errors    = 0
-    lock      = Lock()
-    completed = 0
-    print(f"Running predictions with {n_threads} threads...")
-    pbar = tqdm(total=len(rows), desc=f"Running predictions ({n_threads} threads)")
-    def worker(row: dict) -> tuple[int, str]:
-        nonlocal errors, completed
-        try:
-            pred = parse_json(
-                predict(str(row["dialogueA"]), str(row["dialogueB"]))
-            )["result"]["verdict"]
-            if pred not in ("A", "B"):
-                raise ValueError(f"predict() returned {pred!r} — must be 'A' or 'B'")
-        except NotImplementedError:
-            sys.exit(
-                "\npredict() is not implemented yet.\n"
-                "  Open this script and fill in the predict() function with your model."
-            )
-        except Exception as exc:
-            print(f"\nError on row {row['_idx']}: {type(exc).__name__}: {exc} — defaulting to 'NA'")
-            with lock:
-                errors += 1
-            pred = "NA"
-        if delay > 0:
-            time.sleep(delay)
-        return row["_idx"], pred
-    with ThreadPoolExecutor(max_workers=n_threads) as executor:
-        futures = {executor.submit(worker, row): row for row in rows}
-        for future in as_completed(futures):
-            idx, pred = future.result()
-            results[idx] = pred
-            pbar.update(1)
-    pbar.close()
-    return sorted(results.items())  # return in original row order
-def main():
-    parser = argparse.ArgumentParser(
-        description="Turing Test Judge Benchmark — generate predictions with your model."
-    )
-    parser.add_argument(
-        "--input", default=None,
-        help="Path to a local CSV file. If omitted, data is loaded from HuggingFace.",
-    )
-    parser.add_argument(
-        "--output", default="predictions.csv",
-        help="Output CSV file path (default: predictions.csv).",
-    )
-    parser.add_argument(
-        "--delay", type=float, default=0.0,
-        help="Seconds to wait between calls (useful for rate-limited APIs, default: 0).",
-    )
-    args = parser.parse_args()
-    df = load_data(args.input)
-    print(f"Loaded {len(df)} examples.\n")
-    # Attach index so threaded results can be re-ordered correctly
-    rows = [{"_idx": i, **row} for i, row in df.iterrows()]
-    if USE_THREADS:
-        ordered = run_threaded(rows, args.delay, N_THREADS)
-    else:
-        ordered = run_single(rows, args.delay)
-    preds   = [pred for _, pred in ordered]
-    errors  = preds.count("NA")
-    out_df = pd.DataFrame({"who_is_human": preds})
-    out_df.to_csv(args.output, index=False)
-    print(f"\n✓ Predictions saved to: {args.output}")
-    print(f"  Total : {len(preds)}  |  A: {preds.count('A')}  |  B: {preds.count('B')}  |  NA: {errors}")
-    if USE_THREADS:
-        print(f"  Threads used: {N_THREADS}")
-    if errors:
-        print(f"{errors} row(s) errored and defaulted to 'NA'")
-    print("\nNext step: submit your predictions CSV to the leaderboard at https://huggingface.co/spaces/roc-hci/Turing-Bench-Leaderboard")
-if __name__ == "__main__":
-    main()

theme.py CHANGED Viewed

@@ -385,12 +385,12 @@ CUSTOM_CSS = """
 }
 .p-small {
-    font-size: 0.5rem;
 }
 .logo-small {
     height: auto;
-    max-width: 125px
 }
 @media (max-width: 900px) {

 }
 .p-small {
+    font-size: 0.8rem;
 }
 .logo-small {
     height: auto;
+    max-width: 70px
 }
 @media (max-width: 900px) {

utils.py CHANGED Viewed

@@ -6,11 +6,14 @@ import pandas as pd
 from huggingface_hub import HfApi
 import gradio as gr
 API = HfApi()
 SUBMISSIONS_REPO = "roc-hci/turing-bench-submissions"
 RESULTS_REPO = "roc-hci/turing-bench-results"
 HF_TOKEN = os.environ.get("HF_TOKEN")
-GOLD_LABELS = json.loads(os.environ.get("PRIVATE_LABELS"))
 def submit_prediction(model_name: str, predictions_file, profile: gr.OAuthProfile | None) -> str:
@@ -158,3 +161,87 @@ def load_results() -> pd.DataFrame:
     except Exception as e:
         print(f"Error loading results: {e}")
         return pd.DataFrame(columns=["Model", "User", "Time" "Accuracy"])

 from huggingface_hub import HfApi
 import gradio as gr
+import html
+import re
 API = HfApi()
 SUBMISSIONS_REPO = "roc-hci/turing-bench-submissions"
 RESULTS_REPO = "roc-hci/turing-bench-results"
 HF_TOKEN = os.environ.get("HF_TOKEN")
+#GOLD_LABELS = json.loads(os.environ.get("PRIVATE_LABELS"))
 def submit_prediction(model_name: str, predictions_file, profile: gr.OAuthProfile | None) -> str:
     except Exception as e:
         print(f"Error loading results: {e}")
         return pd.DataFrame(columns=["Model", "User", "Time" "Accuracy"])
+def _format_inline(text: str) -> str:
+    escaped = html.escape(text.strip())
+    escaped = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", escaped)
+    escaped = re.sub(r"`([^`]+)`", r"<code>\1</code>", escaped)
+    return escaped
+def markdown_to_html(markdown: str, elem_classes: str = "html-block") -> str:
+    lines = markdown.strip().splitlines()
+    blocks: list[str] = []
+    paragraph: list[str] = []
+    list_items: list[str] = []
+    code_lines: list[str] = []
+    code_language = ""
+    in_code_block = False
+    def flush_paragraph():
+        if paragraph:
+            content = " ".join(part.strip() for part in paragraph if part.strip())
+            if content:
+                blocks.append(f"<p>{_format_inline(content)}</p>")
+            paragraph.clear()
+    def flush_list():
+        if list_items:
+            items_html = "".join(f"<li>{item}</li>" for item in list_items)
+            blocks.append(f"<ul>{items_html}</ul>")
+            list_items.clear()
+    for raw_line in lines:
+        stripped = raw_line.strip()
+        if stripped.startswith("```"):
+            flush_paragraph()
+            flush_list()
+            if in_code_block:
+                code_html = html.escape("\n".join(code_lines))
+                language_class = f' class="language-{code_language}"' if code_language else ""
+                blocks.append(f"<pre><code{language_class}>{code_html}</code></pre>")
+                code_lines.clear()
+                code_language = ""
+                in_code_block = False
+            else:
+                in_code_block = True
+                code_language = stripped.removeprefix("```").strip()
+            continue
+        if in_code_block:
+            code_lines.append(raw_line.rstrip())
+            continue
+        if not stripped:
+            flush_paragraph()
+            flush_list()
+            continue
+        heading_match = re.match(r"^(#{1,6})\s+(.*)$", stripped)
+        if heading_match:
+            flush_paragraph()
+            flush_list()
+            level = len(heading_match.group(1))
+            blocks.append(f"<h{level}>{_format_inline(heading_match.group(2))}</h{level}>")
+            continue
+        if stripped.startswith("- "):
+            flush_paragraph()
+            list_items.append(_format_inline(stripped[2:]))
+            continue
+        flush_list()
+        paragraph.append(stripped)
+    flush_paragraph()
+    flush_list()
+    return f'<div class="{elem_classes}">{"".join(blocks)}</div>'
+def _format_accuracy(value) -> str:
+    if pd.isna(value):
+        return "N/A"
+    return f"{float(value):.4f}"