Spaces:

ping98k
/

LLM-Brainstorming

Sleeping

App Files Files Community

ping98k commited on Jun 26, 2025

Commit

3bd9ad6

1 Parent(s): c43a3f3

Log scoring and pairwise completions

Browse files

Files changed (3) hide show

main.py +70 -27
tests/test_main.py +8 -5
tournament_utils.py +23 -7

main.py CHANGED Viewed

@@ -61,47 +61,84 @@ def run_tournament(
     process_log = []
     hist_fig = None
     top_picks_str = ""
     def log(msg):
         process_log.append(msg)
         tqdm.write(msg)
-        yield "\n".join(process_log), hist_fig, top_picks_str
     yield from log("Generating players …")
-    all_players = generate_players(
         instruction,
         n_gen,
         model=generate_model,
         api_base=api_base,
         api_key=api_token,
     )
     yield from log(f"{len(all_players)} players generated")
     def criteria_block():
         return "\n".join(f"{i + 1}) {c}" for i, c in enumerate(criteria_list))
     if enable_score_filter:
         def score(player):
-            data = _clean_json(
-                prompt_score(
-                    instruction,
-                    criteria_list,
-                    criteria_block(),
-                    player,
-                    model=score_model,
-                    api_base=api_base,
-                    api_key=api_token,
-                )
             )
             if "scores" in data and isinstance(data["scores"], list):
                 vals = data["scores"]
                 return sum(vals) / len(vals) if vals else 0.0
             return float(data.get("score", 0))
-        yield from log("Scoring players …")
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             scores = {
                 p: s
                 for p, s in zip(
                     all_players,
-                    list(tqdm(ex.map(score, all_players), total=len(all_players))),
                 )
             }
         hist_fig = plt.figure()
@@ -109,21 +146,25 @@ def run_tournament(
         yield from log("Histogram generated")
         top_players = sorted(all_players, key=scores.get, reverse=True)[:pool_size]
         yield from log(f"Filtered to {len(top_players)} players with best scores")
     else:
         top_players = all_players
     if enable_pairwise_filter:
         def play(a, b):
-            winner_label = _clean_json(
-                prompt_pairwise(
-                    instruction,
-                    criteria_block(),
-                    a,
-                    b,
-                    model=pairwise_model,
-                    api_base=api_base,
-                    api_key=api_token,
-                )
-            ).get("winner", "A")
             return a if winner_label == "A" else b
         def tournament_round(pairs, executor):
@@ -172,13 +213,14 @@ def run_tournament(
             candidates = list(set(finalists + semifinalists + get_candidates(champion, lost_to)))
             return playoff(candidates, executor)[:num_top_picks]
-        yield from log("Running tournament …")
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             top_k = get_top(top_players, ex)
     else:
         top_k = top_players[:num_top_picks]
     top_picks_str = "\n\n\n=====================================================\n\n\n".join(top_k)
-    yield "\n".join(process_log + ["Done"]), hist_fig, top_picks_str
 demo = gr.Interface(
     fn=run_tournament,
@@ -201,6 +243,7 @@ demo = gr.Interface(
         gr.Textbox(lines=10, label="Process"),
         gr.Plot(label="Score Distribution"),
         gr.Textbox(lines=50, label="Top picks"),
     ],
     description="Generate multiple completions and use score and pairwise filters to find the best answers.",
 )

     process_log = []
     hist_fig = None
     top_picks_str = ""
+    prompt_tokens = 0
+    completion_tokens = 0
+    score_outputs: list[str] = []
+    pairwise_outputs: list[str] = []
+    def add_usage(usage):
+        nonlocal prompt_tokens, completion_tokens
+        if not usage:
+            return
+        pt = getattr(usage, "prompt_tokens", None)
+        if pt is None and isinstance(usage, dict):
+            pt = usage.get("prompt_tokens")
+        ct = getattr(usage, "completion_tokens", None)
+        if ct is None and isinstance(usage, dict):
+            ct = usage.get("completion_tokens")
+        if pt:
+            prompt_tokens += pt
+        if ct:
+            completion_tokens += ct
+    def usage_str():
+        return (
+            f"Prompt tokens: {prompt_tokens}\n"
+            f"Completion tokens: {completion_tokens}\n"
+            f"Total tokens: {prompt_tokens + completion_tokens}"
+        )
+    def log_completion(prefix: str, text: str):
+        disp = text.replace("\n", " ")
+        if len(disp) > 100:
+            disp = disp[:100] + "…"
+        return log(f"{prefix}{disp}")
     def log(msg):
         process_log.append(msg)
         tqdm.write(msg)
+        yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
     yield from log("Generating players …")
+    all_players, usage = generate_players(
         instruction,
         n_gen,
         model=generate_model,
         api_base=api_base,
         api_key=api_token,
+        return_usage=True,
     )
+    add_usage(usage)
     yield from log(f"{len(all_players)} players generated")
+    for i, p in enumerate(all_players, 1):
+        yield from log_completion(f"Completion {i}: ", p)
     def criteria_block():
         return "\n".join(f"{i + 1}) {c}" for i, c in enumerate(criteria_list))
     if enable_score_filter:
         def score(player):
+            text, usage = prompt_score(
+                instruction,
+                criteria_list,
+                criteria_block(),
+                player,
+                model=score_model,
+                api_base=api_base,
+                api_key=api_token,
+                return_usage=True,
             )
+            add_usage(usage)
+            score_outputs.append(text)
+            data = _clean_json(text)
             if "scores" in data and isinstance(data["scores"], list):
                 vals = data["scores"]
                 return sum(vals) / len(vals) if vals else 0.0
             return float(data.get("score", 0))
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             scores = {
                 p: s
                 for p, s in zip(
                     all_players,
+                    tqdm(ex.map(score, all_players), total=len(all_players)),
                 )
             }
         hist_fig = plt.figure()
         yield from log("Histogram generated")
         top_players = sorted(all_players, key=scores.get, reverse=True)[:pool_size]
         yield from log(f"Filtered to {len(top_players)} players with best scores")
+        for i, txt in enumerate(score_outputs, 1):
+            yield from log_completion(f"Score completion {i}: ", txt)
     else:
         top_players = all_players
     if enable_pairwise_filter:
         def play(a, b):
+            text, usage = prompt_pairwise(
+                instruction,
+                criteria_block(),
+                a,
+                b,
+                model=pairwise_model,
+                api_base=api_base,
+                api_key=api_token,
+                return_usage=True,
+            )
+            add_usage(usage)
+            pairwise_outputs.append(text)
+            winner_label = _clean_json(text).get("winner", "A")
             return a if winner_label == "A" else b
         def tournament_round(pairs, executor):
             candidates = list(set(finalists + semifinalists + get_candidates(champion, lost_to)))
             return playoff(candidates, executor)[:num_top_picks]
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             top_k = get_top(top_players, ex)
+        for i, txt in enumerate(pairwise_outputs, 1):
+            yield from log_completion(f"Pairwise completion {i}: ", txt)
     else:
         top_k = top_players[:num_top_picks]
     top_picks_str = "\n\n\n=====================================================\n\n\n".join(top_k)
+    yield "\n".join(process_log + ["Done"]), hist_fig, top_picks_str, usage_str()
 demo = gr.Interface(
     fn=run_tournament,
         gr.Textbox(lines=10, label="Process"),
         gr.Plot(label="Score Distribution"),
         gr.Textbox(lines=50, label="Top picks"),
+        gr.Textbox(lines=5, label="Token Usage"),
     ],
     description="Generate multiple completions and use score and pairwise filters to find the best answers.",
 )

tests/test_main.py CHANGED Viewed

@@ -82,10 +82,10 @@ def test_run_tournament_full_loop():
          patch('main.tqdm', new=dummy_tqdm), \
          patch('main.plt.figure', return_value='fig'), \
          patch('main.plt.hist'):
-        mock_gen.return_value = ['p1', 'p2', 'p3', 'p4']
         scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
-        mock_score.side_effect = lambda instr, cl, block, player, **kw: json.dumps({'score': scores[player]})
-        mock_pair.side_effect = lambda instr, block, a, b, **kw: json.dumps({'winner': 'A'})
         results = list(main.run_tournament(
             api_base='b',
@@ -103,10 +103,13 @@ def test_run_tournament_full_loop():
             enable_pairwise_filter=True,
         ))
-    process_log, hist_fig, top_picks = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
     assert top_picks.strip() in {'p1', 'p2'}
-    mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k')
     assert mock_score.call_count == 4
     assert mock_pair.called

          patch('main.tqdm', new=dummy_tqdm), \
          patch('main.plt.figure', return_value='fig'), \
          patch('main.plt.hist'):
+        mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
         scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
+        mock_score.side_effect = lambda instr, cl, block, player, **kw: (json.dumps({'score': scores[player]}), {'prompt_tokens':1,'completion_tokens':1})
+        mock_pair.side_effect = lambda instr, block, a, b, **kw: (json.dumps({'winner': 'A'}), {'prompt_tokens':1,'completion_tokens':1})
         results = list(main.run_tournament(
             api_base='b',
             enable_pairwise_filter=True,
         ))
+    process_log, hist_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
     assert top_picks.strip() in {'p1', 'p2'}
+    mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', return_usage=True)
+    assert 'Score completion' in process_log
+    assert 'Pairwise completion' in process_log
+    assert 'Prompt tokens' in usage
     assert mock_score.call_count == 4
     assert mock_pair.called

tournament_utils.py CHANGED Viewed

@@ -18,15 +18,23 @@ def generate_players(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
-):
-    """Request `n` completions for the instruction using the given model."""
     response = completion(
         model=model,
         messages=[{"role": "user", "content": instruction}],
         n=n,
         **_completion_kwargs(api_base, api_key),
     )
-    return [c.message.content.strip() for c in response.choices]
 def prompt_score(
@@ -38,7 +46,8 @@ def prompt_score(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
-) -> str:
     """Return a JSON score string evaluating `player` on the criteria."""
     example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
     prompt = f"""Evaluate the output below on the following criteria:
@@ -56,7 +65,10 @@ Output:
         messages=[{"role": "system", "content": prompt}],
         **_completion_kwargs(api_base, api_key),
     )
-    return response.choices[0].message.content.strip()
 def prompt_pairwise(
@@ -68,7 +80,8 @@ def prompt_pairwise(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
-) -> str:
     """Return which player wins in JSON using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
@@ -86,4 +99,7 @@ Players:
         messages=[{"role": "system", "content": prompt}],
         **_completion_kwargs(api_base, api_key),
     )
-    return response.choices[0].message.content.strip()

     *,
     api_base: str | None = None,
     api_key: str | None = None,
+    return_usage: bool = False,
+) -> list[str] | tuple[list[str], object]:
+    """Request ``n`` completions for the instruction using the given model.
+    When ``return_usage`` is ``True`` the ``usage`` object from the completion
+    response is also returned.
+    """
     response = completion(
         model=model,
         messages=[{"role": "user", "content": instruction}],
         n=n,
         **_completion_kwargs(api_base, api_key),
     )
+    players = [c.message.content.strip() for c in response.choices]
+    if return_usage:
+        return players, getattr(response, "usage", None)
+    return players
 def prompt_score(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
+    return_usage: bool = False,
+) -> str | tuple[str, object]:
     """Return a JSON score string evaluating `player` on the criteria."""
     example_scores = ", ".join(["1-10"] * len(criteria_list)) or "1-10"
     prompt = f"""Evaluate the output below on the following criteria:
         messages=[{"role": "system", "content": prompt}],
         **_completion_kwargs(api_base, api_key),
     )
+    text = response.choices[0].message.content.strip()
+    if return_usage:
+        return text, getattr(response, "usage", None)
+    return text
 def prompt_pairwise(
     *,
     api_base: str | None = None,
     api_key: str | None = None,
+    return_usage: bool = False,
+) -> str | tuple[str, object]:
     """Return which player wins in JSON using the given criteria."""
     prompt = f"""Compare the two players below using:
 {criteria_block}
         messages=[{"role": "system", "content": prompt}],
         **_completion_kwargs(api_base, api_key),
     )
+    text = response.choices[0].message.content.strip()
+    if return_usage:
+        return text, getattr(response, "usage", None)
+    return text