Spaces:

ping98k
/

LLM-Brainstorming

Sleeping

App Files Files Community

ping98k commited on Jun 28, 2025

Commit

e6b4ba0

1 Parent(s): d95d0b3

Add Elo bar plot

Browse files

Files changed (2) hide show

main.py +9 -3
tests/test_main.py +9 -4

main.py CHANGED Viewed

@@ -122,6 +122,7 @@ def run_tournament(
     process_log = []
     hist_fig = None
     top_picks_str = ""
     prompt_tokens = 0
     completion_tokens = 0
@@ -161,7 +162,7 @@ def run_tournament(
     def log(msg):
         process_log.append(msg)
         tqdm.write(msg)
-        yield "\n".join(process_log), hist_fig, top_picks_str, usage_str()
     yield from log("Generating answers …")
     all_players, usage = generate_players(
         instruction,
@@ -279,7 +280,11 @@ def run_tournament(
         yield from log("Pairwise generating")
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             rating = yield from rate(top_players, ex)
-        top_k = sorted(top_players, key=rating.get, reverse=True)[:num_top_picks]
         for i, txt in enumerate(pairwise_outputs, 1):
             yield from log_completion(f"Pairwise completion {i}: ", txt)
         top_picks_str = "\n\n\n=====================================================\n\n\n".join(
@@ -288,7 +293,7 @@ def run_tournament(
     else:
         top_k = top_players[:num_top_picks]
         top_picks_str = "\n\n\n=====================================================\n\n\n".join(top_k)
-    yield "\n".join(process_log + ["Done"]), hist_fig, top_picks_str, usage_str()
 demo = gr.Interface(
     fn=run_tournament,
@@ -320,6 +325,7 @@ demo = gr.Interface(
     outputs=[
         gr.Textbox(lines=10, label="Process"),
         gr.Plot(label="Score Distribution"),
         gr.Textbox(lines=50, label="Top picks"),
         gr.Textbox(lines=5, label="Token Usage"),
     ],

     process_log = []
     hist_fig = None
+    elo_fig = None
     top_picks_str = ""
     prompt_tokens = 0
     completion_tokens = 0
     def log(msg):
         process_log.append(msg)
         tqdm.write(msg)
+        yield "\n".join(process_log), hist_fig, elo_fig, top_picks_str, usage_str()
     yield from log("Generating answers …")
     all_players, usage = generate_players(
         instruction,
         yield from log("Pairwise generating")
         with ThreadPoolExecutor(max_workers=max_workers) as ex:
             rating = yield from rate(top_players, ex)
+        elo_fig = plt.figure()
+        players_sorted = sorted(rating, key=rating.get, reverse=True)
+        plt.bar(range(len(players_sorted)), [rating[p] for p in players_sorted])
+        plt.xticks(range(len(players_sorted)), [str(i + 1) for i in range(len(players_sorted))])
+        top_k = players_sorted[:num_top_picks]
         for i, txt in enumerate(pairwise_outputs, 1):
             yield from log_completion(f"Pairwise completion {i}: ", txt)
         top_picks_str = "\n\n\n=====================================================\n\n\n".join(
     else:
         top_k = top_players[:num_top_picks]
         top_picks_str = "\n\n\n=====================================================\n\n\n".join(top_k)
+    yield "\n".join(process_log + ["Done"]), hist_fig, elo_fig, top_picks_str, usage_str()
 demo = gr.Interface(
     fn=run_tournament,
     outputs=[
         gr.Textbox(lines=10, label="Process"),
         gr.Plot(label="Score Distribution"),
+        gr.Plot(label="Elo Ratings"),
         gr.Textbox(lines=50, label="Top picks"),
         gr.Textbox(lines=5, label="Token Usage"),
     ],

tests/test_main.py CHANGED Viewed

@@ -39,6 +39,8 @@ sys.modules.setdefault('tqdm', fake_tqdm_mod)
 fake_plt = types.ModuleType('matplotlib.pyplot')
 fake_plt.figure = MagicMock(return_value='fig')
 fake_plt.hist = MagicMock()
 fake_matplotlib = types.ModuleType('matplotlib')
 fake_matplotlib.pyplot = fake_plt
 sys.modules.setdefault('matplotlib', fake_matplotlib)
@@ -81,7 +83,8 @@ def test_run_tournament_full_loop():
          patch('main.as_completed', new=lambda futs: futs), \
          patch('main.tqdm', new=dummy_tqdm), \
          patch('main.plt.figure', return_value='fig'), \
-         patch('main.plt.hist'):
         mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
         scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
         mock_score.side_effect = lambda instr, cl, block, player, **kw: (json.dumps({'score': scores[player]}), {'prompt_tokens':1,'completion_tokens':1})
@@ -111,9 +114,10 @@ def test_run_tournament_full_loop():
             pairwise_thinking=True,
         ))
-    process_log, hist_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
     assert any(p in top_picks for p in {'p1', 'p2'})
     mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, thinking=True, return_usage=True)
     assert 'Score completion' in process_log
@@ -131,7 +135,8 @@ def test_run_tournament_pairwise_odd_players():
          patch('main.as_completed', new=lambda futs: futs), \
          patch('main.tqdm', new=dummy_tqdm), \
          patch('main.plt.figure', return_value='fig'), \
-         patch('main.plt.hist'):
         mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
         mock_pair.side_effect = lambda instr, block, a, b, **kw: (json.dumps({'winner':'A'}), {'prompt_tokens':1,'completion_tokens':1})
@@ -159,7 +164,7 @@ def test_run_tournament_pairwise_odd_players():
             pairwise_thinking=True,
         ))
-    process_log, fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert any(p in top_picks for p in {'p1', 'p2', 'p3'})
     assert mock_pair.call_count == 3

 fake_plt = types.ModuleType('matplotlib.pyplot')
 fake_plt.figure = MagicMock(return_value='fig')
 fake_plt.hist = MagicMock()
+fake_plt.bar = MagicMock()
+fake_plt.xticks = MagicMock()
 fake_matplotlib = types.ModuleType('matplotlib')
 fake_matplotlib.pyplot = fake_plt
 sys.modules.setdefault('matplotlib', fake_matplotlib)
          patch('main.as_completed', new=lambda futs: futs), \
          patch('main.tqdm', new=dummy_tqdm), \
          patch('main.plt.figure', return_value='fig'), \
+         patch('main.plt.hist'), \
+         patch('main.plt.bar'):
         mock_gen.return_value = (['p1', 'p2', 'p3', 'p4'], {'prompt_tokens':1,'completion_tokens':1})
         scores = {'p1':3, 'p2':2, 'p3':1, 'p4':0}
         mock_score.side_effect = lambda instr, cl, block, player, **kw: (json.dumps({'score': scores[player]}), {'prompt_tokens':1,'completion_tokens':1})
             pairwise_thinking=True,
         ))
+    process_log, hist_fig, elo_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert hist_fig == 'fig'
+    assert elo_fig == 'fig'
     assert any(p in top_picks for p in {'p1', 'p2'})
     mock_gen.assert_called_once_with('instr', 4, model='gm', api_base='b', api_key='k', temperature=1, thinking=True, return_usage=True)
     assert 'Score completion' in process_log
          patch('main.as_completed', new=lambda futs: futs), \
          patch('main.tqdm', new=dummy_tqdm), \
          patch('main.plt.figure', return_value='fig'), \
+         patch('main.plt.hist'), \
+         patch('main.plt.bar'):
         mock_gen.return_value = (['p1', 'p2', 'p3'], {'prompt_tokens':1,'completion_tokens':1})
         mock_pair.side_effect = lambda instr, block, a, b, **kw: (json.dumps({'winner':'A'}), {'prompt_tokens':1,'completion_tokens':1})
             pairwise_thinking=True,
         ))
+    process_log, hist_fig, elo_fig, top_picks, usage = results[-1]
     assert 'Done' in process_log
     assert any(p in top_picks for p in {'p1', 'p2', 'p3'})
     assert mock_pair.call_count == 3