| import json |
| import re |
| from datetime import datetime |
| import hashlib |
| import gradio as gr |
|
|
| from gen_api_answer import ( |
| atla_parse_model_response, |
| get_atla_response |
| ) |
|
|
| from prompts import ( |
| ATLA_PROMPT, |
| ATLA_PROMPT_WITH_REFERENCE |
| ) |
|
|
| from random_sample_generation import ( |
| get_random_human_ai_pair, |
| get_random_human_ai_ground_truth_pair, |
| generate_ai_response |
| ) |
|
|
| from utils import Vote |
|
|
| from prompts import ( |
| DEFAULT_EVAL_PROMPT, |
| DEFAULT_EVAL_PROMPT_EDITABLE, |
| FIXED_EVAL_SUFFIX, |
| DEFAULT_EVAL_CRITERIA |
| ) |
|
|
| from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS |
|
|
|
|
| |
| def load_model_data(): |
| model_data = {} |
| try: |
| with open("data/models.jsonl", "r") as f: |
| for line in f: |
| model = json.loads(line) |
| model_data[model["name"]] = { |
| "organization": model["organization"], |
| "license": model["license"], |
| "api_model": model["api_model"], |
| } |
| except FileNotFoundError: |
| print("Warning: models.jsonl not found") |
| return {} |
| return model_data |
|
|
|
|
| model_data = load_model_data() |
|
|
| def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id): |
| prompt_value = prompt.value if hasattr(prompt, 'value') else prompt |
| |
| vote = Vote( |
| timestamp=datetime.now().isoformat(), |
| prompt=prompt_value, |
| response_a=response_a, |
| response_b=response_b, |
| model_a=model_a, |
| model_b=model_b, |
| winner=winner, |
| judge_id=judge_id, |
| ) |
| add_vote(vote, db) |
|
|
|
|
| def parse_variables(prompt): |
| |
| variables = re.findall(r"{{(.*?)}}", prompt) |
| |
| seen = set() |
| variables = [ |
| x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip())) |
| ] |
| return variables |
|
|
|
|
| def get_final_prompt(eval_prompt, variable_values): |
| |
| for var, val in variable_values.items(): |
| eval_prompt = eval_prompt.replace("{{" + var + "}}", val) |
| return eval_prompt |
|
|
|
|
|
|
| def get_ip(request: gr.Request) -> str: |
| """Get and hash the IP address from the request.""" |
| if "cf-connecting-ip" in request.headers: |
| ip = request.headers["cf-connecting-ip"] |
| elif "x-forwarded-for" in request.headers: |
| ip = request.headers["x-forwarded-for"] |
| if "," in ip: |
| ip = ip.split(",")[0] |
| else: |
| ip = request.client.host |
| |
| |
| return hashlib.sha256(ip.encode()).hexdigest()[:16] |
|
|
|
|
| def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]: |
| """Generate appropriate message based on vote and model rankings. |
| Returns (title, message) tuple.""" |
| |
| voting_data = get_current_votes() |
| leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True) |
| rankings = get_model_rankings(leaderboard) |
| pos_a = rankings.get(model_a, 0) |
| pos_b = rankings.get(model_b, 0) |
| |
| if choice == "Tie": |
| return "It's a tie!", "Keep voting responsibly 🤗" |
| |
| |
| if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a): |
| return "The favourite wins!", "Keep voting responsibly 🤗" |
| else: |
| return "The underdog wins!", "Keep voting responsibly 🤗" |
|
|
|
|
| def populate_random_example(request: gr.Request, compatible_mode: bool): |
| """Generate a random human-AI conversation example and reset judge outputs.""" |
| if compatible_mode: |
| human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
| else: |
| human_msg, ai_msg = get_random_human_ai_pair() |
| ground_truth_msg = "" |
| |
| return [ |
| gr.update(value=human_msg), |
| gr.update(value=ai_msg), |
| gr.update(value="🎲", variant="secondary"), |
| gr.update(value=""), |
| gr.update(value=""), |
| gr.update(value=ground_truth_msg, visible=compatible_mode), |
| ] |
|
|
|
|
| with gr.Blocks(theme="default", css=CSS_STYLES) as demo: |
| gr.Markdown(MAIN_TITLE) |
| gr.Markdown(HOW_IT_WORKS) |
| |
| |
| eval_prompt = gr.Textbox( |
| value=DEFAULT_EVAL_PROMPT, |
| visible=False |
| ) |
|
|
| with gr.Tabs(): |
| with gr.TabItem("Playground"): |
| with gr.Row(): |
| |
| with gr.Column(scale=1): |
| with gr.Group(): |
| human_input = gr.TextArea( |
| label="👩 User Input", |
| lines=5, |
| placeholder="Enter the human message here..." |
| ) |
| with gr.Row(): |
| generate_btn = gr.Button( |
| "Generate AI Response", |
| size="sm", |
| interactive=False |
| ) |
| |
| ai_response = gr.TextArea( |
| label="🤖 AI Response", |
| lines=10, |
| placeholder="Enter the AI response here..." |
| ) |
| |
| |
| ground_truth = gr.TextArea( |
| label="🎯 Ground truth response", |
| lines=10, |
| placeholder="Enter the ground truth response here...", |
| visible=False |
| ) |
| |
| with gr.Row(): |
| random_btn = gr.Button("🎲", scale=2) |
| send_btn = gr.Button( |
| value="Run evaluation", |
| variant="primary", |
| size="lg", |
| scale=8 |
| ) |
|
|
| |
| with gr.Column(scale=1): |
| gr.Markdown("### 👩⚖️ Selene-Mini Evaluation") |
| with gr.Group(): |
| with gr.Row(): |
| score = gr.Textbox(label="Score", lines=1, interactive=False) |
| critique = gr.TextArea(label="Critique", lines=12, interactive=False) |
| |
| gr.Markdown("<br>") |
| |
|
|
| |
| with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion: |
| gr.Markdown("<br>") |
| use_reference_toggle = gr.Checkbox( |
| label="Use a reference response", |
| value=False |
| ) |
| |
| |
| with gr.Column(visible=False) as default_prompt_editor: |
| eval_prompt_editable = gr.TextArea( |
| value=DEFAULT_EVAL_PROMPT_EDITABLE, |
| label="Evaluation Criteria", |
| lines=12 |
| ) |
|
|
| with gr.Row(visible=False) as edit_buttons_row: |
| cancel_prompt_btn = gr.Button("Cancel") |
| save_prompt_btn = gr.Button("Save", variant="primary") |
| gr.Markdown("*The sample being evaluated is always appended as:*") |
| gr.Markdown(f"```{FIXED_EVAL_SUFFIX}") |
| |
| |
| with gr.Column(visible=True) as compatible_prompt_editor: |
| eval_criteria_text = gr.TextArea( |
| label="Evaluation Criteria", |
| lines=12, |
| value=DEFAULT_EVAL_CRITERIA, |
| placeholder="Enter the complete evaluation criteria and scoring rubric..." |
| ) |
|
|
| |
| model_a_state = gr.State() |
| model_b_state = gr.State() |
| final_prompt_state = gr.State() |
| eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) |
| is_editing = gr.State(False) |
| compatible_mode_state = gr.State(False) |
|
|
| |
| def update_model_names(model_a, model_b): |
| return gr.update(value=f"*Model: {model_a}*"), gr.update( |
| value=f"*Model: {model_b}*" |
| ) |
|
|
| |
| last_submission = gr.State({}) |
|
|
| |
| def save_prompt(new_prompt, previous_prompt): |
| return [ |
| gr.update(value=new_prompt), |
| new_prompt, |
| gr.update(visible=False) |
| ] |
|
|
| def cancel_prompt(previous_prompt): |
| return [ |
| gr.update(value=previous_prompt), |
| previous_prompt, |
| gr.update(visible=False) |
| ] |
|
|
| def show_edit_buttons(current_value, previous_value): |
| |
| return gr.update(visible=current_value != previous_value) |
|
|
| |
| save_prompt_btn.click( |
| fn=save_prompt, |
| inputs=[eval_prompt_editable, eval_prompt_previous], |
| outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] |
| ) |
|
|
| cancel_prompt_btn.click( |
| fn=cancel_prompt, |
| inputs=[eval_prompt_previous], |
| outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row] |
| ) |
|
|
| eval_prompt_editable.change( |
| fn=show_edit_buttons, |
| inputs=[eval_prompt_editable, eval_prompt_previous], |
| outputs=edit_buttons_row |
| ) |
|
|
| |
| def toggle_use_reference(checked): |
| if checked: |
| human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair() |
| return { |
| ground_truth: gr.update(visible=True, value=ground_truth_msg), |
| human_input: gr.update(value=human_msg), |
| ai_response: gr.update(value=ai_msg), |
| score: gr.update(value=""), |
| critique: gr.update(value=""), |
| random_btn: gr.update(value="🎲", variant="secondary"), |
| } |
| else: |
| return { |
| ground_truth: gr.update(visible=False) |
| } |
|
|
| |
| use_reference_toggle.change( |
| fn=toggle_use_reference, |
| inputs=[use_reference_toggle], |
| outputs=[ |
| ground_truth, |
| human_input, |
| ai_response, |
| score, |
| critique, |
| random_btn, |
| ] |
| ) |
|
|
| |
| first_game_state = gr.State(True) |
|
|
| |
| def submit_and_store( |
| use_reference, |
| eval_criteria_text, |
| human_input, |
| ai_response, |
| ground_truth_input, |
| ): |
| |
| prompt_data = { |
| 'human_input': human_input, |
| 'ai_response': ai_response, |
| 'ground_truth_input': ground_truth_input if use_reference else '', |
| 'eval_criteria': eval_criteria_text, |
| } |
|
|
| |
| base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT |
|
|
| |
| final_prompt = base_prompt.format( |
| human_input=prompt_data['human_input'], |
| ai_response=prompt_data['ai_response'], |
| ground_truth_input=prompt_data['ground_truth_input'], |
| eval_criteria=prompt_data['eval_criteria'] |
| ) |
|
|
| |
| response = get_atla_response( |
| model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B", |
| prompt=final_prompt, |
| max_tokens=500, |
| temperature=0.01 |
| ) |
|
|
| |
| score, critique = atla_parse_model_response(response) |
|
|
| return [ |
| score, |
| critique, |
| gr.update(value="Regenerate evaluation", variant="secondary", interactive=True), |
| gr.update(value="🎲"), |
| ] |
|
|
| |
| def create_submit_handler(): |
| first_game = True |
| |
| def handler(*args): |
| nonlocal first_game |
| result = submit_and_store(*args) |
| first_game = False |
| return result |
| |
| return handler |
|
|
| |
| send_btn.click( |
| fn=submit_and_store, |
| inputs=[ |
| use_reference_toggle, |
| eval_criteria_text, |
| human_input, |
| ai_response, |
| ground_truth, |
| ], |
| outputs=[ |
| score, |
| critique, |
| send_btn, |
| random_btn, |
| ], |
| ) |
|
|
| |
| random_btn.click( |
| fn=populate_random_example, |
| inputs=[use_reference_toggle], |
| outputs=[ |
| human_input, |
| ai_response, |
| random_btn, |
| score, |
| critique, |
| ground_truth, |
| ] |
| ) |
|
|
| |
| def handle_input_change(): |
| """Reset UI state when inputs are changed""" |
| return [ |
| gr.update(value="Run evaluation", variant="primary"), |
| gr.update(value="🎲", variant="secondary"), |
| ] |
|
|
| |
| human_input.change( |
| fn=handle_input_change, |
| inputs=[], |
| outputs=[send_btn, random_btn] |
| ) |
|
|
| ai_response.change( |
| fn=handle_input_change, |
| inputs=[], |
| outputs=[send_btn, random_btn] |
| ) |
|
|
| generate_btn.click( |
| fn=lambda msg: ( |
| generate_ai_response(msg)[0], |
| gr.update( |
| value="Generate AI Response", |
| interactive=False |
| ) |
| ), |
| inputs=[human_input], |
| outputs=[ai_response, generate_btn] |
| ) |
|
|
| human_input.change( |
| fn=lambda x: gr.update(interactive=bool(x.strip())), |
| inputs=[human_input], |
| outputs=[generate_btn] |
| ) |
|
|
| |
| demo.load( |
| fn=lambda: populate_random_example(None, False), |
| inputs=[], |
| outputs=[ |
| human_input, |
| ai_response, |
| random_btn, |
| score, |
| critique, |
| ground_truth, |
| ] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |