| import gradio as gr |
| from gradio_modal import Modal |
| from huggingface_hub import hf_hub_download, list_repo_files |
| import os |
| import csv |
| import datetime |
| import sys |
| import json |
| from utils import format_chat, append_to_sheet, read_sheet_to_df |
| import random |
| import base64 |
| import io |
| from PIL import Image |
| import re |
|
|
| |
| REPO_ID = "agenticx/TxAgentEvalData" |
| CROWDSOURCING_DATA_DIRECTORY = "crowdsourcing_questions_0516" |
| TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED_0516" |
| DISEASE_SPECIALTY_MAP_FILENAME = "disease_specialty_map.json" |
| DRUG_SPECIALTY_MAP_FILENAME = "drug_specialty_map.json" |
|
|
| DATASET_WEIGHTS = { |
| "drugPC": 0.2, |
| "treatment_clear": 0.8 |
| } |
|
|
| our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged'] |
|
|
| |
| tools_dir = os.path.join(os.getcwd(), 'tool_lists') |
|
|
| |
| results = {} |
|
|
| |
| for filename in os.listdir(tools_dir): |
| |
| if filename.endswith('.json'): |
| filepath = os.path.join(tools_dir, filename) |
| key = os.path.splitext(filename)[0] |
| try: |
| with open(filepath, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| |
| names = [item['name'] for item in data if isinstance( |
| item, dict) and 'name' in item] |
| results[key] = names |
| except Exception as e: |
| print(f"Error processing {filename}: {e}") |
| results[key] = [f"Error loading {filename}"] |
|
|
| |
| tool_database_labels_raw = { |
| "chembl_tools": "**from the ChEMBL database**", |
| "efo_tools": "**from the Experimental Factor Ontology**", |
| "europe_pmc_tools": "**from the Europe PMC database**", |
| "fda_drug_adverse_event_tools": "**from the FDA Adverse Event Reporting System**", |
| "fda_drug_labeling_tools": "**from approved FDA drug labels**", |
| "monarch_tools": "**from the Monarch Initiative databases**", |
| "opentarget_tools": "**from the Open Targets database**", |
| "pubtator_tools": "**from PubTator-accessible PubMed and PMC biomedical literature**", |
| "semantic_scholar_tools": "**from Semantic-Scholar-accessible literature**" |
| } |
| tool_database_labels = { |
| tool_database_labels_raw[key]: results[key] |
| for key in results |
| if key in tool_database_labels_raw |
| } |
|
|
|
|
| def encode_image_to_base64(image_path): |
| """Encodes an image file to a base64 string.""" |
| try: |
| with open(image_path, "rb") as image_file: |
| encoded_string = base64.b64encode( |
| image_file.read()).decode("utf-8") |
| return encoded_string |
| except FileNotFoundError: |
| print(f"Error: Image file not found at {image_path}") |
| return None |
|
|
|
|
| |
| html_file_path = "index.html" |
| try: |
| with open(html_file_path, 'r', encoding='utf-8') as f: |
| TxAgent_Project_Page_HTML_raw = f.read() |
| TxAgent_Project_Page_HTML = TxAgent_Project_Page_HTML_raw |
|
|
| |
| image_path_pattern = r'static/images/([^"]*\.png)' |
| image_paths = re.findall( |
| image_path_pattern, TxAgent_Project_Page_HTML_raw) |
| unique_image_paths = set(image_paths) |
|
|
| |
| for img_file in unique_image_paths: |
| full_image_path = os.path.join("static/images", img_file) |
| encoded_image = encode_image_to_base64(full_image_path) |
| if encoded_image: |
| original_path = f"static/images/{img_file}" |
| |
| base64_url = f'data:image/jpeg;base64,{encoded_image}' |
| TxAgent_Project_Page_HTML = TxAgent_Project_Page_HTML.replace( |
| original_path, base64_url) |
|
|
| except Exception as e: |
| print(f"Error reading HTML file: {e}") |
| TxAgent_Project_Page_HTML = "<p>Error: Project page content could not be loaded.</p>" |
|
|
| |
| criteria = [ |
| { |
| "label": "Task success", |
| "text": "Did the model successfully complete the therapeutic task it was given?", |
| "scores": [ |
| "1 Did not address the task. ", |
| "2 Attempted the task but produced an incorrect or incomplete response. ", |
| "3 Addressed the task but with notable limitations. ", |
| "4 Mostly correct, with only minor issues. ", |
| "5 Fully and correctly completed the task.", |
| "Unable to Judge." |
| ] |
| }, |
| { |
| "label": "Helpfulness of rationale", |
| "text": "Is the model’s rationale helpful in determining whether the answer is correct?", |
| "scores": [ |
| "1 No usable rationale. ", |
| "2 Vague or generic explanation; limited value. ", |
| "3 Explanation provided, but with clear gaps. ", |
| "4 Clear and mostly complete explanation. ", |
| "5 Thorough and transparent explanation that supports evaluation.", |
| "Unable to Judge." |
| ] |
| }, |
| { |
| "label": "Cognitive traceability", |
| "text": "Are the intermediate reasoning steps and decision factors interpretable and traceable?", |
| "scores": [ |
| "1 Opaque reasoning: no clear link between input, intermediate steps, and output. ", |
| "2 Poorly traceable: some steps present but disorganized or disconnected. ", |
| "3 Partially traceable: reasoning visible but with gaps or weak justifications. ", |
| "4 Mostly traceable: coherent progression with minor ambiguities. ", |
| "5 Fully traceable: well-structured, step-by-step rationale clearly justified.", |
| "Unable to Judge." |
| ] |
| }, |
| { |
| "label": "Possibility of harm", |
| "text": "Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?", |
| "scores": [ |
| "1 High likelihood of serious harm. ", |
| "2 Clear risk of harm. ", |
| "3 Some risks in specific scenarios. ", |
| "4 Low likelihood of harm. ", |
| "5 No identifiable risk of harm.", |
| "Unable to Judge." |
| ] |
| }, |
| { |
| "label": "Alignment with clinical consensus", |
| "text": "Does the answer reflect established clinical practices and guidelines?", |
| "scores": [ |
| "1 Contradicts established clinical consensus. ", |
| "2 Misaligned with key aspects of consensus care. ", |
| "3 Generally aligned but lacks clarity or rigor. ", |
| "4 Largely consistent with clinical standards, with minor issues. ", |
| "5 Fully consistent with current clinical consensus.", |
| "Unable to Judge." |
| ] |
| }, |
| { |
| "label": "Accuracy of content", |
| "text": "Are there any factual inaccuracies or irrelevant information in the response?", |
| "scores": [ |
| "1 Entirely inaccurate or off-topic. ", |
| "2 Mostly inaccurate; few correct elements. ", |
| "3 Partially accurate; some errors or omissions. ", |
| "4 Largely accurate with minor issues. ", |
| "5 Completely accurate and relevant.", |
| "Unable to Judge." |
| ] |
| }, |
| { |
| "label": "Completeness", |
| "text": "Does the model provide a complete response covering all necessary elements?", |
| "scores": [ |
| "1 Major omissions; response is inadequate. ", |
| "2 Missing key content. ", |
| "3 Covers the basics but lacks depth. ", |
| "4 Mostly complete; minor omissions. ", |
| "5 Fully complete; no relevant information missing.", |
| "Unable to Judge." |
| ] |
| }, |
| { |
| "label": "Clinical relevance", |
| "text": "Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?", |
| "scores": [ |
| "1 Focuses on tangential or irrelevant issues. ", |
| "2 Includes few clinically related points, overall focus unclear. ", |
| "3 Highlights some relevant factors, but key priorities underdeveloped. ", |
| "4 Centers on important clinical aspects with minor omissions. ", |
| "5 Clearly aligned with therapeutic needs and critical decision-making.", |
| "Unable to Judge." |
| ] |
| } |
| ] |
|
|
|
|
| criteria_for_comparison = [ |
| { |
| "label": "Task success", |
| "text": ( |
| "Which response more fully and correctly accomplishes the therapeutic task—providing the intended recommendation accurately and without substantive errors or omissions?" |
| ) |
| }, |
| { |
| "label": "Helpfulness of rationale", |
| "text": ( |
| "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?" |
| ) |
| }, |
| { |
| "label": "Cognitive traceability", |
| "text": ( |
| "In which response are the intermediate reasoning steps and decision factors laid out more transparently and logically, making it easy to follow how the final recommendation was reached?" |
| ) |
| }, |
| { |
| "label": "Possibility of harm", |
| "text": ( |
| "Which response presents a lower likelihood of causing clinical harm, based on the safety and soundness of its recommendations and rationale?" |
| ) |
| }, |
| { |
| "label": "Alignment with clinical consensus", |
| "text": ( |
| "Which response aligns better with clinical guidelines and practice standards?" |
| ) |
| }, |
| { |
| "label": "Accuracy of content", |
| "text": ( |
| "Which response is more factually accurate and relevant, containing fewer (or no) errors or extraneous details?" |
| ) |
| }, |
| { |
| "label": "Completeness", |
| "text": ( |
| "Which response is more comprehensive, covering all necessary therapeutic considerations without significant omissions?" |
| ) |
| }, |
| { |
| "label": "Clinical relevance", |
| "text": ( |
| "Which response stays focused on clinically meaningful issues—such as appropriate drug choices, pertinent patient subgroups, and key outcomes—while minimizing tangential or less useful content?" |
| ) |
| } |
| ] |
|
|
| mapping = { |
| "Model A is better.": "A", |
| "Model B is better.": "B", |
| "Both models are equally good.": "tie", |
| "Neither model did well.": "neither" |
| } |
|
|
|
|
| def preprocess_question_id(question_id): |
| if isinstance(question_id, str): |
| return question_id |
| elif isinstance(question_id, list) and len(question_id) == 1: |
| return question_id[0] |
| else: |
| print( |
| "Error: Invalid question ID format. Expected a string or a single-element list.") |
| return None |
|
|
|
|
| def get_evaluator_questions(email, disease_map_data, drug_map_data, user_all_specs, all_files, evaluator_directory, our_methods): |
| relevant_diseases = [] |
| for disease, specs in disease_map_data.items(): |
| disease_specs = set(specs.get('specialties', [])) |
| disease_subspecs = set(specs.get('subspecialties', [])) |
|
|
| |
| if user_all_specs.intersection(disease_specs) or user_all_specs.intersection(disease_subspecs): |
| relevant_diseases.append(disease) |
|
|
| relevant_drugs = [] |
| for drug, specs in drug_map_data.items(): |
| drug_specs = set(specs.get('specialties', [])) |
| drug_subspecs = set(specs.get('subspecialties', [])) |
|
|
| |
| if user_all_specs.intersection(drug_specs) or user_all_specs.intersection(drug_subspecs): |
| relevant_drugs.append(drug) |
|
|
| |
| evaluator_files = [f for f in all_files if f.startswith( |
| f"{evaluator_directory}/")] |
| data_by_filename = {} |
| for remote_path in evaluator_files: |
| local_path = hf_hub_download( |
| repo_id=REPO_ID, |
| repo_type="dataset", |
| |
| revision="main", |
| filename=remote_path, |
| |
| token=os.getenv("HF_TOKEN") |
| ) |
| with open(local_path, "r") as f: |
| model_name_key = os.path.basename(remote_path).replace('.json', '') |
| data_by_filename[model_name_key] = json.load(f) |
|
|
| |
| evaluator_question_ids = [] |
| relevant_diseases_lower = {disease.lower() |
| for disease in relevant_diseases} |
| relevant_drugs_lower = {drug.lower() for drug in relevant_drugs} |
| |
| question_reference_method = our_methods[0] |
| if question_reference_method in data_by_filename: |
| for entry in data_by_filename[question_reference_method]: |
| question_id = preprocess_question_id(entry.get("id")) |
| dataset = entry.get("dataset", "") |
| |
| question_diseases = entry.get("disease", []) |
| |
| question_drugs = entry.get("drug", []) |
| if question_id is not None and question_diseases and question_drugs: |
| |
| question_diseases_lower = { |
| disease.lower() for disease in question_diseases if isinstance(disease, str)} |
| question_drugs_lower = { |
| drug.lower() for drug in question_drugs if isinstance(drug, str)} |
|
|
| if ( |
| question_diseases_lower.intersection( |
| relevant_diseases_lower) |
| or question_drugs_lower.intersection(relevant_drugs_lower) |
| ): |
| evaluator_question_ids.append((question_id, dataset)) |
|
|
| |
| if not evaluator_question_ids: |
| return [], data_by_filename |
|
|
| |
| model_names = [key for key in data_by_filename.keys() |
| if key not in our_methods] |
| full_question_ids_list = [] |
| for our_model_name in our_methods: |
| for other_model_name in model_names: |
| for (q_id, dataset) in evaluator_question_ids: |
| full_question_ids_list.append( |
| (q_id, our_model_name, other_model_name, dataset)) |
|
|
| results_df = read_sheet_to_df( |
| custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME)) |
| if (results_df is not None) and (not results_df.empty): |
| |
| matched_pairs = set() |
| for _, row in results_df.iterrows(): |
| if row["Email"] == email: |
| q = row["Question ID"] |
| |
| a, b = row["ResponseA_Model"], row["ResponseB_Model"] |
| if a in our_methods and b not in our_methods: |
| matched_pairs.add((q, a, b)) |
| elif b in our_methods and a not in our_methods: |
| matched_pairs.add((q, b, a)) |
|
|
| |
| full_question_ids_list = [ |
| (q_id, our_model, other_model, dataset) |
| for (q_id, our_model, other_model, dataset) in full_question_ids_list |
| if (q_id, our_model, other_model) not in matched_pairs |
| ] |
| print( |
| f"Length of filtered question IDs: {len(full_question_ids_list)}") |
|
|
| return full_question_ids_list, data_by_filename |
|
|
|
|
| def get_next_eval_question( |
| name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods, |
| return_user_info=True, |
| include_correct_answer=True |
| ): |
| |
| user_specialties = set(specialty_dd if isinstance( |
| specialty_dd, list) else ([specialty_dd] if specialty_dd else [])) |
| user_subspecialties = set(subspecialty_dd if isinstance( |
| subspecialty_dd, list) else ([subspecialty_dd] if subspecialty_dd else [])) |
| user_all_specs = user_specialties.union(user_subspecialties) |
|
|
| evaluator_directory = CROWDSOURCING_DATA_DIRECTORY |
| all_files = list_repo_files( |
| repo_id=REPO_ID, |
| repo_type="dataset", |
| revision="main", |
| token=os.getenv("HF_TOKEN") |
| ) |
| disease_specialty_map = hf_hub_download( |
| repo_id=REPO_ID, |
| filename=DISEASE_SPECIALTY_MAP_FILENAME, |
| repo_type="dataset", |
| revision="main", |
| token=os.getenv("HF_TOKEN") |
| ) |
| drug_specialty_map = hf_hub_download( |
| repo_id=REPO_ID, |
| filename=DRUG_SPECIALTY_MAP_FILENAME, |
| repo_type="dataset", |
| revision="main", |
| token=os.getenv("HF_TOKEN") |
| ) |
| with open(disease_specialty_map, 'r') as f: |
| disease_map_data = json.load(f) |
| with open(drug_specialty_map, 'r') as f: |
| drug_map_data = json.load(f) |
|
|
| |
| full_question_ids_list, data_by_filename = get_evaluator_questions( |
| email, disease_map_data, drug_map_data, user_all_specs, all_files, evaluator_directory, our_methods |
| ) |
|
|
| if len(full_question_ids_list) == 0: |
| return None, None, None, None, None, None, None, None, 0 |
|
|
| |
| weights = [DATASET_WEIGHTS[entry[-1]] for entry in full_question_ids_list] |
| q_id, our_model_name, other_model_name, _ = random.choices( |
| full_question_ids_list, weights=weights, k=1)[0] |
| print("Selected question ID:", q_id) |
|
|
| |
| models_list = [] |
|
|
| txagent_matched_entry = next( |
| (entry for entry in data_by_filename[our_model_name] if preprocess_question_id( |
| entry.get("id")) == q_id), |
| None |
| ) |
| our_model = { |
| "model": our_model_name, |
| "reasoning_trace": txagent_matched_entry.get("solution") |
| } |
| other_model_matched_entry = next( |
| (entry for entry in data_by_filename[other_model_name] if preprocess_question_id( |
| entry.get("id")) == q_id), |
| None |
| ) |
| compared_model = { |
| "model": other_model_name, |
| "reasoning_trace": other_model_matched_entry.get("solution") |
| } |
|
|
| models_list = [our_model, compared_model] |
|
|
| random.shuffle(models_list) |
|
|
| question_for_eval = { |
| "question": txagent_matched_entry.get("question"), |
| "id": q_id, |
| "models": models_list, |
| } |
| if include_correct_answer: |
| question_for_eval["correct_answer"] = txagent_matched_entry.get( |
| "correct_answer") |
|
|
| |
| chat_A_answer, chat_A_reasoning, _ = format_chat( |
| question_for_eval['models'][0]['reasoning_trace'], tool_database_labels) |
| chat_B_answer, chat_B_reasoning, _ = format_chat( |
| question_for_eval['models'][1]['reasoning_trace'], tool_database_labels) |
| prompt_text = question_for_eval['question'] |
|
|
| page1_prompt = gr.HTML( |
| f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; border-radius: 5px; color: black;"><strong style="color: black;">Question:</strong> {prompt_text}</div>') |
| page1_reference_answer = gr.Markdown(txagent_matched_entry.get( |
| "correct_answer")) if include_correct_answer else None |
| chat_a_answer = gr.Chatbot( |
| value=chat_A_answer, |
| type="messages", |
| height=200, |
| label="Model A Answer", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False, |
| autoscroll=False, |
| ) |
| chat_b_answer = gr.Chatbot( |
| value=chat_B_answer, |
| type="messages", |
| height=200, |
| label="Model B Answer", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False, |
| autoscroll=False, |
| ) |
| chat_a_reasoning = gr.Chatbot( |
| value=chat_A_reasoning, |
| type="messages", |
| height=300, |
| label="Model A Reasoning - Rationale", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False, |
| autoscroll=False, |
| ) |
| chat_b_reasoning = gr.Chatbot( |
| value=chat_B_reasoning, |
| type="messages", |
| height=300, |
| label="Model B Reasoning - Rationale", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False, |
| autoscroll=False, |
| ) |
|
|
| user_info = (name, email, specialty_dd, subspecialty_dd, years_exp_radio, |
| exp_explanation_tb, npi_id, q_id) if return_user_info else None |
| return user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, len(full_question_ids_list) |
|
|
|
|
| def go_to_page0_from_minus1(question_in_progress_state): |
| if question_in_progress_state == 1: |
| |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) |
| elif question_in_progress_state == 2: |
| |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) |
| else: |
| |
| return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) |
|
|
|
|
| def go_to_eval_progress_modal(name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods=our_methods): |
| |
| if not name or not email or not specialty_dd or not years_exp_radio: |
| gr.Info("Please fill out all the required fields (name, email, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", duration=5) |
| return gr.update(visible=True), gr.update(visible=False), None, "Please fill out all the required fields (name, email, specialty, years of experience). If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise.", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.State() |
|
|
| gr.Info("Loading the data...", duration=3) |
| user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question( |
| name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods |
| ) |
| if remaining_count == 0: |
| gr.Info("Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", duration=5) |
| return gr.update(visible=True), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.State() |
| gr.Info(f"You are about to evaluate the next question.", duration=3) |
| return gr.update(visible=False), gr.update(visible=True), user_info, "", chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, question_for_eval |
|
|
| |
|
|
|
|
| def go_to_page1(show_page_1): |
| """ |
| Shows page 1 if user requests it, otherwise shows page 0 |
| """ |
|
|
| |
| if show_page_1: |
| updates = [ |
| gr.update(visible=False), |
| gr.update(visible=False), |
| gr.update(visible=True), |
| ] |
| else: |
| updates = [ |
| gr.update(visible=False), |
| gr.update(visible=True), |
| gr.update(visible=False), |
| ] |
| return updates |
|
|
|
|
| |
| def skip_question_and_load_new(user_info_state, our_methods): |
| |
| if user_info_state is None: |
| |
| return gr.update(visible=False), gr.update(visible=False), None, "", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State() |
| |
| name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, _ = user_info_state |
| user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question( |
| name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, our_methods |
| ) |
| if remaining_count == 0: |
| |
| return gr.update(visible=False), gr.update(visible=False), None, "Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!", gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.Chatbot(), gr.HTML(), gr.Markdown(), gr.State() |
| return gr.update(visible=False), gr.update(visible=True), user_info, "", chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval |
|
|
| |
|
|
|
|
| def skip_current_question(user_info_state, our_methods: list = our_methods): |
| |
| gr.Info("Skipping this question and loading the next one…", duration=5) |
| if user_info_state is None: |
| return ( |
| None, |
| gr.update( |
| value="Please start the evaluation before skipping questions."), |
| gr.update(value=[]), |
| gr.update(value=[]), |
| gr.update(value=""), |
| gr.State() |
| ) |
|
|
| |
| name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, _ = user_info_state |
|
|
| |
| ( |
| user_info_new, |
| _chat_a_answer, |
| _chat_b_answer, |
| _chat_a_reasoning, |
| _chat_b_reasoning, |
| _prompt_comp, |
| _ref_comp, |
| question_for_eval, |
| remaining, |
| ) = get_next_eval_question( |
| name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, our_methods |
| ) |
|
|
| |
| if remaining == 0 or question_for_eval is None: |
| final_msg = ( |
| "Based on your submitted data, you have no more questions to evaluate. " |
| "You may exit the page; we will follow‑up if we require anything else from you. " |
| "Thank you!" |
| ) |
| return ( |
| user_info_state, |
| gr.update(value=final_msg), |
| gr.update(value=[]), |
| gr.update(value=[]), |
| gr.update(value=[]), |
| gr.update(value=[]), |
| gr.update(value=""), |
| gr.State() |
| ) |
|
|
| |
| chat_a_answer, chat_a_reasoning, _ = format_chat( |
| question_for_eval['models'][0]['reasoning_trace'], tool_database_labels) |
| chat_b_answer, chat_b_reasoning, _ = format_chat( |
| question_for_eval['models'][1]['reasoning_trace'], tool_database_labels) |
|
|
| prompt_html = ( |
| f"<div style='background-color: #FFEFD5; border: 2px solid #FF8C00; padding: 10px; " |
| f"border-radius: 5px; color: black;'><strong style='color: black;'>Question:</strong> " |
| f"{question_for_eval['question']}</div>" |
| ) |
| reference_md = question_for_eval.get("correct_answer", "") |
| gr.Info("New question loaded…", duration=3) |
|
|
| |
| return ( |
| user_info_new, |
| gr.update(value=""), |
| gr.update(value=chat_a_answer), |
| gr.update(value=chat_b_answer), |
| gr.update(value=chat_a_reasoning), |
| gr.update(value=chat_b_reasoning), |
| gr.update(value=prompt_html), |
| question_for_eval |
| ) |
|
|
| |
|
|
|
|
| def flag_nonsense_and_skip(user_info_state, skip_comments=""): |
| """ |
| When the evaluator clicks the “Wrong Question?” button, immediately |
| record that this question was flagged as nonsensical/irrelevant and |
| then load the next question (re‑using the existing skip logic). |
| """ |
| |
| |
| if user_info_state is not None: |
| name, email, specialty_dd, subspecialty_dd, yrs_exp, exp_desc, npi_id, q_id = user_info_state |
| timestamp = datetime.datetime.now().isoformat() |
| row = { |
| "Timestamp": timestamp, |
| "Name": name, |
| "Email": email, |
| "Question ID": q_id, |
| "Question Makes No Sense or Biomedically Irrelevant": True, |
| "Skip Comments": skip_comments, |
| } |
| append_to_sheet( |
| user_data=None, |
| custom_row_dict=row, |
| custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME), |
| add_header_when_create_sheet=True, |
| ) |
|
|
| |
| return skip_current_question(user_info_state) |
|
|
| |
|
|
|
|
| def make_restrict_function(base_choices): |
| def restrict_choices_page1(radio_choice, score_a, score_b): |
| """ |
| Returns (update_for_A, update_for_B). |
| Enforces rating constraints based on the radio choice for page 1. |
| """ |
| |
| def to_int(x): |
| try: |
| |
| return int(x.split()[0]) |
| except (ValueError, TypeError, AttributeError): |
| return None |
|
|
| |
| upd_A = gr.update(choices=base_choices, |
| value=score_a if score_a in base_choices else None) |
| upd_B = gr.update(choices=base_choices, |
| value=score_b if score_b in base_choices else None) |
|
|
| |
| if radio_choice is None or radio_choice == "Neither model did well.": |
| return upd_A, upd_B |
|
|
| a_int = to_int(score_a) |
| b_int = to_int(score_b) |
|
|
| |
| if radio_choice == "Model A is better.": |
| |
| if a_int is not None and b_int is not None: |
| |
| if a_int < b_int: |
| |
| upd_A = gr.update(choices=base_choices, value=None) |
| upd_B = gr.update(choices=base_choices, value=None) |
| else: |
| |
| allowed_a_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) >= b_int] |
| allowed_b_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) <= a_int] |
| upd_A = gr.update( |
| choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
| upd_B = gr.update( |
| choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
| elif a_int is not None: |
| |
| allowed_b_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) <= a_int] |
| upd_B = gr.update( |
| choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
| elif b_int is not None: |
| |
| allowed_a_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) >= b_int] |
| upd_A = gr.update( |
| choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
| |
|
|
| elif radio_choice == "Model B is better.": |
| |
| if a_int is not None and b_int is not None: |
| |
| if b_int < a_int: |
| |
| upd_A = gr.update(choices=base_choices, value=None) |
| upd_B = gr.update(choices=base_choices, value=None) |
| else: |
| |
| allowed_a_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) <= b_int] |
| allowed_b_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) >= a_int] |
| upd_A = gr.update( |
| choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
| upd_B = gr.update( |
| choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
| elif a_int is not None: |
| |
| allowed_b_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) >= a_int] |
| upd_B = gr.update( |
| choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) |
| elif b_int is not None: |
| |
| allowed_a_choices = [choice for choice in base_choices if to_int( |
| choice) is None or to_int(choice) <= b_int] |
| upd_A = gr.update( |
| choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) |
|
|
| elif radio_choice == "Both models are equally good.": |
| |
| if a_int is not None and b_int is not None: |
| |
| if a_int == b_int: |
| |
| upd_A = gr.update(choices=[score_a], value=score_a) |
| upd_B = gr.update(choices=[score_b], value=score_b) |
| else: |
| |
| upd_A = gr.update(choices=base_choices, value=None) |
| upd_B = gr.update(choices=base_choices, value=None) |
| elif a_int is not None: |
| |
| upd_B = gr.update(choices=[score_a], value=score_a) |
| elif b_int is not None: |
| |
| upd_A = gr.update(choices=[score_b], value=score_b) |
| elif score_a == "Unable to Judge." and score_b == "Unable to Judge.": |
| |
| upd_A = gr.update( |
| choices=["Unable to Judge."], value="Unable to Judge.") |
| upd_B = gr.update( |
| choices=["Unable to Judge."], value="Unable to Judge.") |
| elif score_a == "Unable to Judge.": |
| |
| upd_B = gr.update( |
| choices=["Unable to Judge."], value="Unable to Judge.") |
| elif score_b == "Unable to Judge.": |
| |
| upd_A = gr.update( |
| choices=["Unable to Judge."], value="Unable to Judge.") |
| |
|
|
| return upd_A, upd_B |
| return restrict_choices_page1 |
|
|
| |
|
|
|
|
| def build_row_dict(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args): |
| num_criteria = len(criteria) |
| ratings_A_vals = list(args[:num_criteria]) |
| ratings_B_vals = list(args[num_criteria:]) |
|
|
| prompt_text = data_subset_state['question'] |
| response_A_model = data_subset_state['models'][0]['model'] |
| response_B_model = data_subset_state['models'][1]['model'] |
|
|
| timestamp = datetime.datetime.now().isoformat() |
| row = { |
| "Timestamp": timestamp, |
| "Name": user_info[0], |
| "Email": user_info[1], |
| "Specialty": str(user_info[2]), |
| "Subspecialty": str(user_info[3]), |
| "Years of Experience": user_info[4], |
| "Experience Explanation": user_info[5], |
| "NPI ID": user_info[6], |
| "Question ID": user_info[7], |
| "Prompt": prompt_text, |
| "ResponseA_Model": response_A_model, |
| "ResponseB_Model": response_B_model, |
| "Question Makes No Sense or Biomedically Irrelevant": nonsense_btn_clicked, |
| } |
|
|
| pairwise = [mapping.get(val, val) for val in pairwise] |
| for i, crit in enumerate(criteria): |
| label = crit['label'] |
| row[f"Criterion_{label} Comparison: Which is Better?"] = pairwise[i] |
| row[f"Criterion_{label} Comments"] = comparisons_reasons[i] |
| row[f"ScoreA_{label}"] = ratings_A_vals[i] |
| row[f"ScoreB_{label}"] = ratings_B_vals[i] |
|
|
| return row |
|
|
|
|
| def final_submit(data_subset_state, user_info, pairwise, comparisons_reasons, nonsense_btn_clicked, *args): |
| |
| row_dict = build_row_dict(data_subset_state, user_info, |
| pairwise, comparisons_reasons, nonsense_btn_clicked, *args) |
| append_to_sheet(user_data=None, custom_row_dict=row_dict, custom_sheet_name=str( |
| TXAGENT_RESULTS_SHEET_BASE_NAME), add_header_when_create_sheet=True) |
|
|
| |
| name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, _ = user_info |
| user_info_new, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, page1_reference_answer, question_for_eval, remaining_count = get_next_eval_question( |
| name, email, specialty, subspecialty, years_exp_radio, exp_explanation_tb, npi_id, our_methods |
| ) |
|
|
| if remaining_count == 0: |
| return ( |
| "", |
| gr.update(visible=False), |
| gr.update(visible=True), |
| "", |
| None, |
| None, |
| None, |
| None, |
| None, |
| None, |
| user_info_new, |
| ) |
| return ( |
| "", |
| gr.update(visible=True), |
| gr.update(visible=False), |
| "", |
| chat_a_answer, |
| chat_b_answer, |
| chat_a_reasoning, |
| chat_b_reasoning, |
| page1_prompt, |
| question_for_eval, |
| user_info_new |
| ) |
|
|
|
|
| |
| def validate_and_submit_page1(data_subset_state, user_info, *combined_values): |
| |
| criteria_count = len(criteria_for_comparison) |
| pairwise_list = list(combined_values[:criteria_count]) |
| comparison_reasons_list = list( |
| combined_values[criteria_count:criteria_count*2]) |
| ratings_A_list = list( |
| combined_values[criteria_count*2:criteria_count*3]) |
| ratings_B_list = list(combined_values[criteria_count*3:]) |
|
|
| |
| if any(answer is None for answer in pairwise_list): |
| missing_comparisons = [] |
| for i, answer in enumerate(pairwise_list): |
| if answer is None: |
| missing_comparisons.append(criteria_for_comparison[i]['label']) |
|
|
| missing_text = ", ".join(missing_comparisons) |
| error_msg = f"Your response is missing for: {missing_text}" |
| gr.Info(error_msg) |
| return ( |
| gr.update(value=f"Error: {error_msg}"), |
| gr.update(visible=True), |
| gr.update(visible=False), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| |
| *combined_values |
| ) |
|
|
| |
| if any(r is None for r in ratings_A_list) or any(r is None for r in ratings_B_list): |
| missing_ratings = [] |
| for i in range(len(criteria)): |
| missing_parts = [] |
| if ratings_A_list[i] is None: |
| missing_parts.append("Model A Response") |
| if ratings_B_list[i] is None: |
| missing_parts.append("Model B Response") |
| if missing_parts: |
| missing_ratings.append( |
| f"{criteria[i]['label']} ({', '.join(missing_parts)})") |
|
|
| missing_text = "; ".join(missing_ratings) |
| error_msg = f"Please provide ratings for: {missing_text}" |
| gr.Info(error_msg) |
| return ( |
| gr.update(value=f"Error: {error_msg}"), |
| gr.update(visible=True), |
| gr.update(visible=False), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| gr.update(), |
| |
| *combined_values |
| ) |
| gr.Info("Submitting your evaluation and loading the next question...") |
| |
| submit_result = final_submit(data_subset_state, user_info, pairwise_list, |
| comparison_reasons_list, False, *ratings_A_list, *ratings_B_list) |
|
|
| |
| |
| page1_update = submit_result[1] |
| page1_visible = page1_update.get('visible', False) if isinstance( |
| page1_update, dict) else False |
| gr.Info(f"Your evaluation has been submitted. You are about to evaluate the next question...") |
| |
| if page1_visible: |
| |
| reset_values = [] |
| for _ in range(len(combined_values)): |
| reset_values.append(None) |
| return submit_result + tuple(reset_values) |
| else: |
| |
| return submit_result + tuple(combined_values) |
|
|
|
|
| centered_col_css = """ |
| #centered-column { |
| margin-left: auto; |
| margin-right: auto; |
| max-width: 800px; /* Adjust this width as desired */ |
| width: 100%; |
| } |
| #participate-btn { |
| background-color: purple !important; |
| color: white !important; |
| border-color: purple !important; |
| } |
| #answer-reference-btn { |
| /* Light‑mode palette */ |
| --btn-bg: #E0F2FF; /* soft pastel blue */ |
| --btn-text: #00334D; /* dark slate for good contrast */ |
| --btn-border: #E0F2FF; |
| |
| background-color: var(--btn-bg) !important; |
| color: var(--btn-text) !important; |
| border: 1px solid var(--btn-border) !important; |
| } |
| |
| /* Dark‑mode overrides */ |
| @media (prefers-color-scheme: dark) { |
| #answer-reference-btn { |
| --btn-bg: #2C6E98; /* muted steel blue for dark backgrounds */ |
| --btn-text: #FFFFFF; /* switch to white text for contrast */ |
| --btn-border: #2C6E98; |
| } |
| } |
| |
| #clear_btn { |
| background-color: #F08080 !important; |
| color: white !important; |
| border-color: #F08080 !important; |
| } |
| .reference-box { |
| border: 1px solid #ccc; |
| padding: 10px; |
| border-radius: 5px; |
| } |
| .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; } |
| .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; } |
| |
| /* --- Added for larger criteria font --- */ |
| .criteria-font-large { |
| font-size: 1.2em !important; |
| } |
| /* Radio component labels (the title above the choices) */ |
| .criteria-radio-label label[data-testid="block-label"] { |
| font-weight: bold !important; |
| font-size: 1.1em !important; |
| } |
| /* Textbox labels */ |
| .textbox-bold-label label[data-testid="block-label"] { |
| font-weight: bold !important; |
| } |
| #participate-btn button { |
| font-size: 24px !important; /* Large readable text */ |
| font-weight: 700 !important; /* Bold for emphasis */ |
| padding: 28px 40px !important; /* Extra padding for height */ |
| min-height: 120px !important; /* Make button visibly taller (multi‑line) */ |
| width: 100% !important; /* Occupy full width of its column */ |
| white-space: normal !important; /* Allow text to wrap onto multiple lines */ |
| } |
| .criteria-radio-score-label [role="radiogroup"], |
| .criteria-radio-score-label .gr-radio-group, |
| .criteria-radio-score-label .flex { |
| display: flex !important; |
| flex-direction: column !important; |
| gap: 4px !important; /* 行间距,可按需调整 */ |
| } |
| |
| /* 更具体的选择器来确保垂直布局 */ |
| .criteria-radio-score-label fieldset { |
| display: flex !important; |
| flex-direction: column !important; |
| gap: 4px !important; |
| } |
| |
| .criteria-radio-score-label .wrap { |
| display: flex !important; |
| flex-direction: column !important; |
| gap: 4px !important; |
| } |
| |
| /* 确保每个单选按钮选项垂直排列 */ |
| .criteria-radio-score-label label { |
| display: block !important; |
| margin-bottom: 4px !important; |
| } |
| """ |
| with gr.Blocks(css=centered_col_css) as demo: |
| |
| user_info_state = gr.State() |
| pairwise_state = gr.State() |
| scores_A_state = gr.State() |
| comparison_reasons = gr.State() |
| nonsense_btn_clicked = gr.State(False) |
| unqualified_A_state = gr.State() |
| data_subset_state = gr.State() |
| question_in_progress = gr.State(0) |
|
|
| |
| specialties_path = "specialties.json" |
| subspecialties_path = "subspecialties.json" |
|
|
| try: |
| with open(specialties_path, 'r') as f: |
| specialties_list = json.load(f) |
| with open(subspecialties_path, 'r') as f: |
| subspecialties_list = json.load(f) |
| except FileNotFoundError: |
| print( |
| f"Error: Could not find specialty files at {specialties_path} or {subspecialties_path}. Please ensure these files exist.") |
| |
| specialties_list = ["Error loading specialties"] |
| subspecialties_list = ["Error loading subspecialties"] |
| except json.JSONDecodeError: |
| print(f"Error: Could not parse JSON from specialty files.") |
| specialties_list = ["Error parsing specialties"] |
| subspecialties_list = ["Error parsing subspecialties"] |
|
|
| |
| with gr.Column(visible=True, elem_id="page-1") as page_minus1: |
| gr.HTML(""" |
| <div> |
| <h1>TxAgent Portal: AI Evaluation and Crowdsourcing of Therapeutic Questions</h1> |
| </div> |
| """) |
| |
| |
| with gr.Column(scale=1): |
| participate_eval_btn = gr.Button( |
| value="Evaluate TxAgent", |
| variant="primary", |
| size="lg", |
| elem_id="participate-btn" |
| ) |
| with gr.Column(scale=1): |
| gr.Markdown( |
| """ |
| When you join Evaluate TxAgent, you will: |
| - See model responses to diverse prompts. |
| - Provide instant thumbs-up or thumbs-down ratings. |
| - Influence the roadmap for future releases. |
| |
| Thank you for helping improve TxAgent! |
| """ |
| ) |
| with gr.Column(scale=1): |
| submit_questions_btn = gr.Button( |
| value="Submit Your Therapeutic Questions", |
| variant="primary", |
| size="lg", |
| elem_id="submit-btn" |
| ) |
|
|
| |
| |
| with gr.Column(scale=1): |
| gr.Markdown( |
| """ |
| By submitting therapeutic questions, you will: |
| - Help identify edge cases and blind spots for AI models. |
| - Help extend AI models to reason in new domains. |
| - Directly shape future model improvements. |
| |
| We look forward to seeing your feedback! |
| """ |
| ) |
|
|
| |
| contact_info_markdown = """ |
| ## Contact |
| |
| For questions or suggestions, email [Shanghua Gao](mailto:shanghuagao@gmail.com) and [Marinka Zitnik](mailto:marinka@hms.harvard.edu). |
| """ |
|
|
| gr.Markdown(contact_info_markdown) |
|
|
| gr.HTML(TxAgent_Project_Page_HTML) |
|
|
| |
| |
| |
| google_form_url = "https://forms.gle/pYvyvEQQwS5gdupQA" |
| submit_questions_btn.click( |
| fn=None, |
| inputs=None, |
| outputs=None, |
| js=f"() => {{ window.open('{google_form_url}', '_blank'); }}" |
| ) |
|
|
| |
| with gr.Column(visible=False, elem_id="page0") as page0: |
|
|
| gr.Markdown("## Sign Up") |
| name = gr.Textbox(label="Name (required)") |
| email = gr.Textbox( |
| label="Email (required). Use the same email each time you log into this evaluation portal to avoid receiving repeat questions.") |
| specialty_dd = gr.Dropdown( |
| choices=specialties_list, label="Primary Medical Specialty (required). Visit https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categories.", multiselect=True) |
| subspecialty_dd = gr.Dropdown( |
| choices=subspecialties_list, label="Subspecialty (if applicable). Visit https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categories.", multiselect=True) |
| npi_id = gr.Textbox( |
| label="National Provider Identifier ID (optional). Visit https://npiregistry.cms.hhs.gov/search to find your NPI ID. Leave blank if you do not have an NPI ID.") |
| years_exp_radio = gr.Radio( |
| choices=["0-2 years", "3-5 years", "6-10 years", |
| "11-20 years", "20+ years", "Not Applicable"], |
| label="Years of experience in clinical and/or research activities related to your biomedical expertise (required)." |
| ) |
| exp_explanation_tb = gr.Textbox( |
| label="Briefly describe your expertise in AI (optional).") |
|
|
| page0_error_box = gr.Markdown("") |
| with gr.Row(): |
| next_btn_0 = gr.Button("Next") |
| gr.Markdown("""Click Next to start the study. Your progress will be saved after you submit each question. For questions or concerns, contact us directly. Thank you for participating! |
| """) |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| with gr.Column(visible=False) as page1: |
| with gr.Accordion("Instructions", open=False): |
| gr.Markdown(""" |
| ## Instructions: |
| Please review these instructions and enter your information to begin: |
| |
| - Each session requires at least 5-10 minutes per question. |
| - You can evaluate multiple questions; you will not repeat evaluations. |
| - For each question, compare responses from two models and rate them (scale: 1-5). |
| - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page. |
| - Use the Back and Next buttons to edit responses before submission. |
| - Use the Home Page button to return to the homepage; progress will save but not submit. |
| - Submit answers to the current question before moving to the next. |
| - You can pause between questions and return later; ensure current answers are submitted to save them. |
| """) |
| |
| |
| |
| page1_prompt = gr.HTML() |
| with gr.Row(): |
| nonsense_btn = gr.Button( |
| "Skip Question", |
| size="sm", |
| variant="stop", |
| elem_id="invalid-question-btn", |
| elem_classes=["short-btn"], |
| scale=1 |
| ) |
| skip_comments = gr.Textbox( |
| placeholder="(Optional) Why do you want to skip this question...", |
| show_label=False, |
| scale=3, |
| container=False, |
| ) |
|
|
| page1_error_box = gr.Markdown("") |
|
|
| |
| with gr.Row(): |
| |
| with gr.Column(): |
| gr.Markdown("**Model A Response:**") |
| chat_a_answer = gr.Chatbot( |
| value=[], |
| type="messages", |
| height=200, |
| label="Model A Answer", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False |
| ) |
| |
| chat_a_reasoning = gr.Chatbot( |
| value=[], |
| type="messages", |
| height=300, |
| label="Model A Reasoning - Rationale", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False |
| ) |
| |
| with gr.Column(): |
| gr.Markdown("**Model B Response:**") |
| chat_b_answer = gr.Chatbot( |
| value=[], |
| type="messages", |
| height=200, |
| label="Model B Answer", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False |
| ) |
| |
| chat_b_reasoning = gr.Chatbot( |
| value=[], |
| type="messages", |
| height=300, |
| label="Model B Reasoning - Rationale", |
| show_copy_button=False, |
| show_label=True, |
| render_markdown=True, |
| avatar_images=None, |
| rtl=False |
| ) |
| |
| |
| comparison_reasons_inputs = [] |
| pairwise_inputs = [] |
| ratings_A_page1 = [] |
| ratings_B_page1 = [] |
|
|
| for i, crit_comp in enumerate(criteria_for_comparison): |
| |
| crit_score = criteria[i] |
|
|
| restrict_fn = make_restrict_function(sorted(crit_score["scores"])) |
|
|
| |
| gr.Markdown(f"**{crit_comp['label']}**", |
| elem_classes="criteria-font-large") |
| radio = gr.Radio( |
| choices=[ |
| "Model A is better.", |
| "Model B is better.", |
| "Both models are equally good.", |
| "Neither model did well." |
| ], |
| |
| label=crit_comp['text'], |
| elem_classes="criteria-radio-label" |
| ) |
| pairwise_inputs.append(radio) |
| |
|
|
| |
| index_component = gr.Number( |
| value=i, visible=False, interactive=False) |
| |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| rating_a = gr.Radio(choices=sorted(crit_score["scores"]), |
| label=f"Model A Response - {crit_score['text']}", |
| interactive=True, |
| elem_classes="criteria-radio-score-label") |
| with gr.Column(scale=1): |
| rating_b = gr.Radio(choices=sorted(crit_score["scores"]), |
| label=f"Model B Response - {crit_score['text']}", |
| interactive=True, |
| elem_classes="criteria-radio-score-label") |
|
|
| |
| with gr.Row(): |
| |
| radio.change( |
| fn=restrict_fn, |
| inputs=[radio, rating_a, rating_b], |
| outputs=[rating_a, rating_b] |
| ) |
| rating_a.change( |
| fn=restrict_fn, |
| inputs=[radio, rating_a, rating_b], |
| outputs=[rating_a, rating_b] |
| ) |
| rating_b.change( |
| fn=restrict_fn, |
| inputs=[radio, rating_a, rating_b], |
| outputs=[rating_a, rating_b] |
| ) |
|
|
| ratings_A_page1.append(rating_a) |
| ratings_B_page1.append(rating_b) |
|
|
| text_input = gr.Textbox( |
| |
| placeholder="Comments for your selection (optional)", |
| show_label=False, |
| |
| ) |
| comparison_reasons_inputs.append(text_input) |
|
|
| with gr.Row(): |
| submit_btn_1 = gr.Button( |
| "Submit Evaluation", variant="primary", elem_id="submit_btn") |
|
|
| |
| with gr.Column(visible=False, elem_id="final_page") as final_page: |
| gr.Markdown( |
| "## You have no questions left to evaluate. Thank you for your participation!") |
|
|
| |
| with Modal("Error", visible=False, elem_id="error_modal") as error_modal: |
| error_message_box = gr.Markdown() |
| ok_btn = gr.Button("OK") |
| |
| ok_btn.click(lambda: gr.update(visible=False), None, error_modal) |
|
|
| |
|
|
| |
| participate_eval_btn.click( |
| fn=go_to_page0_from_minus1, |
| inputs=[question_in_progress], |
| |
| outputs=[page_minus1, page0, page1, final_page] |
| ) |
|
|
| |
| next_btn_0.click( |
| fn=go_to_eval_progress_modal, |
| inputs=[name, email, specialty_dd, subspecialty_dd, |
| years_exp_radio, exp_explanation_tb, npi_id], |
| outputs=[page0, page1, user_info_state, page0_error_box, chat_a_answer, |
| chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, data_subset_state], |
| scroll_to_output=True |
| ) |
| |
| nonsense_btn.click( |
| fn=flag_nonsense_and_skip, |
| inputs=[user_info_state, skip_comments], |
| outputs=[user_info_state, page1_error_box, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, |
| page1_prompt, data_subset_state], |
| scroll_to_output=True |
| ) |
|
|
| |
| submit_btn_1.click( |
| fn=validate_and_submit_page1, |
| inputs=[data_subset_state, user_info_state, *pairwise_inputs, |
| *comparison_reasons_inputs, *ratings_A_page1, *ratings_B_page1], |
| outputs=[page1_error_box, page1, final_page, page0_error_box, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, |
| page1_prompt, data_subset_state, user_info_state, *pairwise_inputs, *comparison_reasons_inputs, *ratings_A_page1, *ratings_B_page1], |
| scroll_to_output=True |
| ) |
|
|
|
|
| demo.launch(share=True, allowed_paths=["."]) |
|
|