import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go import json from typing import Dict, List, Tuple st.set_page_config( page_title="LLM Healthcare Benchmarking Budgeting", page_icon="🩺", layout="wide" ) blue_to_gray_palette = ["#0077b6", "#4a98c9", "#7ba7c5", "#a6b5c1", "#d0d7dc"] st.markdown(""" """, unsafe_allow_html=True) st.markdown('
Budgeting for LLM Healthcare Benchmarking
', unsafe_allow_html=True) default_models_json = """{ "OpenAI gpt-4.5-preview": {"input_cost": 75, "output_cost": 150}, "OpenAI gpt-4o": {"input_cost": 2.5, "output_cost": 10}, "OpenAI gpt-4o-mini": {"input_cost": 0.15, "output_cost": 0.6}, "OpenAI o1": {"input_cost": 15, "output_cost": 60}, "OpenAI o1-mini": {"input_cost": 1.1, "output_cost": 4.4}, "OpenAI o3-mini": {"input_cost": 1.1, "output_cost": 4.4}, "Anthropic Claude 3.7 Sonnet": {"input_cost": 3, "output_cost": 15}, "Anthropic Claude 3.5 Haiku": {"input_cost": 0.8, "output_cost": 4}, "Anthropic Claude 3 Opus": {"input_cost": 0.8, "output_cost": 4}, "Anthropic Claude 3.5 Sonnet": {"input_cost": 3, "output_cost": 15}, "Anthropic Claude 3 Haiku": {"input_cost": 0.25, "output_cost": 1.25}, "TogetherAI DeepSeek-R1": {"input_cost": 3, "output_cost": 7}, "Llama 3.2 3B Instruct Turbo": {"input_cost": 0.06, "output_cost": 0.06}, "Gemini 2.0 Flash": {"input_cost": 0.1, "output_cost": 0.4}, "Gemini 2.0 Flash-Lite": {"input_cost": 0.075, "output_cost": 0.3}, "Gemini 1.5 Pro": {"input_cost": 1.25, "output_cost": 5}, "Gemini Pro": {"input_cost": 0.5, "output_cost": 1.5}, "Mistral Small": {"input_cost": 0.1, "output_cost": 0.3}, "Mistral Large": {"input_cost": 2, "output_cost": 6} }""" # Add JSON editor to sidebar st.sidebar.markdown('
LLM Models Configuration
', unsafe_allow_html=True) st.sidebar.markdown("Edit the JSON below to modify existing models or add new ones:") # Display JSON in a text area for editing models_json = st.sidebar.text_area("Models JSON", default_models_json, height=400) # Parse the JSON input try: llm_models = json.loads(models_json) except json.JSONDecodeError as e: st.sidebar.error(f"Invalid JSON: {str(e)}") # Use default models if JSON is invalid llm_models = json.loads(default_models_json) medmcqa_splits = { "Single-Select Questions": { "questions": 120765, "avg_q_tokens": 12.77, # Using the train dataset average "description": "Single-select questions from the MedMCQA train dataset" } } col1, col2 = st.columns([2, 1]) with col1: st.markdown('
Select LLM Models
', unsafe_allow_html=True) selected_models = st.multiselect( "Choose one or more LLM models:", options=list(llm_models.keys()), default=list(llm_models.keys())[:2] ) with st.expander("View Model Details"): models_df = pd.DataFrame([ { "Model": model, "Input Cost (per 1M tokens)": f"${llm_models[model]['input_cost']:.2f}", "Output Cost (per 1M tokens)": f"${llm_models[model]['output_cost']:.2f}" } for model in llm_models ]) st.dataframe(models_df, use_container_width=True) with col2: st.markdown('
MedMCQA Dataset
', unsafe_allow_html=True) st.markdown(f""" **Single-Select Questions:** {medmcqa_splits['Single-Select Questions']['questions']:,} **Average Question Tokens:** {medmcqa_splits['Single-Select Questions']['avg_q_tokens']} **Description:** {medmcqa_splits['Single-Select Questions']['description']} """) st.markdown('
Cost Simulation Parameters
', unsafe_allow_html=True) col1, col2 = st.columns(2) with col1: prompt_tokens = st.number_input( "Prompt Tokens per Question", min_value=1, max_value=1000, value=200, step=10, help="Number of tokens in each prompt (including the question and any additional instructions)" ) with col2: output_tokens = st.number_input( "Output Tokens per Question", min_value=1, max_value=1000, value=100, step=10, help="Average number of tokens in the model's response" ) col1, col2, col3 = st.columns(3) with col1: num_runs = st.number_input( "Number of Evaluation Runs", min_value=1, max_value=1000, value=1, step=1, help="How many times each dataset will be processed by each model" ) with col2: st.write("") with col3: sampling_percentage = st.slider( "Dataset Sampling Percentage", min_value=1, max_value=100, value=100, step=1, help="Percentage of questions to process from each split" ) def calculate_costs(models, prompt_token_count, output_token_count, runs, sampling_pct): results = [] total_questions = medmcqa_splits["Single-Select Questions"]["questions"] num_questions = int(total_questions * (sampling_pct / 100)) for model in models: model_input_cost = llm_models[model]["input_cost"] model_output_cost = llm_models[model]["output_cost"] total_input_tokens = num_questions * prompt_token_count * runs total_output_tokens = num_questions * output_token_count * runs input_cost = (total_input_tokens / 1000000) * model_input_cost output_cost = (total_output_tokens / 1000000) * model_output_cost total_cost = input_cost + output_cost results.append({ "Model": model, "Questions": num_questions, # Changed from Total Questions to Questions "Number of Prompt Tokens per Question": prompt_token_count, "Number of Output Tokens per Question": output_token_count, "Total Input Tokens": total_input_tokens, "Total Output Tokens": total_output_tokens, "Input Cost": input_cost, "Output Cost": output_cost, "Total Cost": total_cost, "Split": "Single-Select Questions" }) cost_df = pd.DataFrame(results) model_summary = cost_df.groupby("Model").agg({ "Input Cost": "sum", "Output Cost": "sum", "Total Cost": "sum" }).reset_index() # Fixed: Using columns that actually exist in the DataFrame split_summary = cost_df.groupby("Split").agg({ "Questions": "sum", # Changed from "Total Questions" "Total Input Tokens": "sum", "Total Output Tokens": "sum", "Total Cost": "sum" }).reset_index() return cost_df, model_summary, split_summary if selected_models: detailed_costs, model_summary, split_summary = calculate_costs( selected_models, prompt_tokens, output_tokens, num_runs, sampling_percentage ) total_cost = detailed_costs["Total Cost"].sum() total_questions = detailed_costs["Questions"][0] # Changed from "Total Questions" total_input_tokens = detailed_costs["Total Input Tokens"].sum() total_output_tokens = detailed_costs["Total Output Tokens"].sum() st.markdown('
Cost Calculation Breakdown
', unsafe_allow_html=True) with st.expander("View Detailed Cost Calculation Formula", expanded=False): st.markdown(""" ### Cost Calculation Formula For each model, the cost is calculated as: ``` Input Cost = (Number of Questions × Prompt Tokens per Question × Number of Runs ÷ 1,000,000) × Input Cost per Million Tokens Output Cost = (Number of Questions × Output Tokens per Question × Number of Runs ÷ 1,000,000) × Output Cost per Million Tokens Total Cost = Input Cost + Output Cost ``` """) for model in selected_models: model_data = detailed_costs[detailed_costs["Model"] == model].iloc[0] model_input_cost = llm_models[model]["input_cost"] model_output_cost = llm_models[model]["output_cost"] model_input_tokens = model_data["Total Input Tokens"] model_output_tokens = model_data["Total Output Tokens"] model_input_cost_total = model_data["Input Cost"] model_output_cost_total = model_data["Output Cost"] model_total_cost = model_data["Total Cost"] st.markdown(f""" #### {model}: **Input Cost Calculation:** ({total_questions:,} questions × {prompt_tokens} tokens × {num_runs} runs ÷ 1,000,000) × ${model_input_cost:.2f} = ${model_input_cost_total:.2f} **Output Cost Calculation:** ({total_questions:,} questions × {output_tokens} tokens × {num_runs} runs ÷ 1,000,000) × ${model_output_cost:.2f} = ${model_output_cost_total:.2f} **Total Cost for {model}:** ${model_total_cost:.2f} """) st.markdown(f"""
Total Estimated Cost
${total_cost:.2f}

For processing {total_questions:,} questions ({sampling_percentage}% of total) with {len(selected_models)} models, {num_runs} time{'s' if num_runs > 1 else ''}.

Using {prompt_tokens} prompt tokens and {output_tokens} output tokens per question.

Total tokens processed: {total_input_tokens:,} input tokens + {total_output_tokens:,} output tokens = {total_input_tokens + total_output_tokens:,} total tokens

""", unsafe_allow_html=True) tab1, tab2 = st.tabs(["Cost Breakdown", "Detailed Costs"]) with tab1: col1, col2 = st.columns(2) with col1: cost_types = ["Input Cost", "Output Cost"] fig1 = px.bar( model_summary, x="Model", y=cost_types, title="Cost Breakdown by Model", labels={"value": "Cost ($)", "variable": "Cost Type"}, color_discrete_sequence=blue_to_gray_palette, ) fig1.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) st.plotly_chart(fig1, use_container_width=True) with col2: fig2 = go.Figure(data=[ go.Pie( labels=model_summary["Model"], values=model_summary["Total Cost"], hole=.4, textinfo="label+percent", marker_colors=blue_to_gray_palette, ) ]) if "Split" in detailed_costs.columns and len(detailed_costs["Split"].unique()) > 1: pivot_df = detailed_costs.pivot(index="Split", columns="Model", values="Total Cost") fig4 = px.imshow( pivot_df, labels=dict(x="Model", y="Split", color="Cost ($)"), x=pivot_df.columns, y=pivot_df.index, color_continuous_scale=["#0077b6", "#4a98c9", "#7ba7c5", "#a6b5c1", "#d0d7dc"], title="Cost Heatmap (Model vs Split)", text_auto='.2f', ) fig4.update_layout(height=400) st.plotly_chart(fig4, use_container_width=True) with tab2: # Fixed display columns to match the actual DataFrame columns display_cols = [ "Model", "Questions", # Changed from "Total Questions" "Number of Prompt Tokens per Question", "Number of Output Tokens per Question", "Total Input Tokens", "Total Output Tokens", "Input Cost", "Output Cost", "Total Cost" ] formatted_df = detailed_costs[display_cols].copy() # Format currency columns for col in ["Input Cost", "Output Cost", "Total Cost"]: if col in formatted_df.columns: formatted_df[col] = formatted_df[col].map("${:.2f}".format) # Format number columns for col in ["Questions", "Total Input Tokens", "Total Output Tokens"]: # Changed from "Total Questions" if col in formatted_df.columns: formatted_df[col] = formatted_df[col].map("{:,}".format) st.dataframe(formatted_df, use_container_width=True) st.markdown('
Export Results
', unsafe_allow_html=True) col1, col2 = st.columns(2) with col1: # Round values to 2 decimal places before exporting export_df = detailed_costs.copy() for col in ["Input Cost", "Output Cost", "Total Cost"]: export_df[col] = export_df[col].round(2) csv = export_df.to_csv(index=False) st.download_button( label="Download Full Results (CSV)", data=csv, file_name="medmcqa_llm_cost_analysis.csv", mime="text/csv", ) with col2: # Also round values in the JSON export rounded_costs = detailed_costs.copy() for col in ["Input Cost", "Output Cost", "Total Cost"]: rounded_costs[col] = rounded_costs[col].round(2) export_json = { "parameters": { "models": selected_models, "dataset": "MedMCQA Single-Select Questions", "total_questions": medmcqa_splits["Single-Select Questions"]["questions"], "prompt_tokens": prompt_tokens, "output_tokens": output_tokens, "sampling_percentage": sampling_percentage, "num_runs": num_runs }, "results": { "total_cost": round(float(total_cost), 2), "detailed_costs": rounded_costs.to_dict(orient="records"), "model_summary": model_summary.round(2).to_dict(orient="records") } } st.download_button( label="Download Full Results (JSON)", data=json.dumps(export_json, indent=4), file_name="medmcqa_llm_cost_analysis.json", mime="application/json", ) else: st.info("Please select at least one model and one dataset split to calculate costs.")