llm-pricing-calculator

Running

App Files Files Community

Presidentlin commited on Jun 22

Commit

2336e04

1 Parent(s): e955c4c

x

Browse files

Files changed (1) hide show

src/lib/benchmarks/openai.ts +151 -12

src/lib/benchmarks/openai.ts CHANGED Viewed

@@ -71,15 +71,34 @@ export const openaiBenchmarks: Benchmark[] = [
         provider: "OpenAI",
         inputPrice: 2.0,
         outputPrice: 8.0,
-        source: "https://github.com/openai/simple-evals",
         benchmark: {
             mmlu: 90.2,
             gpqa: 66.3,
             humaneval: 94.5,
             simpleqa: 41.6,
-            // math: 82.1,
-            // mgsm: 86.9,
-            // drop: 79.4,
         },
     },
     {
@@ -87,15 +106,21 @@ export const openaiBenchmarks: Benchmark[] = [
         provider: "OpenAI",
         inputPrice: 0.4,
         outputPrice: 1.6,
-        source: "https://github.com/openai/simple-evals",
         benchmark: {
             mmlu: 87.5,
             gpqa: 65.0,
             humaneval: 93.8,
             simpleqa: 16.8,
-            // math: 81.4,
-            // mgsm: 88.2,
-            // drop: 81.0,
         },
     },
     {
@@ -103,17 +128,23 @@ export const openaiBenchmarks: Benchmark[] = [
         provider: "OpenAI",
         inputPrice: 0.1,
         outputPrice: 0.4,
-        source: "https://github.com/openai/simple-evals",
         benchmark: {
             mmlu: 80.1,
             gpqa: 50.3,
             humaneval: 87.0,
             simpleqa: 7.6,
-            // math: 62.3,
-            // mgsm: 73.0,
-            // drop: 82.2,
         },
     },
     {
         model: "GPT-4.5-preview-2025-02-27",
         provider: "OpenAI",
@@ -178,4 +209,112 @@ export const openaiBenchmarks: Benchmark[] = [
             // drop: 83.2,
         },
     },
 ];

         provider: "OpenAI",
         inputPrice: 2.0,
         outputPrice: 8.0,
+        source: "https://openai.com/index/gpt-4-1/",
         benchmark: {
             mmlu: 90.2,
             gpqa: 66.3,
+            gpqa_diamond: 66.3,
             humaneval: 94.5,
             simpleqa: 41.6,
+            swe_bench_verified: 54.6,
+            aider_polyglot: 52.9,
+            mmmlu: 90.2,
+            video_mme: 72.0,
+            // Not yet in BenchmarkMetric
+            aime_24: 48.1,
+            // aime_2025: undefined,
+            // mmlu_pro: undefined,
+            // egoschema: undefined,
+            // loft: undefined,
+            // lcb: undefined,
+            // bigcodebench: undefined,
+            // mbpp: undefined,
+            // livecodebench_v6: undefined,
+            // lbpp_v2: undefined,
+            // bigbench_extra_hard: undefined,
+            // global_mmlu_lite: undefined,
+            // facts_grounding: undefined,
+            // humanitys_last_exam: undefined,
+            mrcr_v2_avg_128k: 57.2,
+            mrcr_v2_pointwise_1m: 46.3,
         },
     },
     {
         provider: "OpenAI",
         inputPrice: 0.4,
         outputPrice: 1.6,
+        source: "https://openai.com/index/gpt-4-1/",
         benchmark: {
             mmlu: 87.5,
             gpqa: 65.0,
+            gpqa_diamond: 65.0,
             humaneval: 93.8,
             simpleqa: 16.8,
+            swe_bench_verified: 23.6,
+            aider_polyglot: 31.6,
+            mmmlu: 87.5,
+            aime_24: 49.6,
+            mrcr_v2_avg_128k: 47.2,
+            mrcr_v2_pointwise_1m: 33.3,
+            // video_mme: undefined,
         },
     },
     {
         provider: "OpenAI",
         inputPrice: 0.1,
         outputPrice: 0.4,
+        source: "https://openai.com/index/gpt-4-1/",
         benchmark: {
             mmlu: 80.1,
             gpqa: 50.3,
+            gpqa_diamond: 50.3,
             humaneval: 87.0,
             simpleqa: 7.6,
+            swe_bench_verified: 9.8,
+            aider_polyglot: 6.2,
+            mmmlu: 80.1,
+            aime_24: 29.4,
+            mrcr_v2_avg_128k: 36.6,
+            mrcr_v2_pointwise_1m: 12.0,
+            // video_mme: undefined,
         },
     },
     {
         model: "GPT-4.5-preview-2025-02-27",
         provider: "OpenAI",
             // drop: 83.2,
         },
     },
+    {
+        model: "OpenAI o3",
+        provider: "OpenAI",
+        inputPrice: 2.0,
+        outputPrice: 8.0,
+        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
+        benchmark: {
+            aime_24: 91.6, // "o3 (no tools)"
+            aime_2025: 88.9, // "o3 (no tools)"
+            //codeforces: 2706, // "o3 (with terminal)"
+            gpqa_diamond: 83.3, // "o3 (no tools)"
+            humanitys_last_exam: 20.32, // "o3 (no tools)"
+            mmmu: 82.9,
+            //mathvista: 86.8,
+            //charxiv_reasoning: 78.6,
+            //swe_lancer_ic_swe_diamond: 65250, // "o3-high"
+            swe_bench_verified: 69.1,
+            aider_polyglot: 81.3, // "(whole)"
+            //scale_multichallenge: 56.51,
+            //browsecomp: 8.35, // "o3 with python +browsing*"
+            //tau_bench: 52.0, // "(Airline)"
+            //  tau_bench_retail: 73.9, // "(Retail)"
+        },
+    },
+    {
+        model: "OpenAI o3-pro",
+        provider: "OpenAI",
+        inputPrice: 20.0,
+        outputPrice: 80.0,
+        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
+        benchmark: {
+            // Benchmarks for o3-pro are not explicitly listed, but it's described as "designed to think longer and provide the most reliable responses."
+            // Assuming similar or slightly better performance than o3 in relevant areas.
+            gpqa_diamond: 83.3, // Placeholder, likely similar or slightly better than o3
+            humanitys_last_exam: 24.90, // "o3 (python + browsing**tools)" - this is likely the "pro" version's capability
+        },
+    },
+    {
+        model: "OpenAI o4-mini",
+        provider: "OpenAI",
+        inputPrice: 1.10,
+        outputPrice: 4.40,
+        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
+        benchmark: {
+            aime_24: 93.4, // "o4-mini (no tools)"
+            aime_2025: 92.7, // "o4-mini (no tools)"
+            //codeforces: 2719, // "o4-mini (with terminal)"
+            gpqa_diamond: 81.4, // "o4-mini (no tools)"
+            humanitys_last_exam: 14.28, // "o4-mini (no tools)"
+            mmmu: 81.6,
+            //mathvista: 84.3,
+            //charxiv_reasoning: 72.0,
+            //swe_lancer_ic_swe_diamond: 56375, // "o4-mini-high"
+            swe_bench_verified: 68.1,
+            aider_polyglot: 68.9, // "(whole)"
+            //scale_multichallenge: 42.99,
+            //browsecomp: 1.5, // "o4-mini with python +browsing** tools"
+            //tau_bench: 49.2, // "(Airline)"
+            //tau_bench_retail: 71.8, // "(Retail)"
+        },
+    },
+    {
+        model: "OpenAI o1",
+        provider: "OpenAI",
+        inputPrice: 15.0,
+        outputPrice: 60.0,
+        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
+        benchmark: {
+            aime_24: 74.3,
+            aime_2025: 79.2,
+            //codeforces: 189,
+            gpqa_diamond: 78.0,
+            humanitys_last_exam: 8.12, // "o1-pro"
+            mmmu: 77.6,
+            //mathvista: 71.8,
+            //charxiv_reasoning: 55.1,
+            //swe_lancer_ic_swe_diamond: 28500, // "o1-high"
+            swe_bench_verified: 48.9,
+            aider_polyglot: 64.4, // "(whole)"
+            //scale_multichallenge: 44.93,
+            //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
+            //tau_bench: 50.0, // "(Airline)"
+            //tau_bench_retail: 70.8, // "(Retail)"
+        },
+    },
+    {
+        model: "OpenAI o3-mini",
+        provider: "OpenAI",
+        inputPrice: 1.10,
+        outputPrice: 4.40,
+        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
+        benchmark: {
+            aime_24: 87.3,
+            aime_2025: 86.5,
+            //codeforces: 1207,
+            gpqa_diamond: 77.0,
+            humanitys_last_exam: 13.40,
+            // MMMU, MathVista, CharXiv-Reasoning not explicitly listed for o3-mini, assuming lower than o4-mini
+            //swe_lancer_ic_swe_diamond: 17375, // "o3-mini-high"
+            swe_bench_verified: 49.3,
+            aider_polyglot: 61.7, // "(diff)"
+            //scale_multichallenge: 39.89,
+            // BrowseComp not explicitly listed for o3-mini
+            //tau_bench: 32.4, // "(Airline)"
+            //tau_bench_retail: 57.6, // "(Retail)"
+        },
+    },
 ];