| import json |
| import urllib.request |
| import csv |
| import time |
| import os |
|
|
| samples = [ |
| |
| ("confusion", "Wait, why does a negative times a negative make a positive? I don't get it."), |
| ("confusion", "I'm looking at this cell diagram and I can't tell the difference between the cell wall and the cell membrane."), |
| ("confusion", "Our teacher said the Earth is tilted, but how does that make summer and winter? It doesn't make sense."), |
| ("confusion", "Is a virus alive or is it not? My textbook says both and I'm really mixed up."), |
| ("confusion", "What is the difference between a variable and a constant in algebra? I'm lost."), |
| ("confusion", "Why does dividing by a fraction mean multiplying by its reciprocal? It seems arbitrary."), |
| ("confusion", "What is the difference between speed and velocity? They sound like the same thing."), |
| ("confusion", "Why is the mitochondria called the powerhouse of the cell? What does it actually do?"), |
| |
| ("frustration", "I've tried to solve this quadratic equation three times using the formula, but I keep getting a negative under the square root!"), |
| ("frustration", "My science experiment failed again! The volcano didn't bubble at all and I did everything exactly right!"), |
| ("frustration", "This long division with decimals is taking forever and I keep getting the wrong remainder! I hate this!"), |
| ("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"), |
| ("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"), |
| ("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"), |
| ("frustration", "I'm trying to draw this ray diagram for a concave lens and the lines are crossing in the wrong place. I give up!"), |
| ("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"), |
| |
| ("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"), |
| ("boredom", "This math worksheet is just 50 of the same exact addition problems. This is so boring."), |
| ("boredom", "We are just copying definitions of different math properties from the board. This is so boring."), |
| ("boredom", "Another lecture on the phases of mitosis... we've covered this three years in a row now."), |
| ("boredom", "I finished all my science reading early. There's nothing else to do except stare at the wall."), |
| ("boredom", "We have to measure the temperature of this water every two minutes for an hour. This is so tedious."), |
| ("boredom", "Calculating the area of twenty slightly different rectangles is putting me to sleep."), |
| ("boredom", "This lecture on cell organelles is just slides of definitions. I'm falling asleep."), |
| |
| ("confidence", "I totally mastered multiplying fractions! Give me a hard practice problem to try!"), |
| ("confidence", "I just derived the formula for the volume of a sphere all by myself!"), |
| ("confidence", "I know exactly how to balance any redox reaction now. Try me!"), |
| ("confidence", "I got a perfect score on the calculus midterm today! I really understand derivatives now!"), |
| ("confidence", "I can explain the entire water cycle in my sleep! Evaporation, condensation, precipitation, easy!"), |
| ("confidence", "I just solved the hardest logic puzzle in the workbook on my very first try!"), |
| ("confidence", "I can calculate the trajectory of a projectile in my head now, it's so easy!"), |
| ("confidence", "I fully understand how DNA replication works and could draw every step from memory!"), |
| |
| ("neutral", "How do I calculate the hypotenuse of a right triangle when the sides are 3 and 4?"), |
| ("neutral", "What are the three main types of rocks found in the Earth's crust?"), |
| ("neutral", "Can you explain how photosynthesis converts sunlight into chemical energy?"), |
| ("neutral", "What is the chemical formula for photosynthesis and cellular respiration?"), |
| ("neutral", "How do you find the slope of a line from two points on a graph?"), |
| ("neutral", "What is the difference between an isotope and an ion in chemistry?"), |
| ("neutral", "Could you list the steps of the scientific method in order?"), |
| ("neutral", "What is the value of the constant pi, and how is it calculated?") |
| ] |
|
|
| results = [] |
|
|
| url = "http://127.0.0.1:8000/api/chat" |
|
|
| print(f"Starting benchmark for {len(samples)} sample queries...") |
| for idx, (cat, q) in enumerate(samples, 1): |
| print(f"[{idx}/{len(samples)}] Query ({cat}): \"{q}\"") |
| req_data = json.dumps({"message": q}).encode('utf-8') |
| req = urllib.request.Request( |
| url, |
| data=req_data, |
| headers={'Content-Type': 'application/json'} |
| ) |
| |
| try: |
| with urllib.request.urlopen(req) as response: |
| res_data = json.loads(response.read().decode('utf-8')) |
| |
| |
| results.append({ |
| "category": cat, |
| "query": q, |
| "latency_a": res_data["latency_a"], |
| "tokens_a": res_data["tokens_a"], |
| "latency_b": res_data["latency_b"], |
| "tokens_b": res_data["tokens_b"], |
| "latency_c": res_data["latency_c"], |
| "tokens_c": res_data["tokens_c"], |
| "latency_d": res_data["latency_d"], |
| "tokens_d": res_data["tokens_d"] |
| }) |
| print(f" Done: A ({res_data['latency_a']}s, {res_data['tokens_a']}t) | B ({res_data['latency_b']}s, {res_data['tokens_b']}t) | C ({res_data['latency_c']}s, {res_data['tokens_c']}t) | D ({res_data['latency_d']}s, {res_data['tokens_d']}t)") |
| |
| time.sleep(1.5) |
| except Exception as e: |
| print(f" Error processing query: {e}") |
|
|
| |
| csv_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "benchmark_results.csv") |
| with open(csv_file, mode="w", newline="", encoding="utf-8") as f: |
| writer = csv.writer(f) |
| writer.writerow([ |
| "Category", "Query", |
| "Latency A (s)", "Tokens A", |
| "Latency B (s)", "Tokens B", |
| "Latency C (s)", "Tokens C", |
| "Latency D (s)", "Tokens D" |
| ]) |
| |
| for r in results: |
| writer.writerow([ |
| r["category"], r["query"], |
| r["latency_a"], r["tokens_a"], |
| r["latency_b"], r["tokens_b"], |
| r["latency_c"], r["tokens_c"], |
| r["latency_d"], r["tokens_d"] |
| ]) |
|
|
| print(f"\nResults successfully saved to: {csv_file}") |
|
|
| |
| num_queries = len(results) |
| if num_queries > 0: |
| valid_latencies_a = [r["latency_a"] for r in results if r["latency_a"] is not None] |
| valid_tokens_a = [r["tokens_a"] for r in results if r["tokens_a"] is not None] |
| valid_latencies_b = [r["latency_b"] for r in results if r["latency_b"] is not None] |
| valid_tokens_b = [r["tokens_b"] for r in results if r["tokens_b"] is not None] |
| valid_latencies_c = [r["latency_c"] for r in results if r["latency_c"] is not None] |
| valid_tokens_c = [r["tokens_c"] for r in results if r["tokens_c"] is not None] |
| valid_latencies_d = [r["latency_d"] for r in results if r["latency_d"] is not None] |
| valid_tokens_d = [r["tokens_d"] for r in results if r["tokens_d"] is not None] |
|
|
| avg_latency_a = sum(valid_latencies_a) / len(valid_latencies_a) if valid_latencies_a else 0 |
| avg_tokens_a = sum(valid_tokens_a) / len(valid_tokens_a) if valid_tokens_a else 0 |
| avg_latency_b = sum(valid_latencies_b) / len(valid_latencies_b) if valid_latencies_b else 0 |
| avg_tokens_b = sum(valid_tokens_b) / len(valid_tokens_b) if valid_tokens_b else 0 |
| avg_latency_c = sum(valid_latencies_c) / len(valid_latencies_c) if valid_latencies_c else 0 |
| avg_tokens_c = sum(valid_tokens_c) / len(valid_tokens_c) if valid_tokens_c else 0 |
| avg_latency_d = sum(valid_latencies_d) / len(valid_latencies_d) if valid_latencies_d else 0 |
| avg_tokens_d = sum(valid_tokens_d) / len(valid_tokens_d) if valid_tokens_d else 0 |
|
|
| print("\n" + "="*50) |
| print("BENCHMARK SUMMARY AVERAGES:") |
| print("="*50) |
| print(f"Option A (Gemini LLM-Classifier):") |
| print(f" - Avg Latency: {avg_latency_a:.3f}s") |
| print(f" - Avg Tokens: {avg_tokens_a:.1f}") |
| print(f"Option B (Gemini Single-Pass):") |
| print(f" - Avg Latency: {avg_latency_b:.3f}s") |
| print(f" - Avg Tokens: {avg_tokens_b:.1f}") |
| print(f"Option C (Raw Distribution + Gemini):") |
| print(f" - Avg Latency: {avg_latency_c:.3f}s") |
| print(f" - Avg Tokens: {avg_tokens_c:.1f}") |
| print(f"Option D (Local-Classifier + Gemini):") |
| print(f" - Avg Latency: {avg_latency_d:.3f}s") |
| print(f" - Avg Tokens: {avg_tokens_d:.1f}") |
| print("="*50) |
|
|