File size: 8,942 Bytes
131da12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | import json
import urllib.request
import csv
import time
import os
samples = [
# Confusion
("confusion", "Wait, why does a negative times a negative make a positive? I don't get it."),
("confusion", "I'm looking at this cell diagram and I can't tell the difference between the cell wall and the cell membrane."),
("confusion", "Our teacher said the Earth is tilted, but how does that make summer and winter? It doesn't make sense."),
("confusion", "Is a virus alive or is it not? My textbook says both and I'm really mixed up."),
("confusion", "What is the difference between a variable and a constant in algebra? I'm lost."),
("confusion", "Why does dividing by a fraction mean multiplying by its reciprocal? It seems arbitrary."),
("confusion", "What is the difference between speed and velocity? They sound like the same thing."),
("confusion", "Why is the mitochondria called the powerhouse of the cell? What does it actually do?"),
# Frustration
("frustration", "I've tried to solve this quadratic equation three times using the formula, but I keep getting a negative under the square root!"),
("frustration", "My science experiment failed again! The volcano didn't bubble at all and I did everything exactly right!"),
("frustration", "This long division with decimals is taking forever and I keep getting the wrong remainder! I hate this!"),
("frustration", "This word problem about two trains leaving different cities is making my head spin. I hate word problems!"),
("frustration", "I don't understand how to convert grams to moles. I keep getting the wrong conversion factor and it's so frustrating!"),
("frustration", "I've tried balancing this chemical equation five times and the numbers never match up!"),
("frustration", "I'm trying to draw this ray diagram for a concave lens and the lines are crossing in the wrong place. I give up!"),
("frustration", "This physics problem about friction has too many variables and I don't even know where to start!"),
# Boredom
("boredom", "Ugh, why do we have to learn about sedimentary rocks? They just sit there. Who cares?"),
("boredom", "This math worksheet is just 50 of the same exact addition problems. This is so boring."),
("boredom", "We are just copying definitions of different math properties from the board. This is so boring."),
("boredom", "Another lecture on the phases of mitosis... we've covered this three years in a row now."),
("boredom", "I finished all my science reading early. There's nothing else to do except stare at the wall."),
("boredom", "We have to measure the temperature of this water every two minutes for an hour. This is so tedious."),
("boredom", "Calculating the area of twenty slightly different rectangles is putting me to sleep."),
("boredom", "This lecture on cell organelles is just slides of definitions. I'm falling asleep."),
# Confidence
("confidence", "I totally mastered multiplying fractions! Give me a hard practice problem to try!"),
("confidence", "I just derived the formula for the volume of a sphere all by myself!"),
("confidence", "I know exactly how to balance any redox reaction now. Try me!"),
("confidence", "I got a perfect score on the calculus midterm today! I really understand derivatives now!"),
("confidence", "I can explain the entire water cycle in my sleep! Evaporation, condensation, precipitation, easy!"),
("confidence", "I just solved the hardest logic puzzle in the workbook on my very first try!"),
("confidence", "I can calculate the trajectory of a projectile in my head now, it's so easy!"),
("confidence", "I fully understand how DNA replication works and could draw every step from memory!"),
# Neutral
("neutral", "How do I calculate the hypotenuse of a right triangle when the sides are 3 and 4?"),
("neutral", "What are the three main types of rocks found in the Earth's crust?"),
("neutral", "Can you explain how photosynthesis converts sunlight into chemical energy?"),
("neutral", "What is the chemical formula for photosynthesis and cellular respiration?"),
("neutral", "How do you find the slope of a line from two points on a graph?"),
("neutral", "What is the difference between an isotope and an ion in chemistry?"),
("neutral", "Could you list the steps of the scientific method in order?"),
("neutral", "What is the value of the constant pi, and how is it calculated?")
]
results = []
url = "http://127.0.0.1:8000/api/chat"
print(f"Starting benchmark for {len(samples)} sample queries...")
for idx, (cat, q) in enumerate(samples, 1):
print(f"[{idx}/{len(samples)}] Query ({cat}): \"{q}\"")
req_data = json.dumps({"message": q}).encode('utf-8')
req = urllib.request.Request(
url,
data=req_data,
headers={'Content-Type': 'application/json'}
)
try:
with urllib.request.urlopen(req) as response:
res_data = json.loads(response.read().decode('utf-8'))
# Record result
results.append({
"category": cat,
"query": q,
"latency_a": res_data["latency_a"],
"tokens_a": res_data["tokens_a"],
"latency_b": res_data["latency_b"],
"tokens_b": res_data["tokens_b"],
"latency_c": res_data["latency_c"],
"tokens_c": res_data["tokens_c"],
"latency_d": res_data["latency_d"],
"tokens_d": res_data["tokens_d"]
})
print(f" Done: A ({res_data['latency_a']}s, {res_data['tokens_a']}t) | B ({res_data['latency_b']}s, {res_data['tokens_b']}t) | C ({res_data['latency_c']}s, {res_data['tokens_c']}t) | D ({res_data['latency_d']}s, {res_data['tokens_d']}t)")
# Add a small delay between requests
time.sleep(1.5)
except Exception as e:
print(f" Error processing query: {e}")
# Save to CSV
csv_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "benchmark_results.csv")
with open(csv_file, mode="w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([
"Category", "Query",
"Latency A (s)", "Tokens A",
"Latency B (s)", "Tokens B",
"Latency C (s)", "Tokens C",
"Latency D (s)", "Tokens D"
])
for r in results:
writer.writerow([
r["category"], r["query"],
r["latency_a"], r["tokens_a"],
r["latency_b"], r["tokens_b"],
r["latency_c"], r["tokens_c"],
r["latency_d"], r["tokens_d"]
])
print(f"\nResults successfully saved to: {csv_file}")
# Calculate averages
num_queries = len(results)
if num_queries > 0:
valid_latencies_a = [r["latency_a"] for r in results if r["latency_a"] is not None]
valid_tokens_a = [r["tokens_a"] for r in results if r["tokens_a"] is not None]
valid_latencies_b = [r["latency_b"] for r in results if r["latency_b"] is not None]
valid_tokens_b = [r["tokens_b"] for r in results if r["tokens_b"] is not None]
valid_latencies_c = [r["latency_c"] for r in results if r["latency_c"] is not None]
valid_tokens_c = [r["tokens_c"] for r in results if r["tokens_c"] is not None]
valid_latencies_d = [r["latency_d"] for r in results if r["latency_d"] is not None]
valid_tokens_d = [r["tokens_d"] for r in results if r["tokens_d"] is not None]
avg_latency_a = sum(valid_latencies_a) / len(valid_latencies_a) if valid_latencies_a else 0
avg_tokens_a = sum(valid_tokens_a) / len(valid_tokens_a) if valid_tokens_a else 0
avg_latency_b = sum(valid_latencies_b) / len(valid_latencies_b) if valid_latencies_b else 0
avg_tokens_b = sum(valid_tokens_b) / len(valid_tokens_b) if valid_tokens_b else 0
avg_latency_c = sum(valid_latencies_c) / len(valid_latencies_c) if valid_latencies_c else 0
avg_tokens_c = sum(valid_tokens_c) / len(valid_tokens_c) if valid_tokens_c else 0
avg_latency_d = sum(valid_latencies_d) / len(valid_latencies_d) if valid_latencies_d else 0
avg_tokens_d = sum(valid_tokens_d) / len(valid_tokens_d) if valid_tokens_d else 0
print("\n" + "="*50)
print("BENCHMARK SUMMARY AVERAGES:")
print("="*50)
print(f"Option A (Gemini LLM-Classifier):")
print(f" - Avg Latency: {avg_latency_a:.3f}s")
print(f" - Avg Tokens: {avg_tokens_a:.1f}")
print(f"Option B (Gemini Single-Pass):")
print(f" - Avg Latency: {avg_latency_b:.3f}s")
print(f" - Avg Tokens: {avg_tokens_b:.1f}")
print(f"Option C (Raw Distribution + Gemini):")
print(f" - Avg Latency: {avg_latency_c:.3f}s")
print(f" - Avg Tokens: {avg_tokens_c:.1f}")
print(f"Option D (Local-Classifier + Gemini):")
print(f" - Avg Latency: {avg_latency_d:.3f}s")
print(f" - Avg Tokens: {avg_tokens_d:.1f}")
print("="*50)
|