orgoflu commited on
Commit
317a31a
ยท
verified ยท
1 Parent(s): 2e6f84b
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import json
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
+ import torch
5
+
6
+ # ํ…Œ์ŠคํŠธํ•  ๋ชจ๋ธ ๋ชฉ๋ก (๊ณต๊ฐœยท๋น„๊ฒŒ์ดํŠธ ์ œ์™ธ)
7
+ MODELS = [
8
+ "naver-clova/HyperCLOVA-X-Seed-3B",
9
+ "NousResearch/Hermes-3-Llama-3.2-3B",
10
+ "tiiuae/Falcon-7B-Instruct",
11
+ "openchat/openchat-3.5-0106",
12
+ "mistralai/Mistral-7B-Instruct-v0.3",
13
+ "Qwen/Qwen2.5-7B-Instruct",
14
+ "deepseek-ai/deepseek-coder-6.7b-instruct",
15
+ "microsoft/Phi-3-mini-4k-instruct"
16
+ ]
17
+
18
+ # Hugging Face ํ† ํฐ (Gated ๋ชจ๋ธ ํ…Œ์ŠคํŠธ ์‹œ ํ•„์š”)
19
+ HF_TOKEN = None # "hf_xxx" ํ˜•ํƒœ๋กœ ์ž…๋ ฅ
20
+
21
+ # ํ…Œ์ŠคํŠธ ํ”„๋กฌํ”„ํŠธ ์„ธํŠธ
22
+ TESTS = [
23
+ {
24
+ "name": "์š”์•ฝ",
25
+ "prompt": "๋‹ค์Œ ๋ฌธ์žฅ์„ ํ•œ ์ค„๋กœ ์š”์•ฝํ•ด ์ฃผ์„ธ์š”: ์ธ๊ณต์ง€๋Šฅ์€ ๋‹ค์–‘ํ•œ ์‚ฐ์—…์—์„œ ํ˜์‹ ์„ ์ด๋Œ๊ณ  ์žˆ๋‹ค.",
26
+ "keywords": ["ํ˜์‹ ", "์‚ฐ์—…"]
27
+ },
28
+ {
29
+ "name": "QA",
30
+ "prompt": "๋‹ฌ์˜ ์ค‘๋ ฅ์€ ์ง€๊ตฌ์˜ ๋ช‡ ๋ถ„์˜ ๋ช‡์ธ๊ฐ€์š”?",
31
+ "keywords": ["6๋ถ„์˜1", "1/6"]
32
+ },
33
+ {
34
+ "name": "์ฝ”๋“œ",
35
+ "prompt": "ํŒŒ์ด์ฌ์œผ๋กœ ํ”ผ๋ณด๋‚˜์น˜ ์ˆ˜์—ด์„ ์ถœ๋ ฅํ•˜๋Š” ํ•จ์ˆ˜๋ฅผ ์ž‘์„ฑํ•ด ์ฃผ์„ธ์š”.",
36
+ "keywords": ["def", "fibonacci"]
37
+ }
38
+ ]
39
+
40
+ def benchmark_model(model_id):
41
+ try:
42
+ start_time = time.time()
43
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_id,
46
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
47
+ device_map="auto",
48
+ token=HF_TOKEN
49
+ )
50
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
51
+
52
+ elapsed_load = round(time.time() - start_time, 2)
53
+ test_results = []
54
+ total_quality = 0
55
+
56
+ for test in TESTS:
57
+ t_start = time.time()
58
+ output = pipe(test["prompt"], max_new_tokens=100, do_sample=False)[0]['generated_text']
59
+ t_elapsed = round(time.time() - t_start, 2)
60
+ quality_score = sum(1 for kw in test["keywords"] if kw in output)
61
+ total_quality += quality_score
62
+ test_results.append({
63
+ "test_name": test["name"],
64
+ "time_sec": t_elapsed,
65
+ "quality_score": quality_score,
66
+ "output": output.strip()
67
+ })
68
+
69
+ total_time = round(time.time() - start_time, 2)
70
+ return {
71
+ "model": model_id,
72
+ "load_time_sec": elapsed_load,
73
+ "total_time_sec": total_time,
74
+ "total_quality_score": total_quality,
75
+ "tests": test_results
76
+ }
77
+ except Exception as e:
78
+ return {"model": model_id, "error": str(e)}
79
+
80
+ def main():
81
+ results = []
82
+ for m in MODELS:
83
+ print(f"\n=== {m} ===")
84
+ res = benchmark_model(m)
85
+ results.append(res)
86
+ if "error" in res:
87
+ print(f"โŒ Error: {res['error']}")
88
+ else:
89
+ print(f"โฑ ๋กœ๋“œ {res['load_time_sec']}s | ์ด {res['total_time_sec']}s | ํ’ˆ์งˆํ•ฉ {res['total_quality_score']}")
90
+ for t in res["tests"]:
91
+ print(f" - {t['test_name']}: {t['time_sec']}s | ํ’ˆ์งˆ {t['quality_score']}")
92
+ print(f" {t['output'][:80]}...")
93
+
94
+ with open("benchmark_results.json", "w", encoding="utf-8") as f:
95
+ json.dump(results, f, ensure_ascii=False, indent=2)
96
+
97
+ valid_results = [r for r in results if "error" not in r]
98
+ ranked = sorted(valid_results, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
99
+
100
+ print("\n๐Ÿ† ์ถ”์ฒœ ๋ชจ๋ธ TOP:")
101
+ for r in ranked[:3]:
102
+ print(f"{r['model']} | ํ’ˆ์งˆ {r['total_quality_score']} | ์ด {r['total_time_sec']}s")
103
+
104
+ if __name__ == "__main__":
105
+ main()