orgoflu commited on
Commit
0966925
Β·
verified Β·
1 Parent(s): 1ce904a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -81
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import time
2
  import json
 
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
5
  from tqdm import tqdm
6
- import matplotlib.pyplot as plt
7
 
8
  MODELS = [
9
  "naver-clova/HyperCLOVA-X-Seed-3B",
@@ -15,103 +15,99 @@ MODELS = [
15
  "deepseek-ai/deepseek-coder-6.7b-instruct"
16
  ]
17
 
18
- HF_TOKEN = None # "hf_xxx" ν•„μš” μ‹œ μž…λ ₯
19
 
20
  TESTS = [
21
  {
22
  "name": "μš”μ•½",
23
- "prompt": "λ‹€μŒ λ¬Έμž₯을 ν•œ μ€„λ‘œ μš”μ•½ν•΄ μ£Όμ„Έμš”: 인곡지λŠ₯은 λ‹€μ–‘ν•œ μ‚°μ—…μ—μ„œ ν˜μ‹ μ„ 이끌고 μžˆλ‹€.",
24
  "keywords": ["ν˜μ‹ ", "μ‚°μ—…"]
25
  },
26
  {
27
  "name": "QA",
28
- "prompt": "λ‹¬μ˜ 쀑λ ₯은 μ§€κ΅¬μ˜ λͺ‡ λΆ„μ˜ λͺ‡μΈκ°€μš”?",
29
  "keywords": ["6λΆ„μ˜1", "1/6"]
30
  },
31
  {
32
  "name": "μ½”λ“œ",
33
- "prompt": "파이썬으둜 ν”Όλ³΄λ‚˜μΉ˜ μˆ˜μ—΄μ„ 좜λ ₯ν•˜λŠ” ν•¨μˆ˜λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”.",
34
  "keywords": ["def", "fibonacci"]
35
  }
36
  ]
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def benchmark_model(model_id):
 
39
  try:
40
- start_time = time.time()
41
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
42
- model = AutoModelForCausalLM.from_pretrained(
43
- model_id,
44
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
45
- device_map="auto",
46
- low_cpu_mem_usage=True,
47
- token=HF_TOKEN
48
- )
49
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
50
-
51
- elapsed_load = round(time.time() - start_time, 2)
52
- test_results = []
53
- total_quality = 0
54
-
55
- for test in TESTS:
56
  t_start = time.time()
57
- output = pipe(test["prompt"], max_new_tokens=100, do_sample=False)[0]['generated_text']
58
  t_elapsed = round(time.time() - t_start, 2)
59
- quality_score = sum(1 for kw in test["keywords"] if kw in output)
60
- total_quality += quality_score
61
  test_results.append({
62
  "test_name": test["name"],
63
  "time_sec": t_elapsed,
64
- "quality_score": quality_score,
65
  "output": output.strip()
66
  })
 
 
 
 
 
 
 
 
67
 
68
- total_time = round(time.time() - start_time, 2)
69
-
70
- # λ©”λͺ¨λ¦¬ ν•΄μ œ
71
- del model
72
- del tokenizer
73
- del pipe
74
- torch.cuda.empty_cache()
75
-
76
- return {
77
- "model": model_id,
78
- "load_time_sec": elapsed_load,
79
- "total_time_sec": total_time,
80
- "total_quality_score": total_quality,
81
- "tests": test_results
82
- }
83
- except Exception as e:
84
- torch.cuda.empty_cache()
85
- return {"model": model_id, "error": str(e)}
86
-
87
- def visualize_results(results):
88
- valid_results = [r for r in results if "error" not in r]
89
- models = [r["model"] for r in valid_results]
90
- qualities = [r["total_quality_score"] for r in valid_results]
91
- times = [r["total_time_sec"] for r in valid_results]
92
-
93
- fig, ax1 = plt.subplots()
94
-
95
- ax1.set_xlabel("λͺ¨λΈ")
96
- ax1.set_ylabel("ν’ˆμ§ˆ 점수", color="tab:blue")
97
- ax1.bar(models, qualities, color="tab:blue", alpha=0.6, label="ν’ˆμ§ˆ 점수")
98
- ax1.tick_params(axis="y", labelcolor="tab:blue")
99
- plt.xticks(rotation=45, ha="right")
100
 
101
- ax2 = ax1.twinx()
102
- ax2.set_ylabel("총 μ‹œκ°„(초)", color="tab:red")
103
- ax2.plot(models, times, color="tab:red", marker="o", label="총 μ‹œκ°„")
104
- ax2.tick_params(axis="y", labelcolor="tab:red")
105
 
106
- fig.tight_layout()
107
- plt.title("LLM 벀치마크 결과")
108
- plt.show()
 
 
 
 
109
 
110
  def main():
111
  results = []
112
  with tqdm(total=len(MODELS), desc="λͺ¨λΈ 벀치마크 μ§„ν–‰", unit="model") as pbar:
113
  for m in MODELS:
114
- start_model_time = time.time()
115
  res = benchmark_model(m)
116
  results.append(res)
117
 
@@ -121,29 +117,22 @@ def main():
121
  print(f"\n=== {m} ===")
122
  print(f"⏱ λ‘œλ“œ {res['load_time_sec']}s | 총 {res['total_time_sec']}s | ν’ˆμ§ˆν•© {res['total_quality_score']}")
123
  for t in res["tests"]:
124
- print(f" - {t['test_name']}: {t['time_sec']}s | ν’ˆμ§ˆ {t['quality_score']}")
125
- print(f" {t['output'][:80]}...")
126
-
127
- # ETA 계산
128
- avg_time = sum(r['total_time_sec'] for r in results if "error" not in r) / max(1, len([r for r in results if "error" not in r]))
129
- remaining_models = len(MODELS) - pbar.n - 1
130
- eta_seconds = int(avg_time * remaining_models)
131
- eta_min, eta_sec = divmod(eta_seconds, 60)
132
- pbar.set_postfix({"ETA": f"{eta_min}m {eta_sec}s"})
133
  pbar.update(1)
134
 
135
  with open("benchmark_results.json", "w", encoding="utf-8") as f:
136
  json.dump(results, f, ensure_ascii=False, indent=2)
137
 
138
- valid_results = [r for r in results if "error" not in r]
139
- ranked = sorted(valid_results, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
140
-
141
- print("\nπŸ† μΆ”μ²œ λͺ¨λΈ TOP:")
142
  for r in ranked[:3]:
143
  print(f"{r['model']} | ν’ˆμ§ˆ {r['total_quality_score']} | 총 {r['total_time_sec']}s")
144
 
145
- # μ‹œκ°ν™”
146
- visualize_results(results)
147
-
148
  if __name__ == "__main__":
149
  main()
 
1
  import time
2
  import json
3
+ import os
4
  import torch
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
6
  from tqdm import tqdm
 
7
 
8
  MODELS = [
9
  "naver-clova/HyperCLOVA-X-Seed-3B",
 
15
  "deepseek-ai/deepseek-coder-6.7b-instruct"
16
  ]
17
 
18
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
19
 
20
  TESTS = [
21
  {
22
  "name": "μš”μ•½",
23
+ "prompt": "λ‹€μŒ λ¬Έμž₯을 ν•œ μ€„λ‘œ μš”μ•½ν•΄ μ£Όμ„Έμš”. μΆ”κ°€ μ •λ³΄λŠ” μ ˆλŒ€ λ„£μ§€ λ§ˆμ„Έμš”:\n인곡지λŠ₯은 λ‹€μ–‘ν•œ μ‚°μ—…μ—μ„œ ν˜μ‹ μ„ 이끌고 μžˆλ‹€.",
24
  "keywords": ["ν˜μ‹ ", "μ‚°μ—…"]
25
  },
26
  {
27
  "name": "QA",
28
+ "prompt": "λ‹¬μ˜ 쀑λ ₯은 μ§€κ΅¬μ˜ λͺ‡ λΆ„μ˜ λͺ‡μΈκ°€μš”? κ°„λ‹¨νžˆ 숫자둜만 λ‹΅ν•˜μ„Έμš”.",
29
  "keywords": ["6λΆ„μ˜1", "1/6"]
30
  },
31
  {
32
  "name": "μ½”λ“œ",
33
+ "prompt": "파이썬으둜 ν”Όλ³΄λ‚˜μΉ˜ μˆ˜μ—΄μ„ 좜λ ₯ν•˜λŠ” ν•¨μˆ˜λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”. λΆˆν•„μš”ν•œ μ„€λͺ… 없이 μ½”λ“œλ§Œ μ£Όμ„Έμš”.",
34
  "keywords": ["def", "fibonacci"]
35
  }
36
  ]
37
 
38
+ def load_pipeline(model_id):
39
+ tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
40
+ mdl = AutoModelForCausalLM.from_pretrained(
41
+ model_id,
42
+ torch_dtype=torch.float32,
43
+ low_cpu_mem_usage=True,
44
+ token=HF_TOKEN,
45
+ trust_remote_code=True
46
+ ).to("cpu")
47
+ return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok
48
+
49
+ def run_generation(pipe, prompt, max_new_tokens=128):
50
+ outputs = pipe(
51
+ prompt,
52
+ max_new_tokens=max_new_tokens,
53
+ do_sample=False,
54
+ temperature=0.0,
55
+ repetition_penalty=1.1
56
+ )
57
+ return outputs[0]["generated_text"]
58
+
59
+ def quality_score(output, keywords):
60
+ return sum(1 for kw in keywords if kw in output)
61
+
62
  def benchmark_model(model_id):
63
+ start_time = time.time()
64
  try:
65
+ pipe, model, tokenizer = load_pipeline(model_id)
66
+ except Exception as e:
67
+ return {"model": model_id, "error": f"load_error: {e}"}
68
+
69
+ elapsed_load = round(time.time() - start_time, 2)
70
+ test_results = []
71
+ total_quality = 0
72
+
73
+ for test in TESTS:
74
+ try:
 
 
 
 
 
 
75
  t_start = time.time()
76
+ output = run_generation(pipe, test["prompt"])
77
  t_elapsed = round(time.time() - t_start, 2)
78
+ q = quality_score(output, test["keywords"])
79
+ total_quality += q
80
  test_results.append({
81
  "test_name": test["name"],
82
  "time_sec": t_elapsed,
83
+ "quality_score": q,
84
  "output": output.strip()
85
  })
86
+ except Exception as e:
87
+ test_results.append({
88
+ "test_name": test["name"],
89
+ "time_sec": None,
90
+ "quality_score": 0,
91
+ "output": "",
92
+ "error": f"gen_error: {e}"
93
+ })
94
 
95
+ total_time = round(time.time() - start_time, 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ del model, tokenizer, pipe
 
 
 
98
 
99
+ return {
100
+ "model": model_id,
101
+ "load_time_sec": elapsed_load,
102
+ "total_time_sec": total_time,
103
+ "total_quality_score": total_quality,
104
+ "tests": test_results
105
+ }
106
 
107
  def main():
108
  results = []
109
  with tqdm(total=len(MODELS), desc="λͺ¨λΈ 벀치마크 μ§„ν–‰", unit="model") as pbar:
110
  for m in MODELS:
 
111
  res = benchmark_model(m)
112
  results.append(res)
113
 
 
117
  print(f"\n=== {m} ===")
118
  print(f"⏱ λ‘œλ“œ {res['load_time_sec']}s | 총 {res['total_time_sec']}s | ν’ˆμ§ˆν•© {res['total_quality_score']}")
119
  for t in res["tests"]:
120
+ if "error" in t:
121
+ print(f" - {t['test_name']}: ERROR {t['error']}")
122
+ else:
123
+ print(f" - {t['test_name']}: {t['time_sec']}s | ν’ˆμ§ˆ {t['quality_score']}")
124
+ print(f" {t['output'][:80]}...")
125
+
 
 
 
126
  pbar.update(1)
127
 
128
  with open("benchmark_results.json", "w", encoding="utf-8") as f:
129
  json.dump(results, f, ensure_ascii=False, indent=2)
130
 
131
+ valid = [r for r in results if "error" not in r]
132
+ ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
133
+ print("\nπŸ† μΆ”μ²œ TOP 3")
 
134
  for r in ranked[:3]:
135
  print(f"{r['model']} | ν’ˆμ§ˆ {r['total_quality_score']} | 총 {r['total_time_sec']}s")
136
 
 
 
 
137
  if __name__ == "__main__":
138
  main()