Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
import time
|
| 2 |
import json
|
|
|
|
| 3 |
import torch
|
| 4 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 5 |
from tqdm import tqdm
|
| 6 |
-
import matplotlib.pyplot as plt
|
| 7 |
|
| 8 |
MODELS = [
|
| 9 |
"naver-clova/HyperCLOVA-X-Seed-3B",
|
|
@@ -15,103 +15,99 @@ MODELS = [
|
|
| 15 |
"deepseek-ai/deepseek-coder-6.7b-instruct"
|
| 16 |
]
|
| 17 |
|
| 18 |
-
HF_TOKEN =
|
| 19 |
|
| 20 |
TESTS = [
|
| 21 |
{
|
| 22 |
"name": "μμ½",
|
| 23 |
-
"prompt": "λ€μ λ¬Έμ₯μ ν μ€λ‘ μμ½ν΄
|
| 24 |
"keywords": ["νμ ", "μ°μ
"]
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"name": "QA",
|
| 28 |
-
"prompt": "λ¬μ μ€λ ₯μ μ§κ΅¬μ λͺ λΆμ λͺμΈκ°μ?",
|
| 29 |
"keywords": ["6λΆμ1", "1/6"]
|
| 30 |
},
|
| 31 |
{
|
| 32 |
"name": "μ½λ",
|
| 33 |
-
"prompt": "νμ΄μ¬μΌλ‘ νΌλ³΄λμΉ μμ΄μ μΆλ ₯νλ ν¨μλ₯Ό μμ±ν΄ μ£ΌμΈμ.",
|
| 34 |
"keywords": ["def", "fibonacci"]
|
| 35 |
}
|
| 36 |
]
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
def benchmark_model(model_id):
|
|
|
|
| 39 |
try:
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
model
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
elapsed_load = round(time.time() - start_time, 2)
|
| 52 |
-
test_results = []
|
| 53 |
-
total_quality = 0
|
| 54 |
-
|
| 55 |
-
for test in TESTS:
|
| 56 |
t_start = time.time()
|
| 57 |
-
output = pipe
|
| 58 |
t_elapsed = round(time.time() - t_start, 2)
|
| 59 |
-
|
| 60 |
-
total_quality +=
|
| 61 |
test_results.append({
|
| 62 |
"test_name": test["name"],
|
| 63 |
"time_sec": t_elapsed,
|
| 64 |
-
"quality_score":
|
| 65 |
"output": output.strip()
|
| 66 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
# λ©λͺ¨λ¦¬ ν΄μ
|
| 71 |
-
del model
|
| 72 |
-
del tokenizer
|
| 73 |
-
del pipe
|
| 74 |
-
torch.cuda.empty_cache()
|
| 75 |
-
|
| 76 |
-
return {
|
| 77 |
-
"model": model_id,
|
| 78 |
-
"load_time_sec": elapsed_load,
|
| 79 |
-
"total_time_sec": total_time,
|
| 80 |
-
"total_quality_score": total_quality,
|
| 81 |
-
"tests": test_results
|
| 82 |
-
}
|
| 83 |
-
except Exception as e:
|
| 84 |
-
torch.cuda.empty_cache()
|
| 85 |
-
return {"model": model_id, "error": str(e)}
|
| 86 |
-
|
| 87 |
-
def visualize_results(results):
|
| 88 |
-
valid_results = [r for r in results if "error" not in r]
|
| 89 |
-
models = [r["model"] for r in valid_results]
|
| 90 |
-
qualities = [r["total_quality_score"] for r in valid_results]
|
| 91 |
-
times = [r["total_time_sec"] for r in valid_results]
|
| 92 |
-
|
| 93 |
-
fig, ax1 = plt.subplots()
|
| 94 |
-
|
| 95 |
-
ax1.set_xlabel("λͺ¨λΈ")
|
| 96 |
-
ax1.set_ylabel("νμ§ μ μ", color="tab:blue")
|
| 97 |
-
ax1.bar(models, qualities, color="tab:blue", alpha=0.6, label="νμ§ μ μ")
|
| 98 |
-
ax1.tick_params(axis="y", labelcolor="tab:blue")
|
| 99 |
-
plt.xticks(rotation=45, ha="right")
|
| 100 |
|
| 101 |
-
|
| 102 |
-
ax2.set_ylabel("μ΄ μκ°(μ΄)", color="tab:red")
|
| 103 |
-
ax2.plot(models, times, color="tab:red", marker="o", label="μ΄ μκ°")
|
| 104 |
-
ax2.tick_params(axis="y", labelcolor="tab:red")
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
def main():
|
| 111 |
results = []
|
| 112 |
with tqdm(total=len(MODELS), desc="λͺ¨λΈ λ²€μΉλ§ν¬ μ§ν", unit="model") as pbar:
|
| 113 |
for m in MODELS:
|
| 114 |
-
start_model_time = time.time()
|
| 115 |
res = benchmark_model(m)
|
| 116 |
results.append(res)
|
| 117 |
|
|
@@ -121,29 +117,22 @@ def main():
|
|
| 121 |
print(f"\n=== {m} ===")
|
| 122 |
print(f"β± λ‘λ {res['load_time_sec']}s | μ΄ {res['total_time_sec']}s | νμ§ν© {res['total_quality_score']}")
|
| 123 |
for t in res["tests"]:
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
eta_seconds = int(avg_time * remaining_models)
|
| 131 |
-
eta_min, eta_sec = divmod(eta_seconds, 60)
|
| 132 |
-
pbar.set_postfix({"ETA": f"{eta_min}m {eta_sec}s"})
|
| 133 |
pbar.update(1)
|
| 134 |
|
| 135 |
with open("benchmark_results.json", "w", encoding="utf-8") as f:
|
| 136 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
| 137 |
|
| 138 |
-
|
| 139 |
-
ranked = sorted(
|
| 140 |
-
|
| 141 |
-
print("\nπ μΆμ² λͺ¨λΈ TOP:")
|
| 142 |
for r in ranked[:3]:
|
| 143 |
print(f"{r['model']} | νμ§ {r['total_quality_score']} | μ΄ {r['total_time_sec']}s")
|
| 144 |
|
| 145 |
-
# μκ°ν
|
| 146 |
-
visualize_results(results)
|
| 147 |
-
|
| 148 |
if __name__ == "__main__":
|
| 149 |
main()
|
|
|
|
| 1 |
import time
|
| 2 |
import json
|
| 3 |
+
import os
|
| 4 |
import torch
|
| 5 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 6 |
from tqdm import tqdm
|
|
|
|
| 7 |
|
| 8 |
MODELS = [
|
| 9 |
"naver-clova/HyperCLOVA-X-Seed-3B",
|
|
|
|
| 15 |
"deepseek-ai/deepseek-coder-6.7b-instruct"
|
| 16 |
]
|
| 17 |
|
| 18 |
+
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
| 19 |
|
| 20 |
TESTS = [
|
| 21 |
{
|
| 22 |
"name": "μμ½",
|
| 23 |
+
"prompt": "λ€μ λ¬Έμ₯μ ν μ€λ‘ μμ½ν΄ μ£ΌμΈμ. μΆκ° μ 보λ μ λ λ£μ§ λ§μΈμ:\nμΈκ³΅μ§λ₯μ λ€μν μ°μ
μμ νμ μ μ΄λκ³ μλ€.",
|
| 24 |
"keywords": ["νμ ", "μ°μ
"]
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"name": "QA",
|
| 28 |
+
"prompt": "λ¬μ μ€λ ₯μ μ§κ΅¬μ λͺ λΆμ λͺμΈκ°μ? κ°λ¨ν μ«μλ‘λ§ λ΅νμΈμ.",
|
| 29 |
"keywords": ["6λΆμ1", "1/6"]
|
| 30 |
},
|
| 31 |
{
|
| 32 |
"name": "μ½λ",
|
| 33 |
+
"prompt": "νμ΄μ¬μΌλ‘ νΌλ³΄λμΉ μμ΄μ μΆλ ₯νλ ν¨μλ₯Ό μμ±ν΄ μ£ΌμΈμ. λΆνμν μ€λͺ
μμ΄ μ½λλ§ μ£ΌμΈμ.",
|
| 34 |
"keywords": ["def", "fibonacci"]
|
| 35 |
}
|
| 36 |
]
|
| 37 |
|
| 38 |
+
def load_pipeline(model_id):
|
| 39 |
+
tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
|
| 40 |
+
mdl = AutoModelForCausalLM.from_pretrained(
|
| 41 |
+
model_id,
|
| 42 |
+
torch_dtype=torch.float32,
|
| 43 |
+
low_cpu_mem_usage=True,
|
| 44 |
+
token=HF_TOKEN,
|
| 45 |
+
trust_remote_code=True
|
| 46 |
+
).to("cpu")
|
| 47 |
+
return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok
|
| 48 |
+
|
| 49 |
+
def run_generation(pipe, prompt, max_new_tokens=128):
|
| 50 |
+
outputs = pipe(
|
| 51 |
+
prompt,
|
| 52 |
+
max_new_tokens=max_new_tokens,
|
| 53 |
+
do_sample=False,
|
| 54 |
+
temperature=0.0,
|
| 55 |
+
repetition_penalty=1.1
|
| 56 |
+
)
|
| 57 |
+
return outputs[0]["generated_text"]
|
| 58 |
+
|
| 59 |
+
def quality_score(output, keywords):
|
| 60 |
+
return sum(1 for kw in keywords if kw in output)
|
| 61 |
+
|
| 62 |
def benchmark_model(model_id):
|
| 63 |
+
start_time = time.time()
|
| 64 |
try:
|
| 65 |
+
pipe, model, tokenizer = load_pipeline(model_id)
|
| 66 |
+
except Exception as e:
|
| 67 |
+
return {"model": model_id, "error": f"load_error: {e}"}
|
| 68 |
+
|
| 69 |
+
elapsed_load = round(time.time() - start_time, 2)
|
| 70 |
+
test_results = []
|
| 71 |
+
total_quality = 0
|
| 72 |
+
|
| 73 |
+
for test in TESTS:
|
| 74 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
t_start = time.time()
|
| 76 |
+
output = run_generation(pipe, test["prompt"])
|
| 77 |
t_elapsed = round(time.time() - t_start, 2)
|
| 78 |
+
q = quality_score(output, test["keywords"])
|
| 79 |
+
total_quality += q
|
| 80 |
test_results.append({
|
| 81 |
"test_name": test["name"],
|
| 82 |
"time_sec": t_elapsed,
|
| 83 |
+
"quality_score": q,
|
| 84 |
"output": output.strip()
|
| 85 |
})
|
| 86 |
+
except Exception as e:
|
| 87 |
+
test_results.append({
|
| 88 |
+
"test_name": test["name"],
|
| 89 |
+
"time_sec": None,
|
| 90 |
+
"quality_score": 0,
|
| 91 |
+
"output": "",
|
| 92 |
+
"error": f"gen_error: {e}"
|
| 93 |
+
})
|
| 94 |
|
| 95 |
+
total_time = round(time.time() - start_time, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
+
del model, tokenizer, pipe
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
return {
|
| 100 |
+
"model": model_id,
|
| 101 |
+
"load_time_sec": elapsed_load,
|
| 102 |
+
"total_time_sec": total_time,
|
| 103 |
+
"total_quality_score": total_quality,
|
| 104 |
+
"tests": test_results
|
| 105 |
+
}
|
| 106 |
|
| 107 |
def main():
|
| 108 |
results = []
|
| 109 |
with tqdm(total=len(MODELS), desc="λͺ¨λΈ λ²€μΉλ§ν¬ μ§ν", unit="model") as pbar:
|
| 110 |
for m in MODELS:
|
|
|
|
| 111 |
res = benchmark_model(m)
|
| 112 |
results.append(res)
|
| 113 |
|
|
|
|
| 117 |
print(f"\n=== {m} ===")
|
| 118 |
print(f"β± λ‘λ {res['load_time_sec']}s | μ΄ {res['total_time_sec']}s | νμ§ν© {res['total_quality_score']}")
|
| 119 |
for t in res["tests"]:
|
| 120 |
+
if "error" in t:
|
| 121 |
+
print(f" - {t['test_name']}: ERROR {t['error']}")
|
| 122 |
+
else:
|
| 123 |
+
print(f" - {t['test_name']}: {t['time_sec']}s | νμ§ {t['quality_score']}")
|
| 124 |
+
print(f" {t['output'][:80]}...")
|
| 125 |
+
|
|
|
|
|
|
|
|
|
|
| 126 |
pbar.update(1)
|
| 127 |
|
| 128 |
with open("benchmark_results.json", "w", encoding="utf-8") as f:
|
| 129 |
json.dump(results, f, ensure_ascii=False, indent=2)
|
| 130 |
|
| 131 |
+
valid = [r for r in results if "error" not in r]
|
| 132 |
+
ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
|
| 133 |
+
print("\nπ μΆμ² TOP 3")
|
|
|
|
| 134 |
for r in ranked[:3]:
|
| 135 |
print(f"{r['model']} | νμ§ {r['total_quality_score']} | μ΄ {r['total_time_sec']}s")
|
| 136 |
|
|
|
|
|
|
|
|
|
|
| 137 |
if __name__ == "__main__":
|
| 138 |
main()
|