Spaces:
Runtime error
Runtime error
File size: 4,702 Bytes
317a31a 0966925 317a31a 959633d 0fe9daa 317a31a 395e379 317a31a 959633d 317a31a 395e379 0966925 317a31a 395e379 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 317a31a 0966925 0fe9daa 0966925 0fe9daa 0966925 0fe9daa 317a31a 47d9b67 0966925 47d9b67 317a31a 0966925 317a31a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import time
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm
# ν
μ€νΈν λͺ¨λΈ λͺ©λ‘
MODELS = [
"naver-clova/HyperCLOVA-X-Seed-3B",
"NousResearch/Hermes-3-Llama-3.2-3B",
"tiiuae/Falcon-7B-Instruct",
"openchat/openchat-3.5-0106",
"mistralai/Mistral-7B-Instruct-v0.3",
"Qwen/Qwen2.5-7B-Instruct",
"deepseek-ai/deepseek-coder-6.7b-instruct"
]
# Hugging Face ν ν° (νμ μ νκ²½λ³μλ‘ μ€μ )
HF_TOKEN = os.getenv("HF_TOKEN", None)
# ν
μ€νΈ μλ리μ€
TESTS = [
{
"name": "μμ½",
"prompt": "λ€μ λ¬Έμ₯μ ν μ€λ‘ μμ½ν΄ μ£ΌμΈμ. μΆκ° μ 보λ μ λ λ£μ§ λ§μΈμ:\nμΈκ³΅μ§λ₯μ λ€μν μ°μ
μμ νμ μ μ΄λκ³ μλ€.",
"keywords": ["νμ ", "μ°μ
"]
},
{
"name": "QA",
"prompt": "λ¬μ μ€λ ₯μ μ§κ΅¬μ λͺ λΆμ λͺμΈκ°μ? κ°λ¨ν μ«μλ‘λ§ λ΅νμΈμ.",
"keywords": ["6λΆμ1", "1/6"]
},
{
"name": "μ½λ",
"prompt": "νμ΄μ¬μΌλ‘ νΌλ³΄λμΉ μμ΄μ μΆλ ₯νλ ν¨μλ₯Ό μμ±ν΄ μ£ΌμΈμ. λΆνμν μ€λͺ
μμ΄ μ½λλ§ μ£ΌμΈμ.",
"keywords": ["def", "fibonacci"]
}
]
def load_pipeline(model_id):
tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
mdl = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
token=HF_TOKEN,
trust_remote_code=True
).to("cpu")
return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok
def run_generation(pipe, prompt, max_new_tokens=128):
outputs = pipe(
prompt,
max_new_tokens=max_new_tokens,
do_sample=False,
temperature=0.0,
repetition_penalty=1.1
)
return outputs[0]["generated_text"]
def quality_score(output, keywords):
return sum(1 for kw in keywords if kw in output)
def benchmark_model(model_id):
start_time = time.time()
try:
pipe, model, tokenizer = load_pipeline(model_id)
except Exception as e:
return {"model": model_id, "error": f"load_error: {e}"}
elapsed_load = round(time.time() - start_time, 2)
test_results = []
total_quality = 0
for test in TESTS:
try:
t_start = time.time()
output = run_generation(pipe, test["prompt"])
t_elapsed = round(time.time() - t_start, 2)
q = quality_score(output, test["keywords"])
total_quality += q
test_results.append({
"test_name": test["name"],
"time_sec": t_elapsed,
"quality_score": q,
"output": output.strip()
})
except Exception as e:
test_results.append({
"test_name": test["name"],
"time_sec": None,
"quality_score": 0,
"output": "",
"error": f"gen_error: {e}"
})
total_time = round(time.time() - start_time, 2)
del model, tokenizer, pipe
return {
"model": model_id,
"load_time_sec": elapsed_load,
"total_time_sec": total_time,
"total_quality_score": total_quality,
"tests": test_results
}
def main():
results = []
with tqdm(total=len(MODELS), desc="λͺ¨λΈ λ²€μΉλ§ν¬ μ§ν", unit="model") as pbar:
for m in MODELS:
res = benchmark_model(m)
results.append(res)
if "error" in res:
print(f"\nβ {m} Error: {res['error']}")
else:
print(f"\n=== {m} ===")
print(f"β± λ‘λ {res['load_time_sec']}s | μ΄ {res['total_time_sec']}s | νμ§ν© {res['total_quality_score']}")
for t in res["tests"]:
if "error" in t:
print(f" - {t['test_name']}: ERROR {t['error']}")
else:
print(f" - {t['test_name']}: {t['time_sec']}s | νμ§ {t['quality_score']}")
print(f" {t['output'][:80]}...")
pbar.update(1)
with open("benchmark_results.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
valid = [r for r in results if "error" not in r]
ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
print("\nπ μΆμ² TOP 3")
for r in ranked[:3]:
print(f"{r['model']} | νμ§ {r['total_quality_score']} | μ΄ {r['total_time_sec']}s")
if __name__ == "__main__":
main() |