moro_test-bench / app.py
orgoflu's picture
Update app.py
395e379 verified
import time
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm
# ν…ŒμŠ€νŠΈν•  λͺ¨λΈ λͺ©λ‘
MODELS = [
"naver-clova/HyperCLOVA-X-Seed-3B",
"NousResearch/Hermes-3-Llama-3.2-3B",
"tiiuae/Falcon-7B-Instruct",
"openchat/openchat-3.5-0106",
"mistralai/Mistral-7B-Instruct-v0.3",
"Qwen/Qwen2.5-7B-Instruct",
"deepseek-ai/deepseek-coder-6.7b-instruct"
]
# Hugging Face 토큰 (ν•„μš” μ‹œ ν™˜κ²½λ³€μˆ˜λ‘œ μ„€μ •)
HF_TOKEN = os.getenv("HF_TOKEN", None)
# ν…ŒμŠ€νŠΈ μ‹œλ‚˜λ¦¬μ˜€
TESTS = [
{
"name": "μš”μ•½",
"prompt": "λ‹€μŒ λ¬Έμž₯을 ν•œ μ€„λ‘œ μš”μ•½ν•΄ μ£Όμ„Έμš”. μΆ”κ°€ μ •λ³΄λŠ” μ ˆλŒ€ λ„£μ§€ λ§ˆμ„Έμš”:\n인곡지λŠ₯은 λ‹€μ–‘ν•œ μ‚°μ—…μ—μ„œ ν˜μ‹ μ„ 이끌고 μžˆλ‹€.",
"keywords": ["ν˜μ‹ ", "μ‚°μ—…"]
},
{
"name": "QA",
"prompt": "λ‹¬μ˜ 쀑λ ₯은 μ§€κ΅¬μ˜ λͺ‡ λΆ„μ˜ λͺ‡μΈκ°€μš”? κ°„λ‹¨νžˆ 숫자둜만 λ‹΅ν•˜μ„Έμš”.",
"keywords": ["6λΆ„μ˜1", "1/6"]
},
{
"name": "μ½”λ“œ",
"prompt": "파이썬으둜 ν”Όλ³΄λ‚˜μΉ˜ μˆ˜μ—΄μ„ 좜λ ₯ν•˜λŠ” ν•¨μˆ˜λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”. λΆˆν•„μš”ν•œ μ„€λͺ… 없이 μ½”λ“œλ§Œ μ£Όμ„Έμš”.",
"keywords": ["def", "fibonacci"]
}
]
def load_pipeline(model_id):
tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
mdl = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
token=HF_TOKEN,
trust_remote_code=True
).to("cpu")
return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok
def run_generation(pipe, prompt, max_new_tokens=128):
outputs = pipe(
prompt,
max_new_tokens=max_new_tokens,
do_sample=False,
temperature=0.0,
repetition_penalty=1.1
)
return outputs[0]["generated_text"]
def quality_score(output, keywords):
return sum(1 for kw in keywords if kw in output)
def benchmark_model(model_id):
start_time = time.time()
try:
pipe, model, tokenizer = load_pipeline(model_id)
except Exception as e:
return {"model": model_id, "error": f"load_error: {e}"}
elapsed_load = round(time.time() - start_time, 2)
test_results = []
total_quality = 0
for test in TESTS:
try:
t_start = time.time()
output = run_generation(pipe, test["prompt"])
t_elapsed = round(time.time() - t_start, 2)
q = quality_score(output, test["keywords"])
total_quality += q
test_results.append({
"test_name": test["name"],
"time_sec": t_elapsed,
"quality_score": q,
"output": output.strip()
})
except Exception as e:
test_results.append({
"test_name": test["name"],
"time_sec": None,
"quality_score": 0,
"output": "",
"error": f"gen_error: {e}"
})
total_time = round(time.time() - start_time, 2)
del model, tokenizer, pipe
return {
"model": model_id,
"load_time_sec": elapsed_load,
"total_time_sec": total_time,
"total_quality_score": total_quality,
"tests": test_results
}
def main():
results = []
with tqdm(total=len(MODELS), desc="λͺ¨λΈ 벀치마크 μ§„ν–‰", unit="model") as pbar:
for m in MODELS:
res = benchmark_model(m)
results.append(res)
if "error" in res:
print(f"\n❌ {m} Error: {res['error']}")
else:
print(f"\n=== {m} ===")
print(f"⏱ λ‘œλ“œ {res['load_time_sec']}s | 총 {res['total_time_sec']}s | ν’ˆμ§ˆν•© {res['total_quality_score']}")
for t in res["tests"]:
if "error" in t:
print(f" - {t['test_name']}: ERROR {t['error']}")
else:
print(f" - {t['test_name']}: {t['time_sec']}s | ν’ˆμ§ˆ {t['quality_score']}")
print(f" {t['output'][:80]}...")
pbar.update(1)
with open("benchmark_results.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
valid = [r for r in results if "error" not in r]
ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
print("\nπŸ† μΆ”μ²œ TOP 3")
for r in ranked[:3]:
print(f"{r['model']} | ν’ˆμ§ˆ {r['total_quality_score']} | 총 {r['total_time_sec']}s")
if __name__ == "__main__":
main()