File size: 4,702 Bytes
317a31a
 
0966925
317a31a
959633d
0fe9daa
317a31a
395e379
317a31a
 
 
 
 
 
 
959633d
317a31a
 
395e379
0966925
317a31a
395e379
317a31a
 
 
0966925
317a31a
 
 
 
0966925
317a31a
 
 
 
0966925
317a31a
 
 
 
0966925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317a31a
0966925
317a31a
0966925
 
 
 
 
 
 
 
 
 
317a31a
0966925
317a31a
0966925
 
317a31a
 
 
0966925
317a31a
 
0966925
 
 
 
 
 
 
 
317a31a
0966925
0fe9daa
0966925
0fe9daa
0966925
 
 
 
 
 
 
0fe9daa
317a31a
 
47d9b67
 
 
 
 
 
 
 
 
 
 
0966925
 
 
 
 
 
47d9b67
317a31a
 
 
 
0966925
 
 
317a31a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import time
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm

# ν…ŒμŠ€νŠΈν•  λͺ¨λΈ λͺ©λ‘
MODELS = [
    "naver-clova/HyperCLOVA-X-Seed-3B",
    "NousResearch/Hermes-3-Llama-3.2-3B",
    "tiiuae/Falcon-7B-Instruct",
    "openchat/openchat-3.5-0106",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "Qwen/Qwen2.5-7B-Instruct",
    "deepseek-ai/deepseek-coder-6.7b-instruct"
]

# Hugging Face 토큰 (ν•„μš” μ‹œ ν™˜κ²½λ³€μˆ˜λ‘œ μ„€μ •)
HF_TOKEN = os.getenv("HF_TOKEN", None)

# ν…ŒμŠ€νŠΈ μ‹œλ‚˜λ¦¬μ˜€
TESTS = [
    {
        "name": "μš”μ•½",
        "prompt": "λ‹€μŒ λ¬Έμž₯을 ν•œ μ€„λ‘œ μš”μ•½ν•΄ μ£Όμ„Έμš”. μΆ”κ°€ μ •λ³΄λŠ” μ ˆλŒ€ λ„£μ§€ λ§ˆμ„Έμš”:\n인곡지λŠ₯은 λ‹€μ–‘ν•œ μ‚°μ—…μ—μ„œ ν˜μ‹ μ„ 이끌고 μžˆλ‹€.",
        "keywords": ["ν˜μ‹ ", "μ‚°μ—…"]
    },
    {
        "name": "QA",
        "prompt": "λ‹¬μ˜ 쀑λ ₯은 μ§€κ΅¬μ˜ λͺ‡ λΆ„μ˜ λͺ‡μΈκ°€μš”? κ°„λ‹¨νžˆ 숫자둜만 λ‹΅ν•˜μ„Έμš”.",
        "keywords": ["6λΆ„μ˜1", "1/6"]
    },
    {
        "name": "μ½”λ“œ",
        "prompt": "파이썬으둜 ν”Όλ³΄λ‚˜μΉ˜ μˆ˜μ—΄μ„ 좜λ ₯ν•˜λŠ” ν•¨μˆ˜λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”. λΆˆν•„μš”ν•œ μ„€λͺ… 없이 μ½”λ“œλ§Œ μ£Όμ„Έμš”.",
        "keywords": ["def", "fibonacci"]
    }
]

def load_pipeline(model_id):
    tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
    mdl = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float32,
        low_cpu_mem_usage=True,
        token=HF_TOKEN,
        trust_remote_code=True
    ).to("cpu")
    return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok

def run_generation(pipe, prompt, max_new_tokens=128):
    outputs = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
        repetition_penalty=1.1
    )
    return outputs[0]["generated_text"]

def quality_score(output, keywords):
    return sum(1 for kw in keywords if kw in output)

def benchmark_model(model_id):
    start_time = time.time()
    try:
        pipe, model, tokenizer = load_pipeline(model_id)
    except Exception as e:
        return {"model": model_id, "error": f"load_error: {e}"}

    elapsed_load = round(time.time() - start_time, 2)
    test_results = []
    total_quality = 0

    for test in TESTS:
        try:
            t_start = time.time()
            output = run_generation(pipe, test["prompt"])
            t_elapsed = round(time.time() - t_start, 2)
            q = quality_score(output, test["keywords"])
            total_quality += q
            test_results.append({
                "test_name": test["name"],
                "time_sec": t_elapsed,
                "quality_score": q,
                "output": output.strip()
            })
        except Exception as e:
            test_results.append({
                "test_name": test["name"],
                "time_sec": None,
                "quality_score": 0,
                "output": "",
                "error": f"gen_error: {e}"
            })

    total_time = round(time.time() - start_time, 2)

    del model, tokenizer, pipe

    return {
        "model": model_id,
        "load_time_sec": elapsed_load,
        "total_time_sec": total_time,
        "total_quality_score": total_quality,
        "tests": test_results
    }

def main():
    results = []
    with tqdm(total=len(MODELS), desc="λͺ¨λΈ 벀치마크 μ§„ν–‰", unit="model") as pbar:
        for m in MODELS:
            res = benchmark_model(m)
            results.append(res)

            if "error" in res:
                print(f"\n❌ {m} Error: {res['error']}")
            else:
                print(f"\n=== {m} ===")
                print(f"⏱ λ‘œλ“œ {res['load_time_sec']}s | 총 {res['total_time_sec']}s | ν’ˆμ§ˆν•© {res['total_quality_score']}")
                for t in res["tests"]:
                    if "error" in t:
                        print(f"  - {t['test_name']}: ERROR {t['error']}")
                    else:
                        print(f"  - {t['test_name']}: {t['time_sec']}s | ν’ˆμ§ˆ {t['quality_score']}")
                        print(f"    {t['output'][:80]}...")

            pbar.update(1)

    with open("benchmark_results.json", "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    valid = [r for r in results if "error" not in r]
    ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
    print("\nπŸ† μΆ”μ²œ TOP 3")
    for r in ranked[:3]:
        print(f"{r['model']} | ν’ˆμ§ˆ {r['total_quality_score']} | 총 {r['total_time_sec']}s")

if __name__ == "__main__":
    main()