import time import json import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from tqdm import tqdm # 테스트할 모델 목록 MODELS = [ "naver-clova/HyperCLOVA-X-Seed-3B", "NousResearch/Hermes-3-Llama-3.2-3B", "tiiuae/Falcon-7B-Instruct", "openchat/openchat-3.5-0106", "mistralai/Mistral-7B-Instruct-v0.3", "Qwen/Qwen2.5-7B-Instruct", "deepseek-ai/deepseek-coder-6.7b-instruct" ] # Hugging Face 토큰 (필요 시 환경변수로 설정) HF_TOKEN = os.getenv("HF_TOKEN", None) # 테스트 시나리오 TESTS = [ { "name": "요약", "prompt": "다음 문장을 한 줄로 요약해 주세요. 추가 정보는 절대 넣지 마세요:\n인공지능은 다양한 산업에서 혁신을 이끌고 있다.", "keywords": ["혁신", "산업"] }, { "name": "QA", "prompt": "달의 중력은 지구의 몇 분의 몇인가요? 간단히 숫자로만 답하세요.", "keywords": ["6분의1", "1/6"] }, { "name": "코드", "prompt": "파이썬으로 피보나치 수열을 출력하는 함수를 작성해 주세요. 불필요한 설명 없이 코드만 주세요.", "keywords": ["def", "fibonacci"] } ] def load_pipeline(model_id): tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True) mdl = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, low_cpu_mem_usage=True, token=HF_TOKEN, trust_remote_code=True ).to("cpu") return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok def run_generation(pipe, prompt, max_new_tokens=128): outputs = pipe( prompt, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.0, repetition_penalty=1.1 ) return outputs[0]["generated_text"] def quality_score(output, keywords): return sum(1 for kw in keywords if kw in output) def benchmark_model(model_id): start_time = time.time() try: pipe, model, tokenizer = load_pipeline(model_id) except Exception as e: return {"model": model_id, "error": f"load_error: {e}"} elapsed_load = round(time.time() - start_time, 2) test_results = [] total_quality = 0 for test in TESTS: try: t_start = time.time() output = run_generation(pipe, test["prompt"]) t_elapsed = round(time.time() - t_start, 2) q = quality_score(output, test["keywords"]) total_quality += q test_results.append({ "test_name": test["name"], "time_sec": t_elapsed, "quality_score": q, "output": output.strip() }) except Exception as e: test_results.append({ "test_name": test["name"], "time_sec": None, "quality_score": 0, "output": "", "error": f"gen_error: {e}" }) total_time = round(time.time() - start_time, 2) del model, tokenizer, pipe return { "model": model_id, "load_time_sec": elapsed_load, "total_time_sec": total_time, "total_quality_score": total_quality, "tests": test_results } def main(): results = [] with tqdm(total=len(MODELS), desc="모델 벤치마크 진행", unit="model") as pbar: for m in MODELS: res = benchmark_model(m) results.append(res) if "error" in res: print(f"\n❌ {m} Error: {res['error']}") else: print(f"\n=== {m} ===") print(f"⏱ 로드 {res['load_time_sec']}s | 총 {res['total_time_sec']}s | 품질합 {res['total_quality_score']}") for t in res["tests"]: if "error" in t: print(f" - {t['test_name']}: ERROR {t['error']}") else: print(f" - {t['test_name']}: {t['time_sec']}s | 품질 {t['quality_score']}") print(f" {t['output'][:80]}...") pbar.update(1) with open("benchmark_results.json", "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) valid = [r for r in results if "error" not in r] ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"])) print("\n🏆 추천 TOP 3") for r in ranked[:3]: print(f"{r['model']} | 품질 {r['total_quality_score']} | 총 {r['total_time_sec']}s") if __name__ == "__main__": main()