Spaces:
Runtime error
Runtime error
| import time | |
| import json | |
| import os | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from tqdm import tqdm | |
| # ν μ€νΈν λͺ¨λΈ λͺ©λ‘ | |
| MODELS = [ | |
| "naver-clova/HyperCLOVA-X-Seed-3B", | |
| "NousResearch/Hermes-3-Llama-3.2-3B", | |
| "tiiuae/Falcon-7B-Instruct", | |
| "openchat/openchat-3.5-0106", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| "Qwen/Qwen2.5-7B-Instruct", | |
| "deepseek-ai/deepseek-coder-6.7b-instruct" | |
| ] | |
| # Hugging Face ν ν° (νμ μ νκ²½λ³μλ‘ μ€μ ) | |
| HF_TOKEN = os.getenv("HF_TOKEN", None) | |
| # ν μ€νΈ μλλ¦¬μ€ | |
| TESTS = [ | |
| { | |
| "name": "μμ½", | |
| "prompt": "λ€μ λ¬Έμ₯μ ν μ€λ‘ μμ½ν΄ μ£ΌμΈμ. μΆκ° μ 보λ μ λ λ£μ§ λ§μΈμ:\nμΈκ³΅μ§λ₯μ λ€μν μ°μ μμ νμ μ μ΄λκ³ μλ€.", | |
| "keywords": ["νμ ", "μ°μ "] | |
| }, | |
| { | |
| "name": "QA", | |
| "prompt": "λ¬μ μ€λ ₯μ μ§κ΅¬μ λͺ λΆμ λͺμΈκ°μ? κ°λ¨ν μ«μλ‘λ§ λ΅νμΈμ.", | |
| "keywords": ["6λΆμ1", "1/6"] | |
| }, | |
| { | |
| "name": "μ½λ", | |
| "prompt": "νμ΄μ¬μΌλ‘ νΌλ³΄λμΉ μμ΄μ μΆλ ₯νλ ν¨μλ₯Ό μμ±ν΄ μ£ΌμΈμ. λΆνμν μ€λͺ μμ΄ μ½λλ§ μ£ΌμΈμ.", | |
| "keywords": ["def", "fibonacci"] | |
| } | |
| ] | |
| def load_pipeline(model_id): | |
| tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True) | |
| mdl = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.float32, | |
| low_cpu_mem_usage=True, | |
| token=HF_TOKEN, | |
| trust_remote_code=True | |
| ).to("cpu") | |
| return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok | |
| def run_generation(pipe, prompt, max_new_tokens=128): | |
| outputs = pipe( | |
| prompt, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=False, | |
| temperature=0.0, | |
| repetition_penalty=1.1 | |
| ) | |
| return outputs[0]["generated_text"] | |
| def quality_score(output, keywords): | |
| return sum(1 for kw in keywords if kw in output) | |
| def benchmark_model(model_id): | |
| start_time = time.time() | |
| try: | |
| pipe, model, tokenizer = load_pipeline(model_id) | |
| except Exception as e: | |
| return {"model": model_id, "error": f"load_error: {e}"} | |
| elapsed_load = round(time.time() - start_time, 2) | |
| test_results = [] | |
| total_quality = 0 | |
| for test in TESTS: | |
| try: | |
| t_start = time.time() | |
| output = run_generation(pipe, test["prompt"]) | |
| t_elapsed = round(time.time() - t_start, 2) | |
| q = quality_score(output, test["keywords"]) | |
| total_quality += q | |
| test_results.append({ | |
| "test_name": test["name"], | |
| "time_sec": t_elapsed, | |
| "quality_score": q, | |
| "output": output.strip() | |
| }) | |
| except Exception as e: | |
| test_results.append({ | |
| "test_name": test["name"], | |
| "time_sec": None, | |
| "quality_score": 0, | |
| "output": "", | |
| "error": f"gen_error: {e}" | |
| }) | |
| total_time = round(time.time() - start_time, 2) | |
| del model, tokenizer, pipe | |
| return { | |
| "model": model_id, | |
| "load_time_sec": elapsed_load, | |
| "total_time_sec": total_time, | |
| "total_quality_score": total_quality, | |
| "tests": test_results | |
| } | |
| def main(): | |
| results = [] | |
| with tqdm(total=len(MODELS), desc="λͺ¨λΈ λ²€μΉλ§ν¬ μ§ν", unit="model") as pbar: | |
| for m in MODELS: | |
| res = benchmark_model(m) | |
| results.append(res) | |
| if "error" in res: | |
| print(f"\nβ {m} Error: {res['error']}") | |
| else: | |
| print(f"\n=== {m} ===") | |
| print(f"β± λ‘λ {res['load_time_sec']}s | μ΄ {res['total_time_sec']}s | νμ§ν© {res['total_quality_score']}") | |
| for t in res["tests"]: | |
| if "error" in t: | |
| print(f" - {t['test_name']}: ERROR {t['error']}") | |
| else: | |
| print(f" - {t['test_name']}: {t['time_sec']}s | νμ§ {t['quality_score']}") | |
| print(f" {t['output'][:80]}...") | |
| pbar.update(1) | |
| with open("benchmark_results.json", "w", encoding="utf-8") as f: | |
| json.dump(results, f, ensure_ascii=False, indent=2) | |
| valid = [r for r in results if "error" not in r] | |
| ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"])) | |
| print("\nπ μΆμ² TOP 3") | |
| for r in ranked[:3]: | |
| print(f"{r['model']} | νμ§ {r['total_quality_score']} | μ΄ {r['total_time_sec']}s") | |
| if __name__ == "__main__": | |
| main() |