Spaces:

orgoflu
/

moro_test-bench

Runtime error

App Files Files Community

orgoflu commited on Sep 11, 2025

Commit

0966925

verified ·

1 Parent(s): 1ce904a

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -81

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import time
 import json
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from tqdm import tqdm
-import matplotlib.pyplot as plt
 MODELS = [
     "naver-clova/HyperCLOVA-X-Seed-3B",
@@ -15,103 +15,99 @@ MODELS = [
     "deepseek-ai/deepseek-coder-6.7b-instruct"
 ]
-HF_TOKEN = None  # "hf_xxx" 필요 시 입력
 TESTS = [
     {
         "name": "요약",
-        "prompt": "다음 문장을 한 줄로 요약해 주세요: 인공지능은 다양한 산업에서 혁신을 이끌고 있다.",
         "keywords": ["혁신", "산업"]
     },
     {
         "name": "QA",
-        "prompt": "달의 중력은 지구의 몇 분의 몇인가요?",
         "keywords": ["6분의1", "1/6"]
     },
     {
         "name": "코드",
-        "prompt": "파이썬으로 피보나치 수열을 출력하는 함수를 작성해 주세요.",
         "keywords": ["def", "fibonacci"]
     }
 ]
 def benchmark_model(model_id):
     try:
-        start_time = time.time()
-        tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto",
-            low_cpu_mem_usage=True,
-            token=HF_TOKEN
-        )
-        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-        elapsed_load = round(time.time() - start_time, 2)
-        test_results = []
-        total_quality = 0
-        for test in TESTS:
             t_start = time.time()
-            output = pipe(test["prompt"], max_new_tokens=100, do_sample=False)[0]['generated_text']
             t_elapsed = round(time.time() - t_start, 2)
-            quality_score = sum(1 for kw in test["keywords"] if kw in output)
-            total_quality += quality_score
             test_results.append({
                 "test_name": test["name"],
                 "time_sec": t_elapsed,
-                "quality_score": quality_score,
                 "output": output.strip()
             })
-        total_time = round(time.time() - start_time, 2)
-        # 메모리 해제
-        del model
-        del tokenizer
-        del pipe
-        torch.cuda.empty_cache()
-        return {
-            "model": model_id,
-            "load_time_sec": elapsed_load,
-            "total_time_sec": total_time,
-            "total_quality_score": total_quality,
-            "tests": test_results
-        }
-    except Exception as e:
-        torch.cuda.empty_cache()
-        return {"model": model_id, "error": str(e)}
-def visualize_results(results):
-    valid_results = [r for r in results if "error" not in r]
-    models = [r["model"] for r in valid_results]
-    qualities = [r["total_quality_score"] for r in valid_results]
-    times = [r["total_time_sec"] for r in valid_results]
-    fig, ax1 = plt.subplots()
-    ax1.set_xlabel("모델")
-    ax1.set_ylabel("품질 점수", color="tab:blue")
-    ax1.bar(models, qualities, color="tab:blue", alpha=0.6, label="품질 점수")
-    ax1.tick_params(axis="y", labelcolor="tab:blue")
-    plt.xticks(rotation=45, ha="right")
-    ax2 = ax1.twinx()
-    ax2.set_ylabel("총 시간(초)", color="tab:red")
-    ax2.plot(models, times, color="tab:red", marker="o", label="총 시간")
-    ax2.tick_params(axis="y", labelcolor="tab:red")
-    fig.tight_layout()
-    plt.title("LLM 벤치마크 결과")
-    plt.show()
 def main():
     results = []
     with tqdm(total=len(MODELS), desc="모델 벤치마크 진행", unit="model") as pbar:
         for m in MODELS:
-            start_model_time = time.time()
             res = benchmark_model(m)
             results.append(res)
@@ -121,29 +117,22 @@ def main():
                 print(f"\n=== {m} ===")
                 print(f"⏱ 로드 {res['load_time_sec']}s | 총 {res['total_time_sec']}s | 품질합 {res['total_quality_score']}")
                 for t in res["tests"]:
-                    print(f"  - {t['test_name']}: {t['time_sec']}s | 품질 {t['quality_score']}")
-                    print(f"    {t['output'][:80]}...")
-            # ETA 계산
-            avg_time = sum(r['total_time_sec'] for r in results if "error" not in r) / max(1, len([r for r in results if "error" not in r]))
-            remaining_models = len(MODELS) - pbar.n - 1
-            eta_seconds = int(avg_time * remaining_models)
-            eta_min, eta_sec = divmod(eta_seconds, 60)
-            pbar.set_postfix({"ETA": f"{eta_min}m {eta_sec}s"})
             pbar.update(1)
     with open("benchmark_results.json", "w", encoding="utf-8") as f:
         json.dump(results, f, ensure_ascii=False, indent=2)
-    valid_results = [r for r in results if "error" not in r]
-    ranked = sorted(valid_results, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
-    print("\n🏆 추천 모델 TOP:")
     for r in ranked[:3]:
         print(f"{r['model']} | 품질 {r['total_quality_score']} | 총 {r['total_time_sec']}s")
-    # 시각화
-    visualize_results(results)
 if __name__ == "__main__":
     main()

 import time
 import json
+import os
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from tqdm import tqdm
 MODELS = [
     "naver-clova/HyperCLOVA-X-Seed-3B",
     "deepseek-ai/deepseek-coder-6.7b-instruct"
 ]
+HF_TOKEN = os.getenv("HF_TOKEN", None)
 TESTS = [
     {
         "name": "요약",
+        "prompt": "다음 문장을 한 줄로 요약해 주세요. 추가 정보는 절대 넣지 마세요:\n인공지능은 다양한 산업에서 혁신을 이끌고 있다.",
         "keywords": ["혁신", "산업"]
     },
     {
         "name": "QA",
+        "prompt": "달의 중력은 지구의 몇 분의 몇인가요? 간단히 숫자로만 답하세요.",
         "keywords": ["6분의1", "1/6"]
     },
     {
         "name": "코드",
+        "prompt": "파이썬으로 피보나치 수열을 출력하는 함수를 작성해 주세요. 불필요한 설명 없이 코드만 주세요.",
         "keywords": ["def", "fibonacci"]
     }
 ]
+def load_pipeline(model_id):
+    tok = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
+    mdl = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True,
+        token=HF_TOKEN,
+        trust_remote_code=True
+    ).to("cpu")
+    return pipeline("text-generation", model=mdl, tokenizer=tok, device=-1), mdl, tok
+def run_generation(pipe, prompt, max_new_tokens=128):
+    outputs = pipe(
+        prompt,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=0.0,
+        repetition_penalty=1.1
+    )
+    return outputs[0]["generated_text"]
+def quality_score(output, keywords):
+    return sum(1 for kw in keywords if kw in output)
 def benchmark_model(model_id):
+    start_time = time.time()
     try:
+        pipe, model, tokenizer = load_pipeline(model_id)
+    except Exception as e:
+        return {"model": model_id, "error": f"load_error: {e}"}
+    elapsed_load = round(time.time() - start_time, 2)
+    test_results = []
+    total_quality = 0
+    for test in TESTS:
+        try:
             t_start = time.time()
+            output = run_generation(pipe, test["prompt"])
             t_elapsed = round(time.time() - t_start, 2)
+            q = quality_score(output, test["keywords"])
+            total_quality += q
             test_results.append({
                 "test_name": test["name"],
                 "time_sec": t_elapsed,
+                "quality_score": q,
                 "output": output.strip()
             })
+        except Exception as e:
+            test_results.append({
+                "test_name": test["name"],
+                "time_sec": None,
+                "quality_score": 0,
+                "output": "",
+                "error": f"gen_error: {e}"
+            })
+    total_time = round(time.time() - start_time, 2)
+    del model, tokenizer, pipe
+    return {
+        "model": model_id,
+        "load_time_sec": elapsed_load,
+        "total_time_sec": total_time,
+        "total_quality_score": total_quality,
+        "tests": test_results
+    }
 def main():
     results = []
     with tqdm(total=len(MODELS), desc="모델 벤치마크 진행", unit="model") as pbar:
         for m in MODELS:
             res = benchmark_model(m)
             results.append(res)
                 print(f"\n=== {m} ===")
                 print(f"⏱ 로드 {res['load_time_sec']}s | 총 {res['total_time_sec']}s | 품질합 {res['total_quality_score']}")
                 for t in res["tests"]:
+                    if "error" in t:
+                        print(f"  - {t['test_name']}: ERROR {t['error']}")
+                    else:
+                        print(f"  - {t['test_name']}: {t['time_sec']}s | 품질 {t['quality_score']}")
+                        print(f"    {t['output'][:80]}...")
             pbar.update(1)
     with open("benchmark_results.json", "w", encoding="utf-8") as f:
         json.dump(results, f, ensure_ascii=False, indent=2)
+    valid = [r for r in results if "error" not in r]
+    ranked = sorted(valid, key=lambda x: (-x["total_quality_score"], x["total_time_sec"]))
+    print("\n🏆 추천 TOP 3")
     for r in ranked[:3]:
         print(f"{r['model']} | 품질 {r['total_quality_score']} | 총 {r['total_time_sec']}s")
 if __name__ == "__main__":
     main()