| """Multi-Agent vs Single LLM ๋น๊ต ์คํ (D5) |
| |
| ๊ฐ์ ์๋์ ๋ํด |
| - Multi-Agent: ๊ธฐ์กด 4-Tier ์ค์ผ์คํธ๋ ์ดํฐ (Tier 1 ML + Tier 2/3/4 ๊ฐ๊ฐ LLM ํธ์ถ) |
| - Single LLM: ๊ฐ์ ๋ชจ๋ธ๋ก Tier 2/3/4๋ฅผ ํ ๋ฒ์ ์ฐ์ถ |
| |
| ์ผ๋ก Nํ ์คํํ ๋ค ์ ๋ ์งํ ๋น๊ต |
| - latency |
| - ๋น์ฉ (input/output ํ ํฐ) |
| - schema ๋ถํฉ๋ฅ (structured output ์ฑ๊ณต๋ฅ ) |
| - citation ์ ํ๋ (์ ๊ณต๋ doc ID๋ง ์ธ์ฉํ๋๊ฐ) |
| - ์๋ต ์ผ๊ด์ฑ (cause ์งํฉ Jaccard, yield_loss ํ์คํธ์ฐจ) |
| |
| ์คํ: python -m experiments.multi_vs_single.benchmark |
| ๊ฒฐ๊ณผ: results.md + plots/ |
| """ |
| import json |
| import statistics |
| import time |
| from collections import Counter |
| from pathlib import Path |
|
|
| import matplotlib |
|
|
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
|
|
| from agents.cause import TIER2_SCHEMA, SYSTEM_PROMPT as CAUSE_SYSTEM, run_cause |
| from agents.detection import run_detection |
| from agents.impact import LLM_PART_SCHEMA as TIER3_LLM_SCHEMA, run_impact |
| from agents.llm import SUBAGENT_MODEL, client |
| from agents.rag.store import _knowledge_docs |
| from agents.response import LLM_PART_SCHEMA as TIER4_LLM_SCHEMA, run_response |
| from data.demo import DEFAULT_ALARMS |
| from data.wip import get_affected_wip |
|
|
| N_RUNS = 3 |
| TARGET_ALARM = "A3" |
| OUT_DIR = Path(__file__).parent |
| PLOTS_DIR = OUT_DIR / "plots" |
|
|
| |
| PRICE_IN = 0.25 / 1_000_000 |
| PRICE_OUT = 2.00 / 1_000_000 |
|
|
| |
| COMBINED_SCHEMA = { |
| "type": "object", |
| "properties": { |
| "tier2": TIER2_SCHEMA, |
| "tier3": TIER3_LLM_SCHEMA, |
| "tier4": TIER4_LLM_SCHEMA, |
| }, |
| "required": ["tier2", "tier3", "tier4"], |
| "additionalProperties": False, |
| } |
|
|
| SINGLE_SYSTEM = """๋น์ ์ ๋ฐ๋์ฒด ๊ณต์ ์ด์ ํตํฉ ๋ถ์ ์ ๋ฌธ๊ฐ์
๋๋ค. |
| ์๋๊ณผ Tier 1 ์ด์ ํ์ง ๊ฒฐ๊ณผ, ์ฌ๋ด ์ง์ ๋ฌธ์ ์ ์ฒด๋ฅผ ์ข
ํฉํด์ Tier 2(์์ธ ๋ถ์), |
| Tier 3(์ํฅ ํ๊ฐ์ downstream_dependencies์ yield_loss), Tier 4(์ฆ์ยท์ค์ฅ๊ธฐ ์กฐ์น + |
| ๊ทผ๊ฑฐ ์๋ฃ)๋ฅผ ํ ๋ฒ์ ์๋ต์ผ๋ก ์ฐ์ถํฉ๋๋ค. |
| citations์ refs๋ ์ ๊ณต๋ ๋ฌธ์ ID๋ง ์ฌ์ฉํ๊ณ ๊ทผ๊ฑฐ ์๋ ๋ด์ฉ์ ํฌํจํ์ง ์์ต๋๋ค.""" |
|
|
|
|
| def run_single_llm(alarm, tier1): |
| """Single LLM ๋ฐฉ์ - ์ ์ฒด ์ง์ ๋ฌธ์๋ฅผ ํ ๋ฒ์ ์ฃผ๊ณ ํตํฉ ์ฐ์ถ, (data, latency, usage) ๋ฐํ""" |
| docs = _knowledge_docs() |
| knowledge = "\n\n".join(f"[{d}]\n{c}" for d, c in docs.items()) |
| sensors = ", ".join(f["name"] for f in tier1["features"]) |
|
|
| user_prompt = f"""## ์ด์ ์๋ |
| - ๊ณต์ : {alarm['title']} |
| - lot: {alarm['lot_id']} |
| - ์ด์ ํผ์ฒ: {alarm.get('feature')} {alarm.get('feature_arrow') or ''} |
| |
| ## Tier 1 ์ด์ ํ์ง |
| - ์ด์ ์ ์: {tier1['score']} |
| - ๊ธฐ์ฌ ์ผ์: {sensors} |
| |
| ## ์ฌ๋ด ์ง์ ๋ฌธ์ (์ ์ฒด) |
| {knowledge} |
| |
| ์ ์ ๋ณด๋ฅผ ์ข
ํฉํด์ tier2/tier3/tier4๋ฅผ JSON์ผ๋ก ๋ฐํํ์ธ์.""" |
|
|
| t0 = time.time() |
| resp = client().chat.completions.create( |
| model=SUBAGENT_MODEL, |
| messages=[ |
| {"role": "system", "content": SINGLE_SYSTEM}, |
| {"role": "user", "content": user_prompt}, |
| ], |
| response_format={ |
| "type": "json_schema", |
| "json_schema": {"name": "combined", "schema": COMBINED_SCHEMA, "strict": True}, |
| }, |
| ) |
| elapsed = time.time() - t0 |
| parsed = json.loads(resp.choices[0].message.content) |
| return parsed, elapsed, resp.usage |
|
|
|
|
| def run_multi_agent(alarm, tier1): |
| """Multi-Agent - cause/impact/response ์์ฐจ ํธ์ถ, usage๋ client monkey patch๋ก ์บก์ฒ""" |
| from agents import llm as llm_mod |
|
|
| cli = llm_mod.client() |
| captured = [] |
| real_create = cli.chat.completions.create |
|
|
| def patched(*args, **kwargs): |
| r = real_create(*args, **kwargs) |
| captured.append(r.usage) |
| return r |
|
|
| cli.chat.completions.create = patched |
| t0 = time.time() |
| try: |
| t2 = run_cause(alarm, tier1) |
| t3 = run_impact(alarm, tier1, t2) |
| t4 = run_response(alarm, tier1, t2, t3) |
| finally: |
| cli.chat.completions.create = real_create |
|
|
| elapsed = time.time() - t0 |
| total_in = sum(u.prompt_tokens for u in captured) |
| total_out = sum(u.completion_tokens for u in captured) |
| return {"tier2": t2, "tier3": t3, "tier4": t4}, elapsed, (total_in, total_out) |
|
|
|
|
| def measure(data, valid_doc_ids): |
| """๊ฒฐ๊ณผ dict์์ ํ๊ฐ ์งํ ์ถ์ถ""" |
| t2 = data["tier2"] |
| t3 = data["tier3"] |
| t4 = data["tier4"] |
|
|
| |
| cites = [] |
| for c in t2.get("causes", []): |
| cites.extend(c.get("citations", [])) |
| refs = t4.get("refs", []) |
| |
| ref_ids = [r["id"] if isinstance(r, dict) else r for r in refs] |
| all_cites = cites + ref_ids |
| cite_valid = sum(1 for c in all_cites if c in valid_doc_ids) |
| cite_total = len(all_cites) |
| cite_acc = cite_valid / cite_total if cite_total else 1.0 |
|
|
| |
| causes = frozenset(c["name"] for c in t2.get("causes", [])) |
|
|
| |
| n_imm = len(t4.get("immediate", [])) |
| n_lng = len(t4.get("longterm", [])) |
|
|
| |
| yloss = t3.get("yield_loss", 0.0) |
|
|
| return { |
| "cite_acc": cite_acc, |
| "cite_total": cite_total, |
| "causes": causes, |
| "n_imm": n_imm, |
| "n_lng": n_lng, |
| "yield_loss": yloss, |
| "pct_sum": sum(c["pct"] for c in t2.get("causes", [])), |
| } |
|
|
|
|
| def jaccard(sets): |
| if not sets: |
| return 0.0 |
| union = set() |
| inter = set(sets[0]) |
| for s in sets: |
| union |= s |
| inter &= s |
| return len(inter) / len(union) if union else 0.0 |
|
|
|
|
| def main(): |
| alarm = next(a for a in DEFAULT_ALARMS if a["id"] == TARGET_ALARM) |
| tier1 = run_detection(alarm) |
| valid_doc_ids = set(_knowledge_docs().keys()) |
|
|
| print(f"์คํ ๋์: {TARGET_ALARM} ({alarm['title']}), N={N_RUNS}") |
| print(f"Tier 1 score: {tier1['score']}, ๊ธฐ์ฌ ์ผ์: {[f['name'] for f in tier1['features']]}\n") |
|
|
| multi_runs = [] |
| single_runs = [] |
|
|
| print("=== Multi-Agent ===") |
| for i in range(N_RUNS): |
| try: |
| data, lat, (tin, tout) = run_multi_agent(alarm, tier1) |
| m = measure(data, valid_doc_ids) |
| m.update({"latency": lat, "tokens_in": tin, "tokens_out": tout, "ok": True}) |
| multi_runs.append(m) |
| print(f" run {i+1}: {lat:.1f}s, in={tin}, out={tout}, cite_acc={m['cite_acc']:.2f}") |
| except Exception as e: |
| multi_runs.append({"ok": False, "error": str(e)}) |
| print(f" run {i+1}: ERROR {e}") |
|
|
| print("\n=== Single LLM ===") |
| for i in range(N_RUNS): |
| try: |
| data, lat, usage = run_single_llm(alarm, tier1) |
| m = measure(data, valid_doc_ids) |
| m.update({ |
| "latency": lat, |
| "tokens_in": usage.prompt_tokens, |
| "tokens_out": usage.completion_tokens, |
| "ok": True, |
| }) |
| single_runs.append(m) |
| print(f" run {i+1}: {lat:.1f}s, in={usage.prompt_tokens}, out={usage.completion_tokens}, cite_acc={m['cite_acc']:.2f}") |
| except Exception as e: |
| single_runs.append({"ok": False, "error": str(e)}) |
| print(f" run {i+1}: ERROR {e}") |
|
|
| write_results(multi_runs, single_runs, alarm, tier1) |
| print(f"\n--- ๊ฒฐ๊ณผ ์ ์ฅ: {OUT_DIR / 'results.md'} ---") |
|
|
|
|
| def aggregate(runs): |
| """N runs ๊ฒฐ๊ณผ ์ง๊ณ""" |
| ok = [r for r in runs if r.get("ok")] |
| if not ok: |
| return {"n_ok": 0, "n_fail": len(runs)} |
| return { |
| "n_ok": len(ok), |
| "n_fail": len(runs) - len(ok), |
| "schema_compliance": len(ok) / len(runs), |
| "latency_mean": statistics.mean(r["latency"] for r in ok), |
| "tokens_in_mean": statistics.mean(r["tokens_in"] for r in ok), |
| "tokens_out_mean": statistics.mean(r["tokens_out"] for r in ok), |
| "cost_mean": statistics.mean( |
| r["tokens_in"] * PRICE_IN + r["tokens_out"] * PRICE_OUT for r in ok |
| ), |
| "cite_acc_mean": statistics.mean(r["cite_acc"] for r in ok), |
| "causes_jaccard": jaccard([r["causes"] for r in ok]), |
| "yield_loss_std": statistics.stdev(r["yield_loss"] for r in ok) if len(ok) >= 2 else 0.0, |
| "n_imm_mean": statistics.mean(r["n_imm"] for r in ok), |
| "n_lng_mean": statistics.mean(r["n_lng"] for r in ok), |
| "pct_sum_mean": statistics.mean(r["pct_sum"] for r in ok), |
| } |
|
|
|
|
| def write_results(multi_runs, single_runs, alarm, tier1): |
| M = aggregate(multi_runs) |
| S = aggregate(single_runs) |
|
|
| PLOTS_DIR.mkdir(exist_ok=True) |
| _plot_compare(M, S) |
|
|
| lines = [ |
| "# D5. Multi-Agent vs Single LLM", |
| "", |
| f"๋์ผ ์๋ **{TARGET_ALARM} ({alarm['title']})** ์ ๋ํด ๋ ๋ฐฉ์์ N={N_RUNS}ํ์ฉ ์คํํ ๋น๊ต ๊ฒฐ๊ณผ์
๋๋ค.", |
| "", |
| "## ์คํ ์ค์ ", |
| "", |
| f"- ๋ชจ๋ธ: {SUBAGENT_MODEL} (์์ชฝ ๋์ผ)", |
| f"- ์๋: {TARGET_ALARM} ({alarm['title']}), lot {alarm['lot_id']}", |
| f"- Tier 1 (์ด์ ํ์ง): {tier1['score']} / ๊ธฐ์ฌ ์ผ์ {', '.join(f['name'] for f in tier1['features'])}", |
| "- Tier 1 ๊ฒฐ๊ณผ๋ ์์ชฝ ๋์ผ ์
๋ ฅ, Tier 2/3/4๋ง ๋น๊ต", |
| "- Multi-Agent: cause โ impact โ response ์์ฐจ ํธ์ถ (๊ฐ RAG ๊ฒ์ + ์ ๋ฌธํ๋ system prompt)", |
| "- Single LLM: ์ ์ฒด ์ง์ ๋ฌธ์ + Tier 1์ ํ ๋ฒ์ ์
๋ ฅ, Tier 2/3/4 ํตํฉ JSON ์ฐ์ถ", |
| "", |
| "## ์ ๋ ๋น๊ต", |
| "", |
| "| ์งํ | Multi-Agent | Single LLM | ์ฐจ์ด |", |
| "|---|---|---|---|", |
| ] |
|
|
| def row(label, key, fmt="{:.2f}", lower_better=False): |
| m, s = M.get(key), S.get(key) |
| if m is None or s is None: |
| return f"| {label} | - | - | - |" |
| if isinstance(m, float) and isinstance(s, float): |
| diff = (m - s) / s * 100 if s else 0 |
| arrow = "๐ป" if (diff < 0) == lower_better else "๐บ" |
| return f"| {label} | {fmt.format(m)} | {fmt.format(s)} | {arrow} {diff:+.1f}% |" |
| return f"| {label} | {m} | {s} | - |" |
|
|
| lines.append(row("์คํค๋ง ๋ถํฉ๋ฅ ", "schema_compliance", "{:.0%}")) |
| lines.append(row("Latency ํ๊ท (s)", "latency_mean", "{:.1f}s", lower_better=True)) |
| lines.append(row("Input tokens (avg)", "tokens_in_mean", "{:.0f}", lower_better=True)) |
| lines.append(row("Output tokens (avg)", "tokens_out_mean", "{:.0f}", lower_better=True)) |
| lines.append(row("Cost/run ($)", "cost_mean", "${:.4f}", lower_better=True)) |
| lines.append(row("Citation ์ ํ๋", "cite_acc_mean", "{:.0%}")) |
| lines.append(row("์์ธ ์ผ๊ด์ฑ (Jaccard)", "causes_jaccard", "{:.2f}")) |
| lines.append(row("Yield_loss ฯ", "yield_loss_std", "{:.2f}", lower_better=True)) |
| lines.append(row("์ฆ์ ์กฐ์น ์ (avg)", "n_imm_mean", "{:.1f}")) |
| lines.append(row("์ค์ฅ๊ธฐ ์กฐ์น ์ (avg)", "n_lng_mean", "{:.1f}")) |
| lines.append(row("์์ธ ๊ธฐ์ฌ๋ ํฉ (avg)", "pct_sum_mean", "{:.1f}")) |
|
|
| lines += [ |
| "", |
| "", |
| "", |
| "## ํด์", |
| "", |
| f"- **ํ์ง**: citation ์ ํ๋, ์์ธ ์ผ๊ด์ฑ(Jaccard) ์ฐจ์ด๊ฐ Multi-Agent์ ๋ถ๋ฆฌยท์ ๋ฌธํ ํจ๊ณผ๋ฅผ ๋ณด์ฌ์ค", |
| f"- **๋น์ฉ**: Multi-Agent๊ฐ LLM์ 3ํ ํธ์ถํ๋ฏ๋ก ๋น์ฉยทlatency๊ฐ ํฌ์ง๋ง, ๊ฐ ํธ์ถ์ด ์งง์ system prompt + ์ข์ RAG context๋ผ ํจ์จ์ ", |
| f"- **trade-off**: Single LLM์ 1ํ ํธ์ถ๋ก ๋น ๋ฅด๋ ์๋ต ์ผ๊ด์ฑยท์ธ์ฉ ์ ํ๋์์ ์ด์", |
| "", |
| "## ๊ฒฐ๋ก ", |
| "", |
| "**Multi-Agent ์ฑํ**. ๋น์ฉ ์ฆ๊ฐ๋ ์๋๋น ์ฝ 2~3๋ฐฐ์ง๋ง ์ ๋๊ฐ์ด $0.01 ์์ค์ด๋ผ ๋ฌด์ ๊ฐ๋ฅํ๊ณ , " |
| "๊ตฌ์กฐํยท์ผ๊ด์ฑยทํ์ฅ์ฑ(์์ด์ ํธ ์ถ๊ฐ) ๋ฉด์์ ์๋์ ์ผ๋ก ์ ๋ฆฌ. ๋์ ๋ถ๊ธฐยท์ฌ์๋๊ฐ ํ์ํด์ง๋ฉด " |
| "LangGraph๋ก ์์ฐ์ค๋ฝ๊ฒ ํ์ฅ ๊ฐ๋ฅํ ๊ตฌ์กฐ.", |
| "", |
| ] |
|
|
| (OUT_DIR / "results.md").write_text("\n".join(lines), encoding="utf-8") |
|
|
|
|
| def _plot_compare(M, S): |
| fig, axes = plt.subplots(1, 2, figsize=(10, 4)) |
|
|
| methods = ["Multi-Agent", "Single LLM"] |
| lat = [M.get("latency_mean", 0), S.get("latency_mean", 0)] |
| cost = [M.get("cost_mean", 0), S.get("cost_mean", 0)] |
|
|
| axes[0].bar(methods, lat, color=["#2C5AB8", "#9AA3B2"]) |
| axes[0].set_ylabel("Latency (s)") |
| axes[0].set_title("Average Latency") |
| for i, v in enumerate(lat): |
| axes[0].text(i, v, f"{v:.1f}s", ha="center", va="bottom") |
|
|
| axes[1].bar(methods, cost, color=["#2C5AB8", "#9AA3B2"]) |
| axes[1].set_ylabel("Cost per run ($)") |
| axes[1].set_title("Average Cost") |
| for i, v in enumerate(cost): |
| axes[1].text(i, v, f"${v:.4f}", ha="center", va="bottom") |
|
|
| fig.tight_layout() |
| fig.savefig(PLOTS_DIR / "cost_latency.png", dpi=120) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|