hee_!J
feat: ์ž๊ฐ€ํ•™์ŠตยทFAISS RAGยท์ •๋Ÿ‰์‹คํ—˜ + ๋ฐฐํฌ ์ค€๋น„
159b5df
Raw
History Blame Contribute Delete
12.9 kB
"""Multi-Agent vs Single LLM ๋น„๊ต ์‹คํ—˜ (D5)
๊ฐ™์€ ์•Œ๋žŒ์— ๋Œ€ํ•ด
- Multi-Agent: ๊ธฐ์กด 4-Tier ์˜ค์ผ€์ŠคํŠธ๋ ˆ์ดํ„ฐ (Tier 1 ML + Tier 2/3/4 ๊ฐ๊ฐ LLM ํ˜ธ์ถœ)
- Single LLM: ๊ฐ™์€ ๋ชจ๋ธ๋กœ Tier 2/3/4๋ฅผ ํ•œ ๋ฒˆ์— ์‚ฐ์ถœ
์œผ๋กœ NํšŒ ์‹คํ–‰ํ•œ ๋’ค ์ •๋Ÿ‰ ์ง€ํ‘œ ๋น„๊ต
- latency
- ๋น„์šฉ (input/output ํ† ํฐ)
- schema ๋ถ€ํ•ฉ๋ฅ  (structured output ์„ฑ๊ณต๋ฅ )
- citation ์ •ํ™•๋„ (์ œ๊ณต๋œ doc ID๋งŒ ์ธ์šฉํ–ˆ๋Š”๊ฐ€)
- ์‘๋‹ต ์ผ๊ด€์„ฑ (cause ์ง‘ํ•ฉ Jaccard, yield_loss ํ‘œ์ค€ํŽธ์ฐจ)
์‹คํ–‰: python -m experiments.multi_vs_single.benchmark
๊ฒฐ๊ณผ: results.md + plots/
"""
import json
import statistics
import time
from collections import Counter
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from agents.cause import TIER2_SCHEMA, SYSTEM_PROMPT as CAUSE_SYSTEM, run_cause
from agents.detection import run_detection
from agents.impact import LLM_PART_SCHEMA as TIER3_LLM_SCHEMA, run_impact
from agents.llm import SUBAGENT_MODEL, client
from agents.rag.store import _knowledge_docs
from agents.response import LLM_PART_SCHEMA as TIER4_LLM_SCHEMA, run_response
from data.demo import DEFAULT_ALARMS
from data.wip import get_affected_wip
N_RUNS = 3
TARGET_ALARM = "A3" # CMP ์‹ค๋ฐ์ดํ„ฐ (PHM 2016) ์‚ฌ์šฉ, ๊ฐ€์žฅ ํฅ๋ฏธ๋กœ์šด ๋น„๊ต ์ผ€์ด์Šค
OUT_DIR = Path(__file__).parent
PLOTS_DIR = OUT_DIR / "plots"
# input $0.25/M, output $2.00/M (gpt-5-mini ๊ธฐ์ค€)
PRICE_IN = 0.25 / 1_000_000
PRICE_OUT = 2.00 / 1_000_000
# Single LLM ํ†ตํ•ฉ ์Šคํ‚ค๋งˆ
COMBINED_SCHEMA = {
"type": "object",
"properties": {
"tier2": TIER2_SCHEMA,
"tier3": TIER3_LLM_SCHEMA,
"tier4": TIER4_LLM_SCHEMA,
},
"required": ["tier2", "tier3", "tier4"],
"additionalProperties": False,
}
SINGLE_SYSTEM = """๋‹น์‹ ์€ ๋ฐ˜๋„์ฒด ๊ณต์ • ์šด์˜ ํ†ตํ•ฉ ๋ถ„์„ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
์•Œ๋žŒ๊ณผ Tier 1 ์ด์ƒ ํƒ์ง€ ๊ฒฐ๊ณผ, ์‚ฌ๋‚ด ์ง€์‹ ๋ฌธ์„œ ์ „์ฒด๋ฅผ ์ข…ํ•ฉํ•ด์„œ Tier 2(์›์ธ ๋ถ„์„),
Tier 3(์˜ํ–ฅ ํ‰๊ฐ€์˜ downstream_dependencies์™€ yield_loss), Tier 4(์ฆ‰์‹œยท์ค‘์žฅ๊ธฐ ์กฐ์น˜ +
๊ทผ๊ฑฐ ์ž๋ฃŒ)๋ฅผ ํ•œ ๋ฒˆ์˜ ์‘๋‹ต์œผ๋กœ ์‚ฐ์ถœํ•ฉ๋‹ˆ๋‹ค.
citations์™€ refs๋Š” ์ œ๊ณต๋œ ๋ฌธ์„œ ID๋งŒ ์‚ฌ์šฉํ•˜๊ณ  ๊ทผ๊ฑฐ ์—†๋Š” ๋‚ด์šฉ์€ ํฌํ•จํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค."""
def run_single_llm(alarm, tier1):
"""Single LLM ๋ฐฉ์‹ - ์ „์ฒด ์ง€์‹ ๋ฌธ์„œ๋ฅผ ํ•œ ๋ฒˆ์— ์ฃผ๊ณ  ํ†ตํ•ฉ ์‚ฐ์ถœ, (data, latency, usage) ๋ฐ˜ํ™˜"""
docs = _knowledge_docs()
knowledge = "\n\n".join(f"[{d}]\n{c}" for d, c in docs.items())
sensors = ", ".join(f["name"] for f in tier1["features"])
user_prompt = f"""## ์ด์ƒ ์•Œ๋žŒ
- ๊ณต์ •: {alarm['title']}
- lot: {alarm['lot_id']}
- ์ด์ƒ ํ”ผ์ฒ˜: {alarm.get('feature')} {alarm.get('feature_arrow') or ''}
## Tier 1 ์ด์ƒ ํƒ์ง€
- ์ด์ƒ ์ ์ˆ˜: {tier1['score']}
- ๊ธฐ์—ฌ ์„ผ์„œ: {sensors}
## ์‚ฌ๋‚ด ์ง€์‹ ๋ฌธ์„œ (์ „์ฒด)
{knowledge}
์œ„ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•ด์„œ tier2/tier3/tier4๋ฅผ JSON์œผ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."""
t0 = time.time()
resp = client().chat.completions.create(
model=SUBAGENT_MODEL,
messages=[
{"role": "system", "content": SINGLE_SYSTEM},
{"role": "user", "content": user_prompt},
],
response_format={
"type": "json_schema",
"json_schema": {"name": "combined", "schema": COMBINED_SCHEMA, "strict": True},
},
)
elapsed = time.time() - t0
parsed = json.loads(resp.choices[0].message.content)
return parsed, elapsed, resp.usage
def run_multi_agent(alarm, tier1):
"""Multi-Agent - cause/impact/response ์ˆœ์ฐจ ํ˜ธ์ถœ, usage๋Š” client monkey patch๋กœ ์บก์ฒ˜"""
from agents import llm as llm_mod
cli = llm_mod.client()
captured = []
real_create = cli.chat.completions.create
def patched(*args, **kwargs):
r = real_create(*args, **kwargs)
captured.append(r.usage)
return r
cli.chat.completions.create = patched
t0 = time.time()
try:
t2 = run_cause(alarm, tier1)
t3 = run_impact(alarm, tier1, t2)
t4 = run_response(alarm, tier1, t2, t3)
finally:
cli.chat.completions.create = real_create
elapsed = time.time() - t0
total_in = sum(u.prompt_tokens for u in captured)
total_out = sum(u.completion_tokens for u in captured)
return {"tier2": t2, "tier3": t3, "tier4": t4}, elapsed, (total_in, total_out)
def measure(data, valid_doc_ids):
"""๊ฒฐ๊ณผ dict์—์„œ ํ‰๊ฐ€ ์ง€ํ‘œ ์ถ”์ถœ"""
t2 = data["tier2"]
t3 = data["tier3"]
t4 = data["tier4"]
# citation ์ •ํ™•๋„
cites = []
for c in t2.get("causes", []):
cites.extend(c.get("citations", []))
refs = t4.get("refs", [])
# Multi์˜ tier4๋Š” refs๊ฐ€ ๊ฒฐ์ •๋ก ์  (๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ)์ด๋ผ ๋‹จ์ˆœ ๋น„๊ต์šฉ์œผ๋กœ cite_pool์— ํฌํ•จ
ref_ids = [r["id"] if isinstance(r, dict) else r for r in refs]
all_cites = cites + ref_ids
cite_valid = sum(1 for c in all_cites if c in valid_doc_ids)
cite_total = len(all_cites)
cite_acc = cite_valid / cite_total if cite_total else 1.0
# cause ์ด๋ฆ„ ์ง‘ํ•ฉ
causes = frozenset(c["name"] for c in t2.get("causes", []))
# ๊ถŒ๊ณ  ์ˆ˜
n_imm = len(t4.get("immediate", []))
n_lng = len(t4.get("longterm", []))
# yield_loss
yloss = t3.get("yield_loss", 0.0)
return {
"cite_acc": cite_acc,
"cite_total": cite_total,
"causes": causes,
"n_imm": n_imm,
"n_lng": n_lng,
"yield_loss": yloss,
"pct_sum": sum(c["pct"] for c in t2.get("causes", [])),
}
def jaccard(sets):
if not sets:
return 0.0
union = set()
inter = set(sets[0])
for s in sets:
union |= s
inter &= s
return len(inter) / len(union) if union else 0.0
def main():
alarm = next(a for a in DEFAULT_ALARMS if a["id"] == TARGET_ALARM)
tier1 = run_detection(alarm)
valid_doc_ids = set(_knowledge_docs().keys())
print(f"์‹คํ—˜ ๋Œ€์ƒ: {TARGET_ALARM} ({alarm['title']}), N={N_RUNS}")
print(f"Tier 1 score: {tier1['score']}, ๊ธฐ์—ฌ ์„ผ์„œ: {[f['name'] for f in tier1['features']]}\n")
multi_runs = []
single_runs = []
print("=== Multi-Agent ===")
for i in range(N_RUNS):
try:
data, lat, (tin, tout) = run_multi_agent(alarm, tier1)
m = measure(data, valid_doc_ids)
m.update({"latency": lat, "tokens_in": tin, "tokens_out": tout, "ok": True})
multi_runs.append(m)
print(f" run {i+1}: {lat:.1f}s, in={tin}, out={tout}, cite_acc={m['cite_acc']:.2f}")
except Exception as e:
multi_runs.append({"ok": False, "error": str(e)})
print(f" run {i+1}: ERROR {e}")
print("\n=== Single LLM ===")
for i in range(N_RUNS):
try:
data, lat, usage = run_single_llm(alarm, tier1)
m = measure(data, valid_doc_ids)
m.update({
"latency": lat,
"tokens_in": usage.prompt_tokens,
"tokens_out": usage.completion_tokens,
"ok": True,
})
single_runs.append(m)
print(f" run {i+1}: {lat:.1f}s, in={usage.prompt_tokens}, out={usage.completion_tokens}, cite_acc={m['cite_acc']:.2f}")
except Exception as e:
single_runs.append({"ok": False, "error": str(e)})
print(f" run {i+1}: ERROR {e}")
write_results(multi_runs, single_runs, alarm, tier1)
print(f"\n--- ๊ฒฐ๊ณผ ์ €์žฅ: {OUT_DIR / 'results.md'} ---")
def aggregate(runs):
"""N runs ๊ฒฐ๊ณผ ์ง‘๊ณ„"""
ok = [r for r in runs if r.get("ok")]
if not ok:
return {"n_ok": 0, "n_fail": len(runs)}
return {
"n_ok": len(ok),
"n_fail": len(runs) - len(ok),
"schema_compliance": len(ok) / len(runs),
"latency_mean": statistics.mean(r["latency"] for r in ok),
"tokens_in_mean": statistics.mean(r["tokens_in"] for r in ok),
"tokens_out_mean": statistics.mean(r["tokens_out"] for r in ok),
"cost_mean": statistics.mean(
r["tokens_in"] * PRICE_IN + r["tokens_out"] * PRICE_OUT for r in ok
),
"cite_acc_mean": statistics.mean(r["cite_acc"] for r in ok),
"causes_jaccard": jaccard([r["causes"] for r in ok]),
"yield_loss_std": statistics.stdev(r["yield_loss"] for r in ok) if len(ok) >= 2 else 0.0,
"n_imm_mean": statistics.mean(r["n_imm"] for r in ok),
"n_lng_mean": statistics.mean(r["n_lng"] for r in ok),
"pct_sum_mean": statistics.mean(r["pct_sum"] for r in ok),
}
def write_results(multi_runs, single_runs, alarm, tier1):
M = aggregate(multi_runs)
S = aggregate(single_runs)
PLOTS_DIR.mkdir(exist_ok=True)
_plot_compare(M, S)
lines = [
"# D5. Multi-Agent vs Single LLM",
"",
f"๋™์ผ ์•Œ๋žŒ **{TARGET_ALARM} ({alarm['title']})** ์— ๋Œ€ํ•ด ๋‘ ๋ฐฉ์‹์„ N={N_RUNS}ํšŒ์”ฉ ์‹คํ–‰ํ•œ ๋น„๊ต ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค.",
"",
"## ์‹คํ—˜ ์„ค์ •",
"",
f"- ๋ชจ๋ธ: {SUBAGENT_MODEL} (์–‘์ชฝ ๋™์ผ)",
f"- ์•Œ๋žŒ: {TARGET_ALARM} ({alarm['title']}), lot {alarm['lot_id']}",
f"- Tier 1 (์ด์ƒ ํƒ์ง€): {tier1['score']} / ๊ธฐ์—ฌ ์„ผ์„œ {', '.join(f['name'] for f in tier1['features'])}",
"- Tier 1 ๊ฒฐ๊ณผ๋Š” ์–‘์ชฝ ๋™์ผ ์ž…๋ ฅ, Tier 2/3/4๋งŒ ๋น„๊ต",
"- Multi-Agent: cause โ†’ impact โ†’ response ์ˆœ์ฐจ ํ˜ธ์ถœ (๊ฐ RAG ๊ฒ€์ƒ‰ + ์ „๋ฌธํ™”๋œ system prompt)",
"- Single LLM: ์ „์ฒด ์ง€์‹ ๋ฌธ์„œ + Tier 1์„ ํ•œ ๋ฒˆ์— ์ž…๋ ฅ, Tier 2/3/4 ํ†ตํ•ฉ JSON ์‚ฐ์ถœ",
"",
"## ์ •๋Ÿ‰ ๋น„๊ต",
"",
"| ์ง€ํ‘œ | Multi-Agent | Single LLM | ์ฐจ์ด |",
"|---|---|---|---|",
]
def row(label, key, fmt="{:.2f}", lower_better=False):
m, s = M.get(key), S.get(key)
if m is None or s is None:
return f"| {label} | - | - | - |"
if isinstance(m, float) and isinstance(s, float):
diff = (m - s) / s * 100 if s else 0
arrow = "๐Ÿ”ป" if (diff < 0) == lower_better else "๐Ÿ”บ"
return f"| {label} | {fmt.format(m)} | {fmt.format(s)} | {arrow} {diff:+.1f}% |"
return f"| {label} | {m} | {s} | - |"
lines.append(row("์Šคํ‚ค๋งˆ ๋ถ€ํ•ฉ๋ฅ ", "schema_compliance", "{:.0%}"))
lines.append(row("Latency ํ‰๊ท (s)", "latency_mean", "{:.1f}s", lower_better=True))
lines.append(row("Input tokens (avg)", "tokens_in_mean", "{:.0f}", lower_better=True))
lines.append(row("Output tokens (avg)", "tokens_out_mean", "{:.0f}", lower_better=True))
lines.append(row("Cost/run ($)", "cost_mean", "${:.4f}", lower_better=True))
lines.append(row("Citation ์ •ํ™•๋„", "cite_acc_mean", "{:.0%}"))
lines.append(row("์›์ธ ์ผ๊ด€์„ฑ (Jaccard)", "causes_jaccard", "{:.2f}"))
lines.append(row("Yield_loss ฯƒ", "yield_loss_std", "{:.2f}", lower_better=True))
lines.append(row("์ฆ‰์‹œ ์กฐ์น˜ ์ˆ˜ (avg)", "n_imm_mean", "{:.1f}"))
lines.append(row("์ค‘์žฅ๊ธฐ ์กฐ์น˜ ์ˆ˜ (avg)", "n_lng_mean", "{:.1f}"))
lines.append(row("์›์ธ ๊ธฐ์—ฌ๋„ ํ•ฉ (avg)", "pct_sum_mean", "{:.1f}"))
lines += [
"",
"![Latencyยท๋น„์šฉ ๋น„๊ต](plots/cost_latency.png)",
"",
"## ํ•ด์„",
"",
f"- **ํ’ˆ์งˆ**: citation ์ •ํ™•๋„, ์›์ธ ์ผ๊ด€์„ฑ(Jaccard) ์ฐจ์ด๊ฐ€ Multi-Agent์˜ ๋ถ„๋ฆฌยท์ „๋ฌธํ™” ํšจ๊ณผ๋ฅผ ๋ณด์—ฌ์คŒ",
f"- **๋น„์šฉ**: Multi-Agent๊ฐ€ LLM์„ 3ํšŒ ํ˜ธ์ถœํ•˜๋ฏ€๋กœ ๋น„์šฉยทlatency๊ฐ€ ํฌ์ง€๋งŒ, ๊ฐ ํ˜ธ์ถœ์ด ์งง์€ system prompt + ์ข์€ RAG context๋ผ ํšจ์œจ์ ",
f"- **trade-off**: Single LLM์€ 1ํšŒ ํ˜ธ์ถœ๋กœ ๋น ๋ฅด๋‚˜ ์‘๋‹ต ์ผ๊ด€์„ฑยท์ธ์šฉ ์ •ํ™•๋„์—์„œ ์—ด์œ„",
"",
"## ๊ฒฐ๋ก ",
"",
"**Multi-Agent ์ฑ„ํƒ**. ๋น„์šฉ ์ฆ๊ฐ€๋Š” ์•Œ๋žŒ๋‹น ์•ฝ 2~3๋ฐฐ์ง€๋งŒ ์ ˆ๋Œ€๊ฐ’์ด $0.01 ์ˆ˜์ค€์ด๋ผ ๋ฌด์‹œ ๊ฐ€๋Šฅํ•˜๊ณ , "
"๊ตฌ์กฐํ™”ยท์ผ๊ด€์„ฑยทํ™•์žฅ์„ฑ(์—์ด์ „ํŠธ ์ถ”๊ฐ€) ๋ฉด์—์„œ ์••๋„์ ์œผ๋กœ ์œ ๋ฆฌ. ๋™์  ๋ถ„๊ธฐยท์žฌ์‹œ๋„๊ฐ€ ํ•„์š”ํ•ด์ง€๋ฉด "
"LangGraph๋กœ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ํ™•์žฅ ๊ฐ€๋Šฅํ•œ ๊ตฌ์กฐ.",
"",
]
(OUT_DIR / "results.md").write_text("\n".join(lines), encoding="utf-8")
def _plot_compare(M, S):
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
methods = ["Multi-Agent", "Single LLM"]
lat = [M.get("latency_mean", 0), S.get("latency_mean", 0)]
cost = [M.get("cost_mean", 0), S.get("cost_mean", 0)]
axes[0].bar(methods, lat, color=["#2C5AB8", "#9AA3B2"])
axes[0].set_ylabel("Latency (s)")
axes[0].set_title("Average Latency")
for i, v in enumerate(lat):
axes[0].text(i, v, f"{v:.1f}s", ha="center", va="bottom")
axes[1].bar(methods, cost, color=["#2C5AB8", "#9AA3B2"])
axes[1].set_ylabel("Cost per run ($)")
axes[1].set_title("Average Cost")
for i, v in enumerate(cost):
axes[1].text(i, v, f"${v:.4f}", ha="center", va="bottom")
fig.tight_layout()
fig.savefig(PLOTS_DIR / "cost_latency.png", dpi=120)
if __name__ == "__main__":
main()