umyunsang commited on
Commit
201e800
·
verified ·
1 Parent(s): 671d971

feat: E2E 법률 4카테고리 시나리오 추가 (민사/형사/지식재산/판례)

Browse files
Files changed (1) hide show
  1. scripts/verify_e2e_tool_calling.py +210 -7
scripts/verify_e2e_tool_calling.py CHANGED
@@ -8,7 +8,7 @@ HuggingFace Space에 배포된 govon-runtime 서버에 대해
8
  GOVON_RUNTIME_URL=https://<space-url>.hf.space python3 scripts/verify_e2e_tool_calling.py
9
  GOVON_RUNTIME_URL=https://<space-url>.hf.space API_KEY=<key> python3 scripts/verify_e2e_tool_calling.py
10
 
11
- 5-Phase 검증 (13 시나리오):
12
  Phase 1: Infrastructure (hard gate)
13
  1. Health & Profile
14
  2. Base Model Generation
@@ -17,6 +17,10 @@ HuggingFace Space에 배포된 govon-runtime 서버에 대해
17
  4. Planner Produces Valid Plan
18
  5. Civil LoRA Draft Response
19
  6. Legal LoRA Evidence Augmentation (depends on 5)
 
 
 
 
20
  7. Task Type Classification
21
  Phase 3: data.go.kr API Tools (soft gate)
22
  8. External API Tool Invocation (4 sub-cases)
@@ -110,11 +114,13 @@ def _save_intermediate_results() -> None:
110
  },
111
  "scenarios": _results,
112
  }
 
113
  try:
114
- with open(RESULTS_PATH, "w", encoding="utf-8") as f:
115
  json.dump(output, f, ensure_ascii=False, indent=2)
116
- except Exception:
117
- pass
 
118
 
119
 
120
  _observed_tools: set[str] = set()
@@ -354,7 +360,7 @@ async def _call_agent_with_approval(
354
  "tool_args": {},
355
  }
356
 
357
- logger.info(f"[Agent] 요청: session={session_id}, query={query[:60]}...")
358
 
359
  # --- SSE 스트리밍 시도 ---
360
  try:
@@ -439,9 +445,16 @@ async def _call_agent_with_approval(
439
  meta["tool_args"] = awaiting["tool_args"]
440
 
441
  thread_id = awaiting.get("thread_id") or session_id
442
- logger.info(f" [Approval] planned_tools={meta['planned_tools']}")
 
 
 
 
 
443
  logger.info(
444
- f" [Approval] adapter_mode={meta['adapter_mode']}, tool_args={meta['tool_args']}"
 
 
445
  )
446
  logger.info(
447
  f" [Approval] {'승인' if approve else '거절'} 요청 → thread_id={thread_id}"
@@ -1089,6 +1102,186 @@ async def scenario6_legal_lora_evidence() -> dict:
1089
  )
1090
 
1091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
  async def scenario7_task_type_classification() -> dict:
1093
  """Scenario 7: Task Type Classification (at least 2/3 correct)."""
1094
  test_cases = [
@@ -1685,6 +1878,16 @@ async def main() -> int:
1685
  for fn in phase2_scenarios:
1686
  await fn()
1687
 
 
 
 
 
 
 
 
 
 
 
1688
  # ===== Phase 3: data.go.kr API Tools (soft gate) =====
1689
  logger.info("\n[Phase 3] data.go.kr API Tools (soft gate)")
1690
  logger.info("-" * 40)
 
8
  GOVON_RUNTIME_URL=https://<space-url>.hf.space python3 scripts/verify_e2e_tool_calling.py
9
  GOVON_RUNTIME_URL=https://<space-url>.hf.space API_KEY=<key> python3 scripts/verify_e2e_tool_calling.py
10
 
11
+ 5-Phase 검증 (16 시나리오):
12
  Phase 1: Infrastructure (hard gate)
13
  1. Health & Profile
14
  2. Base Model Generation
 
17
  4. Planner Produces Valid Plan
18
  5. Civil LoRA Draft Response
19
  6. Legal LoRA Evidence Augmentation (depends on 5)
20
+ 6a. Legal LoRA — 민사법 (Civil Law)
21
+ 6b. Legal LoRA — 형사법 (Criminal Law)
22
+ 6c. Legal LoRA — 지식재산권 (IP)
23
+ 6d. Legal LoRA — 판례 해석 (Precedent)
24
  7. Task Type Classification
25
  Phase 3: data.go.kr API Tools (soft gate)
26
  8. External API Tool Invocation (4 sub-cases)
 
114
  },
115
  "scenarios": _results,
116
  }
117
+ tmp_path = f"{RESULTS_PATH}.tmp"
118
  try:
119
+ with open(tmp_path, "w", encoding="utf-8") as f:
120
  json.dump(output, f, ensure_ascii=False, indent=2)
121
+ os.replace(tmp_path, RESULTS_PATH)
122
+ except Exception as exc:
123
+ logger.warning("중간 결과 저장 실패: %s", exc)
124
 
125
 
126
  _observed_tools: set[str] = set()
 
360
  "tool_args": {},
361
  }
362
 
363
+ logger.info("[Agent] 요청: session=%s, query_len=%d", session_id, len(query))
364
 
365
  # --- SSE 스트리밍 시도 ---
366
  try:
 
445
  meta["tool_args"] = awaiting["tool_args"]
446
 
447
  thread_id = awaiting.get("thread_id") or session_id
448
+ logger.info(" [Approval] planned_tools=%s", meta["planned_tools"])
449
+ tool_arg_keys = (
450
+ sorted(meta["tool_args"].keys())
451
+ if isinstance(meta["tool_args"], dict)
452
+ else str(type(meta["tool_args"]).__name__)
453
+ )
454
  logger.info(
455
+ " [Approval] adapter_mode=%s, tool_arg_keys=%s",
456
+ meta["adapter_mode"],
457
+ tool_arg_keys,
458
  )
459
  logger.info(
460
  f" [Approval] {'승인' if approve else '거절'} 요청 → thread_id={thread_id}"
 
1102
  )
1103
 
1104
 
1105
+ # ---------------------------------------------------------------------------
1106
+ # Legal LoRA 카테고리별 패턴
1107
+ # ---------------------------------------------------------------------------
1108
+ CIVIL_LAW_PATTERNS = [
1109
+ r"민법",
1110
+ r"제\s*\d+\s*조",
1111
+ r"임대차",
1112
+ r"계약",
1113
+ r"손해배상",
1114
+ r"채권",
1115
+ r"채무",
1116
+ ]
1117
+ CRIMINAL_LAW_PATTERNS = [
1118
+ r"형법",
1119
+ r"형사",
1120
+ r"처벌",
1121
+ r"벌금",
1122
+ r"징역",
1123
+ r"보호법",
1124
+ r"제\s*\d+\s*조",
1125
+ ]
1126
+ IP_PATTERNS = [
1127
+ r"상표법",
1128
+ r"특허법",
1129
+ r"저작권",
1130
+ r"지식재산",
1131
+ r"제\s*\d+\s*조",
1132
+ r"침해",
1133
+ ]
1134
+ PRECEDENT_PATTERNS = [
1135
+ r"대법원",
1136
+ r"판례",
1137
+ r"판결",
1138
+ r"선고",
1139
+ r"\d{4}\s*[다나]\s*\d+",
1140
+ ]
1141
+
1142
+
1143
+ async def _legal_category_scenario(
1144
+ scenario_id: int,
1145
+ name: str,
1146
+ civil_query: str,
1147
+ legal_followup: str,
1148
+ patterns: list[str],
1149
+ ) -> dict:
1150
+ """Legal LoRA 카테고리별 시나리오 공통 로직.
1151
+
1152
+ 1단계: civil draft 선행 요청 (세션 컨텍스트 생성)
1153
+ 2단계: 법적 근거 보강 후속 요청
1154
+ """
1155
+ t0 = time.monotonic()
1156
+ session_id = _session_id(scenario_id)
1157
+
1158
+ try:
1159
+ # Step 1: Civil draft (선행 요청으로 세션 컨텍스트 생성)
1160
+ ok_civil, _, _, err_civil = await _call_agent_with_approval(
1161
+ query=civil_query,
1162
+ session_id=session_id,
1163
+ )
1164
+ if not ok_civil:
1165
+ elapsed = time.monotonic() - t0
1166
+ return _record(
1167
+ scenario_id,
1168
+ name,
1169
+ 2,
1170
+ "failed",
1171
+ elapsed,
1172
+ error=f"civil 선행 실패: {err_civil}",
1173
+ )
1174
+
1175
+ # Step 2: Legal follow-up (법적 근거 보강)
1176
+ ok, text, meta, err = await _call_agent_with_approval(
1177
+ query=legal_followup,
1178
+ session_id=session_id,
1179
+ )
1180
+ elapsed = time.monotonic() - t0
1181
+
1182
+ if not ok:
1183
+ return _record(
1184
+ scenario_id,
1185
+ name,
1186
+ 2,
1187
+ "failed",
1188
+ elapsed,
1189
+ error=err,
1190
+ detail={"meta": meta},
1191
+ )
1192
+
1193
+ # 법령 패턴 매칭
1194
+ matched = [p for p in patterns if re.search(p, text)]
1195
+ has_legal = len(matched) > 0
1196
+
1197
+ assertions: list[str] = []
1198
+ warnings: list[str] = []
1199
+
1200
+ planned = meta.get("planned_tools", [])
1201
+ if planned:
1202
+ _observed_tools.update(planned)
1203
+
1204
+ if "append_evidence" in planned:
1205
+ assertions.append("append_evidence in planned_tools")
1206
+ else:
1207
+ warnings.append("append_evidence not in planned_tools")
1208
+
1209
+ if has_legal:
1210
+ assertions.append(f"법령 패턴 발견: {matched[:3]}")
1211
+ else:
1212
+ warnings.append("법령 패턴 미발견")
1213
+
1214
+ passed = bool(text and len(text) > 30)
1215
+ return _record(
1216
+ scenario_id,
1217
+ name,
1218
+ 2,
1219
+ "passed" if passed else "failed",
1220
+ elapsed,
1221
+ assertions=assertions,
1222
+ warnings=warnings,
1223
+ error=None if passed else "응답 텍스트 부족",
1224
+ detail={
1225
+ "text_preview": text[:200] if text else "",
1226
+ "matched_patterns": matched,
1227
+ "meta": meta,
1228
+ },
1229
+ )
1230
+ except Exception as exc:
1231
+ return _record(
1232
+ scenario_id,
1233
+ name,
1234
+ 2,
1235
+ "failed",
1236
+ time.monotonic() - t0,
1237
+ error=str(exc),
1238
+ )
1239
+
1240
+
1241
+ async def scenario6a_legal_civil_law() -> dict:
1242
+ """Scenario 6a: Legal LoRA — 민사법 질의."""
1243
+ return await _legal_category_scenario(
1244
+ scenario_id=61,
1245
+ name="Legal LoRA — 민사법 (Civil Law)",
1246
+ civil_query="임대차 계약에서 임대인의 수선의무 범위와 임차인의 권리에 대해 답변을 작성해주세요",
1247
+ legal_followup="위 답변에 관련 법령 조항을 인용하여 법적 근거를 보강해주세요",
1248
+ patterns=CIVIL_LAW_PATTERNS,
1249
+ )
1250
+
1251
+
1252
+ async def scenario6b_legal_criminal_law() -> dict:
1253
+ """Scenario 6b: Legal LoRA — 형사법 질의."""
1254
+ return await _legal_category_scenario(
1255
+ scenario_id=62,
1256
+ name="Legal LoRA — 형사법 (Criminal Law)",
1257
+ civil_query="개인정보보호법 위반 시 형사처벌 기준과 관련 법률 조항에 대해 답변을 작성해주세요",
1258
+ legal_followup="위 답변에 관련 법령 조항을 인용하여 법적 근거를 보강해주세요",
1259
+ patterns=CRIMINAL_LAW_PATTERNS,
1260
+ )
1261
+
1262
+
1263
+ async def scenario6c_legal_ip() -> dict:
1264
+ """Scenario 6c: Legal LoRA — 지식재산권 질의."""
1265
+ return await _legal_category_scenario(
1266
+ scenario_id=63,
1267
+ name="Legal LoRA — 지식재산권 (IP)",
1268
+ civil_query="상표권 침해 판단 기준과 구제 방법에 대해 답변을 작성해주세요",
1269
+ legal_followup="위 답변에 상표법 조항을 인용하여 법적 근거를 보강해주세요",
1270
+ patterns=IP_PATTERNS,
1271
+ )
1272
+
1273
+
1274
+ async def scenario6d_legal_precedent() -> dict:
1275
+ """Scenario 6d: Legal LoRA — 판례 해석 질의."""
1276
+ return await _legal_category_scenario(
1277
+ scenario_id=64,
1278
+ name="Legal LoRA — 판례 해석 (Precedent)",
1279
+ civil_query="근로계약 해지 시 부당해고 여부를 판단하는 기준에 대해 답변을 작성해주세요",
1280
+ legal_followup="위 답변에 대법원 판례의 기준을 인용하여 법적 근거를 보강해주세요",
1281
+ patterns=PRECEDENT_PATTERNS,
1282
+ )
1283
+
1284
+
1285
  async def scenario7_task_type_classification() -> dict:
1286
  """Scenario 7: Task Type Classification (at least 2/3 correct)."""
1287
  test_cases = [
 
1878
  for fn in phase2_scenarios:
1879
  await fn()
1880
 
1881
+ # Legal LoRA 카테고리별 테스트 (4개: 민사법, 형사법, 지식재산권, 판례)
1882
+ legal_scenarios = [
1883
+ scenario6a_legal_civil_law,
1884
+ scenario6b_legal_criminal_law,
1885
+ scenario6c_legal_ip,
1886
+ scenario6d_legal_precedent,
1887
+ ]
1888
+ for legal_fn in legal_scenarios:
1889
+ await legal_fn()
1890
+
1891
  # ===== Phase 3: data.go.kr API Tools (soft gate) =====
1892
  logger.info("\n[Phase 3] data.go.kr API Tools (soft gate)")
1893
  logger.info("-" * 40)