小形克宏 commited on
Commit
5ba08e6
·
1 Parent(s): f614727

Fix: replace Dataframe with Markdown to avoid Gradio bug

Browse files
Files changed (1) hide show
  1. app.py +127 -251
app.py CHANGED
@@ -1,9 +1,6 @@
1
  """
2
  StructEval-T Analyzer
3
  松尾研LLM講義2025 メインコンペ用 推論結果分析ツール
4
-
5
- inference.json と public_150.json をアップロードして、
6
- フォーマット別のパース成功率やエラーパターンを分析します。
7
  """
8
 
9
  import json
@@ -18,32 +15,25 @@ import gradio as gr
18
  import pandas as pd
19
 
20
  # ---------------------------------------------------------------------------
21
- # 1. Syntax Validators (フォーマット別パーサー)
22
  # ---------------------------------------------------------------------------
23
 
24
- def validate_json(text: str) -> tuple[bool, str]:
25
- """JSON構文を検証"""
26
  try:
27
  json.loads(text)
28
  return True, ""
29
  except json.JSONDecodeError as e:
30
- return False, f"JSONDecodeError: {e.msg} (line {e.lineno}, col {e.colno})"
31
-
32
 
33
- def validate_yaml(text: str) -> tuple[bool, str]:
34
- """YAML構文を検証"""
35
  try:
36
  import yaml
37
  yaml.safe_load(text)
38
  return True, ""
39
- except yaml.YAMLError as e:
40
- return False, f"YAMLError: {e}"
41
  except Exception as e:
42
- return False, f"Error: {e}"
43
-
44
 
45
- def validate_toml(text: str) -> tuple[bool, str]:
46
- """TOML構文を検証"""
47
  try:
48
  import tomllib
49
  tomllib.loads(text)
@@ -51,37 +41,29 @@ def validate_toml(text: str) -> tuple[bool, str]:
51
  except Exception as e:
52
  return False, f"TOMLError: {e}"
53
 
54
-
55
- def validate_xml(text: str) -> tuple[bool, str]:
56
- """XML構文を検証"""
57
  try:
58
  import xml.etree.ElementTree as ET
59
  ET.fromstring(text)
60
  return True, ""
61
- except ET.ParseError as e:
62
- return False, f"XMLParseError: {e}"
63
  except Exception as e:
64
- return False, f"Error: {e}"
65
-
66
 
67
- def validate_csv(text: str) -> tuple[bool, str]:
68
- """CSV構文を検証"""
69
  try:
70
  reader = csv.reader(io.StringIO(text))
71
  rows = list(reader)
72
  if len(rows) == 0:
73
  return False, "Empty CSV"
74
  if len(rows) == 1:
75
- return False, "CSV has only header, no data rows"
76
- # 列数の一貫性チェック
77
  col_counts = [len(row) for row in rows]
78
  if len(set(col_counts)) > 1:
79
- return False, f"Inconsistent column counts: {col_counts[:5]}"
80
  return True, ""
81
  except Exception as e:
82
  return False, f"CSVError: {e}"
83
 
84
-
85
  VALIDATORS = {
86
  "JSON": validate_json,
87
  "YAML": validate_yaml,
@@ -91,112 +73,63 @@ VALIDATORS = {
91
  }
92
 
93
  # ---------------------------------------------------------------------------
94
- # 2. Error Pattern Classifier (エラーパターン自動分類)
95
  # ---------------------------------------------------------------------------
96
 
97
- def classify_error_patterns(generation: str, output_type: str) -> list[str]:
98
- """出力テキストのエラーパターンを分類"""
99
  patterns = []
100
-
101
- # マークダウンブロックの混入
102
  if re.search(r"```\w*", generation):
103
  patterns.append("markdown_block")
104
-
105
- # 自然言語の混入(先頭部分)
106
  first_line = generation.strip().split("\n")[0] if generation.strip() else ""
107
- nl_indicators = [
108
- "here is", "here's", "below is", "the following",
109
- "sure", "certainly", "of course", "i'll",
110
- "let me", "note:", "output:",
111
- ]
112
  if any(ind in first_line.lower() for ind in nl_indicators):
113
  patterns.append("natural_language_prefix")
114
-
115
- # 末尾の自然言語混入
116
  last_lines = generation.strip().split("\n")[-3:] if generation.strip() else []
117
  last_text = " ".join(last_lines).lower()
118
- nl_suffix = ["note:", "explanation:", "this ", "the above", "please "]
119
- if any(ind in last_text for ind in nl_suffix):
120
  patterns.append("natural_language_suffix")
121
-
122
- # 途切れ(トランケーション)の検出
123
- stripped = generation.rstrip()
124
  if output_type == "JSON":
125
- open_count = generation.count("{") + generation.count("[")
126
- close_count = generation.count("}") + generation.count("]")
127
- if open_count > close_count:
128
  patterns.append("truncation")
129
  elif output_type == "XML":
130
  open_tags = len(re.findall(r"<[^/!?][^>]*>", generation))
131
  close_tags = len(re.findall(r"</[^>]+>", generation))
132
  if open_tags > close_tags + 1:
133
  patterns.append("truncation")
134
- elif output_type in ("YAML", "TOML", "CSV"):
135
- if stripped and stripped[-1] == "\\":
136
- patterns.append("truncation")
137
-
138
- # 空出力
139
  if not generation.strip():
140
  patterns.append("empty_output")
141
-
142
- # 別フォーマットの出力(JSONを要求されたのにXMLが出てくる等)
143
- format_indicators = {
144
- "JSON": (r"^\s*[\{\[]", None),
145
- "XML": (r"^\s*<", None),
146
- "YAML": (None, None),
147
- "TOML": (r"^\s*\[", None),
148
- "CSV": (None, None),
149
- }
150
- if output_type == "JSON" and re.match(r"^\s*<", generation.strip()):
151
- patterns.append("wrong_format")
152
- elif output_type == "XML" and re.match(r"^\s*[\{\[]", generation.strip()):
153
- patterns.append("wrong_format")
154
-
155
- # CoT思考過程の混入
156
- if re.search(r"<think>|</think>|<reasoning>|</reasoning>", generation):
157
  patterns.append("cot_leakage")
158
-
 
159
  return patterns if patterns else ["unknown"]
160
 
161
-
162
  # ---------------------------------------------------------------------------
163
- # 3. Core Analysis (コア分析ロジック)
164
  # ---------------------------------------------------------------------------
165
 
166
- def load_public_150(file_path: str) -> dict:
167
- """public_150.json を読み込み、task_id → 情報 の辞書を返す"""
168
  with open(file_path, "r", encoding="utf-8") as f:
169
  data = json.load(f)
170
  return {item["task_id"]: item for item in data}
171
 
172
-
173
- def analyze_single_inference(
174
- inference_data: list[dict],
175
- task_info: dict,
176
- ) -> pd.DataFrame:
177
- """1つのinference.jsonを分析してDataFrameを返す"""
178
  results = []
179
  for item in inference_data:
180
  task_id = item.get("task_id", "")
181
  generation = item.get("generation", "")
182
-
183
  info = task_info.get(task_id, {})
184
  output_type = info.get("output_type", "UNKNOWN")
185
  task_name = info.get("task_name", "UNKNOWN")
186
-
187
- # 構文検証
188
  validator = VALIDATORS.get(output_type)
189
  if validator:
190
  is_valid, error_msg = validator(generation)
191
  else:
192
  is_valid, error_msg = False, f"Unknown format: {output_type}"
193
-
194
- # エラーパターン分類
195
  if not is_valid:
196
  error_patterns = classify_error_patterns(generation, output_type)
197
  else:
198
  error_patterns = []
199
-
200
  results.append({
201
  "task_id": task_id,
202
  "task_name": task_name,
@@ -207,91 +140,48 @@ def analyze_single_inference(
207
  "generation_length": len(generation),
208
  "generation_preview": generation[:200],
209
  })
210
-
211
  return pd.DataFrame(results)
212
 
213
-
214
- def compute_summary(df: pd.DataFrame) -> dict:
215
- """分析結果のサマリーを計算"""
216
  total = len(df)
217
- valid = df["is_valid"].sum()
218
-
219
  summary = {
220
  "total_tasks": total,
221
- "parse_success": int(valid),
222
- "parse_fail": int(total - valid),
223
  "parse_rate": f"{valid / total * 100:.1f}%" if total > 0 else "N/A",
224
  }
225
-
226
- # フォーマット別
227
  format_stats = {}
228
  for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
229
  fmt_df = df[df["output_type"] == fmt]
230
  fmt_total = len(fmt_df)
231
- fmt_valid = fmt_df["is_valid"].sum()
232
  format_stats[fmt] = {
233
  "total": fmt_total,
234
- "success": int(fmt_valid),
235
- "fail": int(fmt_total - fmt_valid),
236
  "rate": f"{fmt_valid / fmt_total * 100:.1f}%" if fmt_total > 0 else "N/A",
237
  }
238
  summary["by_format"] = format_stats
239
-
240
- # エラーパターン集計
241
  all_patterns = []
242
  for patterns_str in df[df["is_valid"] == False]["error_patterns"]:
243
  if patterns_str:
244
  all_patterns.extend(patterns_str.split(","))
245
  summary["error_pattern_counts"] = dict(Counter(all_patterns).most_common())
246
-
247
  return summary
248
 
249
-
250
  # ---------------------------------------------------------------------------
251
- # 4. Multi-file Comparison (複数ファイル比較)
252
- # ---------------------------------------------------------------------------
253
-
254
- def compare_experiments(
255
- all_results: dict[str, pd.DataFrame],
256
- ) -> pd.DataFrame:
257
- """複数実験の結果を比較するDataFrameを返す"""
258
- rows = []
259
- for name, df in all_results.items():
260
- total = len(df)
261
- valid = df["is_valid"].sum()
262
- row = {
263
- "experiment": name,
264
- "total": total,
265
- "parse_success": int(valid),
266
- "parse_rate": f"{valid / total * 100:.1f}%" if total > 0 else "N/A",
267
- }
268
- for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
269
- fmt_df = df[df["output_type"] == fmt]
270
- fmt_total = len(fmt_df)
271
- fmt_valid = fmt_df["is_valid"].sum()
272
- row[f"{fmt}_rate"] = (
273
- f"{fmt_valid / fmt_total * 100:.1f}%"
274
- if fmt_total > 0
275
- else "N/A"
276
- )
277
- rows.append(row)
278
- return pd.DataFrame(rows)
279
-
280
-
281
- # ---------------------------------------------------------------------------
282
- # 5. Gradio Interface
283
  # ---------------------------------------------------------------------------
284
 
285
  def process_files(public_150_file, inference_files):
286
- """メイン処理:ファイルを受け取って分析結果を返す"""
287
  if public_150_file is None:
288
- return "❌ public_150.json をアップロードしてください", None, None, None
289
 
290
  if not inference_files:
291
- return "❌ inference.json を1つ以上アップロードしてください", None, None, None
292
 
293
  try:
294
- # Gradio 5ではfilepathモードで文字列パスが渡される
295
  pub_path = public_150_file if isinstance(public_150_file, str) else public_150_file.name
296
  task_info = load_public_150(pub_path)
297
 
@@ -303,13 +193,12 @@ def process_files(public_150_file, inference_files):
303
  filename = Path(inf_path).stem
304
  with open(inf_path, "r", encoding="utf-8") as f:
305
  inference_data = json.load(f)
306
-
307
  df = analyze_single_inference(inference_data, task_info)
308
  summary = compute_summary(df)
309
  all_results[filename] = df
310
  all_summaries[filename] = summary
311
 
312
- # --- 出力1: 全体サマリーテキスト ---
313
  summary_text = "## 📊 分析結果サマリー\n\n"
314
  for name, s in all_summaries.items():
315
  summary_text += f"### {name}\n"
@@ -323,127 +212,114 @@ def process_files(public_150_file, inference_files):
323
  summary_text += f" - {pattern}: {count}件\n"
324
  summary_text += "\n"
325
 
326
- # --- 出力2: 比較テーブル ---
327
- comparison_df = compare_experiments(all_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
- # --- 出力3: エラー詳細(最初のファイルのみ) ---
330
  first_name = list(all_results.keys())[0]
331
  first_df = all_results[first_name]
332
- error_df = first_df[first_df["is_valid"] == False][
333
- ["task_id", "task_name", "output_type", "error_msg", "error_patterns", "generation_preview"]
334
- ]
335
 
336
- # --- 出力4: フォマット別パース成功率のCSV ---
337
- format_comparison_rows = []
338
- for name, df in all_results.items():
339
- row = {"experiment": name}
340
- for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
341
- fmt_df = df[df["output_type"] == fmt]
342
- fmt_total = len(fmt_df)
343
- fmt_valid = fmt_df["is_valid"].sum()
344
- row[fmt] = round(fmt_valid / fmt_total * 100, 1) if fmt_total > 0 else 0
345
- format_comparison_rows.append(row)
346
- format_df = pd.DataFrame(format_comparison_rows)
347
 
348
- return summary_text, comparison_df, error_df, format_df
349
 
350
  except Exception as e:
351
  error_trace = traceback.format_exc()
352
- return f"❌ エラーが発生しました:\n```\n{error_trace}\n```", None, None, None
353
-
354
 
355
  # ---------------------------------------------------------------------------
356
- # 6. Gradio App
357
  # ---------------------------------------------------------------------------
358
 
359
- def create_app():
360
- with gr.Blocks(
361
- title="StructEval-T Analyzer",
362
- theme=gr.themes.Soft(),
363
- ) as app:
364
- gr.Markdown(
365
- """
366
- # 🔍 StructEval-T Analyzer
367
- ### 松尾研LLM講義2025 メインコンペ用 推論結果分析ツール
368
-
369
- `inference.json` と `public_150.json` をアップロドすること
370
- モデル出力の構文的正確性(パース可能性)やエラーパターンを分析できます。
371
-
372
- **使い方:**
373
- 1. `public_150.json` をアップロード
374
- 2. 1つ以上の `inference.json` プロード(複数ファイル対応・実験比較可能)
375
- 3. 「分析開始」ボタンをクリック
376
- """
377
- )
378
-
379
- with gr.Row():
380
- public_file = gr.File(
381
- label="public_150.json",
382
- file_types=[".json"],
383
- type="filepath",
384
- )
385
- inference_files = gr.File(
386
- label="inference.json(複数可)",
387
- file_types=[".json"],
388
- file_count="multiple",
389
- type="filepath",
390
- )
391
-
392
- analyze_btn = gr.Button("🔬 分析開始", variant="primary", size="lg")
393
-
394
- with gr.Tabs():
395
- with gr.Tab("📊 サマリー"):
396
- summary_output = gr.Markdown()
397
-
398
- with gr.Tab("📈 実験比較"):
399
- comparison_table = gr.Dataframe(
400
- label="実験間のパース成功率比較",
401
- interactive=False,
402
- )
403
-
404
- with gr.Tab("❌ エラー詳細"):
405
- gr.Markdown("*最初にアップロードされたファイルのエラー一覧を表示*")
406
- error_table = gr.Dataframe(
407
- label="パース失敗タスク一覧",
408
- interactive=False,
409
- wrap=True,
410
- )
411
-
412
- with gr.Tab("📉 フォーマット別"):
413
- format_table = gr.Dataframe(
414
- label="フォーマット別パース成功率(%)",
415
- interactive=False,
416
- )
417
-
418
- analyze_btn.click(
419
- fn=process_files,
420
- inputs=[public_file, inference_files],
421
- outputs=[summary_output, comparison_table, error_table, format_table],
422
  )
423
-
424
- gr.Markdown(
425
- """
426
- ---
427
- **注意:** このツールは構文的な正確性(パース可能かどうか)のみを検証します。
428
- 運営側の採点基準である `raw_output_metric`(特定キーの存在チェック等)は
429
- `public_150.json` から削除されているため、完全なスコア再現はできません。
430
-
431
- **エラーパターンの凡例:**
432
- - `markdown_block`: マークダウンコードブロック(\\`\\`\\`json 等)の混入
433
- - `natural_language_prefix`: 先頭に自然言語("Here is..."等)が混入
434
- - `natural_language_suffix`: 末尾に自然言語("Note:"等)が混入
435
- - `truncation`: 出力の途切れ(閉じ括弧・タグの欠落)
436
- - `empty_output`: 空の出力
437
- - `wrong_format`: 要求と異なるフォーマットの出力
438
- - `cot_leakage`: 思考過程(\\<think\\>等)の混入
439
- - `unknown`: 上記に該当しない構文エラー
440
- """
441
  )
442
 
443
- return app
444
-
445
-
446
- demo = create_app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
  if __name__ == "__main__":
449
- demo.launch()
 
1
  """
2
  StructEval-T Analyzer
3
  松尾研LLM講義2025 メインコンペ用 推論結果分析ツール
 
 
 
4
  """
5
 
6
  import json
 
15
  import pandas as pd
16
 
17
  # ---------------------------------------------------------------------------
18
+ # 1. Syntax Validators
19
  # ---------------------------------------------------------------------------
20
 
21
+ def validate_json(text):
 
22
  try:
23
  json.loads(text)
24
  return True, ""
25
  except json.JSONDecodeError as e:
26
+ return False, f"JSONDecodeError: {e.msg} (line {e.lineno})"
 
27
 
28
+ def validate_yaml(text):
 
29
  try:
30
  import yaml
31
  yaml.safe_load(text)
32
  return True, ""
 
 
33
  except Exception as e:
34
+ return False, f"YAMLError: {e}"
 
35
 
36
+ def validate_toml(text):
 
37
  try:
38
  import tomllib
39
  tomllib.loads(text)
 
41
  except Exception as e:
42
  return False, f"TOMLError: {e}"
43
 
44
+ def validate_xml(text):
 
 
45
  try:
46
  import xml.etree.ElementTree as ET
47
  ET.fromstring(text)
48
  return True, ""
 
 
49
  except Exception as e:
50
+ return False, f"XMLError: {e}"
 
51
 
52
+ def validate_csv(text):
 
53
  try:
54
  reader = csv.reader(io.StringIO(text))
55
  rows = list(reader)
56
  if len(rows) == 0:
57
  return False, "Empty CSV"
58
  if len(rows) == 1:
59
+ return False, "Only header"
 
60
  col_counts = [len(row) for row in rows]
61
  if len(set(col_counts)) > 1:
62
+ return False, f"Inconsistent cols: {set(col_counts)}"
63
  return True, ""
64
  except Exception as e:
65
  return False, f"CSVError: {e}"
66
 
 
67
  VALIDATORS = {
68
  "JSON": validate_json,
69
  "YAML": validate_yaml,
 
73
  }
74
 
75
  # ---------------------------------------------------------------------------
76
+ # 2. Error Pattern Classifier
77
  # ---------------------------------------------------------------------------
78
 
79
+ def classify_error_patterns(generation, output_type):
 
80
  patterns = []
 
 
81
  if re.search(r"```\w*", generation):
82
  patterns.append("markdown_block")
 
 
83
  first_line = generation.strip().split("\n")[0] if generation.strip() else ""
84
+ nl_indicators = ["here is", "here's", "below is", "sure", "certainly", "let me"]
 
 
 
 
85
  if any(ind in first_line.lower() for ind in nl_indicators):
86
  patterns.append("natural_language_prefix")
 
 
87
  last_lines = generation.strip().split("\n")[-3:] if generation.strip() else []
88
  last_text = " ".join(last_lines).lower()
89
+ if any(ind in last_text for ind in ["note:", "explanation:", "this ", "the above"]):
 
90
  patterns.append("natural_language_suffix")
 
 
 
91
  if output_type == "JSON":
92
+ if generation.count("{") + generation.count("[") > generation.count("}") + generation.count("]"):
 
 
93
  patterns.append("truncation")
94
  elif output_type == "XML":
95
  open_tags = len(re.findall(r"<[^/!?][^>]*>", generation))
96
  close_tags = len(re.findall(r"</[^>]+>", generation))
97
  if open_tags > close_tags + 1:
98
  patterns.append("truncation")
 
 
 
 
 
99
  if not generation.strip():
100
  patterns.append("empty_output")
101
+ if re.search(r"<think>|</think>", generation):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  patterns.append("cot_leakage")
103
+ if re.search(r"<tool_call>", generation):
104
+ patterns.append("tool_call_leakage")
105
  return patterns if patterns else ["unknown"]
106
 
 
107
  # ---------------------------------------------------------------------------
108
+ # 3. Core Analysis
109
  # ---------------------------------------------------------------------------
110
 
111
+ def load_public_150(file_path):
 
112
  with open(file_path, "r", encoding="utf-8") as f:
113
  data = json.load(f)
114
  return {item["task_id"]: item for item in data}
115
 
116
+ def analyze_single_inference(inference_data, task_info):
 
 
 
 
 
117
  results = []
118
  for item in inference_data:
119
  task_id = item.get("task_id", "")
120
  generation = item.get("generation", "")
 
121
  info = task_info.get(task_id, {})
122
  output_type = info.get("output_type", "UNKNOWN")
123
  task_name = info.get("task_name", "UNKNOWN")
 
 
124
  validator = VALIDATORS.get(output_type)
125
  if validator:
126
  is_valid, error_msg = validator(generation)
127
  else:
128
  is_valid, error_msg = False, f"Unknown format: {output_type}"
 
 
129
  if not is_valid:
130
  error_patterns = classify_error_patterns(generation, output_type)
131
  else:
132
  error_patterns = []
 
133
  results.append({
134
  "task_id": task_id,
135
  "task_name": task_name,
 
140
  "generation_length": len(generation),
141
  "generation_preview": generation[:200],
142
  })
 
143
  return pd.DataFrame(results)
144
 
145
+ def compute_summary(df):
 
 
146
  total = len(df)
147
+ valid = int(df["is_valid"].sum())
 
148
  summary = {
149
  "total_tasks": total,
150
+ "parse_success": valid,
151
+ "parse_fail": total - valid,
152
  "parse_rate": f"{valid / total * 100:.1f}%" if total > 0 else "N/A",
153
  }
 
 
154
  format_stats = {}
155
  for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
156
  fmt_df = df[df["output_type"] == fmt]
157
  fmt_total = len(fmt_df)
158
+ fmt_valid = int(fmt_df["is_valid"].sum())
159
  format_stats[fmt] = {
160
  "total": fmt_total,
161
+ "success": fmt_valid,
162
+ "fail": fmt_total - fmt_valid,
163
  "rate": f"{fmt_valid / fmt_total * 100:.1f}%" if fmt_total > 0 else "N/A",
164
  }
165
  summary["by_format"] = format_stats
 
 
166
  all_patterns = []
167
  for patterns_str in df[df["is_valid"] == False]["error_patterns"]:
168
  if patterns_str:
169
  all_patterns.extend(patterns_str.split(","))
170
  summary["error_pattern_counts"] = dict(Counter(all_patterns).most_common())
 
171
  return summary
172
 
 
173
  # ---------------------------------------------------------------------------
174
+ # 4. Main Processing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  # ---------------------------------------------------------------------------
176
 
177
  def process_files(public_150_file, inference_files):
 
178
  if public_150_file is None:
179
+ return "❌ public_150.json をアップロードしてください", "", ""
180
 
181
  if not inference_files:
182
+ return "❌ inference.json を1つ以上アップロードしてください", "", ""
183
 
184
  try:
 
185
  pub_path = public_150_file if isinstance(public_150_file, str) else public_150_file.name
186
  task_info = load_public_150(pub_path)
187
 
 
193
  filename = Path(inf_path).stem
194
  with open(inf_path, "r", encoding="utf-8") as f:
195
  inference_data = json.load(f)
 
196
  df = analyze_single_inference(inference_data, task_info)
197
  summary = compute_summary(df)
198
  all_results[filename] = df
199
  all_summaries[filename] = summary
200
 
201
+ # --- Output 1: Summary ---
202
  summary_text = "## 📊 分析結果サマリー\n\n"
203
  for name, s in all_summaries.items():
204
  summary_text += f"### {name}\n"
 
212
  summary_text += f" - {pattern}: {count}件\n"
213
  summary_text += "\n"
214
 
215
+ # --- Output 2: Comparison table as markdown ---
216
+ comp_lines = ["## 📈 実験比較\n"]
217
+ comp_lines.append("| experiment | total | pass | rate | JSON | YAML | TOML | XML | CSV |")
218
+ comp_lines.append("|---|---|---|---|---|---|---|---|---|")
219
+ for name, df in all_results.items():
220
+ total = len(df)
221
+ valid = int(df["is_valid"].sum())
222
+ rate = f"{valid/total*100:.1f}%" if total > 0 else "N/A"
223
+ fmt_rates = {}
224
+ for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
225
+ fmt_df = df[df["output_type"] == fmt]
226
+ ft = len(fmt_df)
227
+ fv = int(fmt_df["is_valid"].sum())
228
+ fmt_rates[fmt] = f"{fv/ft*100:.1f}%" if ft > 0 else "N/A"
229
+ comp_lines.append(f"| {name} | {total} | {valid} | {rate} | {fmt_rates['JSON']} | {fmt_rates['YAML']} | {fmt_rates['TOML']} | {fmt_rates['XML']} | {fmt_rates['CSV']} |")
230
+ comparison_md = "\n".join(comp_lines)
231
 
232
+ # --- Output 3: Error details as markdown ---
233
  first_name = list(all_results.keys())[0]
234
  first_df = all_results[first_name]
235
+ error_df = first_df[first_df["is_valid"] == False]
 
 
236
 
237
+ error_lines = [f"## ❌ エラ詳細 ({first_name})\n"]
238
+ error_lines.append(f"パース失敗: {len(error_df)}件\n")
239
+ error_lines.append("| task_name | output_type | error_patterns | error_msg |")
240
+ error_lines.append("|---|---|---|---|")
241
+ for _, row in error_df.iterrows():
242
+ err_msg_short = str(row['error_msg'])[:60]
243
+ error_lines.append(f"| {row['task_name']} | {row['output_type']} | {row['error_patterns']} | {err_msg_short} |")
244
+ error_md = "\n".join(error_lines)
 
 
 
245
 
246
+ return summary_text, comparison_md, error_md
247
 
248
  except Exception as e:
249
  error_trace = traceback.format_exc()
250
+ return f"❌ エラー:\n```\n{error_trace}\n```", "", ""
 
251
 
252
  # ---------------------------------------------------------------------------
253
+ # 5. Gradio App - using only Markdown outputs to avoid Dataframe bugs
254
  # ---------------------------------------------------------------------------
255
 
256
+ with gr.Blocks(
257
+ title="StructEval-T Analyzer",
258
+ theme=gr.themes.Soft(),
259
+ ) as demo:
260
+ gr.Markdown(
261
+ """
262
+ # 🔍 StructEval-T Analyzer
263
+ ### 松尾研LLM講義2025 メインコンペ用 推論結果分析ツール
264
+
265
+ `inference.json` と `public_150.json` をアップロードすることで、
266
+ モデル出力の構文的正確性(パス可能性)やエラーパターンを分析きます。
267
+
268
+ **使い方:**
269
+ 1. `public_150.json` をアップロード
270
+ 2. 1つ以上の `inference.json` をアップロード(複数ファイル対応・実験比較可能)
271
+ 3. 「分析開始」ボタンクリ
272
+ """
273
+ )
274
+
275
+ with gr.Row():
276
+ public_file = gr.File(
277
+ label="public_150.json",
278
+ file_types=[".json"],
279
+ type="filepath",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  )
281
+ inference_files = gr.File(
282
+ label="inference.json(��数可)",
283
+ file_types=[".json"],
284
+ file_count="multiple",
285
+ type="filepath",
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  )
287
 
288
+ analyze_btn = gr.Button("🔬 分析開始", variant="primary", size="lg")
289
+
290
+ with gr.Tabs():
291
+ with gr.Tab("📊 サマリー"):
292
+ summary_output = gr.Markdown()
293
+ with gr.Tab("📈 実験比較"):
294
+ comparison_output = gr.Markdown()
295
+ with gr.Tab("❌ エラー詳細"):
296
+ error_output = gr.Markdown()
297
+
298
+ analyze_btn.click(
299
+ fn=process_files,
300
+ inputs=[public_file, inference_files],
301
+ outputs=[summary_output, comparison_output, error_output],
302
+ )
303
+
304
+ gr.Markdown(
305
+ """
306
+ ---
307
+ **注意:** このツールは構文的な正確性(パース可能かどうか)のみを検証します。
308
+ 運営側の採点基準である `raw_output_metric`(特定キーの存在チェック等)は
309
+ `public_150.json` から削除されているため、完全なスコア再現はできません。
310
+
311
+ **エラーパターンの凡例:**
312
+ - `markdown_block`: マークダウンコードブロック(\\`\\`\\`json 等)の混入
313
+ - `natural_language_prefix`: 先頭に自然言語("Here is..."等)が混入
314
+ - `natural_language_suffix`: 末尾に自然言語("Note:"等)が混入
315
+ - `truncation`: 出力の途切れ(閉じ括弧・タグの欠落)
316
+ - `empty_output`: 空の出力
317
+ - `wrong_format`: 要求と異なるフォーマットの出力
318
+ - `cot_leakage`: 思考過程(\\<think\\>等)の混入
319
+ - `tool_call_leakage`: ツールコール(\\<tool_call\\>等)の混入
320
+ - `unknown`: 上記に該当しない構文エラー
321
+ """
322
+ )
323
 
324
  if __name__ == "__main__":
325
+ demo.launch(ssr=False)