Spaces:

ogwata
/

structeval-analyz

Runtime error

App Files Files Community

小形克宏 commited on Feb 20

Commit

2bd094f

1 Parent(s): 8fa637a

Initial commit: StructEval-T Analyzer

Browse files

Files changed (4) hide show

.gitignore +5 -0
README.md +57 -8
app.py +446 -0
requirements.txt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.pyc
+.DS_Store
+*.jsonl
+flagged/

README.md CHANGED Viewed

@@ -1,14 +1,63 @@
 ---
-title: Structeval Analyz
-emoji: 🐠
-colorFrom: yellow
-colorTo: red
 sdk: gradio
-sdk_version: 6.6.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: 松尾研Deep Learning応用講座2025最終課題のための分析器
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: StructEval-T Analyzer
+emoji: 🔍
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: "5.12.0"
 app_file: app.py
 pinned: false
+license: mit
 ---
+# 🔍 StructEval-T Analyzer
+松尾研LLM講義2025 メインコンペ用の推論結果分析ツールです。
+## 概要
+`inference.json` と `public_150.json` をアップロードすることで、モデル出力の構文的正確性（パース可能性）やエラーパターンを分析できます。
+## 機能
+### 📊 構文検証（Syntax Validation）
+各フォーマット（JSON, YAML, TOML, XML, CSV）ごとにPythonの標準パーサーで構文を検証します。
+### ❌ エラーパターン自動分類
+パースに失敗した出力に対して、以下のエラーパターンを自動検出します：
+| パターン | 説明 |
+|---------|------|
+| `markdown_block` | マークダウンコードブロック（\`\`\`json 等）の混入 |
+| `natural_language_prefix` | 先頭に自然言語（"Here is..."等）が混入 |
+| `natural_language_suffix` | 末尾に自然言語（"Note:"等）が混入 |
+| `truncation` | 出力の途切れ（閉じ括弧・タグの欠落） |
+| `empty_output` | 空の出力 |
+| `wrong_format` | 要求と異なるフォーマットの出力 |
+| `cot_leakage` | 思考過程（\<think\>等）の混入 |
+### 📈 複数実験の比較
+複数の `inference.json` をアップロードすることで、実験間のパース成功率を比較できます。
+## 使い方
+1. `public_150.json` をアップロード
+2. 1つ以上の `inference.json` をアップロード（複数ファイル対応）
+3. 「分析開始」ボタンをクリック
+## 注意事項
+- このツールは**構文的な正確性（パース可能かどうか）のみ**を検証します
+- 運営側の採点基準である `raw_output_metric`（特定キーの存在チェック等）は再現できません
+- スコアの完全な再現を目的としたものではなく、**エラーの傾向把握**に活用してください
+## ローカルでの実行
+```bash
+pip install gradio pandas pyyaml
+python app.py
+```
+## ライセンス
+MIT License

app.py ADDED Viewed

	@@ -0,0 +1,446 @@

+"""
+StructEval-T Analyzer
+松尾研LLM講義2025 メインコンペ用 推論結果分析ツール
+inference.json と public_150.json をアップロードして、
+フォーマット別のパース成功率やエラーパターンを分析します。
+"""
+import json
+import csv
+import io
+import re
+import traceback
+from collections import Counter, defaultdict
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+# ---------------------------------------------------------------------------
+# 1. Syntax Validators (フォーマット別パーサー)
+# ---------------------------------------------------------------------------
+def validate_json(text: str) -> tuple[bool, str]:
+    """JSON構文を検証"""
+    try:
+        json.loads(text)
+        return True, ""
+    except json.JSONDecodeError as e:
+        return False, f"JSONDecodeError: {e.msg} (line {e.lineno}, col {e.colno})"
+def validate_yaml(text: str) -> tuple[bool, str]:
+    """YAML構文を検証"""
+    try:
+        import yaml
+        yaml.safe_load(text)
+        return True, ""
+    except yaml.YAMLError as e:
+        return False, f"YAMLError: {e}"
+    except Exception as e:
+        return False, f"Error: {e}"
+def validate_toml(text: str) -> tuple[bool, str]:
+    """TOML構文を検証"""
+    try:
+        import tomllib
+        tomllib.loads(text)
+        return True, ""
+    except Exception as e:
+        return False, f"TOMLError: {e}"
+def validate_xml(text: str) -> tuple[bool, str]:
+    """XML構文を検証"""
+    try:
+        import xml.etree.ElementTree as ET
+        ET.fromstring(text)
+        return True, ""
+    except ET.ParseError as e:
+        return False, f"XMLParseError: {e}"
+    except Exception as e:
+        return False, f"Error: {e}"
+def validate_csv(text: str) -> tuple[bool, str]:
+    """CSV構文を検証"""
+    try:
+        reader = csv.reader(io.StringIO(text))
+        rows = list(reader)
+        if len(rows) == 0:
+            return False, "Empty CSV"
+        if len(rows) == 1:
+            return False, "CSV has only header, no data rows"
+        # 列数の一貫性チェック
+        col_counts = [len(row) for row in rows]
+        if len(set(col_counts)) > 1:
+            return False, f"Inconsistent column counts: {col_counts[:5]}"
+        return True, ""
+    except Exception as e:
+        return False, f"CSVError: {e}"
+VALIDATORS = {
+    "JSON": validate_json,
+    "YAML": validate_yaml,
+    "TOML": validate_toml,
+    "XML": validate_xml,
+    "CSV": validate_csv,
+}
+# ---------------------------------------------------------------------------
+# 2. Error Pattern Classifier (エラーパターン自動分類)
+# ---------------------------------------------------------------------------
+def classify_error_patterns(generation: str, output_type: str) -> list[str]:
+    """出力テキストのエラーパターンを分類"""
+    patterns = []
+    # マークダウンブロックの混入
+    if re.search(r"```\w*", generation):
+        patterns.append("markdown_block")
+    # 自然言語の混入（先頭部分）
+    first_line = generation.strip().split("\n")[0] if generation.strip() else ""
+    nl_indicators = [
+        "here is", "here's", "below is", "the following",
+        "sure", "certainly", "of course", "i'll",
+        "let me", "note:", "output:",
+    ]
+    if any(ind in first_line.lower() for ind in nl_indicators):
+        patterns.append("natural_language_prefix")
+    # 末尾の自然言語混入
+    last_lines = generation.strip().split("\n")[-3:] if generation.strip() else []
+    last_text = " ".join(last_lines).lower()
+    nl_suffix = ["note:", "explanation:", "this ", "the above", "please "]
+    if any(ind in last_text for ind in nl_suffix):
+        patterns.append("natural_language_suffix")
+    # 途切れ（トランケーション）の検出
+    stripped = generation.rstrip()
+    if output_type == "JSON":
+        open_count = generation.count("{") + generation.count("[")
+        close_count = generation.count("}") + generation.count("]")
+        if open_count > close_count:
+            patterns.append("truncation")
+    elif output_type == "XML":
+        open_tags = len(re.findall(r"<[^/!?][^>]*>", generation))
+        close_tags = len(re.findall(r"</[^>]+>", generation))
+        if open_tags > close_tags + 1:
+            patterns.append("truncation")
+    elif output_type in ("YAML", "TOML", "CSV"):
+        if stripped and stripped[-1] == "\\":
+            patterns.append("truncation")
+    # 空出力
+    if not generation.strip():
+        patterns.append("empty_output")
+    # 別フォーマットの出力（JSONを要求されたのにXMLが出てくる等）
+    format_indicators = {
+        "JSON": (r"^\s*[\{\[]", None),
+        "XML": (r"^\s*<", None),
+        "YAML": (None, None),
+        "TOML": (r"^\s*\[", None),
+        "CSV": (None, None),
+    }
+    if output_type == "JSON" and re.match(r"^\s*<", generation.strip()):
+        patterns.append("wrong_format")
+    elif output_type == "XML" and re.match(r"^\s*[\{\[]", generation.strip()):
+        patterns.append("wrong_format")
+    # CoT思考過程の混入
+    if re.search(r"<think>|</think>|<reasoning>|</reasoning>", generation):
+        patterns.append("cot_leakage")
+    return patterns if patterns else ["unknown"]
+# ---------------------------------------------------------------------------
+# 3. Core Analysis (コア分析ロジック)
+# ---------------------------------------------------------------------------
+def load_public_150(file_path: str) -> dict:
+    """public_150.json を読み込み、task_id → 情報 の辞書を返す"""
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return {item["task_id"]: item for item in data}
+def analyze_single_inference(
+    inference_data: list[dict],
+    task_info: dict,
+) -> pd.DataFrame:
+    """1つのinference.jsonを分析してDataFrameを返す"""
+    results = []
+    for item in inference_data:
+        task_id = item.get("task_id", "")
+        generation = item.get("generation", "")
+        info = task_info.get(task_id, {})
+        output_type = info.get("output_type", "UNKNOWN")
+        task_name = info.get("task_name", "UNKNOWN")
+        # 構文検証
+        validator = VALIDATORS.get(output_type)
+        if validator:
+            is_valid, error_msg = validator(generation)
+        else:
+            is_valid, error_msg = False, f"Unknown format: {output_type}"
+        # エラーパターン分類
+        if not is_valid:
+            error_patterns = classify_error_patterns(generation, output_type)
+        else:
+            error_patterns = []
+        results.append({
+            "task_id": task_id,
+            "task_name": task_name,
+            "output_type": output_type,
+            "is_valid": is_valid,
+            "error_msg": error_msg,
+            "error_patterns": ",".join(error_patterns) if error_patterns else "",
+            "generation_length": len(generation),
+            "generation_preview": generation[:200],
+        })
+    return pd.DataFrame(results)
+def compute_summary(df: pd.DataFrame) -> dict:
+    """分析結果のサマリーを計算"""
+    total = len(df)
+    valid = df["is_valid"].sum()
+    summary = {
+        "total_tasks": total,
+        "parse_success": int(valid),
+        "parse_fail": int(total - valid),
+        "parse_rate": f"{valid / total * 100:.1f}%" if total > 0 else "N/A",
+    }
+    # フォーマット別
+    format_stats = {}
+    for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
+        fmt_df = df[df["output_type"] == fmt]
+        fmt_total = len(fmt_df)
+        fmt_valid = fmt_df["is_valid"].sum()
+        format_stats[fmt] = {
+            "total": fmt_total,
+            "success": int(fmt_valid),
+            "fail": int(fmt_total - fmt_valid),
+            "rate": f"{fmt_valid / fmt_total * 100:.1f}%" if fmt_total > 0 else "N/A",
+        }
+    summary["by_format"] = format_stats
+    # エラーパターン集計
+    all_patterns = []
+    for patterns_str in df[df["is_valid"] == False]["error_patterns"]:
+        if patterns_str:
+            all_patterns.extend(patterns_str.split(","))
+    summary["error_pattern_counts"] = dict(Counter(all_patterns).most_common())
+    return summary
+# ---------------------------------------------------------------------------
+# 4. Multi-file Comparison (複数ファイル比較)
+# ---------------------------------------------------------------------------
+def compare_experiments(
+    all_results: dict[str, pd.DataFrame],
+) -> pd.DataFrame:
+    """複数実験の結果を比較するDataFrameを返す"""
+    rows = []
+    for name, df in all_results.items():
+        total = len(df)
+        valid = df["is_valid"].sum()
+        row = {
+            "experiment": name,
+            "total": total,
+            "parse_success": int(valid),
+            "parse_rate": f"{valid / total * 100:.1f}%" if total > 0 else "N/A",
+        }
+        for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
+            fmt_df = df[df["output_type"] == fmt]
+            fmt_total = len(fmt_df)
+            fmt_valid = fmt_df["is_valid"].sum()
+            row[f"{fmt}_rate"] = (
+                f"{fmt_valid / fmt_total * 100:.1f}%"
+                if fmt_total > 0
+                else "N/A"
+            )
+        rows.append(row)
+    return pd.DataFrame(rows)
+# ---------------------------------------------------------------------------
+# 5. Gradio Interface
+# ---------------------------------------------------------------------------
+def process_files(public_150_file, inference_files):
+    """メイン処理：ファイルを受け取って分析結果を返す"""
+    if public_150_file is None:
+        return "❌ public_150.json をアップロードしてください", None, None, None, None
+    if not inference_files:
+        return "❌ inference.json を1つ以上アップロードしてくだ��い", None, None, None, None
+    try:
+        # public_150.json 読み込み
+        task_info = load_public_150(public_150_file.name)
+        all_results = {}
+        all_summaries = {}
+        for inf_file in inference_files:
+            filename = Path(inf_file.name).stem
+            with open(inf_file.name, "r", encoding="utf-8") as f:
+                inference_data = json.load(f)
+            df = analyze_single_inference(inference_data, task_info)
+            summary = compute_summary(df)
+            all_results[filename] = df
+            all_summaries[filename] = summary
+        # --- 出力1: 全体サマリーテキスト ---
+        summary_text = "## 📊 分析結果サマリー\n\n"
+        for name, s in all_summaries.items():
+            summary_text += f"### {name}\n"
+            summary_text += f"- パース成功: {s['parse_success']}/{s['total_tasks']} ({s['parse_rate']})\n"
+            summary_text += f"- フォーマット別:\n"
+            for fmt, fs in s["by_format"].items():
+                summary_text += f"  - {fmt}: {fs['success']}/{fs['total']} ({fs['rate']})\n"
+            if s["error_pattern_counts"]:
+                summary_text += f"- エラーパターン:\n"
+                for pattern, count in s["error_pattern_counts"].items():
+                    summary_text += f"  - {pattern}: {count}件\n"
+            summary_text += "\n"
+        # --- 出力2: 比較テーブル ---
+        comparison_df = compare_experiments(all_results)
+        # --- 出力3: エラー詳細（最初のファイルのみ） ---
+        first_name = list(all_results.keys())[0]
+        first_df = all_results[first_name]
+        error_df = first_df[first_df["is_valid"] == False][
+            ["task_id", "task_name", "output_type", "error_msg", "error_patterns", "generation_preview"]
+        ]
+        # --- 出力4: フォーマット別パース成功率のCSV ---
+        format_comparison_rows = []
+        for name, df in all_results.items():
+            row = {"experiment": name}
+            for fmt in ["JSON", "YAML", "TOML", "XML", "CSV"]:
+                fmt_df = df[df["output_type"] == fmt]
+                fmt_total = len(fmt_df)
+                fmt_valid = fmt_df["is_valid"].sum()
+                row[fmt] = round(fmt_valid / fmt_total * 100, 1) if fmt_total > 0 else 0
+            format_comparison_rows.append(row)
+        format_df = pd.DataFrame(format_comparison_rows)
+        return summary_text, comparison_df, error_df, format_df, None
+    except Exception as e:
+        error_trace = traceback.format_exc()
+        return f"❌ エラーが発生しました:\n```\n{error_trace}\n```", None, None, None, None
+# ---------------------------------------------------------------------------
+# 6. Gradio App
+# ---------------------------------------------------------------------------
+def create_app():
+    with gr.Blocks(
+        title="StructEval-T Analyzer",
+        theme=gr.themes.Soft(),
+    ) as app:
+        gr.Markdown(
+            """
+            # 🔍 StructEval-T Analyzer
+            ### 松尾研LLM講義2025 メインコンペ用 推論結果分析ツール
+            `inference.json` と `public_150.json` をアップロードすることで、
+            モデル出力の構文的正確性（パース可能性）やエラーパターンを分析できます。
+            **使い方:**
+            1. `public_150.json` をアップロード
+            2. 1つ以上の `inference.json` をアップロード（複数ファイル対応・実験比較可能）
+            3. 「分析開始」ボタンをクリック
+            """
+        )
+        with gr.Row():
+            public_file = gr.File(
+                label="public_150.json",
+                file_types=[".json"],
+                type="filepath",
+            )
+            inference_files = gr.File(
+                label="inference.json（複数可）",
+                file_types=[".json"],
+                file_count="multiple",
+                type="filepath",
+            )
+        analyze_btn = gr.Button("🔬 分析開始", variant="primary", size="lg")
+        with gr.Tabs():
+            with gr.Tab("📊 サマリー"):
+                summary_output = gr.Markdown()
+            with gr.Tab("📈 実験比較"):
+                comparison_table = gr.Dataframe(
+                    label="実験間のパース成功率比較",
+                    interactive=False,
+                )
+            with gr.Tab("❌ エラー詳細"):
+                gr.Markdown("*最初にアップロードされたファイルのエラー一覧を表示*")
+                error_table = gr.Dataframe(
+                    label="パース失敗タスク一覧",
+                    interactive=False,
+                    wrap=True,
+                )
+            with gr.Tab("📉 フォーマット別"):
+                format_table = gr.Dataframe(
+                    label="フォーマット別パース成功率（%）",
+                    interactive=False,
+                )
+        analyze_btn.click(
+            fn=process_files,
+            inputs=[public_file, inference_files],
+            outputs=[summary_output, comparison_table, error_table, format_table, gr.State()],
+        )
+        gr.Markdown(
+            """
+            ---
+            **注意:** このツールは構文的な正確性（パース可能かどうか）のみを検証します。
+            運営側の採点基準である `raw_output_metric`（特定キーの存在チェック等）は
+            `public_150.json` から削除されているため、完全なスコア再現はできません。
+            **エラーパターンの凡例:**
+            - `markdown_block`: マークダウンコードブロック（\\`\\`\\`json 等）の混入
+            - `natural_language_prefix`: 先頭に自然言語（"Here is..."等）が混入
+            - `natural_language_suffix`: 末尾に自然言語（"Note:"等）が混入
+            - `truncation`: 出力の途切れ（閉じ括弧・タグの欠落）
+            - `empty_output`: 空の出力
+            - `wrong_format`: 要求と異なるフォーマットの出力
+            - `cot_leakage`: 思考過程（\\<think\\>等）の混入
+            - `unknown`: 上記に該当しない構文エラー
+            """
+        )
+    return app
+if __name__ == "__main__":
+    app = create_app()
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=4.0.0
+pandas
+pyyaml