Spaces:
Sleeping
Sleeping
| """ | |
| Inference Comparator - 推論結果比較アプリ | |
| 問題文: public_150.json と 複数の推論結果: inference.json | |
| を比較するGradioアプリ。 | |
| フォーマット不正の行は黄色でハイライト表示する。 | |
| """ | |
| import gradio as gr | |
| import json | |
| import re | |
| import difflib | |
| import html | |
| from pathlib import Path | |
| from typing import Optional, Tuple, List, Dict | |
| import yaml | |
| import toml | |
| import xml.etree.ElementTree as ET | |
| import pandas as pd | |
| import io | |
| import base64 | |
| # 定数 | |
| PUBLIC_DEFAULT_PATH = Path(__file__).parent / "public_150.json" | |
| HIGHLIGHT_COLOR = "#fff3b0" # 黄色ハイライト | |
| # カスタムCSS/JS(外部ファイルから読み込み) | |
| CSS_PATH = Path(__file__).parent / "styles.css" | |
| JS_PATH = Path(__file__).parent / "scripts.js" | |
| def esc(x: str) -> str: | |
| """HTMLエスケープ""" | |
| return html.escape(x or "") | |
| def count_tokens(text: str) -> int: | |
| """ | |
| 簡易的なトークン数カウント(文字数/4で近似) | |
| より正確にはtiktokenを使用するが、依存を減らすため簡易版 | |
| 算出ロジック: | |
| - 入力テキストの文字数を4で割った整数値を返す | |
| - 英語テキストでは約4文字≒1トークンが一般的な近似 | |
| - 日本語テキストでは1文字≒1-2トークンのため、この近似は過小評価となる | |
| - 正確なトークン数が必要な場合はtiktoken等のトークナイザーを使用すること | |
| """ | |
| return len(text) // 4 if text else 0 | |
| def extract_content(text: str, output_type: str) -> Tuple[Optional[str], str]: | |
| """ | |
| テキストからコードフェンスを除去してコンテンツを抽出する。 | |
| """ | |
| text = text.strip() | |
| fence_pattern = r'```(?:\w+)?\s*\n?(.*?)```' | |
| fence_match = re.search(fence_pattern, text, re.DOTALL | re.IGNORECASE) | |
| if fence_match: | |
| return fence_match.group(1).strip(), "fence" | |
| return text, "raw" | |
| def validate_format(text: str, output_type: str) -> Tuple[bool, str]: | |
| """output_typeに応じたフォーマット検証""" | |
| content, _ = extract_content(text, output_type) | |
| output_type = output_type.upper() | |
| try: | |
| if output_type == 'JSON': | |
| json.loads(content) | |
| elif output_type == 'YAML': | |
| yaml.safe_load(content) | |
| elif output_type == 'TOML': | |
| toml.loads(content) | |
| elif output_type == 'XML': | |
| ET.fromstring(content) | |
| elif output_type == 'CSV': | |
| if not content.strip(): | |
| raise ValueError("Empty CSV") | |
| pd.read_csv(io.StringIO(content)) | |
| else: | |
| return False, f"Unknown format: {output_type}" | |
| return True, "" | |
| except Exception as e: | |
| return False, str(e) | |
| def load_public_data() -> Dict[str, Dict]: | |
| """public_150.jsonを読込み、task_id -> metadata のマップを返す""" | |
| if not PUBLIC_DEFAULT_PATH.exists(): | |
| raise FileNotFoundError( | |
| f"public_150.json not found at {PUBLIC_DEFAULT_PATH}. " | |
| "Please ensure it is placed in the same directory as app.py." | |
| ) | |
| with open(PUBLIC_DEFAULT_PATH, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return { | |
| str(item['task_id']): { | |
| 'task_name': item.get('task_name', ''), | |
| 'output_type': item.get('output_type', ''), | |
| 'query': item.get('query', '') | |
| } | |
| for item in data | |
| } | |
| def load_inference_file(file) -> Tuple[str, Dict[str, str], Optional[str]]: | |
| """inferenceファイルを読込み""" | |
| if file is None: | |
| return "", {}, None | |
| filename = Path(file.name).name | |
| try: | |
| with open(file.name, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return filename, { | |
| str(item['task_id']): item.get('generation', '') | |
| for item in data | |
| }, None | |
| except json.JSONDecodeError as e: | |
| return filename, {}, f"JSONパースエラー: {e}" | |
| except Exception as e: | |
| return filename, {}, f"読込みエラー: {e}" | |
| def generate_summary_html( | |
| public_index: Dict[str, Dict], | |
| inference_names: List[str], | |
| inference_maps: List[Dict[str, str]], | |
| task_name_filter: str = "", | |
| output_type_filter: str = "", | |
| sort_column: str = "", | |
| sort_order: str = "", | |
| invalid_only: bool = False, | |
| ) -> str: | |
| """一覧HTMLテーブルを生成(フィルタ・ソート対応)""" | |
| rows_data = [] | |
| task_ids = sorted(public_index.keys()) | |
| for tid in task_ids: | |
| meta = public_index[tid] | |
| task_name = meta['task_name'].strip() # trim | |
| output_type = meta['output_type'].strip() # trim | |
| query = meta.get('query', '') | |
| # フィルタリング | |
| if task_name_filter and task_name_filter != task_name: | |
| continue | |
| if output_type_filter and output_type_filter != output_type: | |
| continue | |
| # 各inferenceの結果を収集 | |
| inf_results = [] | |
| any_invalid = False | |
| total_tokens = 0 | |
| for inf_map in inference_maps: | |
| gen = inf_map.get(tid, "") | |
| is_valid, error_msg = validate_format(gen, output_type) | |
| if not is_valid: | |
| any_invalid = True | |
| tokens = count_tokens(gen) | |
| total_tokens += tokens | |
| # 推論結果全文を保存(モーダル表示用) | |
| inf_results.append({ | |
| 'valid': is_valid, | |
| 'tokens': tokens, | |
| 'error': error_msg if not is_valid else '', | |
| 'generation': gen, # 全文保存 | |
| }) | |
| # queryのトークン数を計算 | |
| query_tokens = count_tokens(query) | |
| rows_data.append({ | |
| 'tid': tid, | |
| 'task_name': task_name, | |
| 'output_type': output_type, | |
| 'query': query, | |
| 'query_tokens': query_tokens, | |
| 'any_invalid': any_invalid, | |
| 'inf_results': inf_results, | |
| 'total_tokens': total_tokens, | |
| }) | |
| # フォーマット不正のレコードのみ表示フィルタ | |
| if invalid_only: | |
| rows_data = [r for r in rows_data if r['any_invalid']] | |
| # ソート処理 | |
| if sort_column and sort_order: | |
| reverse = (sort_order == "降順") | |
| # queryでソート | |
| if sort_column == "query": | |
| rows_data.sort( | |
| key=lambda x: x['query_tokens'], | |
| reverse=reverse | |
| ) | |
| else: | |
| # 特定のinferenceカラムでソート | |
| try: | |
| col_idx = int(sort_column) | |
| rows_data.sort( | |
| key=lambda x: x['inf_results'][col_idx]['tokens'] | |
| if col_idx < len(x['inf_results']) else 0, | |
| reverse=reverse | |
| ) | |
| except (ValueError, IndexError): | |
| pass | |
| # HTML生成 | |
| rows = [] | |
| for row_idx, row in enumerate(rows_data): | |
| bg_color = HIGHLIGHT_COLOR if row['any_invalid'] else "#ffffff" | |
| tid_disp = esc(row['tid']) | |
| task_name_disp = esc(row['task_name']) | |
| output_type_disp = esc(row['output_type']) | |
| # queryは最初の50文字を表示(長い場合は省略) | |
| query_raw = row['query'] | |
| query_len = len(query_raw) | |
| query_preview = query_raw[:50] + ('...' if query_len > 50 else '') | |
| query_disp = esc(query_preview) | |
| query_full = esc(query_raw[:200]) # title属性用は200文字まで | |
| # Base64エンコードでqueryを安全にコピー | |
| query_b64 = base64.b64encode( | |
| query_raw.encode('utf-8') | |
| ).decode('ascii') | |
| row_html = f'<tr style="background-color: {bg_color};">' | |
| # 幅を詰めたスタイル(padding縮小) | |
| td_compact = 'padding: 2px 4px; white-space: nowrap; font-size: 13px;' | |
| row_html += f'<td style="{td_compact}">{tid_disp}</td>' | |
| row_html += f'<td style="{td_compact}">{task_name_disp}</td>' | |
| row_html += f'<td style="{td_compact}">{output_type_disp}</td>' | |
| # queryカラム(クリックでモーダル表示 + トークン数) | |
| query_tokens = row['query_tokens'] | |
| tid_b64 = base64.b64encode( | |
| row['tid'].encode('utf-8') | |
| ).decode('ascii') | |
| q_style = 'padding: 2px 4px; max-width: 280px; font-size: 13px;' | |
| q_style += ' cursor: pointer;' | |
| onclick = f"showQueryModal('{tid_b64}','{query_b64}')" | |
| row_html += f'<td style="{q_style}" onclick="{onclick}" ' | |
| row_html += 'title="クリックでクエリ全文を表示">' | |
| row_html += f'{query_disp} ({query_tokens}tokens)</td>' | |
| for res in row['inf_results']: | |
| status = "✓" if res['valid'] else "✗" | |
| color = "#28a745" if res['valid'] else "#dc3545" | |
| inf_style = 'padding: 2px 4px; text-align: left;' | |
| inf_style += ' white-space: nowrap; font-size: 13px;' | |
| # エラーがある場合はクリック可能にする | |
| if not res['valid']: | |
| inf_style += ' cursor: pointer;' | |
| # task_id, エラー, 推論結果をBase64エンコード | |
| tid_b64 = base64.b64encode( | |
| row['tid'].encode('utf-8') | |
| ).decode('ascii') | |
| err_b64 = base64.b64encode( | |
| res['error'].encode('utf-8') | |
| ).decode('ascii') | |
| gen_b64 = base64.b64encode( | |
| res['generation'].encode('utf-8') | |
| ).decode('ascii') | |
| # カスタムモーダル関数を呼び出す | |
| onclick = ( | |
| f"showErrorModal('{tid_b64}','{err_b64}','{gen_b64}')" | |
| ) | |
| row_html += f'<td style="{inf_style}" onclick="{onclick}" ' | |
| row_html += 'title="クリックでエラー詳細を表示">' | |
| else: | |
| row_html += f'<td style="{inf_style}">' | |
| row_html += f'<span style="color: {color}; font-weight: bold;">' | |
| row_html += f'{status}</span> ({res["tokens"]}tokens)</td>' | |
| row_html += '</tr>' | |
| rows.append(row_html) | |
| # テーブルヘッダー | |
| header_cols = ['task_id', 'task_name', 'output_type', 'query'] | |
| header_cols.extend([name.strip() for name in inference_names]) | |
| header_html = '<tr style="background-color: #f8f9fa; ' | |
| header_html += 'position: sticky; top: 0; z-index: 1;">' | |
| for col in header_cols: | |
| header_html += '<th style="padding: 4px 6px; font-size: 13px; ' | |
| header_html += 'border-bottom: 2px solid #dee2e6; white-space: nowrap;">' | |
| header_html += f'{esc(col)}</th>' | |
| header_html += '</tr>' | |
| # 完全なテーブル(モーダルは削除、alertを使用) | |
| formats = ['JSON', 'YAML', 'TOML', 'XML', 'CSV'] | |
| table_html = f''' | |
| <div style="overflow-x: auto; | |
| border: 1px solid #dee2e6; border-radius: 4px;"> | |
| <table style="width: 100%; border-collapse: collapse; | |
| font-size: 14px; table-layout: auto;"> | |
| <thead>{header_html}</thead> | |
| <tbody>{''.join(rows)}</tbody> | |
| </table> | |
| </div> | |
| <div style="margin-top: 8px; padding: 8px; background-color: #f8f9fa; | |
| border-radius: 4px; font-size: 12px;"> | |
| <b>表示件数:</b> {len(rows)} / {len(task_ids)} 件 | |
| <br> | |
| <b>結果の見方:</b> | |
| <br> | |
| - <span style="background-color: {HIGHLIGHT_COLOR}; padding: 2px 8px; | |
| border-radius: 2px;">黄色</span> | |
| : 少なくとも1つの推論結果が指定フォーマット({', '.join(formats)}) | |
| として解釈できない(出力フォーマットの不正が生じた) | |
| <br> | |
| - <b>query</b>カラムをクリック → クエリ全文をモーダル表示 | |
| <br> | |
| - <b>✗</b>マークをクリック → フォーマットエラー詳細と推論結果をモーダル表示 | |
| <br> | |
| - <b>tokens</b>: 文字数÷4の簡易近似(日本語は過小評価の可能性あり) | |
| <br> | |
| </div> | |
| ''' | |
| return table_html | |
| def generate_colored_diff(text_a: str, text_b: str) -> str: | |
| """カラフルなdiff HTMLを生成""" | |
| lines_a = text_a.splitlines(keepends=True) | |
| lines_b = text_b.splitlines(keepends=True) | |
| diff = list(difflib.unified_diff( | |
| lines_a, lines_b, fromfile='A', tofile='B' | |
| )) | |
| if not diff: | |
| return "<div style='padding: 10px; color: #666;'>差分なし</div>" | |
| html_lines = [] | |
| for line in diff: | |
| line_escaped = ( | |
| line.replace('&', '&') | |
| .replace('<', '<') | |
| .replace('>', '>') | |
| .replace('\n', '') | |
| ) | |
| if line.startswith('+++') or line.startswith('---'): | |
| html_lines.append( | |
| f'<div style="color: #666; font-weight: bold;">' | |
| f'{line_escaped}</div>' | |
| ) | |
| elif line.startswith('@@'): | |
| html_lines.append( | |
| f'<div style="color: #0066cc; background-color: #e6f2ff; ' | |
| f'padding: 2px 4px;">{line_escaped}</div>' | |
| ) | |
| elif line.startswith('+'): | |
| html_lines.append( | |
| f'<div style="color: #28a745; background-color: #e6ffec; ' | |
| f'padding: 2px 4px;">{line_escaped}</div>' | |
| ) | |
| elif line.startswith('-'): | |
| html_lines.append( | |
| f'<div style="color: #dc3545; background-color: #ffebe9; ' | |
| f'padding: 2px 4px;">{line_escaped}</div>' | |
| ) | |
| else: | |
| html_lines.append( | |
| f'<div style="padding: 2px 4px;">{line_escaped}</div>' | |
| ) | |
| return f''' | |
| <div style="font-family: monospace; font-size: 12px; | |
| max-height: 300px; overflow-y: auto; | |
| border: 1px solid #dee2e6; border-radius: 4px; | |
| padding: 8px;"> | |
| {''.join(html_lines)} | |
| </div> | |
| ''' | |
| # Gradio UI | |
| def check_file_count(files): | |
| """ファイル数をチェックして読込みボタンの有効/無効を切り替え""" | |
| if files is None or len(files) < 1: | |
| return gr.update(interactive=False) | |
| return gr.update(interactive=True) | |
| def process_files( | |
| files, | |
| task_name_filter_val: str = "", | |
| output_type_filter_val: str = "", | |
| sort_column_val: str = "", | |
| sort_order_val: str = "", | |
| invalid_only_val: bool = False, | |
| ): | |
| """ファイル読込み処理(フィルター状態を維持)""" | |
| if files is None or len(files) < 1: | |
| empty_choices = gr.update( | |
| choices=[], value=None, allow_custom_value=True | |
| ) | |
| no_file_msg = ("<p style='color: #666;'>「📝 結果概要」タブで推論" | |
| "ファイルをアップロードしてください</p>") | |
| return ( | |
| "<p style='color: red;'>⚠️ 最低1つのinferenceファイルを" | |
| "アップロードしてください</p>", | |
| empty_choices, # task_dropdown | |
| empty_choices, # inf_a | |
| empty_choices, # inf_b | |
| empty_choices, # task_name_filter | |
| empty_choices, # output_type_filter | |
| empty_choices, # sort_column | |
| gr.update(interactive=False), # sort_order (disable) | |
| [], [], {}, | |
| no_file_msg, # eval_result_html | |
| ) | |
| try: | |
| public_index = load_public_data() | |
| except FileNotFoundError as e: | |
| empty_choices = gr.update( | |
| choices=[], value=None, allow_custom_value=True | |
| ) | |
| no_file_msg = ("<p style='color: #666;'>「📝 結果概要」タブで推論" | |
| "ファイルをアップロードしてください</p>") | |
| return ( | |
| f"<p style='color: red;'>⚠️ {str(e)}</p>", | |
| empty_choices, empty_choices, empty_choices, | |
| empty_choices, empty_choices, empty_choices, | |
| gr.update(interactive=False), | |
| [], [], {}, | |
| no_file_msg, # eval_result_html | |
| ) | |
| inference_names = [] | |
| inference_maps = [] | |
| load_errors = [] | |
| for file in files: | |
| name, inf_map, error = load_inference_file(file) | |
| if error: | |
| load_errors.append(f"<b>{esc(name)}</b>: {esc(error)}") | |
| else: | |
| inference_names.append(name) | |
| inference_maps.append(inf_map) | |
| if load_errors: | |
| error_html = ( | |
| "<div style='color: red; padding: 10px; " | |
| "background-color: #ffe6e6; border-radius: 4px; " | |
| "margin-bottom: 10px;'><b>⚠️ 読込みエラー:</b><br>" + | |
| "<br>".join(load_errors) + "</div>" | |
| ) | |
| else: | |
| error_html = "" | |
| if len(inference_maps) < 1: | |
| empty_choices = gr.update(choices=[], value=None) | |
| no_file_msg = ("<p style='color: #666;'>「📝 結果概要」タブで推論" | |
| "ファイルをアップロードしてください</p>") | |
| return ( | |
| error_html + "<p style='color: red;'>⚠️ 有効なinference" | |
| "ファイルが必要です</p>", | |
| empty_choices, empty_choices, empty_choices, | |
| empty_choices, empty_choices, empty_choices, | |
| gr.update(interactive=False), # sort_order (disable) | |
| [], [], {}, | |
| no_file_msg, # eval_result_html | |
| ) | |
| summary_html = error_html + generate_summary_html( | |
| public_index, inference_names, inference_maps, | |
| task_name_filter=task_name_filter_val or "", | |
| output_type_filter=output_type_filter_val or "", | |
| sort_column=sort_column_val or "", | |
| sort_order=sort_order_val or "", | |
| invalid_only=invalid_only_val, | |
| ) | |
| # task_id選択肢(検索用にtask_nameも表示) | |
| task_choices = [] | |
| for tid in sorted(public_index.keys()): | |
| meta = public_index[tid] | |
| label = f"{tid[:20]}... | {meta['task_name']}" | |
| task_choices.append((label, tid)) | |
| # フィルター用の選択肢 | |
| task_names = sorted(set(m['task_name'] for m in public_index.values())) | |
| output_types = sorted(set(m['output_type'] for m in public_index.values())) | |
| # ソートカラム選択肢(queryとinferenceファイル) | |
| sort_cols = [("query", "query")] | |
| sort_cols.extend([(f"{i}: {name}", str(i)) | |
| for i, name in enumerate(inference_names)]) | |
| # 詳細比較用のデフォルト値 | |
| inf_a_default = inference_names[0] if inference_names else None | |
| inf_b_default = (inference_names[1] if len(inference_names) > 1 | |
| else inference_names[0] if inference_names else None) | |
| # ローカル評価結果を生成 | |
| eval_html = run_local_evaluation( | |
| inference_names, inference_maps, public_index | |
| ) | |
| return ( | |
| summary_html, | |
| gr.update(choices=task_choices, | |
| value=task_choices[0][1] if task_choices else None), | |
| gr.update(choices=inference_names, value=inf_a_default), | |
| gr.update(choices=inference_names, value=inf_b_default), | |
| gr.update(choices=[("なし", "")] + [(n, n) for n in task_names], | |
| value=""), | |
| gr.update(choices=[("なし", "")] + [(t, t) for t in output_types], | |
| value=""), | |
| gr.update(choices=[("なし", "")] + sort_cols, value=""), | |
| gr.update(interactive=False), # sort_order (初期はdisable) | |
| inference_names, | |
| inference_maps, | |
| public_index, | |
| eval_html, # eval_result_html | |
| ) | |
| def update_summary( | |
| task_name_filter: str, | |
| output_type_filter: str, | |
| sort_column: str, | |
| sort_order: str, | |
| invalid_only: bool, | |
| inference_names: List[str], | |
| inference_maps: List[Dict[str, str]], | |
| public_index: Dict[str, Dict], | |
| ): | |
| """一覧をフィルタ・ソートして更新""" | |
| if not public_index: | |
| return "<p>データが読み込まれていません</p>" | |
| return generate_summary_html( | |
| public_index, | |
| inference_names, | |
| inference_maps, | |
| task_name_filter=task_name_filter or "", | |
| output_type_filter=output_type_filter or "", | |
| sort_column=sort_column or "", | |
| sort_order=sort_order or "", | |
| invalid_only=invalid_only, | |
| ) | |
| def update_detail( | |
| task_id: str, | |
| inf_a_name: str, | |
| inf_b_name: str, | |
| inference_names: List[str], | |
| inference_maps: List[Dict[str, str]], | |
| public_index: Dict[str, Dict], | |
| ): | |
| """詳細表示を更新""" | |
| if not task_id or not inf_a_name or not inf_b_name: | |
| return "", "", "", "", "", "" | |
| meta = public_index.get(task_id, {}) | |
| query = meta.get('query', 'N/A') | |
| query_tokens = count_tokens(query) | |
| meta_md = f""" | |
| ### Task Information | |
| - **task_id**: `{task_id}` | |
| - **task_name**: {meta.get('task_name', 'N/A')} | |
| - **output_type**: {meta.get('output_type', 'N/A')} | |
| ### Query ({query_tokens} tokens) | |
| <pre style="white-space: pre-wrap; word-wrap: break-word; background-color: #f5f5f5; padding: 12px; border-radius: 6px; font-family: monospace; font-size: 13px; max-height: 400px; overflow-y: auto;">{html.escape(query)}</pre> | |
| """ | |
| idx_a = (inference_names.index(inf_a_name) | |
| if inf_a_name in inference_names else 0) | |
| idx_b = (inference_names.index(inf_b_name) | |
| if inf_b_name in inference_names else 1) | |
| gen_a = (inference_maps[idx_a].get(task_id, "") | |
| if idx_a < len(inference_maps) else "") | |
| gen_b = (inference_maps[idx_b].get(task_id, "") | |
| if idx_b < len(inference_maps) else "") | |
| tokens_a = count_tokens(gen_a) | |
| tokens_b = count_tokens(gen_b) | |
| label_a = f"### Inference A: {inf_a_name} ({tokens_a} tokens)" | |
| label_b = f"### Inference B: {inf_b_name} ({tokens_b} tokens)" | |
| diff_html = generate_colored_diff(gen_a, gen_b) | |
| return meta_md, gen_a, gen_b, diff_html, label_a, label_b | |
| def evaluate_single_inference( | |
| inf_name: str, | |
| inf_map: Dict[str, str], | |
| public_index: Dict[str, Dict], | |
| ) -> Dict: | |
| """ | |
| 単一の推論ファイルを評価し、統計情報を返す。 | |
| """ | |
| from collections import defaultdict | |
| stats = defaultdict(lambda: {"total": 0, "valid": 0}) | |
| errors = [] | |
| for tid, meta in public_index.items(): | |
| target_fmt = meta.get('output_type', '').upper() | |
| generated_text = inf_map.get(tid, '') | |
| if not target_fmt: | |
| continue | |
| # extract_contentを使ってコードフェンス除去 | |
| content, _ = extract_content(generated_text, target_fmt) | |
| is_valid, error_msg = validate_format(generated_text, target_fmt) | |
| stats[target_fmt]["total"] += 1 | |
| if is_valid: | |
| stats[target_fmt]["valid"] += 1 | |
| else: | |
| errors.append({ | |
| "task_id": tid, | |
| "format": target_fmt, | |
| "error": error_msg, | |
| "generated_snippet": generated_text[:100] | |
| }) | |
| # 全体統計を計算 | |
| total_valid = sum(d["valid"] for d in stats.values()) | |
| total_count = sum(d["total"] for d in stats.values()) | |
| overall_rate = (total_valid / total_count) * 100 if total_count > 0 else 0 | |
| return { | |
| "name": inf_name, | |
| "stats": dict(stats), | |
| "errors": errors, | |
| "total_valid": total_valid, | |
| "total_count": total_count, | |
| "overall_rate": overall_rate, | |
| } | |
| def generate_eval_report_html(eval_result: Dict) -> str: | |
| """ | |
| 単一評価結果からHTMLレポートを生成。 | |
| """ | |
| stats = eval_result["stats"] | |
| errors = eval_result["errors"] | |
| total_valid = eval_result["total_valid"] | |
| total_count = eval_result["total_count"] | |
| overall_rate = eval_result["overall_rate"] | |
| format_rows = [] | |
| for fmt, data in sorted(stats.items()): | |
| rate = (data["valid"] / data["total"]) * 100 if data["total"] > 0 else 0 | |
| # プログレスバーの色と状態 | |
| if rate >= 80: | |
| bar_color = "#28a745" | |
| status = "✓" | |
| status_color = "#28a745" | |
| elif rate >= 60: | |
| bar_color = "#ffc107" | |
| status = "△" | |
| status_color = "#ffc107" | |
| else: | |
| bar_color = "#dc3545" | |
| status = "✗" | |
| status_color = "#dc3545" | |
| bar_width = int(rate) | |
| format_rows.append(f''' | |
| <tr> | |
| <td style="padding: 8px 12px; font-weight: bold; | |
| font-family: monospace;">[{fmt}]</td> | |
| <td style="padding: 8px 12px; width: 300px;"> | |
| <div style="background-color: #e9ecef; | |
| border-radius: 4px; height: 24px;"> | |
| <div style="background-color: {bar_color}; | |
| width: {bar_width}%; height: 100%; | |
| border-radius: 4px;"></div> | |
| </div> | |
| </td> | |
| <td style="padding: 8px 12px; text-align: right; | |
| font-family: monospace;">{rate:.1f}%</td> | |
| <td style="padding: 8px 12px; text-align: center; | |
| font-family: monospace;">({data["valid"]:3}/{data["total"]:3})</td> | |
| <td style="padding: 8px 12px; text-align: center; | |
| font-size: 18px; color: {status_color};">{status}</td> | |
| </tr> | |
| ''') | |
| # 判定メッセージ | |
| if overall_rate >= 90: | |
| judgment = "✅ 提出候補として検討可能" | |
| judgment_color = "#28a745" | |
| elif overall_rate >= 80: | |
| judgment = "△ 要検討。弱点フォーマットの改善を推奨" | |
| judgment_color = "#ffc107" | |
| else: | |
| judgment = "⚠️ 見直し推奨。フォーマットエラーが多い" | |
| judgment_color = "#dc3545" | |
| # エラーサンプル | |
| error_samples_html = "" | |
| if errors: | |
| error_samples = [] | |
| for err in errors[:10]: # 最大10件表示 | |
| err_msg = err['error'][:100].replace('\n', ' ') \ | |
| if err['error'] else 'Unknown error' | |
| error_samples.append(f''' | |
| <tr style="background-color: #fff3f3;"> | |
| <td style="padding: 6px 10px; | |
| font-family: monospace;">[{err['format']}]</td> | |
| <td style="padding: 6px 10px; | |
| font-family: monospace; color: #666;">{err['task_id']}</td> | |
| <td style="padding: 6px 10px; | |
| color: #dc3545; font-size: 12px;">{esc(err_msg)}...</td> | |
| </tr> | |
| ''') | |
| error_samples_html = f''' | |
| <div style="margin-top: 20px;"> | |
| <h4 style="color: #dc3545;">⚠️ エラー(最大10件)</h4> | |
| <table style="width: 100%; border-collapse: collapse; | |
| font-size: 13px; border: 1px solid #dee2e6;"> | |
| <thead> | |
| <tr style="background-color: #f8f9fa;"> | |
| <th style="padding: 8px; text-align: left; | |
| border-bottom: 2px solid #dee2e6;">Format</th> | |
| <th style="padding: 8px; text-align: left; | |
| border-bottom: 2px solid #dee2e6;">Task ID</th> | |
| <th style="padding: 8px; text-align: left; | |
| border-bottom: 2px solid #dee2e6;">Error</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {''.join(error_samples)} | |
| </tbody> | |
| </table> | |
| </div> | |
| ''' | |
| return f''' | |
| <table style="width: 100%; border-collapse: collapse; margin: 16px 0;"> | |
| <thead> | |
| <tr style="background-color: #e9ecef;"> | |
| <th style="padding: 10px; text-align: left;">Format</th> | |
| <th style="padding: 10px; text-align: left;">Progress</th> | |
| <th style="padding: 10px; text-align: right;">Rate</th> | |
| <th style="padding: 10px; text-align: center;">Count</th> | |
| <th style="padding: 10px; text-align: center;">Status</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| {''.join(format_rows)} | |
| </tbody> | |
| </table> | |
| <div style="margin-top: 20px; padding: 16px; | |
| background-color: #ffffff; border-radius: 6px; | |
| border-left: 4px solid {judgment_color};"> | |
| <h4 style="margin: 0 0 8px 0;"> | |
| OVERALL: {overall_rate:.2f}% ({total_valid}/{total_count})</h4> | |
| <p style="margin: 0; color: {judgment_color}; | |
| font-weight: bold;">{judgment}</p> | |
| </div> | |
| {error_samples_html} | |
| ''' | |
| def run_local_evaluation( | |
| inference_names: List[str], | |
| inference_maps: List[Dict[str, str]], | |
| public_index: Dict[str, Dict], | |
| sort_order: str = "", | |
| ) -> str: | |
| """ | |
| ローカル評価を実行し、結果HTMLを返す。 | |
| scripts/local_eval.py のロジックをベースに実装。 | |
| 既にアップロードされた推論ファイルのデータを使用。 | |
| 複数ファイルがある場合はアコーディオン形式で表示。 | |
| sort_order: "スコア昇順", "スコア降順", または "" (アップロード順) | |
| """ | |
| if not inference_maps or not public_index: | |
| return ("<p style='color: #666;'>「📝 結果概要」タブで推論ファイル" | |
| "(inference.json)をアップロードしてください</p>") | |
| try: | |
| # 全ファイルを評価 | |
| all_results = [] | |
| for i, inf_map in enumerate(inference_maps): | |
| inf_name = inference_names[i] if i < len(inference_names) \ | |
| else f"inference_{i}.json" | |
| result = evaluate_single_inference( | |
| inf_name, inf_map, public_index | |
| ) | |
| all_results.append(result) | |
| # ソート処理 | |
| if sort_order == "スコア降順": | |
| all_results.sort(key=lambda x: x["overall_rate"], reverse=True) | |
| elif sort_order == "スコア昇順": | |
| all_results.sort(key=lambda x: x["overall_rate"], reverse=False) | |
| # else: アップロード順(デフォルト) | |
| # 単一ファイルの場合はシンプル表示 | |
| if len(all_results) == 1: | |
| report_html = generate_eval_report_html(all_results[0]) | |
| return f''' | |
| <div style="padding: 20px; | |
| background-color: #f8f9fa; border-radius: 8px;"> | |
| <h3 style="margin-top: 0;"> | |
| 🏆 評価レポート(コードフェンス自動除去版)</h3> | |
| <p style="color: #666; margin-bottom: 16px;"> | |
| ファイル: <strong>{esc(all_results[0]["name"])}</strong></p> | |
| {report_html} | |
| <div style="margin-top: 24px; padding: 12px; | |
| background-color: #fff3cd; border-radius: 6px; | |
| border: 1px solid #ffc107;"> | |
| <p style="margin: 0; font-size: 13px; color: #856404;"> | |
| <strong>※ あくまで参考値であり、 | |
| LBでの評価と相関があるとは限りません。</strong><br> | |
| 詳細な評価基準が公表されていないため、 | |
| フォーマットに沿っているかのみを判定しています。 | |
| </p> | |
| </div> | |
| </div> | |
| ''' | |
| # 複数ファイルの場合はアコーディオン形式で表示 | |
| # (GradioのHTML内ではJavaScriptが動作しないため、 | |
| # HTML5のdetails/summaryタグを使用) | |
| accordion_items = [] | |
| for i, res in enumerate(all_results): | |
| rate = res["overall_rate"] | |
| # スコアに応じた色 | |
| if rate >= 90: | |
| badge_color = "#28a745" | |
| badge_text_color = "white" | |
| elif rate >= 80: | |
| badge_color = "#ffc107" | |
| badge_text_color = "black" | |
| else: | |
| badge_color = "#dc3545" | |
| badge_text_color = "white" | |
| report_html = generate_eval_report_html(res) | |
| # 最初のアイテムは開いた状態にする | |
| open_attr = "open" if i == 0 else "" | |
| accordion_items.append(f''' | |
| <details {open_attr} style="margin-bottom: 12px; | |
| border: 1px solid #dee2e6; border-radius: 8px; | |
| overflow: hidden;"> | |
| <summary style="padding: 12px 16px; | |
| background-color: #f8f9fa; cursor: pointer; | |
| font-weight: bold; font-size: 14px; | |
| display: flex; align-items: center; | |
| justify-content: space-between; | |
| list-style: none;"> | |
| <span style="display: flex; align-items: center;"> | |
| <span style="margin-right: 12px;">📄</span> | |
| {esc(res["name"])} | |
| </span> | |
| <span style="background-color: {badge_color}; | |
| color: {badge_text_color}; | |
| padding: 4px 12px; border-radius: 12px; | |
| font-size: 13px; font-weight: bold;"> | |
| {rate:.1f}% | |
| </span> | |
| </summary> | |
| <div style="padding: 20px; background-color: #ffffff;"> | |
| {report_html} | |
| </div> | |
| </details> | |
| ''') | |
| result_html = f''' | |
| <div style="padding: 20px; | |
| background-color: #f8f9fa; border-radius: 8px;"> | |
| <h3 style="margin-top: 0;"> | |
| 🏆 評価レポート(コードフェンス自動除去版)</h3> | |
| <p style="color: #666; margin-bottom: 16px;"> | |
| {len(all_results)}件のファイルを評価しました。 | |
| 各ファイル名をクリックして結果を展開/折りたたみできます。 | |
| </p> | |
| {''.join(accordion_items)} | |
| <div style="margin-top: 24px; padding: 12px; | |
| background-color: #fff3cd; border-radius: 6px; | |
| border: 1px solid #ffc107;"> | |
| <p style="margin: 0; font-size: 13px; color: #856404;"> | |
| <strong>※ あくまで参考値であり、 | |
| LBでの評価と相関があるとは限りません。</strong><br> | |
| 詳細な評価基準が公表されていないため、 | |
| フォーマットに沿っているかのみを判定しています。 | |
| </p> | |
| </div> | |
| </div> | |
| ''' | |
| return result_html | |
| except json.JSONDecodeError as e: | |
| return f"<p style='color: red;'>⚠️ JSONパースエラー: {e}</p>" | |
| except Exception as e: | |
| return f"<p style='color: red;'>⚠️ エラー: {e}</p>" | |
| def swap_inferences(inf_a: str, inf_b: str): | |
| """Inference A と B の選択を入れ替える""" | |
| return inf_b, inf_a | |
| def load_custom_css() -> str: | |
| """styles.cssを読込む""" | |
| if CSS_PATH.exists(): | |
| with open(CSS_PATH, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| return "" | |
| def get_custom_js() -> str: | |
| """scripts.jsを読込む""" | |
| if JS_PATH.exists(): | |
| with open(JS_PATH, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| return "" | |
| def update_sort_order_visibility(sort_column: str): | |
| """ソートカラムに応じてソート順の有効/無効を切り替え""" | |
| if sort_column and sort_column != "": | |
| return gr.update(interactive=True) | |
| else: | |
| return gr.update(interactive=False) | |
| def update_eval_results( | |
| sort_order: str, | |
| inference_names: List[str], | |
| inference_maps: List[Dict[str, str]], | |
| public_index: Dict[str, Dict], | |
| ) -> str: | |
| """ローカル評価結果をソートして更新""" | |
| return run_local_evaluation( | |
| inference_names, inference_maps, public_index, sort_order | |
| ) | |
| # アプリ構築 | |
| with gr.Blocks(title="Inference Comparator") as app: | |
| gr.Markdown(""" | |
| # 🔍 Inference Comparator | |
| テストデータ(public_150.json)への推論結果(inference.json)を比較するツール。 | |
| **使い方:** | |
| 1. 推論ファイル(inference.json)をアップロードする(1ファイル以上必要、複数ファイル選択可) | |
| 2. 一覧タブで、推論結果の概要を確認できる(黄色: フォーマット不正を含む) | |
| 3. 詳細比較タブで、複数の推論結果を個別比較できる(2ファイル以上必要) | |
| """) | |
| # State | |
| inference_names_state = gr.State([]) | |
| inference_maps_state = gr.State([]) | |
| public_index_state = gr.State({}) | |
| file_input = gr.File( | |
| label="📂 推論ファイル(inference.json)のアップロード", | |
| file_count="multiple", | |
| file_types=[".json"], | |
| ) | |
| with gr.Tabs(): | |
| with gr.TabItem("📝 結果概要"): | |
| with gr.Row(): | |
| task_name_filter = gr.Dropdown( | |
| label="🔍 task_name フィルター", | |
| choices=[], | |
| value="", | |
| allow_custom_value=True, | |
| interactive=True, | |
| ) | |
| output_type_filter = gr.Dropdown( | |
| label="🔍 output_type フィルター", | |
| choices=[], | |
| value="", | |
| allow_custom_value=True, | |
| interactive=True, | |
| ) | |
| invalid_only_checkbox = gr.Checkbox( | |
| label="⚠️ フォーマット不正のレコードのみ表示", | |
| value=False, | |
| interactive=True, | |
| ) | |
| with gr.Row(): | |
| sort_column = gr.Dropdown( | |
| label="📊 ソートカラム", | |
| choices=[], | |
| value="", | |
| allow_custom_value=True, | |
| interactive=True, | |
| ) | |
| sort_order = gr.Radio( | |
| label="ソート順", | |
| choices=["昇順", "降順"], | |
| value="降順", | |
| interactive=False, # 初期状態はdisable | |
| ) | |
| summary_html = gr.HTML( | |
| value="<p>ファイルをアップロードしてください</p>" | |
| ) | |
| with gr.TabItem("🔎 詳細比較"): | |
| # Task IDドロップダウン(検索機能付き) | |
| task_dropdown = gr.Dropdown( | |
| label="🔍 Task ID を選択(検索可能: ID / Task Name)", | |
| choices=[], | |
| interactive=True, | |
| allow_custom_value=False, | |
| filterable=True, # 検索機能を有効化 | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| inf_a_dropdown = gr.Dropdown( | |
| label="Inference A", | |
| choices=[], | |
| interactive=True, | |
| ) | |
| with gr.Column(scale=0, min_width=60): | |
| swap_btn = gr.Button( | |
| "⇄", | |
| size="sm", | |
| ) | |
| with gr.Column(scale=5): | |
| inf_b_dropdown = gr.Dropdown( | |
| label="Inference B", | |
| choices=[], | |
| interactive=True, | |
| ) | |
| meta_md = gr.Markdown("Task情報がここに表示されます") | |
| with gr.Row(): | |
| with gr.Column(): | |
| label_a = gr.Markdown("### Inference A") | |
| text_a = gr.Code( | |
| label="", | |
| language="json", | |
| lines=15, | |
| interactive=False, | |
| ) | |
| with gr.Column(): | |
| label_b = gr.Markdown("### Inference B") | |
| text_b = gr.Code( | |
| label="", | |
| language="json", | |
| lines=15, | |
| interactive=False, | |
| ) | |
| gr.Markdown("### Diff (A → B)") | |
| diff_output = gr.HTML( | |
| value="<div style='color: #666;'>差分がここに表示されます</div>" | |
| ) | |
| with gr.TabItem("📊 ローカル評価"): | |
| gr.Markdown(""" | |
| ### ローカル簡易評価 | |
| 推論結果のフォーマット妥当性を検証し、簡易スコアを算出します。 | |
| `python scripts/local_eval.py` と同等の評価を行います。 | |
| **使い方**: 上部の「Inferenceファイルのアップロード」から推論ファイルをアップロードすると、 | |
| 自動的に評価結果が表示されます。 | |
| """) | |
| eval_sort_dropdown = gr.Dropdown( | |
| label="📊 表示順", | |
| choices=[ | |
| ("アップロード順", ""), | |
| ("スコア降順", "スコア降順"), | |
| ("スコア昇順", "スコア昇順"), | |
| ], | |
| value="", | |
| interactive=True, | |
| scale=1, | |
| ) | |
| eval_result_html = gr.HTML( | |
| value="<p style='color: #666;'>推論ファイルをアップロードすると評価結果が表示されます</p>" | |
| ) | |
| # イベントハンドラ - ファイル選択時に即座に処理(フィルター状態を維持) | |
| file_input.change( | |
| fn=process_files, | |
| inputs=[ | |
| file_input, | |
| task_name_filter, | |
| output_type_filter, | |
| sort_column, | |
| sort_order, | |
| invalid_only_checkbox, | |
| ], | |
| outputs=[ | |
| summary_html, | |
| task_dropdown, | |
| inf_a_dropdown, | |
| inf_b_dropdown, | |
| task_name_filter, | |
| output_type_filter, | |
| sort_column, | |
| sort_order, # 追加: ソート順の表示/非表示制御 | |
| inference_names_state, | |
| inference_maps_state, | |
| public_index_state, | |
| eval_result_html, # ローカル評価結果も更新 | |
| ] | |
| ) | |
| # フィルター・ソート変更時 | |
| filter_inputs = [ | |
| task_name_filter, | |
| output_type_filter, | |
| sort_column, | |
| sort_order, | |
| invalid_only_checkbox, | |
| inference_names_state, | |
| inference_maps_state, | |
| public_index_state, | |
| ] | |
| task_name_filter.change( | |
| fn=update_summary, | |
| inputs=filter_inputs, | |
| outputs=[summary_html], | |
| ) | |
| output_type_filter.change( | |
| fn=update_summary, | |
| inputs=filter_inputs, | |
| outputs=[summary_html], | |
| ) | |
| invalid_only_checkbox.change( | |
| fn=update_summary, | |
| inputs=filter_inputs, | |
| outputs=[summary_html], | |
| ) | |
| sort_column.change( | |
| fn=update_summary, | |
| inputs=filter_inputs, | |
| outputs=[summary_html], | |
| ) | |
| # ソートカラム変更時にソート順の表示/非表示を切り替え | |
| sort_column.change( | |
| fn=update_sort_order_visibility, | |
| inputs=[sort_column], | |
| outputs=[sort_order], | |
| ) | |
| sort_order.change( | |
| fn=update_summary, | |
| inputs=filter_inputs, | |
| outputs=[summary_html], | |
| ) | |
| # 詳細更新トリガー | |
| detail_inputs = [ | |
| task_dropdown, | |
| inf_a_dropdown, | |
| inf_b_dropdown, | |
| inference_names_state, | |
| inference_maps_state, | |
| public_index_state, | |
| ] | |
| detail_outputs = [meta_md, text_a, text_b, diff_output, label_a, label_b] | |
| task_dropdown.change( | |
| fn=update_detail, inputs=detail_inputs, outputs=detail_outputs | |
| ) | |
| inf_a_dropdown.change( | |
| fn=update_detail, inputs=detail_inputs, outputs=detail_outputs | |
| ) | |
| inf_b_dropdown.change( | |
| fn=update_detail, inputs=detail_inputs, outputs=detail_outputs | |
| ) | |
| # Inference A と B の入れ替えボタン | |
| swap_btn.click( | |
| fn=swap_inferences, | |
| inputs=[inf_a_dropdown, inf_b_dropdown], | |
| outputs=[inf_a_dropdown, inf_b_dropdown], | |
| ) | |
| # ローカル評価ソート変更時 | |
| eval_sort_dropdown.change( | |
| fn=update_eval_results, | |
| inputs=[ | |
| eval_sort_dropdown, | |
| inference_names_state, | |
| inference_maps_state, | |
| public_index_state, | |
| ], | |
| outputs=[eval_result_html], | |
| ) | |
| if __name__ == "__main__": | |
| # css, jsは Gradio 6.0 以降は launch() に指定 | |
| app.launch( | |
| theme=gr.themes.Soft(), | |
| css=load_custom_css(), | |
| js=get_custom_js(), | |
| ) | |