| | """ |
| | ML4SE Benchmark Viewer |
| | |
| | A web-based interface for browsing and inspecting individual datapoints |
| | from popular ML4SE benchmark datasets (REval, CRUXEval, HumanEval+, |
| | BigOBench, and others). |
| | """ |
| |
|
| | import ast as _ast |
| | import os |
| |
|
| | from flask import Flask, jsonify, render_template, request |
| | from pygments import highlight |
| | from pygments.formatters import HtmlFormatter |
| | from pygments.lexers import PythonLexer, get_lexer_by_name |
| |
|
| | app = Flask(__name__) |
| |
|
| |
|
| | def _extract_test_classes(test_code: str, cls_name: str) -> list: |
| | """ |
| | Parse a ClassEval unittest module and return one dict per test class |
| | in definition order: {"name": ..., "code": ...}. |
| | |
| | Matches top-level classes whose names start with f"{cls_name}Test", |
| | which is the same pattern used by ClassFactory.create_test_classes(). |
| | Uses ast.parse only — no code execution, safe to call from the web server. |
| | """ |
| | try: |
| | tree = _ast.parse(test_code) |
| | except SyntaxError as e: |
| | print(f"Warning: SyntaxError parsing test code for {cls_name}: {e}") |
| | return [] |
| | lines = test_code.splitlines(keepends=True) |
| | prefix = f"{cls_name}Test" |
| | result = [] |
| | for node in tree.body: |
| | if isinstance(node, _ast.ClassDef) and node.name.startswith(prefix): |
| | start = node.lineno - 1 |
| | end = node.end_lineno |
| | result.append( |
| | { |
| | "name": node.name, |
| | "code": "".join(lines[start:end]), |
| | } |
| | ) |
| | return result |
| |
|
| |
|
| | def _code_offset(code: str) -> int: |
| | """Number of leading newlines that Pygments will strip.""" |
| | offset = 0 |
| | for ch in code: |
| | if ch == "\n": |
| | offset += 1 |
| | else: |
| | break |
| | return offset |
| |
|
| |
|
| | def highlight_code(code, highlight_lines=None, language="python"): |
| | """ |
| | Syntax highlight code with optional line highlighting. |
| | |
| | Args: |
| | code: The source code to highlight |
| | highlight_lines: List of line numbers (1-indexed) to highlight |
| | language: Programming language name (default: "python"). |
| | Must be a key in LEXER_MAP. |
| | |
| | Returns: |
| | HTML string with syntax highlighted code |
| | """ |
| | formatter = HtmlFormatter( |
| | linenos="table", cssclass="source", hl_lines=highlight_lines or [], linenostart=1 |
| | ) |
| | try: |
| | lexer = get_lexer_by_name(language.lower()) |
| | except Exception: |
| | lexer = PythonLexer() |
| | return highlight(code, lexer, formatter) |
| |
|
| |
|
| | def get_css(): |
| | """Get CSS for syntax highlighting.""" |
| | return HtmlFormatter().get_style_defs(".source") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | from adapters import REGISTRY, _set_helpers, register_hf_datasets |
| |
|
| | |
| | _set_helpers(highlight_code, _code_offset, _extract_test_classes) |
| |
|
| | |
| | register_hf_datasets() |
| |
|
| |
|
| | def _get_adapter(dataset_slug: str): |
| | """Return the adapter for the given slug, or None.""" |
| | return REGISTRY.get(dataset_slug) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | @app.route("/") |
| | def index(): |
| | """Main page showing list of all benchmark problems.""" |
| | return render_template("index.html") |
| |
|
| |
|
| | @app.route("/api/datasets") |
| | def get_datasets(): |
| | """Return list of available datasets for the UI dataset selector.""" |
| | items = sorted( |
| | ( |
| | { |
| | "slug": slug, |
| | "display_name": adapter.display_name, |
| | "problem_count": adapter.problem_count(), |
| | "total_count": adapter.total_count, |
| | "has_ground_truth": adapter.has_ground_truth, |
| | } |
| | for slug, adapter in REGISTRY.items() |
| | ), |
| | key=lambda d: d["display_name"].lower(), |
| | ) |
| | return jsonify(items) |
| |
|
| |
|
| | @app.route("/api/<dataset_slug>/problems") |
| | def get_problems(dataset_slug): |
| | """API endpoint to get list of all problems for a dataset.""" |
| | adapter = _get_adapter(dataset_slug) |
| | if adapter is None: |
| | return jsonify({"error": f"Unknown dataset: {dataset_slug}"}), 404 |
| |
|
| | problems = [adapter.get_problem_summary(i) for i in range(adapter.problem_count())] |
| | return jsonify(problems) |
| |
|
| |
|
| | @app.route("/api/<dataset_slug>/problem/<int:idx>") |
| | def get_problem(idx, dataset_slug): |
| | """API endpoint to get detailed information about a specific problem.""" |
| | adapter = _get_adapter(dataset_slug) |
| | if adapter is None: |
| | return jsonify({"error": f"Unknown dataset: {dataset_slug}"}), 404 |
| |
|
| | if not (0 <= idx < adapter.problem_count()): |
| | return jsonify({"error": "Invalid problem index"}), 404 |
| |
|
| | try: |
| | return jsonify(adapter.get_problem_detail(idx)) |
| | except (KeyError, IndexError, ValueError) as exc: |
| | return jsonify({"error": f"Internal error: {exc}"}), 500 |
| |
|
| |
|
| | @app.route("/api/highlight_code") |
| | def highlight_code_api(): |
| | """API endpoint to highlight code with specific lines.""" |
| | code = request.args.get("code", "") |
| | lines_str = request.args.get("lines", "") |
| |
|
| | if lines_str: |
| | try: |
| | lines = [int(x) for x in lines_str.split(",") if x.strip()] |
| | except ValueError: |
| | return jsonify({"error": "Invalid line numbers"}), 400 |
| | else: |
| | lines = None |
| |
|
| | highlighted = highlight_code(code, lines) |
| | return jsonify({"highlighted_code": highlighted}) |
| |
|
| |
|
| | @app.route("/<dataset_slug>/problem/<int:idx>") |
| | def problem_detail(idx, dataset_slug): |
| | """Page showing detailed view of a specific problem.""" |
| | adapter = _get_adapter(dataset_slug) |
| | if adapter is None: |
| | return jsonify({"error": "Unknown dataset"}), 404 |
| |
|
| | if not (0 <= idx < adapter.problem_count()): |
| | return jsonify({"error": "Problem not found"}), 404 |
| |
|
| | return render_template( |
| | "problem.html", |
| | idx=idx, |
| | css=get_css(), |
| | total_problems=adapter.problem_count(), |
| | dataset_slug=dataset_slug, |
| | dataset_name=adapter.display_name, |
| | has_ground_truth=adapter.has_ground_truth, |
| | has_tasks=adapter.has_tasks, |
| | ) |
| |
|
| |
|
| | @app.route("/api/css") |
| | def get_css_api(): |
| | """API endpoint to get CSS for syntax highlighting.""" |
| | return get_css(), 200, {"Content-Type": "text/css"} |
| |
|
| |
|
| | @app.route("/api/<dataset_slug>/problem/<int:idx>/ground_truth/<int:input_idx>") |
| | def get_ground_truth(idx, input_idx, dataset_slug): |
| | """Return ground truth execution data for one (problem, input) pair.""" |
| | adapter = _get_adapter(dataset_slug) |
| | if adapter is None: |
| | return jsonify({"error": f"Unknown dataset: {dataset_slug}"}), 404 |
| |
|
| | if not adapter.has_ground_truth: |
| | return jsonify({"status": "unavailable", "message": "Ground truth not available"}), 200 |
| |
|
| | if not (0 <= idx < adapter.problem_count()): |
| | return jsonify({"error": "Invalid problem index"}), 404 |
| |
|
| | result = adapter.get_ground_truth(idx, input_idx) |
| | return jsonify(result) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | debug_mode = os.getenv("FLASK_DEBUG", "false").lower() == "true" |
| | port = int(os.getenv("PORT", 7860)) |
| | app.run(debug=debug_mode, host="0.0.0.0", port=port) |
| |
|