CodeReviewBench

Sleeping

CodeReviewBench / src /api_submit_results.py

Alex

zalupa

e7ea9f6 8 months ago

3.91 kB

	from datetime import datetime, timezone
	import json
	import os
	import uuid

	from fastapi import APIRouter, HTTPException
	from pydantic import BaseModel, Field, validator

	from src.envs import API, RESULTS_REPO, EVAL_RESULTS_PATH, TOKEN

	router = APIRouter(prefix="/api", tags=["submission"])

	ALL_SUBJECTIVE_FIELDS = [
	"readability",
	"relevance",
	"explanation_clarity",
	"problem_identification",
	"actionability",
	"completeness",
	"specificity",
	"contextual_adequacy",
	"consistency",
	"brevity",
	]


	class ResultPayload(BaseModel):
	model: str = Field(..., description="Model id on the Hub (e.g. org/model)")
	revision: str = Field("main", description="Commit sha or branch (default: main)")
	bleu: float = Field(..., ge=0, description="BLEU score (0-100)")

	# 10 subjective metrics 0-5
	readability: int = Field(..., ge=0, le=5)
	relevance: int = Field(..., ge=0, le=5)
	explanation_clarity: int = Field(..., ge=0, le=5)
	problem_identification: int = Field(..., ge=0, le=5)
	actionability: int = Field(..., ge=0, le=5)
	completeness: int = Field(..., ge=0, le=5)
	specificity: int = Field(..., ge=0, le=5)
	contextual_adequacy: int = Field(..., ge=0, le=5)
	consistency: int = Field(..., ge=0, le=5)
	brevity: int = Field(..., ge=0, le=5)

	pass_at_1: float = Field(..., ge=0, le=1)
	pass_at_5: float = Field(..., ge=0, le=1)
	pass_at_10: float = Field(..., ge=0, le=1)

	@validator("pass_at_5")
	def _p5_ge_p1(cls, v, values):
	if "pass_at_1" in values and v < values["pass_at_1"]:
	raise ValueError("pass@5 must be >= pass@1")
	return v

	@validator("pass_at_10")
	def _p10_ge_p5(cls, v, values):
	if "pass_at_5" in values and v < values["pass_at_5"]:
	raise ValueError("pass@10 must be >= pass@5")
	return v

	def multimetric(self) -> float:
	total = sum(getattr(self, f) for f in ALL_SUBJECTIVE_FIELDS)
	return float(total) / len(ALL_SUBJECTIVE_FIELDS)


	@router.post("/submit", status_code=200)
	async def submit_results(payload: ResultPayload):
	"""Accept new evaluation results and push them to the results dataset."""

	# Prepare JSON in expected format (compatible with read_evals.py)
	results_dict = {
	"config": {
	"model_dtype": "unknown",
	"model_name": payload.model,
	"model_sha": payload.revision,
	},
	"results": {},
	}

	# Primary metrics
	results_dict["results"]["bleu"] = {"score": payload.bleu}
	results_dict["results"]["multimetric"] = {"score": payload.multimetric()}

	# Subjective metrics
	for field in ALL_SUBJECTIVE_FIELDS:
	results_dict["results"][field] = {"score": getattr(payload, field)}

	# Pass@k metrics
	results_dict["results"]["pass_at_1"] = {"score": payload.pass_at_1}
	results_dict["results"]["pass_at_5"] = {"score": payload.pass_at_5}
	results_dict["results"]["pass_at_10"] = {"score": payload.pass_at_10}

	# File handling
	os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
	ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
	unique_id = uuid.uuid4().hex[:8]
	filename = f"results_{payload.model.replace('/', '_')}_{ts}_{unique_id}.json"
	local_path = os.path.join(EVAL_RESULTS_PATH, filename)

	with open(local_path, "w") as fp:
	json.dump(results_dict, fp)

	try:
	API.upload_file(
	path_or_fileobj=local_path,
	path_in_repo=filename,
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	commit_message=f"Add results for {payload.model}",
	)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Failed to upload results: {e}")
	finally:
	if os.path.exists(local_path):
	os.remove(local_path)

	return {"status": "ok", "detail": "Results submitted."}