Spaces:
Sleeping
Sleeping
| from datetime import datetime, timezone | |
| import json | |
| import os | |
| import uuid | |
| from fastapi import APIRouter, HTTPException | |
| from pydantic import BaseModel, Field, validator | |
| from src.envs import API, RESULTS_REPO, EVAL_RESULTS_PATH, TOKEN | |
| router = APIRouter(prefix="/api", tags=["submission"]) | |
| ALL_SUBJECTIVE_FIELDS = [ | |
| "readability", | |
| "relevance", | |
| "explanation_clarity", | |
| "problem_identification", | |
| "actionability", | |
| "completeness", | |
| "specificity", | |
| "contextual_adequacy", | |
| "consistency", | |
| "brevity", | |
| ] | |
| class ResultPayload(BaseModel): | |
| model: str = Field(..., description="Model id on the Hub (e.g. org/model)") | |
| revision: str = Field("main", description="Commit sha or branch (default: main)") | |
| bleu: float = Field(..., ge=0, description="BLEU score (0-100)") | |
| # 10 subjective metrics 0-5 | |
| readability: int = Field(..., ge=0, le=5) | |
| relevance: int = Field(..., ge=0, le=5) | |
| explanation_clarity: int = Field(..., ge=0, le=5) | |
| problem_identification: int = Field(..., ge=0, le=5) | |
| actionability: int = Field(..., ge=0, le=5) | |
| completeness: int = Field(..., ge=0, le=5) | |
| specificity: int = Field(..., ge=0, le=5) | |
| contextual_adequacy: int = Field(..., ge=0, le=5) | |
| consistency: int = Field(..., ge=0, le=5) | |
| brevity: int = Field(..., ge=0, le=5) | |
| pass_at_1: float = Field(..., ge=0, le=1) | |
| pass_at_5: float = Field(..., ge=0, le=1) | |
| pass_at_10: float = Field(..., ge=0, le=1) | |
| def _p5_ge_p1(cls, v, values): | |
| if "pass_at_1" in values and v < values["pass_at_1"]: | |
| raise ValueError("pass@5 must be >= pass@1") | |
| return v | |
| def _p10_ge_p5(cls, v, values): | |
| if "pass_at_5" in values and v < values["pass_at_5"]: | |
| raise ValueError("pass@10 must be >= pass@5") | |
| return v | |
| def multimetric(self) -> float: | |
| total = sum(getattr(self, f) for f in ALL_SUBJECTIVE_FIELDS) | |
| return float(total) / len(ALL_SUBJECTIVE_FIELDS) | |
| async def submit_results(payload: ResultPayload): | |
| """Accept new evaluation results and push them to the results dataset.""" | |
| # Prepare JSON in expected format (compatible with read_evals.py) | |
| results_dict = { | |
| "config": { | |
| "model_dtype": "unknown", | |
| "model_name": payload.model, | |
| "model_sha": payload.revision, | |
| }, | |
| "results": {}, | |
| } | |
| # Primary metrics | |
| results_dict["results"]["bleu"] = {"score": payload.bleu} | |
| results_dict["results"]["multimetric"] = {"score": payload.multimetric()} | |
| # Subjective metrics | |
| for field in ALL_SUBJECTIVE_FIELDS: | |
| results_dict["results"][field] = {"score": getattr(payload, field)} | |
| # Pass@k metrics | |
| results_dict["results"]["pass_at_1"] = {"score": payload.pass_at_1} | |
| results_dict["results"]["pass_at_5"] = {"score": payload.pass_at_5} | |
| results_dict["results"]["pass_at_10"] = {"score": payload.pass_at_10} | |
| # File handling | |
| os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) | |
| ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") | |
| unique_id = uuid.uuid4().hex[:8] | |
| filename = f"results_{payload.model.replace('/', '_')}_{ts}_{unique_id}.json" | |
| local_path = os.path.join(EVAL_RESULTS_PATH, filename) | |
| with open(local_path, "w") as fp: | |
| json.dump(results_dict, fp) | |
| try: | |
| API.upload_file( | |
| path_or_fileobj=local_path, | |
| path_in_repo=filename, | |
| repo_id=RESULTS_REPO, | |
| repo_type="dataset", | |
| commit_message=f"Add results for {payload.model}", | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Failed to upload results: {e}") | |
| finally: | |
| if os.path.exists(local_path): | |
| os.remove(local_path) | |
| return {"status": "ok", "detail": "Results submitted."} |