Spaces:

roc-hci
/

turing-bench-evaluator

Sleeping

App Files Files Community

turing-bench-evaluator / app.py

roc-hci

Update app.py

7f8909f verified 20 days ago

raw

history blame contribute delete

3.76 kB

	import json, os, threading
	import pandas as pd
	from fastapi import FastAPI, Request, Response, Header
	from huggingface_hub import HfApi, hf_hub_download, list_repo_files
	import uvicorn

	app = FastAPI()

	API = HfApi()
	HF_TOKEN = os.environ.get("HF_TOKEN")
	SECRET = os.environ["PRIVATE_LABELS"]
	LABELS = json.loads(SECRET)["who_is_human"]
	WEBHOOK_SECRET = os.environ.get("WEBHOOK_SECRET")

	SUBMISSIONS_REPO = "roc-hci/turing-bench-submissions"
	RESULTS_REPO = "roc-hci/turing-bench-results"

	def get_pending_submissions():
	"""Find submissions that haven't been evaluated yet."""
	# List all submission metadata files
	submission_files = [
	f for f in list_repo_files(SUBMISSIONS_REPO, repo_type="dataset", token=HF_TOKEN)
	if f.startswith("metadata/") and f.endswith(".json")
	]

	# List all result files
	result_files = [
	f for f in list_repo_files(RESULTS_REPO, repo_type="dataset", token=HF_TOKEN)
	if f.endswith(".json")
	]

	# Extract submission IDs from each
	submitted_ids = {f.replace("metadata/", "").replace(".json", "") for f in submission_files}
	evaluated_ids = {f.replace("results/", "").replace(".json", "") for f in result_files}

	pending_ids = submitted_ids - evaluated_ids
	return pending_ids


	def evaluate_submission(submission_id: str):
	"""Run evaluation for a single submission."""
	# Download the submission metadata
	metadata_path = hf_hub_download(
	repo_id=SUBMISSIONS_REPO,
	filename=f"metadata/{submission_id}.json",
	repo_type="dataset",
	token=HF_TOKEN,
	)
	with open(metadata_path) as f:
	metadata = json.load(f)

	# Download the predictions file
	predictions_path = hf_hub_download(
	repo_id=SUBMISSIONS_REPO,
	filename=metadata["predictions_file"],
	repo_type="dataset",
	token=HF_TOKEN,
	)

	# ---- Your evaluation logic goes here ----
	scores = run_evaluation(predictions_path)
	# ------------------------------------------

	# Build the result record
	result = {
	"model_name": metadata["model_name"],
	"submitted_by": metadata["submitted_by"],
	"submission_time": metadata["submission_time"],
	"accuracy": scores["Accuracy"],
	}

	# Upload result
	result_bytes = json.dumps(result).encode()
	API.upload_file(
	path_or_fileobj=result_bytes,
	path_in_repo=f"results/{submission_id}.json",
	repo_id=RESULTS_REPO,
	repo_type="dataset",
	token=HF_TOKEN,
	)
	print(f"Evaluated {submission_id}: {scores}")


	def run_evaluation(predictions_path: str) -> dict:
	df = pd.read_csv(predictions_path)
	preds = df["who_is_human"].astype(str).str.strip().tolist()
	acc = sum(p == l for p, l in zip(preds, LABELS)) / len(LABELS)
	return {"Accuracy" : acc}


	@app.get("/")
	def health():
	return "OK"

	@app.post("/webhook")
	async def webhook(request: Request, x_webhook_secret: str = Header(None)):
	if WEBHOOK_SECRET and x_webhook_secret != WEBHOOK_SECRET:
	return Response(status_code=403)

	payload = await request.json()
	print("Webhook received:", json.dumps(payload, indent=2))

	event = payload.get("event", {})
	repo = payload.get("repo", {})
	if event.get("action") == "update" and repo.get("name") == SUBMISSIONS_REPO:
	threading.Thread(target=process_pending, daemon=True).start()

	return "OK"

	def process_pending():
	pending = get_pending_submissions()
	for submission_id in pending:
	try:
	evaluate_submission(submission_id)
	except Exception as e:
	print(f"Error evaluating {submission_id}: {e}")

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)