Spaces:

BrainDrive
/

Summary-Evaluator

Running

App Files Files Community

Summary-Evaluator / src /evaluation.py

navaneethkrishnan

Upload evaluation.py

5447bce verified 8 months ago

raw

history blame contribute delete

3.18 kB

	import os
	import json
	import tempfile
	import zipfile
	import concurrent.futures
	import pandas as pd
	from src.utils import _inject, openai_call, deepseek_call, claude_call, split_json_objects, parse, wavg
	from src.config import MAX_TOKENS, PRESET

	BACKENDS_ALL = {"OpenAI": openai_call, "DeepSeek": deepseek_call, "Claude": claude_call}

	last_summary = {}
	last_eval_result = {}

	def evaluate(article, summary, variant, active_back, temp,
	w_cov, w_align, w_hall, w_rel, w_bias, show_ev):

	weights = dict(coverage=w_cov, alignment=w_align, hallucination=w_hall,
	relevance=w_rel, bias_toxicity=w_bias)

	run_variants = ["Twin-Lock","Judge-Lock"] if variant=="ParallelX-TJ" else [variant]

	rows, feedback, tokens, raw_blobs = [], {}, {b:0 for b in active_back}, {}

	PROMPTS = {
	"Twin-Lock": open("prompts/twinlock.txt", encoding="utf-8").read(),
	"Judge-Lock": open("prompts/judgelock.txt", encoding="utf-8").read(),
	}

	with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
	futures = []
	for v in run_variants:
	prompt = PROMPTS[v]
	for b in active_back:
	futures.append(
	exe.submit(lambda args: (args[0],args[1],*args[2]),
	(v,b,BACKENDS_ALL[b](prompt, article, summary, temp)))
	)

	for fut in concurrent.futures.as_completed(futures):
	vtag, backend, raw, tok = fut.result()
	tokens[backend] += tok
	raw_blobs[f"{vtag}_{backend}"] = raw
	parsed = parse(raw)
	if not show_ev and "hallucination" in parsed:
	for c in parsed["hallucination"].get("claims_checked", []):
	c.pop("evidence", None)
	rows.append({
	"Variant": vtag,
	"Model": backend,
	"coverage": parsed.get("coverage", {}).get("overall_score", 0),
	"alignment": parsed.get("alignment", {}).get("overall_score", 0),
	"hallucination": parsed.get("hallucination", {}).get("overall_score", 0),
	"relevance": parsed.get("relevance", {}).get("overall_score", 0),
	"bias_toxicity": parsed.get("bias_toxicity", {}).get("overall_score", 0),
	"Total": round(wavg(parsed, weights), 2),
	})
	feedback[f"{vtag} • {backend}"] = parsed

	df = pd.DataFrame(rows)
	avg = df.groupby("Variant")["Total"].mean().round(2).to_dict()
	if variant != "ParallelX-TJ":
	avg = {variant: avg.get(variant)}

	tmp = tempfile.mkdtemp()
	csv_path = os.path.join(tmp,"metrics.csv"); df.to_csv(csv_path,index=False)
	zip_path = os.path.join(tmp,"raw_json.zip")
	with zipfile.ZipFile(zip_path,"w") as z:
	for tag, blob in raw_blobs.items():
	p = os.path.join(tmp, f"{tag}.json")
	with open(p,"w",encoding="utf-8") as f: f.write(blob)
	z.write(p, arcname=os.path.basename(p))

	last_eval_result.update({"scores": avg, "comments": feedback})
	return df, feedback, avg, tokens, csv_path, zip_path