Spaces:
Running
Running
File size: 3,175 Bytes
5447bce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | import os
import json
import tempfile
import zipfile
import concurrent.futures
import pandas as pd
from src.utils import _inject, openai_call, deepseek_call, claude_call, split_json_objects, parse, wavg
from src.config import MAX_TOKENS, PRESET
BACKENDS_ALL = {"OpenAI": openai_call, "DeepSeek": deepseek_call, "Claude": claude_call}
last_summary = {}
last_eval_result = {}
def evaluate(article, summary, variant, active_back, temp,
w_cov, w_align, w_hall, w_rel, w_bias, show_ev):
weights = dict(coverage=w_cov, alignment=w_align, hallucination=w_hall,
relevance=w_rel, bias_toxicity=w_bias)
run_variants = ["Twin-Lock","Judge-Lock"] if variant=="ParallelX-TJ" else [variant]
rows, feedback, tokens, raw_blobs = [], {}, {b:0 for b in active_back}, {}
PROMPTS = {
"Twin-Lock": open("prompts/twinlock.txt", encoding="utf-8").read(),
"Judge-Lock": open("prompts/judgelock.txt", encoding="utf-8").read(),
}
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe:
futures = []
for v in run_variants:
prompt = PROMPTS[v]
for b in active_back:
futures.append(
exe.submit(lambda args: (args[0],args[1],*args[2]),
(v,b,BACKENDS_ALL[b](prompt, article, summary, temp)))
)
for fut in concurrent.futures.as_completed(futures):
vtag, backend, raw, tok = fut.result()
tokens[backend] += tok
raw_blobs[f"{vtag}_{backend}"] = raw
parsed = parse(raw)
if not show_ev and "hallucination" in parsed:
for c in parsed["hallucination"].get("claims_checked", []):
c.pop("evidence", None)
rows.append({
"Variant": vtag,
"Model": backend,
"coverage": parsed.get("coverage", {}).get("overall_score", 0),
"alignment": parsed.get("alignment", {}).get("overall_score", 0),
"hallucination": parsed.get("hallucination", {}).get("overall_score", 0),
"relevance": parsed.get("relevance", {}).get("overall_score", 0),
"bias_toxicity": parsed.get("bias_toxicity", {}).get("overall_score", 0),
"Total": round(wavg(parsed, weights), 2),
})
feedback[f"{vtag} • {backend}"] = parsed
df = pd.DataFrame(rows)
avg = df.groupby("Variant")["Total"].mean().round(2).to_dict()
if variant != "ParallelX-TJ":
avg = {variant: avg.get(variant)}
tmp = tempfile.mkdtemp()
csv_path = os.path.join(tmp,"metrics.csv"); df.to_csv(csv_path,index=False)
zip_path = os.path.join(tmp,"raw_json.zip")
with zipfile.ZipFile(zip_path,"w") as z:
for tag, blob in raw_blobs.items():
p = os.path.join(tmp, f"{tag}.json")
with open(p,"w",encoding="utf-8") as f: f.write(blob)
z.write(p, arcname=os.path.basename(p))
last_eval_result.update({"scores": avg, "comments": feedback})
return df, feedback, avg, tokens, csv_path, zip_path |