Spaces:
Running
Running
| import os | |
| import json | |
| import tempfile | |
| import zipfile | |
| import concurrent.futures | |
| import pandas as pd | |
| from src.utils import _inject, openai_call, deepseek_call, claude_call, split_json_objects, parse, wavg | |
| from src.config import MAX_TOKENS, PRESET | |
| BACKENDS_ALL = {"OpenAI": openai_call, "DeepSeek": deepseek_call, "Claude": claude_call} | |
| last_summary = {} | |
| last_eval_result = {} | |
| def evaluate(article, summary, variant, active_back, temp, | |
| w_cov, w_align, w_hall, w_rel, w_bias, show_ev): | |
| weights = dict(coverage=w_cov, alignment=w_align, hallucination=w_hall, | |
| relevance=w_rel, bias_toxicity=w_bias) | |
| run_variants = ["Twin-Lock","Judge-Lock"] if variant=="ParallelX-TJ" else [variant] | |
| rows, feedback, tokens, raw_blobs = [], {}, {b:0 for b in active_back}, {} | |
| PROMPTS = { | |
| "Twin-Lock": open("prompts/twinlock.txt", encoding="utf-8").read(), | |
| "Judge-Lock": open("prompts/judgelock.txt", encoding="utf-8").read(), | |
| } | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=6) as exe: | |
| futures = [] | |
| for v in run_variants: | |
| prompt = PROMPTS[v] | |
| for b in active_back: | |
| futures.append( | |
| exe.submit(lambda args: (args[0],args[1],*args[2]), | |
| (v,b,BACKENDS_ALL[b](prompt, article, summary, temp))) | |
| ) | |
| for fut in concurrent.futures.as_completed(futures): | |
| vtag, backend, raw, tok = fut.result() | |
| tokens[backend] += tok | |
| raw_blobs[f"{vtag}_{backend}"] = raw | |
| parsed = parse(raw) | |
| if not show_ev and "hallucination" in parsed: | |
| for c in parsed["hallucination"].get("claims_checked", []): | |
| c.pop("evidence", None) | |
| rows.append({ | |
| "Variant": vtag, | |
| "Model": backend, | |
| "coverage": parsed.get("coverage", {}).get("overall_score", 0), | |
| "alignment": parsed.get("alignment", {}).get("overall_score", 0), | |
| "hallucination": parsed.get("hallucination", {}).get("overall_score", 0), | |
| "relevance": parsed.get("relevance", {}).get("overall_score", 0), | |
| "bias_toxicity": parsed.get("bias_toxicity", {}).get("overall_score", 0), | |
| "Total": round(wavg(parsed, weights), 2), | |
| }) | |
| feedback[f"{vtag} • {backend}"] = parsed | |
| df = pd.DataFrame(rows) | |
| avg = df.groupby("Variant")["Total"].mean().round(2).to_dict() | |
| if variant != "ParallelX-TJ": | |
| avg = {variant: avg.get(variant)} | |
| tmp = tempfile.mkdtemp() | |
| csv_path = os.path.join(tmp,"metrics.csv"); df.to_csv(csv_path,index=False) | |
| zip_path = os.path.join(tmp,"raw_json.zip") | |
| with zipfile.ZipFile(zip_path,"w") as z: | |
| for tag, blob in raw_blobs.items(): | |
| p = os.path.join(tmp, f"{tag}.json") | |
| with open(p,"w",encoding="utf-8") as f: f.write(blob) | |
| z.write(p, arcname=os.path.basename(p)) | |
| last_eval_result.update({"scores": avg, "comments": feedback}) | |
| return df, feedback, avg, tokens, csv_path, zip_path |