Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- src/comparison.py +38 -0
- src/config.py +21 -0
src/comparison.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from src.api_clients import openai_client
|
| 3 |
+
|
| 4 |
+
last_eval_result = {} # Shared state
|
| 5 |
+
|
| 6 |
+
def get_last_eval_data():
|
| 7 |
+
return last_eval_result if last_eval_result else None
|
| 8 |
+
|
| 9 |
+
def run_comparison(human_scores, human_comments, model_scores, model_comments):
|
| 10 |
+
prompt = f"""Compare human and model summary evaluations.
|
| 11 |
+
|
| 12 |
+
Human Scores: {human_scores}
|
| 13 |
+
Model Scores: {model_scores}
|
| 14 |
+
|
| 15 |
+
Human Comments: {human_comments}
|
| 16 |
+
Model Comments: {model_comments}
|
| 17 |
+
|
| 18 |
+
Output key differences, strengths, and any mismatches."""
|
| 19 |
+
res = openai_client.chat.completions.create(
|
| 20 |
+
model="gpt-4o-mini",
|
| 21 |
+
messages=[{"role": "user", "content": prompt}],
|
| 22 |
+
max_tokens=800
|
| 23 |
+
)
|
| 24 |
+
return res.choices[0].message.content
|
| 25 |
+
|
| 26 |
+
def import_model_metrics():
|
| 27 |
+
data = get_last_eval_data()
|
| 28 |
+
if not data or "scores" not in data:
|
| 29 |
+
return ["" ] * 6
|
| 30 |
+
s = data["scores"]
|
| 31 |
+
return (
|
| 32 |
+
str(s.get("coverage", "")),
|
| 33 |
+
str(s.get("alignment", "")),
|
| 34 |
+
str(s.get("hallucination", "")),
|
| 35 |
+
str(s.get("relevance", "")),
|
| 36 |
+
str(s.get("bias_toxicity", "")),
|
| 37 |
+
json.dumps(data.get("comments", ""), indent=2)
|
| 38 |
+
)
|
src/config.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MAX_TOKENS = {"OpenAI": 8000, "DeepSeek": 8000, "Claude": 4000}
|
| 2 |
+
|
| 3 |
+
PRESET = {
|
| 4 |
+
"Twin-Lock": dict(coverage=0.25, alignment=0.20, hallucination=0.15, relevance=0.15, bias_toxicity=0.05),
|
| 5 |
+
"Judge-Lock": dict(coverage=0.35, alignment=0.15, hallucination=0.30, relevance=0.15, bias_toxicity=0.05)
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
CSS = """
|
| 9 |
+
body,.gradio-container{background:#f7f7f7!important;color:#1a1a1a!important}
|
| 10 |
+
textarea,textarea.gr-input{background:#f7f7f7!important;color:#1a1a1a!important}
|
| 11 |
+
textarea::placeholder,input::placeholder{color:#666!important}
|
| 12 |
+
input[type=radio]{accent-color:#000000}
|
| 13 |
+
input[type=checkbox]{accent-color:#000000}
|
| 14 |
+
#variant-group input[type=radio]{accent-color:#ffa500}
|
| 15 |
+
#backend-group input[type=checkbox]{accent-color:#0074d9}
|
| 16 |
+
.metric-slider input[type=range]::-webkit-slider-thumb,
|
| 17 |
+
.metric-slider input[type=range]::-moz-range-thumb{background:#21a366!important}
|
| 18 |
+
.metric-slider input[type=range]::-webkit-slider-runnable-track,
|
| 19 |
+
.metric-slider input[type=range]::-moz-range-track{background:#cfe8db!important}
|
| 20 |
+
#btn-twin,#btn-judge,#run-btn{background:#000000!important;color:#ffffff!important;border-radius:6px!important}
|
| 21 |
+
"""
|