Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- src/api_clients.py +117 -0
- src/conversation.py +20 -0
- src/evaluation.py +82 -0
src/api_clients.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import anthropic
|
| 3 |
+
import openai
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
from anthropic import Anthropic
|
| 6 |
+
|
| 7 |
+
# ✅ Debug: log versions and proxy env vars
|
| 8 |
+
print(f"Anthropic version: {anthropic.__version__}")
|
| 9 |
+
print(f"OpenAI version: {openai.__version__}")
|
| 10 |
+
print(f"Proxy env vars before cleanup: HTTP_PROXY={os.getenv('HTTP_PROXY')}, HTTPS_PROXY={os.getenv('HTTPS_PROXY')}, NO_PROXY={os.getenv('NO_PROXY')}")
|
| 11 |
+
|
| 12 |
+
# ✅ Forcefully disable proxies for HF Spaces
|
| 13 |
+
os.environ["HTTP_PROXY"] = ""
|
| 14 |
+
os.environ["HTTPS_PROXY"] = ""
|
| 15 |
+
os.environ["NO_PROXY"] = "*"
|
| 16 |
+
|
| 17 |
+
# ✅ Patch any client constructors to ignore 'proxies'
|
| 18 |
+
def safe_init(client_cls):
|
| 19 |
+
"""Wrap client __init__ to strip 'proxies' kwarg injected by Spaces."""
|
| 20 |
+
orig_init = client_cls.__init__
|
| 21 |
+
def wrapped_init(self, *args, **kwargs):
|
| 22 |
+
if "proxies" in kwargs:
|
| 23 |
+
print(f"[Patch] Stripped unexpected 'proxies' from {client_cls.__name__}")
|
| 24 |
+
kwargs.pop("proxies", None)
|
| 25 |
+
return orig_init(self, *args, **kwargs)
|
| 26 |
+
client_cls.__init__ = wrapped_init
|
| 27 |
+
|
| 28 |
+
# Apply patch to both clients
|
| 29 |
+
safe_init(OpenAI)
|
| 30 |
+
safe_init(Anthropic)
|
| 31 |
+
|
| 32 |
+
def init_clients():
|
| 33 |
+
"""Initialize API clients using HF Spaces Repository Secrets."""
|
| 34 |
+
try:
|
| 35 |
+
openai_key = os.getenv("OPENAI_API_KEY")
|
| 36 |
+
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
| 37 |
+
deepseek_key = os.getenv("DEEPSEEK_API_KEY")
|
| 38 |
+
|
| 39 |
+
if not all([openai_key, anthropic_key, deepseek_key]):
|
| 40 |
+
raise ValueError("Missing one or more API keys in HF Spaces Repository Secrets.")
|
| 41 |
+
|
| 42 |
+
# ✅ Initialize OpenAI client
|
| 43 |
+
openai_client = OpenAI(api_key=openai_key)
|
| 44 |
+
|
| 45 |
+
# ✅ Initialize Anthropic client
|
| 46 |
+
anthropic_client = Anthropic(api_key=anthropic_key)
|
| 47 |
+
|
| 48 |
+
# ✅ Initialize DeepSeek client (via OpenAI interface)
|
| 49 |
+
deepseek_client = OpenAI(
|
| 50 |
+
api_key=deepseek_key,
|
| 51 |
+
base_url="https://api.deepseek.com/v1"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
return openai_client, anthropic_client, deepseek_client
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
raise Exception(f"Failed to initialize API clients: {str(e)}")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def gpt4_mini_backend(system_msg, user_prompt, temperature):
|
| 61 |
+
"""Call GPT-4o Mini API."""
|
| 62 |
+
openai_client, _, _ = init_clients()
|
| 63 |
+
try:
|
| 64 |
+
r = openai_client.chat.completions.create(
|
| 65 |
+
model="gpt-4o-mini",
|
| 66 |
+
messages=[
|
| 67 |
+
{"role": "system", "content": system_msg},
|
| 68 |
+
{"role": "user", "content": user_prompt}
|
| 69 |
+
],
|
| 70 |
+
temperature=temperature
|
| 71 |
+
)
|
| 72 |
+
return r.choices[0].message.content, r.usage.total_tokens
|
| 73 |
+
except Exception as e:
|
| 74 |
+
raise Exception(f"GPT-4o-mini error: {str(e)}")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def anthropic_backend(system_msg, user_prompt, temperature):
|
| 78 |
+
"""Call Anthropic Claude API."""
|
| 79 |
+
_, anthropic_client, _ = init_clients()
|
| 80 |
+
try:
|
| 81 |
+
r = anthropic_client.messages.create(
|
| 82 |
+
model="claude-3-5-sonnet-20241022",
|
| 83 |
+
system=system_msg,
|
| 84 |
+
messages=[{"role": "user", "content": user_prompt}],
|
| 85 |
+
max_tokens=2000,
|
| 86 |
+
temperature=temperature
|
| 87 |
+
)
|
| 88 |
+
text = r.content[0].text.strip()
|
| 89 |
+
toks = r.usage.input_tokens + r.usage.output_tokens
|
| 90 |
+
return text, toks
|
| 91 |
+
except Exception as e:
|
| 92 |
+
raise Exception(f"Anthropic error: {str(e)}")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def deepseek_backend(system_msg, user_prompt, temperature):
|
| 96 |
+
"""Call DeepSeek API."""
|
| 97 |
+
_, _, deepseek_client = init_clients()
|
| 98 |
+
try:
|
| 99 |
+
r = deepseek_client.chat.completions.create(
|
| 100 |
+
model="deepseek-chat",
|
| 101 |
+
messages=[
|
| 102 |
+
{"role": "system", "content": system_msg},
|
| 103 |
+
{"role": "user", "content": user_prompt}
|
| 104 |
+
],
|
| 105 |
+
temperature=temperature
|
| 106 |
+
)
|
| 107 |
+
return r.choices[0].message.content, r.usage.total_tokens
|
| 108 |
+
except Exception as e:
|
| 109 |
+
raise Exception(f"DeepSeek error: {str(e)}")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ✅ Register backends
|
| 113 |
+
BACKENDS = {
|
| 114 |
+
"GPT-4o Mini": gpt4_mini_backend,
|
| 115 |
+
"Claude 3.5 Sonnet": anthropic_backend,
|
| 116 |
+
"DeepSeek Chat": deepseek_backend
|
| 117 |
+
}
|
src/conversation.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.api_clients import init_clients
|
| 2 |
+
|
| 3 |
+
def structure_conversation(raw_text):
|
| 4 |
+
_, anthropic_client, _ = init_clients()
|
| 5 |
+
formatter_prompt = (
|
| 6 |
+
"Convert this dialogue into a turn-by-turn transcript where each line "
|
| 7 |
+
"starts with 'HUMAN:' or 'AI:'. Do not add any other commentary.\n\n"
|
| 8 |
+
+ raw_text
|
| 9 |
+
)
|
| 10 |
+
try:
|
| 11 |
+
resp = anthropic_client.messages.create(
|
| 12 |
+
model="claude-3-5-sonnet-20241022",
|
| 13 |
+
system="You are a conversation formatter.",
|
| 14 |
+
messages=[{"role": "user", "content": formatter_prompt}],
|
| 15 |
+
max_tokens=1000,
|
| 16 |
+
temperature=0.0
|
| 17 |
+
)
|
| 18 |
+
return resp.content[0].text.strip()
|
| 19 |
+
except Exception as e:
|
| 20 |
+
raise Exception(f"Error in structuring conversation: {str(e)}")
|
src/evaluation.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import json
|
| 3 |
+
import datetime
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from src.api_clients import BACKENDS
|
| 6 |
+
|
| 7 |
+
def split_json_objects(s):
|
| 8 |
+
objs, depth, start = [], 0, None
|
| 9 |
+
for i, ch in enumerate(s):
|
| 10 |
+
if ch == "{":
|
| 11 |
+
if depth == 0:
|
| 12 |
+
start = i
|
| 13 |
+
depth += 1
|
| 14 |
+
elif ch == "}":
|
| 15 |
+
depth -= 1
|
| 16 |
+
if depth == 0 and start is not None:
|
| 17 |
+
objs.append(s[start:i+1])
|
| 18 |
+
return objs
|
| 19 |
+
|
| 20 |
+
def evaluate_with_judges(conversation, selected_models, variant, *weights_and_temp, prompt_template):
|
| 21 |
+
weights, temperature = list(weights_and_temp[:-1]), weights_and_temp[-1]
|
| 22 |
+
if not conversation.strip():
|
| 23 |
+
raise ValueError("Conversation input is empty.")
|
| 24 |
+
from src.conversation import structure_conversation
|
| 25 |
+
structured = structure_conversation(conversation)
|
| 26 |
+
system_msg = (
|
| 27 |
+
"You are Judge-Care-Lock, a rigorous evaluator of AI-therapist dialogues.\n"
|
| 28 |
+
"1. Use ONLY the transcript—quote it for every decision.\n"
|
| 29 |
+
"2. Apply the multi-layer rubric exactly; do NOT invent scales.\n"
|
| 30 |
+
"3. Return valid JSON matching the schema; no extra text."
|
| 31 |
+
)
|
| 32 |
+
user_prompt = prompt_template.replace("{CONVERSATION}", structured)
|
| 33 |
+
|
| 34 |
+
metrics_rows = []
|
| 35 |
+
comments_map = {}
|
| 36 |
+
tokens_map = {}
|
| 37 |
+
pros_map = {}
|
| 38 |
+
cons_map = {}
|
| 39 |
+
summary_map = {}
|
| 40 |
+
|
| 41 |
+
for model_name in selected_models:
|
| 42 |
+
fn = BACKENDS[model_name]
|
| 43 |
+
raw, toks = fn(system_msg, user_prompt, temperature)
|
| 44 |
+
tokens_map[model_name] = toks
|
| 45 |
+
|
| 46 |
+
clean = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw, flags=re.MULTILINE).strip()
|
| 47 |
+
objs = split_json_objects(clean)
|
| 48 |
+
if not objs:
|
| 49 |
+
raise ValueError(f"No valid JSON from {model_name}:\n{clean}")
|
| 50 |
+
try:
|
| 51 |
+
parsed = json.loads(objs[0])
|
| 52 |
+
except json.JSONDecodeError as e:
|
| 53 |
+
raise ValueError(f"Invalid JSON from {model_name}: {str(e)}")
|
| 54 |
+
|
| 55 |
+
row = {"Model": model_name}
|
| 56 |
+
total_score = 0.0
|
| 57 |
+
for idx, (m, data) in enumerate(parsed["metrics"].items()):
|
| 58 |
+
score = data.get("score", 0.0)
|
| 59 |
+
row[m] = score
|
| 60 |
+
total_score += score * weights[idx]
|
| 61 |
+
row["Total"] = round(total_score, 2)
|
| 62 |
+
metrics_rows.append(row)
|
| 63 |
+
comments_map[model_name] = parsed
|
| 64 |
+
pros_map[model_name] = parsed.get("positive", [])
|
| 65 |
+
cons_map[model_name] = parsed.get("negative", [])
|
| 66 |
+
summary_map[model_name] = parsed.get("summary", "")
|
| 67 |
+
|
| 68 |
+
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 69 |
+
filename = f"/tmp/carelock_eval_{ts}.json"
|
| 70 |
+
combined = {
|
| 71 |
+
"metrics_table": metrics_rows,
|
| 72 |
+
"parsed_per_model": comments_map,
|
| 73 |
+
"tokens_per_model": tokens_map,
|
| 74 |
+
"pros_per_model": pros_map,
|
| 75 |
+
"cons_per_model": cons_map,
|
| 76 |
+
"summary_per_model": summary_map
|
| 77 |
+
}
|
| 78 |
+
with open(filename, "w", encoding="utf-8") as f:
|
| 79 |
+
json.dump(combined, f, indent=2)
|
| 80 |
+
|
| 81 |
+
return (pd.DataFrame(metrics_rows), comments_map, tokens_map,
|
| 82 |
+
pros_map, cons_map, summary_map, filename)
|