hhh / evaluator.py
github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
import streamlit as st
import json
import pandas as pd
import requests
import io
import time
from typing import Dict, List
import openai
class ConversationEvaluator:
def __init__(self):
self.openai_client = None
self.hf_api_key = None
self.hf_api_url = "https://router.huggingface.co/v1/chat/completions"
self.metrics = [
"empathy", "clarity", "therapeutic_alliance",
"active_listening", "intervention_quality", "patient_engagement"
]
def setup_openai(self, api_key: str):
"""Initialize OpenAI client"""
try:
openai.api_key = api_key
self.openai_client = openai
return True
except Exception as e:
st.error(f"OpenAI setup failed: {str(e)}")
return False
def setup_huggingface(self, api_key: str):
"""Initialize Hugging Face API client"""
try:
self.hf_api_key = api_key
# Test the API connection with new chat completions format
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
test_payload = {
"messages": [
{
"role": "user",
"content": "Hello, this is a test message."
}
],
"model": "deepseek-ai/DeepSeek-V3-0324",
"stream": False
}
test_response = requests.post(
self.hf_api_url,
headers=headers,
json=test_payload
)
if test_response.status_code == 200:
return True
else:
st.error(f"Hugging Face API test failed: {test_response.status_code} - {test_response.text}")
return False
except Exception as e:
st.error(f"Hugging Face API setup failed: {str(e)}")
return False
def parse_conversation(self, file_content: str, file_type: str) -> List[Dict]:
"""Parse conversation file into structured format"""
utterances = []
if file_type == "json":
try:
data = json.loads(file_content)
if isinstance(data, list):
for i, item in enumerate(data):
utterances.append({
"speaker": item.get("speaker", "Unknown"),
"text": item.get("text", ""),
"timestamp": item.get("timestamp", i)
})
else:
# Handle nested JSON structure
for speaker, messages in data.items():
for i, message in enumerate(messages):
utterances.append({
"speaker": speaker,
"text": message,
"timestamp": i
})
except json.JSONDecodeError:
st.error("Invalid JSON format")
return []
elif file_type == "txt":
lines = file_content.split('\n')
for i, line in enumerate(lines):
if line.strip():
# Simple parsing: assume format "Speaker: Text"
if ':' in line:
speaker, text = line.split(':', 1)
utterances.append({
"speaker": speaker.strip(),
"text": text.strip(),
"timestamp": i
})
else:
utterances.append({
"speaker": "Unknown",
"text": line.strip(),
"timestamp": i
})
elif file_type == "csv":
try:
df = pd.read_csv(io.StringIO(file_content))
for _, row in df.iterrows():
utterances.append({
"speaker": row.get("speaker", "Unknown"),
"text": row.get("text", ""),
"timestamp": row.get("timestamp", len(utterances))
})
except Exception as e:
st.error(f"CSV parsing error: {str(e)}")
return []
return utterances
def evaluate_with_openai(self, utterance: str, speaker: str) -> Dict[str, float]:
"""Evaluate utterance using OpenAI"""
if not self.openai_client:
return {}
# Build metrics list based on what's available
metric_descriptions = {
'empathy': 'Empathy (1-10): How empathetic and understanding is the response?',
'clarity': 'Clarity (1-10): How clear and understandable is the communication?',
'therapeutic_alliance': 'Therapeutic Alliance (1-10): How well does it build rapport and trust?',
'active_listening': 'Active Listening (1-10): How well does it show engagement and attention?',
'intervention_quality': 'Intervention Quality (1-10): How effective is the therapeutic technique?',
'patient_engagement': 'Patient Engagement (1-10): How well does it encourage patient participation?'
}
# Filter metrics to only include selected ones
metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions]
if not metrics_to_evaluate:
return {}
# Build JSON template
json_template = {m: "X" for m in metrics_to_evaluate}
json_str_template = json.dumps(json_template).replace('"X"', 'X')
prompt = f"""
Evaluate this {speaker} utterance on a scale of 1-10 for each metric:
Utterance: "{utterance}"
Provide scores for:
"""
for metric in metrics_to_evaluate:
prompt += f"- {metric_descriptions.get(metric, metric)}\n"
prompt += f"\nRespond with only the scores in JSON format: {json_str_template}"
try:
response = self.openai_client.responses.create(
model="gpt-4o-mini",
input=prompt,
temperature=0.3
)
result = response.output_text.strip()
# Extract JSON from response
if "{" in result and "}" in result:
json_start = result.find("{")
json_end = result.rfind("}") + 1
json_str = result[json_start:json_end]
scores = json.loads(json_str)
# Filter to only return selected metrics
return {k: v for k, v in scores.items() if k in metrics_to_evaluate}
except Exception as e:
st.warning(f"OpenAI evaluation failed: {str(e)}")
return {}
def evaluate_with_huggingface(self, utterance: str) -> Dict[str, float]:
"""Evaluate utterance using Hugging Face Chat Completions API"""
if not self.hf_api_key:
return {}
# Build metrics list based on what's available
metric_descriptions = {
'empathy': 'Empathy: How empathetic and understanding is the response?',
'clarity': 'Clarity: How clear and understandable is the communication?',
'therapeutic_alliance': 'Therapeutic Alliance: How well does it build rapport and trust?',
'active_listening': 'Active Listening: How well does it show engagement and attention?',
'intervention_quality': 'Intervention Quality: How effective is the therapeutic technique?',
'patient_engagement': 'Patient Engagement: How well does it encourage patient participation?'
}
# Filter metrics to only include selected ones
metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions]
if not metrics_to_evaluate:
return {}
try:
headers = {
"Authorization": f"Bearer {self.hf_api_key}",
"Content-Type": "application/json"
}
# Build JSON template
json_template = {m: "X" for m in metrics_to_evaluate}
json_str_template = json.dumps(json_template).replace('"X"', 'X')
# Create a prompt for therapeutic evaluation
evaluation_prompt = f"""
Please evaluate this therapeutic utterance on a scale of 1-10 for each metric:
Utterance: "{utterance}"
Rate each of the following metrics from 1-10:
"""
for metric in metrics_to_evaluate:
evaluation_prompt += f"- {metric_descriptions.get(metric, metric)}\n"
evaluation_prompt += f"\nRespond with only the scores in JSON format: {json_str_template}"
payload = {
"messages": [
{
"role": "user",
"content": evaluation_prompt
}
],
"model": "deepseek-ai/DeepSeek-V3-0324", # Using DeepSeek V3 model
"stream": False,
"temperature": 0.3
}
response = requests.post(
self.hf_api_url,
headers=headers,
json=payload
)
if response.status_code == 200:
result = response.json()
content = result['choices'][0]['message']['content']
# Extract JSON from response
try:
if "{" in content and "}" in content:
json_start = content.find("{")
json_end = content.rfind("}") + 1
json_str = content[json_start:json_end]
scores = json.loads(json_str)
# Filter to only return selected metrics
return {k: v for k, v in scores.items() if k in metrics_to_evaluate}
else:
# Fallback: return default scores if JSON parsing fails
return {m: 5.0 for m in metrics_to_evaluate}
except json.JSONDecodeError:
# Fallback scores if JSON parsing fails
return {m: 5.0 for m in metrics_to_evaluate}
else:
st.warning(f"Hugging Face API request failed: {response.status_code}")
return {}
except Exception as e:
st.warning(f"Hugging Face API evaluation failed: {str(e)}")
return {}
def evaluate_conversation(self, utterances: List[Dict], use_openai: bool = True, use_hf: bool = True) -> List[Dict]:
"""Evaluate entire conversation"""
results = []
progress_bar = st.progress(0)
status_text = st.empty()
for i, utterance in enumerate(utterances):
status_text.text(f"Evaluating utterance {i+1}/{len(utterances)}")
utterance_result = {
"speaker": utterance["speaker"],
"text": utterance["text"],
"timestamp": utterance["timestamp"],
"openai_scores": {},
"huggingface_scores": {}
}
# OpenAI evaluation
if use_openai and self.openai_client:
utterance_result["openai_scores"] = self.evaluate_with_openai(
utterance["text"], utterance["speaker"]
)
# Hugging Face evaluation
if use_hf and self.hf_api_key:
utterance_result["huggingface_scores"] = self.evaluate_with_huggingface(
utterance["text"]
)
results.append(utterance_result)
progress_bar.progress((i + 1) / len(utterances))
time.sleep(0.1) # Small delay for better UX
status_text.text("Evaluation complete!")
return results
# Helper functions
def create_radar_chart(scores: Dict[str, float], title: str):
"""Create radar chart for scores"""
import plotly.graph_objects as go
categories = list(scores.keys())
values = list(scores.values())
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=values,
theta=categories,
fill='toself',
name=title,
line_color='blue'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 10]
)),
showlegend=True,
title=title,
font_size=12
)
return fig
def display_utterance_results(results: List[Dict]):
"""Display utterance-level results"""
st.subheader("Utterance-Level Results")
for i, result in enumerate(results):
with st.expander(f"Utterance {i+1}: {result['speaker']} (Timestamp: {result['timestamp']})"):
st.write(f"**Text:** {result['text']}")
col1, col2 = st.columns(2)
with col1:
st.write("**OpenAI Scores:**")
if result['openai_scores']:
for metric, score in result['openai_scores'].items():
st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10")
else:
st.write("No OpenAI scores available")
with col2:
st.write("**Hugging Face Scores:**")
if result['huggingface_scores']:
for metric, score in result['huggingface_scores'].items():
st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10")
else:
st.write("No Hugging Face scores available")