Spaces:

RyanDDD
/

hhh

Sleeping

hhh / evaluator.py

github-actions[bot]

Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)

9366995 26 days ago

14.1 kB

	import streamlit as st
	import json
	import pandas as pd
	import requests
	import io
	import time
	from typing import Dict, List
	import openai

	class ConversationEvaluator:
	def __init__(self):
	self.openai_client = None
	self.hf_api_key = None
	self.hf_api_url = "https://router.huggingface.co/v1/chat/completions"
	self.metrics = [
	"empathy", "clarity", "therapeutic_alliance",
	"active_listening", "intervention_quality", "patient_engagement"
	]

	def setup_openai(self, api_key: str):
	"""Initialize OpenAI client"""
	try:
	openai.api_key = api_key
	self.openai_client = openai
	return True
	except Exception as e:
	st.error(f"OpenAI setup failed: {str(e)}")
	return False

	def setup_huggingface(self, api_key: str):
	"""Initialize Hugging Face API client"""
	try:
	self.hf_api_key = api_key
	# Test the API connection with new chat completions format
	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}
	test_payload = {
	"messages": [
	{
	"role": "user",
	"content": "Hello, this is a test message."
	}
	],
	"model": "deepseek-ai/DeepSeek-V3-0324",
	"stream": False
	}
	test_response = requests.post(
	self.hf_api_url,
	headers=headers,
	json=test_payload
	)
	if test_response.status_code == 200:
	return True
	else:
	st.error(f"Hugging Face API test failed: {test_response.status_code} - {test_response.text}")
	return False
	except Exception as e:
	st.error(f"Hugging Face API setup failed: {str(e)}")
	return False

	def parse_conversation(self, file_content: str, file_type: str) -> List[Dict]:
	"""Parse conversation file into structured format"""
	utterances = []

	if file_type == "json":
	try:
	data = json.loads(file_content)
	if isinstance(data, list):
	for i, item in enumerate(data):
	utterances.append({
	"speaker": item.get("speaker", "Unknown"),
	"text": item.get("text", ""),
	"timestamp": item.get("timestamp", i)
	})
	else:
	# Handle nested JSON structure
	for speaker, messages in data.items():
	for i, message in enumerate(messages):
	utterances.append({
	"speaker": speaker,
	"text": message,
	"timestamp": i
	})
	except json.JSONDecodeError:
	st.error("Invalid JSON format")
	return []

	elif file_type == "txt":
	lines = file_content.split('\n')
	for i, line in enumerate(lines):
	if line.strip():
	# Simple parsing: assume format "Speaker: Text"
	if ':' in line:
	speaker, text = line.split(':', 1)
	utterances.append({
	"speaker": speaker.strip(),
	"text": text.strip(),
	"timestamp": i
	})
	else:
	utterances.append({
	"speaker": "Unknown",
	"text": line.strip(),
	"timestamp": i
	})

	elif file_type == "csv":
	try:
	df = pd.read_csv(io.StringIO(file_content))
	for _, row in df.iterrows():
	utterances.append({
	"speaker": row.get("speaker", "Unknown"),
	"text": row.get("text", ""),
	"timestamp": row.get("timestamp", len(utterances))
	})
	except Exception as e:
	st.error(f"CSV parsing error: {str(e)}")
	return []

	return utterances

	def evaluate_with_openai(self, utterance: str, speaker: str) -> Dict[str, float]:
	"""Evaluate utterance using OpenAI"""
	if not self.openai_client:
	return {}

	# Build metrics list based on what's available
	metric_descriptions = {
	'empathy': 'Empathy (1-10): How empathetic and understanding is the response?',
	'clarity': 'Clarity (1-10): How clear and understandable is the communication?',
	'therapeutic_alliance': 'Therapeutic Alliance (1-10): How well does it build rapport and trust?',
	'active_listening': 'Active Listening (1-10): How well does it show engagement and attention?',
	'intervention_quality': 'Intervention Quality (1-10): How effective is the therapeutic technique?',
	'patient_engagement': 'Patient Engagement (1-10): How well does it encourage patient participation?'
	}

	# Filter metrics to only include selected ones
	metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions]

	if not metrics_to_evaluate:
	return {}

	# Build JSON template
	json_template = {m: "X" for m in metrics_to_evaluate}
	json_str_template = json.dumps(json_template).replace('"X"', 'X')

	prompt = f"""
	Evaluate this {speaker} utterance on a scale of 1-10 for each metric:
	Utterance: "{utterance}"

	Provide scores for:
	"""

	for metric in metrics_to_evaluate:
	prompt += f"- {metric_descriptions.get(metric, metric)}\n"

	prompt += f"\nRespond with only the scores in JSON format: {json_str_template}"

	try:
	response = self.openai_client.responses.create(
	model="gpt-4o-mini",
	input=prompt,
	temperature=0.3
	)

	result = response.output_text.strip()
	# Extract JSON from response
	if "{" in result and "}" in result:
	json_start = result.find("{")
	json_end = result.rfind("}") + 1
	json_str = result[json_start:json_end]
	scores = json.loads(json_str)
	# Filter to only return selected metrics
	return {k: v for k, v in scores.items() if k in metrics_to_evaluate}
	except Exception as e:
	st.warning(f"OpenAI evaluation failed: {str(e)}")

	return {}

	def evaluate_with_huggingface(self, utterance: str) -> Dict[str, float]:
	"""Evaluate utterance using Hugging Face Chat Completions API"""
	if not self.hf_api_key:
	return {}

	# Build metrics list based on what's available
	metric_descriptions = {
	'empathy': 'Empathy: How empathetic and understanding is the response?',
	'clarity': 'Clarity: How clear and understandable is the communication?',
	'therapeutic_alliance': 'Therapeutic Alliance: How well does it build rapport and trust?',
	'active_listening': 'Active Listening: How well does it show engagement and attention?',
	'intervention_quality': 'Intervention Quality: How effective is the therapeutic technique?',
	'patient_engagement': 'Patient Engagement: How well does it encourage patient participation?'
	}

	# Filter metrics to only include selected ones
	metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions]

	if not metrics_to_evaluate:
	return {}

	try:
	headers = {
	"Authorization": f"Bearer {self.hf_api_key}",
	"Content-Type": "application/json"
	}

	# Build JSON template
	json_template = {m: "X" for m in metrics_to_evaluate}
	json_str_template = json.dumps(json_template).replace('"X"', 'X')

	# Create a prompt for therapeutic evaluation
	evaluation_prompt = f"""
	Please evaluate this therapeutic utterance on a scale of 1-10 for each metric:

	Utterance: "{utterance}"

	Rate each of the following metrics from 1-10:
	"""

	for metric in metrics_to_evaluate:
	evaluation_prompt += f"- {metric_descriptions.get(metric, metric)}\n"

	evaluation_prompt += f"\nRespond with only the scores in JSON format: {json_str_template}"

	payload = {
	"messages": [
	{
	"role": "user",
	"content": evaluation_prompt
	}
	],
	"model": "deepseek-ai/DeepSeek-V3-0324", # Using DeepSeek V3 model
	"stream": False,
	"temperature": 0.3
	}

	response = requests.post(
	self.hf_api_url,
	headers=headers,
	json=payload
	)

	if response.status_code == 200:
	result = response.json()
	content = result['choices'][0]['message']['content']

	# Extract JSON from response
	try:
	if "{" in content and "}" in content:
	json_start = content.find("{")
	json_end = content.rfind("}") + 1
	json_str = content[json_start:json_end]
	scores = json.loads(json_str)
	# Filter to only return selected metrics
	return {k: v for k, v in scores.items() if k in metrics_to_evaluate}
	else:
	# Fallback: return default scores if JSON parsing fails
	return {m: 5.0 for m in metrics_to_evaluate}
	except json.JSONDecodeError:
	# Fallback scores if JSON parsing fails
	return {m: 5.0 for m in metrics_to_evaluate}
	else:
	st.warning(f"Hugging Face API request failed: {response.status_code}")
	return {}
	except Exception as e:
	st.warning(f"Hugging Face API evaluation failed: {str(e)}")
	return {}

	def evaluate_conversation(self, utterances: List[Dict], use_openai: bool = True, use_hf: bool = True) -> List[Dict]:
	"""Evaluate entire conversation"""
	results = []

	progress_bar = st.progress(0)
	status_text = st.empty()

	for i, utterance in enumerate(utterances):
	status_text.text(f"Evaluating utterance {i+1}/{len(utterances)}")

	utterance_result = {
	"speaker": utterance["speaker"],
	"text": utterance["text"],
	"timestamp": utterance["timestamp"],
	"openai_scores": {},
	"huggingface_scores": {}
	}

	# OpenAI evaluation
	if use_openai and self.openai_client:
	utterance_result["openai_scores"] = self.evaluate_with_openai(
	utterance["text"], utterance["speaker"]
	)

	# Hugging Face evaluation
	if use_hf and self.hf_api_key:
	utterance_result["huggingface_scores"] = self.evaluate_with_huggingface(
	utterance["text"]
	)

	results.append(utterance_result)
	progress_bar.progress((i + 1) / len(utterances))
	time.sleep(0.1) # Small delay for better UX

	status_text.text("Evaluation complete!")
	return results


	# Helper functions
	def create_radar_chart(scores: Dict[str, float], title: str):
	"""Create radar chart for scores"""
	import plotly.graph_objects as go

	categories = list(scores.keys())
	values = list(scores.values())

	fig = go.Figure()

	fig.add_trace(go.Scatterpolar(
	r=values,
	theta=categories,
	fill='toself',
	name=title,
	line_color='blue'
	))

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 10]
	)),
	showlegend=True,
	title=title,
	font_size=12
	)

	return fig

	def display_utterance_results(results: List[Dict]):
	"""Display utterance-level results"""
	st.subheader("Utterance-Level Results")

	for i, result in enumerate(results):
	with st.expander(f"Utterance {i+1}: {result['speaker']} (Timestamp: {result['timestamp']})"):
	st.write(f"Text: {result['text']}")

	col1, col2 = st.columns(2)

	with col1:
	st.write("OpenAI Scores:")
	if result['openai_scores']:
	for metric, score in result['openai_scores'].items():
	st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10")
	else:
	st.write("No OpenAI scores available")

	with col2:
	st.write("Hugging Face Scores:")
	if result['huggingface_scores']:
	for metric, score in result['huggingface_scores'].items():
	st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10")
	else:
	st.write("No Hugging Face scores available")