import streamlit as st import json import pandas as pd import requests import io import time from typing import Dict, List import openai class ConversationEvaluator: def __init__(self): self.openai_client = None self.hf_api_key = None self.hf_api_url = "https://router.huggingface.co/v1/chat/completions" self.metrics = [ "empathy", "clarity", "therapeutic_alliance", "active_listening", "intervention_quality", "patient_engagement" ] def setup_openai(self, api_key: str): """Initialize OpenAI client""" try: openai.api_key = api_key self.openai_client = openai return True except Exception as e: st.error(f"OpenAI setup failed: {str(e)}") return False def setup_huggingface(self, api_key: str): """Initialize Hugging Face API client""" try: self.hf_api_key = api_key # Test the API connection with new chat completions format headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } test_payload = { "messages": [ { "role": "user", "content": "Hello, this is a test message." } ], "model": "deepseek-ai/DeepSeek-V3-0324", "stream": False } test_response = requests.post( self.hf_api_url, headers=headers, json=test_payload ) if test_response.status_code == 200: return True else: st.error(f"Hugging Face API test failed: {test_response.status_code} - {test_response.text}") return False except Exception as e: st.error(f"Hugging Face API setup failed: {str(e)}") return False def parse_conversation(self, file_content: str, file_type: str) -> List[Dict]: """Parse conversation file into structured format""" utterances = [] if file_type == "json": try: data = json.loads(file_content) if isinstance(data, list): for i, item in enumerate(data): utterances.append({ "speaker": item.get("speaker", "Unknown"), "text": item.get("text", ""), "timestamp": item.get("timestamp", i) }) else: # Handle nested JSON structure for speaker, messages in data.items(): for i, message in enumerate(messages): utterances.append({ "speaker": speaker, "text": message, "timestamp": i }) except json.JSONDecodeError: st.error("Invalid JSON format") return [] elif file_type == "txt": lines = file_content.split('\n') for i, line in enumerate(lines): if line.strip(): # Simple parsing: assume format "Speaker: Text" if ':' in line: speaker, text = line.split(':', 1) utterances.append({ "speaker": speaker.strip(), "text": text.strip(), "timestamp": i }) else: utterances.append({ "speaker": "Unknown", "text": line.strip(), "timestamp": i }) elif file_type == "csv": try: df = pd.read_csv(io.StringIO(file_content)) for _, row in df.iterrows(): utterances.append({ "speaker": row.get("speaker", "Unknown"), "text": row.get("text", ""), "timestamp": row.get("timestamp", len(utterances)) }) except Exception as e: st.error(f"CSV parsing error: {str(e)}") return [] return utterances def evaluate_with_openai(self, utterance: str, speaker: str) -> Dict[str, float]: """Evaluate utterance using OpenAI""" if not self.openai_client: return {} # Build metrics list based on what's available metric_descriptions = { 'empathy': 'Empathy (1-10): How empathetic and understanding is the response?', 'clarity': 'Clarity (1-10): How clear and understandable is the communication?', 'therapeutic_alliance': 'Therapeutic Alliance (1-10): How well does it build rapport and trust?', 'active_listening': 'Active Listening (1-10): How well does it show engagement and attention?', 'intervention_quality': 'Intervention Quality (1-10): How effective is the therapeutic technique?', 'patient_engagement': 'Patient Engagement (1-10): How well does it encourage patient participation?' } # Filter metrics to only include selected ones metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions] if not metrics_to_evaluate: return {} # Build JSON template json_template = {m: "X" for m in metrics_to_evaluate} json_str_template = json.dumps(json_template).replace('"X"', 'X') prompt = f""" Evaluate this {speaker} utterance on a scale of 1-10 for each metric: Utterance: "{utterance}" Provide scores for: """ for metric in metrics_to_evaluate: prompt += f"- {metric_descriptions.get(metric, metric)}\n" prompt += f"\nRespond with only the scores in JSON format: {json_str_template}" try: response = self.openai_client.responses.create( model="gpt-4o-mini", input=prompt, temperature=0.3 ) result = response.output_text.strip() # Extract JSON from response if "{" in result and "}" in result: json_start = result.find("{") json_end = result.rfind("}") + 1 json_str = result[json_start:json_end] scores = json.loads(json_str) # Filter to only return selected metrics return {k: v for k, v in scores.items() if k in metrics_to_evaluate} except Exception as e: st.warning(f"OpenAI evaluation failed: {str(e)}") return {} def evaluate_with_huggingface(self, utterance: str) -> Dict[str, float]: """Evaluate utterance using Hugging Face Chat Completions API""" if not self.hf_api_key: return {} # Build metrics list based on what's available metric_descriptions = { 'empathy': 'Empathy: How empathetic and understanding is the response?', 'clarity': 'Clarity: How clear and understandable is the communication?', 'therapeutic_alliance': 'Therapeutic Alliance: How well does it build rapport and trust?', 'active_listening': 'Active Listening: How well does it show engagement and attention?', 'intervention_quality': 'Intervention Quality: How effective is the therapeutic technique?', 'patient_engagement': 'Patient Engagement: How well does it encourage patient participation?' } # Filter metrics to only include selected ones metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions] if not metrics_to_evaluate: return {} try: headers = { "Authorization": f"Bearer {self.hf_api_key}", "Content-Type": "application/json" } # Build JSON template json_template = {m: "X" for m in metrics_to_evaluate} json_str_template = json.dumps(json_template).replace('"X"', 'X') # Create a prompt for therapeutic evaluation evaluation_prompt = f""" Please evaluate this therapeutic utterance on a scale of 1-10 for each metric: Utterance: "{utterance}" Rate each of the following metrics from 1-10: """ for metric in metrics_to_evaluate: evaluation_prompt += f"- {metric_descriptions.get(metric, metric)}\n" evaluation_prompt += f"\nRespond with only the scores in JSON format: {json_str_template}" payload = { "messages": [ { "role": "user", "content": evaluation_prompt } ], "model": "deepseek-ai/DeepSeek-V3-0324", # Using DeepSeek V3 model "stream": False, "temperature": 0.3 } response = requests.post( self.hf_api_url, headers=headers, json=payload ) if response.status_code == 200: result = response.json() content = result['choices'][0]['message']['content'] # Extract JSON from response try: if "{" in content and "}" in content: json_start = content.find("{") json_end = content.rfind("}") + 1 json_str = content[json_start:json_end] scores = json.loads(json_str) # Filter to only return selected metrics return {k: v for k, v in scores.items() if k in metrics_to_evaluate} else: # Fallback: return default scores if JSON parsing fails return {m: 5.0 for m in metrics_to_evaluate} except json.JSONDecodeError: # Fallback scores if JSON parsing fails return {m: 5.0 for m in metrics_to_evaluate} else: st.warning(f"Hugging Face API request failed: {response.status_code}") return {} except Exception as e: st.warning(f"Hugging Face API evaluation failed: {str(e)}") return {} def evaluate_conversation(self, utterances: List[Dict], use_openai: bool = True, use_hf: bool = True) -> List[Dict]: """Evaluate entire conversation""" results = [] progress_bar = st.progress(0) status_text = st.empty() for i, utterance in enumerate(utterances): status_text.text(f"Evaluating utterance {i+1}/{len(utterances)}") utterance_result = { "speaker": utterance["speaker"], "text": utterance["text"], "timestamp": utterance["timestamp"], "openai_scores": {}, "huggingface_scores": {} } # OpenAI evaluation if use_openai and self.openai_client: utterance_result["openai_scores"] = self.evaluate_with_openai( utterance["text"], utterance["speaker"] ) # Hugging Face evaluation if use_hf and self.hf_api_key: utterance_result["huggingface_scores"] = self.evaluate_with_huggingface( utterance["text"] ) results.append(utterance_result) progress_bar.progress((i + 1) / len(utterances)) time.sleep(0.1) # Small delay for better UX status_text.text("Evaluation complete!") return results # Helper functions def create_radar_chart(scores: Dict[str, float], title: str): """Create radar chart for scores""" import plotly.graph_objects as go categories = list(scores.keys()) values = list(scores.values()) fig = go.Figure() fig.add_trace(go.Scatterpolar( r=values, theta=categories, fill='toself', name=title, line_color='blue' )) fig.update_layout( polar=dict( radialaxis=dict( visible=True, range=[0, 10] )), showlegend=True, title=title, font_size=12 ) return fig def display_utterance_results(results: List[Dict]): """Display utterance-level results""" st.subheader("Utterance-Level Results") for i, result in enumerate(results): with st.expander(f"Utterance {i+1}: {result['speaker']} (Timestamp: {result['timestamp']})"): st.write(f"**Text:** {result['text']}") col1, col2 = st.columns(2) with col1: st.write("**OpenAI Scores:**") if result['openai_scores']: for metric, score in result['openai_scores'].items(): st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10") else: st.write("No OpenAI scores available") with col2: st.write("**Hugging Face Scores:**") if result['huggingface_scores']: for metric, score in result['huggingface_scores'].items(): st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10") else: st.write("No Hugging Face scores available")