Spaces:

RyanDDD
/

hhh

Sleeping

hhh
File size: 14,076 Bytes
import streamlit as st
import json
import pandas as pd
import requests
import io
import time
from typing import Dict, List
import openai

class ConversationEvaluator:
    def __init__(self):
        self.openai_client = None
        self.hf_api_key = None
        self.hf_api_url = "https://router.huggingface.co/v1/chat/completions"
        self.metrics = [
            "empathy", "clarity", "therapeutic_alliance", 
            "active_listening", "intervention_quality", "patient_engagement"
        ]

    def setup_openai(self, api_key: str):
        """Initialize OpenAI client"""
        try:
            openai.api_key = api_key
            self.openai_client = openai
            return True
        except Exception as e:
            st.error(f"OpenAI setup failed: {str(e)}")
            return False

    def setup_huggingface(self, api_key: str):
        """Initialize Hugging Face API client"""
        try:
            self.hf_api_key = api_key
            # Test the API connection with new chat completions format
            headers = {
                "Authorization": f"Bearer {api_key}",
                "Content-Type": "application/json"
            }
            test_payload = {
                 "messages": [
                     {
                         "role": "user",
                         "content": "Hello, this is a test message."
                     }
                 ],
                 "model": "deepseek-ai/DeepSeek-V3-0324",
                 "stream": False
             }
            test_response = requests.post(
                self.hf_api_url,
                headers=headers,
                json=test_payload
            )
            if test_response.status_code == 200:
                return True
            else:
                st.error(f"Hugging Face API test failed: {test_response.status_code} - {test_response.text}")
                return False
        except Exception as e:
            st.error(f"Hugging Face API setup failed: {str(e)}")
            return False

    def parse_conversation(self, file_content: str, file_type: str) -> List[Dict]:
        """Parse conversation file into structured format"""
        utterances = []

        if file_type == "json":
            try:
                data = json.loads(file_content)
                if isinstance(data, list):
                    for i, item in enumerate(data):
                        utterances.append({
                            "speaker": item.get("speaker", "Unknown"),
                            "text": item.get("text", ""),
                            "timestamp": item.get("timestamp", i)
                        })
                else:
                    # Handle nested JSON structure
                    for speaker, messages in data.items():
                        for i, message in enumerate(messages):
                            utterances.append({
                                "speaker": speaker,
                                "text": message,
                                "timestamp": i
                            })
            except json.JSONDecodeError:
                st.error("Invalid JSON format")
                return []

        elif file_type == "txt":
            lines = file_content.split('\n')
            for i, line in enumerate(lines):
                if line.strip():
                    # Simple parsing: assume format "Speaker: Text"
                    if ':' in line:
                        speaker, text = line.split(':', 1)
                        utterances.append({
                            "speaker": speaker.strip(),
                            "text": text.strip(),
                            "timestamp": i
                        })
                    else:
                        utterances.append({
                            "speaker": "Unknown",
                            "text": line.strip(),
                            "timestamp": i
                        })

        elif file_type == "csv":
            try:
                df = pd.read_csv(io.StringIO(file_content))
                for _, row in df.iterrows():
                    utterances.append({
                        "speaker": row.get("speaker", "Unknown"),
                        "text": row.get("text", ""),
                        "timestamp": row.get("timestamp", len(utterances))
                    })
            except Exception as e:
                st.error(f"CSV parsing error: {str(e)}")
                return []

        return utterances

    def evaluate_with_openai(self, utterance: str, speaker: str) -> Dict[str, float]:
        """Evaluate utterance using OpenAI"""
        if not self.openai_client:
            return {}

        # Build metrics list based on what's available
        metric_descriptions = {
            'empathy': 'Empathy (1-10): How empathetic and understanding is the response?',
            'clarity': 'Clarity (1-10): How clear and understandable is the communication?',
            'therapeutic_alliance': 'Therapeutic Alliance (1-10): How well does it build rapport and trust?',
            'active_listening': 'Active Listening (1-10): How well does it show engagement and attention?',
            'intervention_quality': 'Intervention Quality (1-10): How effective is the therapeutic technique?',
            'patient_engagement': 'Patient Engagement (1-10): How well does it encourage patient participation?'
        }
        
        # Filter metrics to only include selected ones
        metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions]
        
        if not metrics_to_evaluate:
            return {}
        
        # Build JSON template
        json_template = {m: "X" for m in metrics_to_evaluate}
        json_str_template = json.dumps(json_template).replace('"X"', 'X')
        
        prompt = f"""
        Evaluate this {speaker} utterance on a scale of 1-10 for each metric:
        Utterance: "{utterance}"
        
        Provide scores for:
        """
        
        for metric in metrics_to_evaluate:
            prompt += f"- {metric_descriptions.get(metric, metric)}\n"
        
        prompt += f"\nRespond with only the scores in JSON format: {json_str_template}"
        
        try:
            response = self.openai_client.responses.create(
                model="gpt-4o-mini",
                input=prompt,
                temperature=0.3
            )

            result = response.output_text.strip()
            # Extract JSON from response
            if "{" in result and "}" in result:
                json_start = result.find("{")
                json_end = result.rfind("}") + 1
                json_str = result[json_start:json_end]
                scores = json.loads(json_str)
                # Filter to only return selected metrics
                return {k: v for k, v in scores.items() if k in metrics_to_evaluate}
        except Exception as e:
            st.warning(f"OpenAI evaluation failed: {str(e)}")

        return {}

    def evaluate_with_huggingface(self, utterance: str) -> Dict[str, float]:
        """Evaluate utterance using Hugging Face Chat Completions API"""
        if not self.hf_api_key:
            return {}

        # Build metrics list based on what's available
        metric_descriptions = {
            'empathy': 'Empathy: How empathetic and understanding is the response?',
            'clarity': 'Clarity: How clear and understandable is the communication?',
            'therapeutic_alliance': 'Therapeutic Alliance: How well does it build rapport and trust?',
            'active_listening': 'Active Listening: How well does it show engagement and attention?',
            'intervention_quality': 'Intervention Quality: How effective is the therapeutic technique?',
            'patient_engagement': 'Patient Engagement: How well does it encourage patient participation?'
        }
        
        # Filter metrics to only include selected ones
        metrics_to_evaluate = [m for m in self.metrics if m in metric_descriptions]
        
        if not metrics_to_evaluate:
            return {}

        try:
            headers = {
                "Authorization": f"Bearer {self.hf_api_key}",
                "Content-Type": "application/json"
            }
            
            # Build JSON template
            json_template = {m: "X" for m in metrics_to_evaluate}
            json_str_template = json.dumps(json_template).replace('"X"', 'X')
            
            # Create a prompt for therapeutic evaluation
            evaluation_prompt = f"""
            Please evaluate this therapeutic utterance on a scale of 1-10 for each metric:
            
            Utterance: "{utterance}"
            
            Rate each of the following metrics from 1-10:
            """
            
            for metric in metrics_to_evaluate:
                evaluation_prompt += f"- {metric_descriptions.get(metric, metric)}\n"
            
            evaluation_prompt += f"\nRespond with only the scores in JSON format: {json_str_template}"
            
            payload = {
                 "messages": [
                     {
                         "role": "user",
                         "content": evaluation_prompt
                     }
                 ],
                 "model": "deepseek-ai/DeepSeek-V3-0324",  # Using DeepSeek V3 model
                 "stream": False,
                 "temperature": 0.3
             }
            
            response = requests.post(
                self.hf_api_url,
                headers=headers,
                json=payload
            )
            
            if response.status_code == 200:
                result = response.json()
                content = result['choices'][0]['message']['content']
                
                # Extract JSON from response
                try:
                    if "{" in content and "}" in content:
                        json_start = content.find("{")
                        json_end = content.rfind("}") + 1
                        json_str = content[json_start:json_end]
                        scores = json.loads(json_str)
                        # Filter to only return selected metrics
                        return {k: v for k, v in scores.items() if k in metrics_to_evaluate}
                    else:
                        # Fallback: return default scores if JSON parsing fails
                        return {m: 5.0 for m in metrics_to_evaluate}
                except json.JSONDecodeError:
                    # Fallback scores if JSON parsing fails
                    return {m: 5.0 for m in metrics_to_evaluate}
            else:
                st.warning(f"Hugging Face API request failed: {response.status_code}")
                return {}
        except Exception as e:
            st.warning(f"Hugging Face API evaluation failed: {str(e)}")
            return {}

    def evaluate_conversation(self, utterances: List[Dict], use_openai: bool = True, use_hf: bool = True) -> List[Dict]:
        """Evaluate entire conversation"""
        results = []

        progress_bar = st.progress(0)
        status_text = st.empty()

        for i, utterance in enumerate(utterances):
            status_text.text(f"Evaluating utterance {i+1}/{len(utterances)}")

            utterance_result = {
                "speaker": utterance["speaker"],
                "text": utterance["text"],
                "timestamp": utterance["timestamp"],
                "openai_scores": {},
                "huggingface_scores": {}
            }

            # OpenAI evaluation
            if use_openai and self.openai_client:
                utterance_result["openai_scores"] = self.evaluate_with_openai(
                    utterance["text"], utterance["speaker"]
                )

            # Hugging Face evaluation
            if use_hf and self.hf_api_key:
                utterance_result["huggingface_scores"] = self.evaluate_with_huggingface(
                    utterance["text"]
                )

            results.append(utterance_result)
            progress_bar.progress((i + 1) / len(utterances))
            time.sleep(0.1)  # Small delay for better UX

        status_text.text("Evaluation complete!")
        return results


# Helper functions
def create_radar_chart(scores: Dict[str, float], title: str):
    """Create radar chart for scores"""
    import plotly.graph_objects as go
    
    categories = list(scores.keys())
    values = list(scores.values())

    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories,
        fill='toself',
        name=title,
        line_color='blue'
    ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 10]
            )),
        showlegend=True,
        title=title,
        font_size=12
    )

    return fig

def display_utterance_results(results: List[Dict]):
    """Display utterance-level results"""
    st.subheader("Utterance-Level Results")

    for i, result in enumerate(results):
        with st.expander(f"Utterance {i+1}: {result['speaker']} (Timestamp: {result['timestamp']})"):
            st.write(f"**Text:** {result['text']}")

            col1, col2 = st.columns(2)

            with col1:
                st.write("**OpenAI Scores:**")
                if result['openai_scores']:
                    for metric, score in result['openai_scores'].items():
                        st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10")
                else:
                    st.write("No OpenAI scores available")

            with col2:
                st.write("**Hugging Face Scores:**")
                if result['huggingface_scores']:
                    for metric, score in result['huggingface_scores'].items():
                        st.metric(metric.replace('_', ' ').title(), f"{score:.1f}/10")
                else:
                    st.write("No Hugging Face scores available")