github-actions[bot]
Deploy from GitHub Actions (commit: 8b247ffacd77c0672965b8378f1d52a7dcd187ae)
9366995
| """ | |
| Test script for ToxicityEvaluator | |
| Usage: | |
| python -m pytest tests/test_evaluators/test_toxicity_evaluator.py | |
| python tests/test_evaluators/test_toxicity_evaluator.py # Direct execution | |
| """ | |
| import logging | |
| import sys | |
| import os | |
| # Add parent directory to path for imports | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) | |
| from custom_types import Utterance, EvaluationResult | |
| # Setup logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| def test_toxicity_evaluator(): | |
| """Test the ToxicityEvaluator with sample utterances.""" | |
| print("=" * 80) | |
| print("ToxicityEvaluator Test") | |
| print("=" * 80) | |
| print() | |
| # Test conversation with varying toxicity levels | |
| test_conversation: list[Utterance] = [ | |
| # Safe utterances | |
| {"speaker": "Patient", "text": "I've been feeling really anxious lately."}, | |
| {"speaker": "Therapist", "text": "I understand. Can you tell me more about what's causing your anxiety?"}, | |
| {"speaker": "Patient", "text": "Work has been stressful and I'm having trouble sleeping."}, | |
| # Mildly concerning | |
| {"speaker": "Patient", "text": "Sometimes I just want to scream at everyone!"}, | |
| # More problematic | |
| {"speaker": "Patient", "text": "I hate myself and everything about my life."}, | |
| # Potentially toxic | |
| {"speaker": "Patient", "text": "You're an idiot if you think that will help."}, | |
| # Severely toxic (example - normally wouldn't appear in therapy) | |
| {"speaker": "Patient", "text": "I'm going to hurt someone if this doesn't stop."}, | |
| ] | |
| # Initialize evaluator | |
| print("Initializing ToxicityEvaluator...") | |
| try: | |
| from evaluators.impl.toxicity_evaluator import ToxicityEvaluator | |
| evaluator = ToxicityEvaluator(model_type="unbiased", device="cpu") | |
| print("✓ Evaluator initialized\n") | |
| except ImportError as e: | |
| print(f"✗ Failed to import: {e}") | |
| print("\nPlease install detoxify: pip install detoxify") | |
| return | |
| except Exception as e: | |
| print(f"✗ Failed to initialize evaluator: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return | |
| # Test evaluation | |
| print(f"Testing conversation with {len(test_conversation)} utterances...") | |
| print("-" * 80) | |
| try: | |
| # Call the evaluator with full conversation | |
| result: EvaluationResult = evaluator.execute(test_conversation) | |
| # Verify result structure | |
| assert result["granularity"] == "utterance", f"Expected granularity 'utterance', got '{result['granularity']}'" | |
| assert result["per_utterance"] is not None, "Expected per_utterance to be populated" | |
| assert len(result["per_utterance"]) == len(test_conversation), \ | |
| f"Expected {len(test_conversation)} results, got {len(result['per_utterance'])}" | |
| print(f"\n✓ Result structure valid") | |
| print(f" Granularity: {result['granularity']}") | |
| print(f" Number of utterances: {len(result['per_utterance'])}") | |
| print() | |
| # Display results | |
| toxic_count = 0 | |
| safe_count = 0 | |
| for i, utt_score in enumerate(result["per_utterance"]): | |
| utt = test_conversation[i] | |
| print(f"\n{'='*80}") | |
| print(f"Utterance {i + 1}:") | |
| print(f" Speaker: {utt['speaker']}") | |
| print(f" Text: {utt['text']}") | |
| print(f"{'-'*80}") | |
| if "toxicity" in utt_score["metrics"]: | |
| toxicity_scores = utt_score["metrics"]["toxicity"] | |
| # Overall assessment | |
| is_toxic = toxicity_scores.get("is_toxic", {}) | |
| print(f" Overall: {is_toxic.get('label', 'Unknown')} (confidence: {is_toxic.get('confidence', 0):.3f})") | |
| if is_toxic.get('label') == 'Toxic': | |
| toxic_count += 1 | |
| # Show primary category if flagged as toxic | |
| primary = toxicity_scores.get("primary_category", {}) | |
| if primary: | |
| print(f" Primary Issue: {primary.get('label', 'Unknown')} (score: {primary.get('confidence', 0):.3f})") | |
| else: | |
| safe_count += 1 | |
| # Show individual scores | |
| print(f"\n Detailed Scores:") | |
| for score_key, score_value in toxicity_scores.items(): | |
| if score_key not in ["is_toxic", "primary_category"]: | |
| if score_value.get('type') == 'numerical': | |
| label_text = f" ({score_value.get('label', '')})" if score_value.get('label') else "" | |
| print(f" - {score_key}: {score_value['value']:.4f}{label_text}") | |
| else: | |
| print(f" No toxicity scores") | |
| # Summary | |
| print(f"\n{'='*80}") | |
| print(f"Summary:") | |
| print(f" Safe utterances: {safe_count}") | |
| print(f" Toxic utterances: {toxic_count}") | |
| print(f" Total utterances: {len(test_conversation)}") | |
| print(f" Toxicity rate: {(toxic_count/len(test_conversation)*100):.1f}%") | |
| print("-" * 80) | |
| # Test summary statistics method | |
| print("\n" + "="*80) | |
| print("Testing summary statistics...") | |
| print("-" * 80) | |
| # Convert result format for summary statistics | |
| results_for_summary = [] | |
| for i, utt_score in enumerate(result["per_utterance"]): | |
| row = { | |
| "index": i, | |
| "speaker": test_conversation[i]["speaker"], | |
| "text": test_conversation[i]["text"], | |
| "toxicity_scores": utt_score["metrics"].get("toxicity", {}) | |
| } | |
| results_for_summary.append(row) | |
| summary = evaluator.get_summary_statistics(results_for_summary) | |
| print(f"\nSummary Statistics:") | |
| print(f" Total Utterances: {summary['total_utterances']}") | |
| print(f" Toxic Utterances: {summary['toxic_utterances']}") | |
| print(f" Toxicity Rate: {summary['toxic_percentage']:.1f}%") | |
| if summary['category_breakdown']: | |
| print(f"\n Category Breakdown:") | |
| for cat, count in summary['category_breakdown'].items(): | |
| print(f" - {cat}: {count}") | |
| if summary['average_scores']: | |
| print(f"\n Average Scores:") | |
| for metric, avg in summary['average_scores'].items(): | |
| print(f" - {metric}: {avg:.4f}") | |
| print("\n" + "="*80) | |
| print("✅ Test passed!") | |
| except Exception as e: | |
| print(f"\n✗ Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| print("\n" + "=" * 80) | |
| print("Test completed!") | |
| print("=" * 80) | |
| def test_single_utterance(utterance: str): | |
| """Test a single utterance.""" | |
| print("=" * 80) | |
| print("Single Utterance Toxicity Test") | |
| print("=" * 80) | |
| print() | |
| try: | |
| from evaluators.impl.toxicity_evaluator import ToxicityEvaluator | |
| evaluator = ToxicityEvaluator(model_type="unbiased", device="cpu") | |
| print(f"Input: \"{utterance}\"") | |
| print() | |
| # Build single-item conversation | |
| conversation: list[Utterance] = [{"speaker": "User", "text": utterance}] | |
| result: EvaluationResult = evaluator.execute(conversation) | |
| if result["per_utterance"] and len(result["per_utterance"]) > 0: | |
| utt_result = result["per_utterance"][0] | |
| if "toxicity" in utt_result["metrics"]: | |
| toxicity_scores = utt_result["metrics"]["toxicity"] | |
| is_toxic = toxicity_scores.get("is_toxic", {}) | |
| print("Result:") | |
| print(f" Assessment: {is_toxic.get('label', 'Unknown')}") | |
| print(f" Confidence: {is_toxic.get('confidence', 0):.3f}") | |
| primary = toxicity_scores.get("primary_category", {}) | |
| if primary: | |
| print(f" Primary Category: {primary.get('label', 'Unknown')}") | |
| print("\nDetailed Scores:") | |
| for key, score in toxicity_scores.items(): | |
| if key not in ["is_toxic", "primary_category"] and score.get('type') == 'numerical': | |
| print(f" - {key}: {score['value']:.4f}") | |
| else: | |
| print("❌ No toxicity scores returned") | |
| else: | |
| print("❌ No results returned") | |
| except ImportError: | |
| print("❌ Detoxify not installed. Run: pip install detoxify") | |
| except Exception as e: | |
| print(f"❌ Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| print() | |
| if __name__ == "__main__": | |
| if len(sys.argv) > 1: | |
| # Test a single utterance from command line | |
| utterance = " ".join(sys.argv[1:]) | |
| test_single_utterance(utterance) | |
| else: | |
| # Run all tests | |
| test_toxicity_evaluator() | |