Spaces:
Sleeping
Sleeping
| """ | |
| Response Verification System for Codette | |
| Validates and verifies responses across multiple perspectives | |
| """ | |
| import logging | |
| from typing import Dict, List, Any, Optional | |
| from datetime import datetime | |
| logger = logging.getLogger(__name__) | |
| class ResponseVerifier: | |
| """Verifies responses for factuality, safety, and quality""" | |
| def __init__(self): | |
| """Initialize response verifier""" | |
| self.verification_history = [] | |
| self.factuality_checks = { | |
| "has_claims": 0, | |
| "verified_claims": 0, | |
| "uncertain_claims": 0, | |
| "uncertain_count": 0 | |
| } | |
| self.safety_flags = { | |
| "prompt_injection_risk": False, | |
| "harmful_content": False, | |
| "misinformation": False, | |
| "bias_detected": False | |
| } | |
| logger.info("ResponseVerifier initialized") | |
| def verify_response(self, response: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: | |
| """ | |
| Verify a response for safety and quality | |
| Args: | |
| response: Response text to verify | |
| context: Optional context information | |
| Returns: | |
| Verification result with status and metrics | |
| """ | |
| try: | |
| verification_result = { | |
| "verified": True, | |
| "confidence": 0.85, | |
| "issues": [], | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| # Check for safety issues | |
| safety_result = self._check_safety(response) | |
| if not safety_result["safe"]: | |
| verification_result["verified"] = False | |
| verification_result["issues"].extend(safety_result["issues"]) | |
| verification_result["confidence"] -= 0.3 | |
| # Check for factuality | |
| factuality_result = self._check_factuality(response) | |
| verification_result["factuality_score"] = factuality_result["score"] | |
| if factuality_result["issues"]: | |
| verification_result["issues"].extend(factuality_result["issues"]) | |
| # Check for coherence | |
| coherence_result = self._check_coherence(response) | |
| verification_result["coherence_score"] = coherence_result["score"] | |
| # Ensure confidence is in valid range | |
| verification_result["confidence"] = min(1.0, max(0.0, verification_result["confidence"])) | |
| # Record verification | |
| self.verification_history.append(verification_result) | |
| return verification_result | |
| except Exception as e: | |
| logger.error(f"Error verifying response: {e}") | |
| return { | |
| "verified": False, | |
| "confidence": 0.0, | |
| "issues": [str(e)], | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| def process_multi_perspective_response(self, | |
| responses: List[str], | |
| perspectives: List[str], | |
| consciousness_state: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: | |
| """ | |
| Process and verify responses from multiple perspectives | |
| Args: | |
| responses: List of responses from different perspectives | |
| perspectives: List of perspective names | |
| consciousness_state: Optional consciousness state context | |
| Returns: | |
| Processed response with verification | |
| """ | |
| try: | |
| verified_insights = [] | |
| uncertain_insights = [] | |
| for response, perspective in zip(responses, perspectives): | |
| verification = self.verify_response(response) | |
| insight_obj = { | |
| "text": response, | |
| "mode": perspective.lower().replace(" ", "_"), | |
| "confidence": verification["confidence"] | |
| } | |
| if verification["verified"] and verification["confidence"] > 0.7: | |
| verified_insights.append(insight_obj) | |
| else: | |
| uncertain_insights.append(insight_obj) | |
| # Calculate overall confidence | |
| all_confidences = [v["confidence"] for v in | |
| verified_insights + uncertain_insights] | |
| overall_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.5 | |
| return { | |
| "verified_insights": verified_insights, | |
| "uncertain_insights": uncertain_insights, | |
| "overall_confidence": overall_confidence, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| logger.error(f"Error processing multi-perspective response: {e}") | |
| return { | |
| "verified_insights": [], | |
| "uncertain_insights": [{"text": r, "mode": p.lower(), "confidence": 0.5} | |
| for r, p in zip(responses, perspectives)], | |
| "overall_confidence": 0.5, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| def _check_safety(self, response: str) -> Dict[str, Any]: | |
| """Check response for safety issues""" | |
| try: | |
| issues = [] | |
| safe = True | |
| # Check for prompt injection patterns | |
| injection_patterns = [ | |
| "ignore", "override", "execute", "system:", | |
| "root:", "admin:", "debug:", "<script>" | |
| ] | |
| for pattern in injection_patterns: | |
| if pattern.lower() in response.lower(): | |
| issues.append(f"Possible prompt injection: {pattern}") | |
| safe = False | |
| # Check for harmful content | |
| harmful_words = [ | |
| "kill", "bomb", "weapon", "destroy", | |
| "illegal", "violence", "hate" | |
| ] | |
| for word in harmful_words: | |
| if word.lower() in response.lower(): | |
| issues.append(f"Potentially harmful content: {word}") | |
| safe = False | |
| # Check length (extremely long responses might be suspicious) | |
| if len(response) > 10000: | |
| issues.append("Response unusually long") | |
| safe = False | |
| return { | |
| "safe": safe, | |
| "issues": issues, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| logger.error(f"Error checking safety: {e}") | |
| return {"safe": False, "issues": [str(e)]} | |
| def _check_factuality(self, response: str) -> Dict[str, Any]: | |
| """Check response for factuality""" | |
| try: | |
| score = 0.8 # Default score | |
| issues = [] | |
| # Check for confident claims without hedging | |
| confident_markers = ["definitely", "absolutely", "certainly", "always"] | |
| hedging_markers = ["might", "could", "may", "possibly", "arguably"] | |
| confident_count = sum(1 for marker in confident_markers | |
| if marker in response.lower()) | |
| hedging_count = sum(1 for marker in hedging_markers | |
| if marker in response.lower()) | |
| if confident_count > hedging_count and confident_count > 3: | |
| score -= 0.1 | |
| issues.append("Over-confident language detected") | |
| # Check for excessive qualifiers | |
| qualifier_count = response.lower().count("apparently") + \ | |
| response.lower().count("allegedly") + \ | |
| response.lower().count("reportedly") | |
| if qualifier_count > 2: | |
| score -= 0.1 | |
| issues.append("Excessive qualifiers detected") | |
| # Check for contradiction markers | |
| if " but " in response.lower() or " however, " in response.lower(): | |
| # This is good - shows nuanced thinking | |
| score += 0.05 | |
| # Ensure score is in valid range | |
| score = min(1.0, max(0.0, score)) | |
| return { | |
| "score": score, | |
| "issues": issues, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| logger.error(f"Error checking factuality: {e}") | |
| return {"score": 0.5, "issues": [str(e)]} | |
| def _check_coherence(self, response: str) -> Dict[str, Any]: | |
| """Check response for coherence""" | |
| try: | |
| score = 0.8 # Default score | |
| # Check for basic structure | |
| sentences = response.split(".") | |
| if len(sentences) < 2: | |
| score -= 0.2 # Single sentence might not be coherent enough | |
| # Check for paragraph coherence (average sentence length) | |
| words_per_sentence = len(response.split()) / max(len(sentences), 1) | |
| if words_per_sentence < 5: | |
| score -= 0.1 # Too choppy | |
| elif words_per_sentence > 30: | |
| score -= 0.1 # Too dense | |
| else: | |
| score += 0.05 # Good balance | |
| # Check for repeated words (indicates coherence or redundancy) | |
| words = response.lower().split() | |
| unique_ratio = len(set(words)) / max(len(words), 1) | |
| if unique_ratio < 0.6: | |
| score -= 0.1 # Too much repetition | |
| # Ensure score is in valid range | |
| score = min(1.0, max(0.0, score)) | |
| return { | |
| "score": score, | |
| "metrics": { | |
| "sentence_count": len(sentences), | |
| "avg_sentence_length": words_per_sentence, | |
| "unique_word_ratio": unique_ratio | |
| }, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| logger.error(f"Error checking coherence: {e}") | |
| return {"score": 0.5, "metrics": {}, "timestamp": datetime.now().isoformat()} | |
| def get_verification_stats(self) -> Dict[str, Any]: | |
| """Get verification statistics""" | |
| try: | |
| if not self.verification_history: | |
| return { | |
| "total_verifications": 0, | |
| "verified_count": 0, | |
| "unverified_count": 0, | |
| "average_confidence": 0.0, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| verified_count = sum(1 for v in self.verification_history if v["verified"]) | |
| unverified_count = len(self.verification_history) - verified_count | |
| avg_confidence = sum(v["confidence"] for v in self.verification_history) / len(self.verification_history) | |
| return { | |
| "total_verifications": len(self.verification_history), | |
| "verified_count": verified_count, | |
| "unverified_count": unverified_count, | |
| "verification_rate": verified_count / len(self.verification_history) if self.verification_history else 0.0, | |
| "average_confidence": avg_confidence, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| except Exception as e: | |
| logger.error(f"Error getting verification stats: {e}") | |
| return {"error": str(e)} | |