Spaces:
Sleeping
Sleeping
File size: 6,647 Bytes
8ad9255 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from typing import Dict, List
import re
from models.hate_speech_classifier import HateSpeechClassifier
from models.language_detector import detect_language
# Initialize classifier globally
classifier = HateSpeechClassifier()
def highlight_keywords(text: str, keywords: List[str]) -> List[str]:
"""Extract phrases containing keywords"""
highlighted = []
text_lower = text.lower()
for keyword in keywords:
if keyword.lower() in text_lower:
sentences = re.split(r'[।.!?]+', text)
for sentence in sentences:
if keyword.lower() in sentence.lower():
highlighted.append(sentence.strip())
break
return highlighted[:5]
async def analyze_content(text: str) -> Dict:
"""
Main analysis function that combines all models
"""
# Detect language
language = detect_language(text)
# Get results from all three methods
custom_result = await classifier.classify_with_custom_model(text, language)
# ✅ Pass language to pretrained model for translation support
pretrained_result = await classifier.classify_with_pretrained_model(text, language)
keyword_result = classifier.classify_with_keywords(text, language)
# Enhanced ensemble decision with adaptive weights
results = []
has_patterns = keyword_result.get("pattern_matches", 0) > 0
has_hate_keywords = keyword_result.get("hate_count", 0) > 0
if has_patterns or has_hate_keywords:
custom_weight = 0.5
pretrained_weight = 0.2
keyword_weight = 0.3
else:
custom_weight = 0.4
pretrained_weight = 0.4
keyword_weight = 0.2
if custom_result:
results.append({
"category": custom_result["category"],
"confidence": custom_result["confidence"],
"weight": custom_weight
})
if pretrained_result:
results.append({
"category": pretrained_result["category"],
"confidence": pretrained_result["confidence"],
"weight": pretrained_weight
})
if keyword_result:
results.append({
"category": keyword_result["category"],
"confidence": keyword_result["confidence"],
"weight": keyword_weight
})
# Weighted voting
category_scores = {}
for result in results:
cat = result["category"]
score = result["confidence"] * result["weight"]
category_scores[cat] = category_scores.get(cat, 0) + score
if category_scores:
sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
final_category = sorted_categories[0][0]
final_confidence = category_scores[final_category] / sum(r["weight"] for r in results)
if len(sorted_categories) > 1:
top_cat, top_score = sorted_categories[0]
second_cat, second_score = sorted_categories[1]
if (second_cat == "hate_speech" and
top_cat != "hate_speech" and
(top_score - second_score) < 0.15 and
has_patterns):
final_category = "hate_speech"
final_confidence = second_score / sum(r["weight"] for r in results)
else:
final_category = "neutral"
final_confidence = 0.5
# Generate reasoning
reasons = []
if has_patterns:
reasons.append(f"Detected hate speech patterns in text structure")
if custom_result and custom_result["category"] == "hate_speech":
reasons.append(f"Custom model detected {custom_result['category']} with {custom_result['confidence']:.2%} confidence")
if pretrained_result:
if pretrained_result.get("translated"):
reasons.append(f"Pretrained model analyzed translated text and identified {pretrained_result['category']}")
elif pretrained_result["category"] != "neutral":
reasons.append(f"Pretrained model identified {pretrained_result['category']} patterns")
if keyword_result and keyword_result.get("detected_keywords"):
reasons.append(f"Found {len(keyword_result['detected_keywords'])} hate/offensive keywords")
if not reasons:
reasons = ["Classification based on content analysis"]
all_keywords = keyword_result.get("detected_keywords", [])
highlighted_phrases = highlight_keywords(text, all_keywords) if all_keywords else []
return {
"ensemble": {
"category": final_category,
"confidence": float(final_confidence),
"reasons": reasons,
"weights_used": {
"custom_model": custom_weight,
"pretrained_model": pretrained_weight,
"keyword_analysis": keyword_weight
}
},
"custom_model": {
"available": custom_result is not None,
"category": custom_result["category"] if custom_result else None,
"confidence": custom_result["confidence"] if custom_result else None,
"method": custom_result.get("method") if custom_result else None,
"raw_prediction": custom_result.get("raw_prediction") if custom_result else None
},
"pretrained_model": {
"available": pretrained_result is not None,
"category": pretrained_result["category"] if pretrained_result else None,
"confidence": pretrained_result["confidence"] if pretrained_result else None,
"method": pretrained_result.get("method") if pretrained_result else None,
"raw_labels": pretrained_result.get("raw_labels") if pretrained_result else None,
"translated": pretrained_result.get("translated", False) if pretrained_result else False,
"translated_text": pretrained_result.get("translated_text") if pretrained_result else None
},
"keyword_analysis": {
"available": True,
"category": keyword_result["category"],
"confidence": keyword_result["confidence"],
"method": keyword_result["method"],
"detected_keywords": keyword_result.get("detected_keywords", []),
"hate_count": keyword_result.get("hate_count", 0),
"offensive_count": keyword_result.get("offensive_count", 0),
"pattern_matches": keyword_result.get("pattern_matches", 0)
},
"highlighted_phrases": highlighted_phrases,
"detected_language": language,
"original_text": text[:200] + "..." if len(text) > 200 else text
} |