Spaces:
Runtime error
Runtime error
Mustafa Öztürk commited on
Commit ·
7a29d91
1
Parent(s): 398cb92
Add int8 quantization and batch moderation endpoint
Browse files- app/api/endpoints.py +51 -1
- app/ml/model_loader.py +29 -0
- app/services/moderation_service.py +152 -92
app/api/endpoints.py
CHANGED
|
@@ -13,7 +13,7 @@ except ImportError:
|
|
| 13 |
psutil = None
|
| 14 |
|
| 15 |
from app.services.cache_manager import get_cache_counts, load_blacklist_to_ram
|
| 16 |
-
from app.services.moderation_service import run_moderation
|
| 17 |
|
| 18 |
router = APIRouter()
|
| 19 |
|
|
@@ -91,6 +91,12 @@ class ModerationInput(BaseModel):
|
|
| 91 |
platform_dil: Optional[str] = "tr"
|
| 92 |
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
@router.get("/vram-status")
|
| 95 |
def get_vram_status():
|
| 96 |
if not torch.cuda.is_available():
|
|
@@ -151,3 +157,47 @@ async def analyze(input_data: ModerationInput):
|
|
| 151 |
"latency_ms": latency_ms,
|
| 152 |
"performance": performance,
|
| 153 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
psutil = None
|
| 14 |
|
| 15 |
from app.services.cache_manager import get_cache_counts, load_blacklist_to_ram
|
| 16 |
+
from app.services.moderation_service import run_moderation, run_moderation_batch
|
| 17 |
|
| 18 |
router = APIRouter()
|
| 19 |
|
|
|
|
| 91 |
platform_dil: Optional[str] = "tr"
|
| 92 |
|
| 93 |
|
| 94 |
+
class ModerationBatchInput(BaseModel):
|
| 95 |
+
texts: list[str]
|
| 96 |
+
platform_dil: Optional[str] = "tr"
|
| 97 |
+
batch_size: Optional[int] = 8
|
| 98 |
+
|
| 99 |
+
|
| 100 |
@router.get("/vram-status")
|
| 101 |
def get_vram_status():
|
| 102 |
if not torch.cuda.is_available():
|
|
|
|
| 157 |
"latency_ms": latency_ms,
|
| 158 |
"performance": performance,
|
| 159 |
}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
@router.post("/analyze-batch")
|
| 163 |
+
async def analyze_batch(input_data: ModerationBatchInput):
|
| 164 |
+
if not input_data.texts:
|
| 165 |
+
raise HTTPException(status_code=400, detail="texts alanı boş olamaz")
|
| 166 |
+
|
| 167 |
+
cleaned_texts = [t for t in input_data.texts if isinstance(t, str) and t.strip()]
|
| 168 |
+
if not cleaned_texts:
|
| 169 |
+
raise HTTPException(status_code=400, detail="Geçerli metin bulunamadı")
|
| 170 |
+
|
| 171 |
+
batch_size = max(1, int(input_data.batch_size or 8))
|
| 172 |
+
start_time = time.time()
|
| 173 |
+
batch_results = run_moderation_batch(
|
| 174 |
+
cleaned_texts,
|
| 175 |
+
input_data.platform_dil or "tr",
|
| 176 |
+
batch_size=batch_size,
|
| 177 |
+
)
|
| 178 |
+
latency_ms = round((time.time() - start_time) * 1000, 2)
|
| 179 |
+
performance = capture_process_metrics()
|
| 180 |
+
performance["latency_ms"] = latency_ms
|
| 181 |
+
|
| 182 |
+
items = []
|
| 183 |
+
for original_text, result in zip(cleaned_texts, batch_results):
|
| 184 |
+
decision, reason, risk, lang, cleaned, details = result
|
| 185 |
+
items.append(
|
| 186 |
+
{
|
| 187 |
+
"text": original_text,
|
| 188 |
+
"cleaned_text": cleaned,
|
| 189 |
+
"decision": decision,
|
| 190 |
+
"reason": reason,
|
| 191 |
+
"risk_level": risk,
|
| 192 |
+
"language": lang,
|
| 193 |
+
"details": details,
|
| 194 |
+
}
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
"count": len(items),
|
| 199 |
+
"batch_size": batch_size,
|
| 200 |
+
"latency_ms": latency_ms,
|
| 201 |
+
"performance": performance,
|
| 202 |
+
"results": items,
|
| 203 |
+
}
|
app/ml/model_loader.py
CHANGED
|
@@ -25,6 +25,17 @@ def load_system():
|
|
| 25 |
model_o = AutoModelForSequenceClassification.from_pretrained(TR_OFF_MODEL_PATH).to(torch_device)
|
| 26 |
model_o.eval()
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
try:
|
| 29 |
gibberish = pipeline(
|
| 30 |
"text-classification",
|
|
@@ -37,6 +48,24 @@ def load_system():
|
|
| 37 |
detox_en = Detoxify("original")
|
| 38 |
detox_multi = Detoxify("multilingual")
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
_STATE.update(
|
| 41 |
{
|
| 42 |
"T_O": tokenizer_o,
|
|
|
|
| 25 |
model_o = AutoModelForSequenceClassification.from_pretrained(TR_OFF_MODEL_PATH).to(torch_device)
|
| 26 |
model_o.eval()
|
| 27 |
|
| 28 |
+
if torch_device.type == "cpu":
|
| 29 |
+
try:
|
| 30 |
+
model_o = torch.quantization.quantize_dynamic(
|
| 31 |
+
model_o,
|
| 32 |
+
{torch.nn.Linear},
|
| 33 |
+
dtype=torch.qint8,
|
| 34 |
+
)
|
| 35 |
+
model_o.eval()
|
| 36 |
+
except Exception:
|
| 37 |
+
pass
|
| 38 |
+
|
| 39 |
try:
|
| 40 |
gibberish = pipeline(
|
| 41 |
"text-classification",
|
|
|
|
| 48 |
detox_en = Detoxify("original")
|
| 49 |
detox_multi = Detoxify("multilingual")
|
| 50 |
|
| 51 |
+
if torch_device.type == "cpu":
|
| 52 |
+
try:
|
| 53 |
+
detox_en.model = torch.quantization.quantize_dynamic(
|
| 54 |
+
detox_en.model,
|
| 55 |
+
{torch.nn.Linear},
|
| 56 |
+
dtype=torch.qint8,
|
| 57 |
+
)
|
| 58 |
+
except Exception:
|
| 59 |
+
pass
|
| 60 |
+
try:
|
| 61 |
+
detox_multi.model = torch.quantization.quantize_dynamic(
|
| 62 |
+
detox_multi.model,
|
| 63 |
+
{torch.nn.Linear},
|
| 64 |
+
dtype=torch.qint8,
|
| 65 |
+
)
|
| 66 |
+
except Exception:
|
| 67 |
+
pass
|
| 68 |
+
|
| 69 |
_STATE.update(
|
| 70 |
{
|
| 71 |
"T_O": tokenizer_o,
|
app/services/moderation_service.py
CHANGED
|
@@ -55,13 +55,117 @@ def calculate_verdict(profanity_hits, insult_hits, ai_scores):
|
|
| 55 |
}
|
| 56 |
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
def run_moderation(text: str, platform_dil: str = "tr"):
|
| 59 |
state = _ensure_runtime_ready()
|
| 60 |
|
| 61 |
temiz = clean_text_nfkc(text)
|
| 62 |
dil = "en" if platform_dil == "en" else "tr"
|
| 63 |
-
pure_text = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ0-9\s]", "", temiz).lower()
|
| 64 |
-
words_in_pure_text = set(pure_text.split())
|
| 65 |
|
| 66 |
if is_spam(temiz, dil):
|
| 67 |
return (
|
|
@@ -73,45 +177,11 @@ def run_moderation(text: str, platform_dil: str = "tr"):
|
|
| 73 |
{"action": "MONITOR", "detox": {}},
|
| 74 |
)
|
| 75 |
|
| 76 |
-
|
| 77 |
-
detected_profanity = []
|
| 78 |
-
detected_insult = []
|
| 79 |
-
|
| 80 |
-
for bad_word, category in active_cache.items():
|
| 81 |
-
is_hit = bad_word in words_in_pure_text or (len(bad_word) > 3 and bad_word in pure_text)
|
| 82 |
-
if is_hit:
|
| 83 |
-
if category == "profanity":
|
| 84 |
-
detected_profanity.append(bad_word)
|
| 85 |
-
else:
|
| 86 |
-
detected_insult.append(bad_word)
|
| 87 |
-
|
| 88 |
-
profanity_hits = sorted(set(detected_profanity))
|
| 89 |
-
insult_hits = sorted(set(detected_insult))
|
| 90 |
|
| 91 |
# Fast path: if blacklist catches profanity/insult, skip all ML inference.
|
| 92 |
if profanity_hits or insult_hits:
|
| 93 |
-
|
| 94 |
-
profanity_hits,
|
| 95 |
-
insult_hits,
|
| 96 |
-
{
|
| 97 |
-
"off_score": 0.0,
|
| 98 |
-
"detox_toxicity": 0.0,
|
| 99 |
-
},
|
| 100 |
-
)
|
| 101 |
-
action_map = {
|
| 102 |
-
"CRITICAL": "CENSOR",
|
| 103 |
-
"HIGH": "WARN",
|
| 104 |
-
"MEDIUM": "MONITOR",
|
| 105 |
-
"LOW": "MONITOR",
|
| 106 |
-
"NONE": "ALLOW",
|
| 107 |
-
}
|
| 108 |
-
detail = {
|
| 109 |
-
"hits": profanity_hits,
|
| 110 |
-
"insult_hits": insult_hits,
|
| 111 |
-
"action": action_map.get(verdict["risk_level"], "MONITOR"),
|
| 112 |
-
"fast_path": "blacklist_early_exit",
|
| 113 |
-
}
|
| 114 |
-
return verdict["decision"], verdict["reason"], verdict["risk_level"], dil, temiz, detail
|
| 115 |
|
| 116 |
if dil == "en":
|
| 117 |
if state["GB_PIPE"] is not None:
|
|
@@ -152,61 +222,51 @@ def run_moderation(text: str, platform_dil: str = "tr"):
|
|
| 152 |
"detox_toxicity": tox_score,
|
| 153 |
},
|
| 154 |
)
|
| 155 |
-
|
| 156 |
-
"CRITICAL": "CENSOR",
|
| 157 |
-
"HIGH": "WARN",
|
| 158 |
-
"MEDIUM": "MONITOR",
|
| 159 |
-
"LOW": "MONITOR",
|
| 160 |
-
"NONE": "ALLOW",
|
| 161 |
-
}
|
| 162 |
-
detail.update({"action": action_map.get(verdict["risk_level"], "MONITOR")})
|
| 163 |
return verdict["decision"], verdict["reason"], verdict["risk_level"], dil, temiz, detail
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
with torch.no_grad():
|
| 168 |
-
out_o = state["M_O"](**in_o)
|
| 169 |
-
p_o = torch.softmax(out_o.logits, dim=1)[0]
|
| 170 |
-
off_score = float(p_o[1].item()) if p_o.numel() > 1 else float(p_o.max().item())
|
| 171 |
|
| 172 |
-
# Only run Detoxify on uncertain content to reduce inference cost.
|
| 173 |
-
if off_score < 0.60:
|
| 174 |
-
raw_threat_res = state["D_MULTI"].predict(temiz)
|
| 175 |
-
else:
|
| 176 |
-
raw_threat_res = {
|
| 177 |
-
"toxicity": off_score,
|
| 178 |
-
"identity_attack": 0.0,
|
| 179 |
-
"threat": 0.0,
|
| 180 |
-
"insult": 0.0,
|
| 181 |
-
}
|
| 182 |
-
threat_res = {k: float(v) for k, v in raw_threat_res.items()}
|
| 183 |
-
threat = float(threat_res.get("threat", 0.0))
|
| 184 |
-
tox_score = float(threat_res.get("toxicity", 0.0))
|
| 185 |
-
ins_score = float(threat_res.get("insult", 0.0))
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
}
|
| 56 |
|
| 57 |
|
| 58 |
+
ACTION_MAP = {
|
| 59 |
+
"CRITICAL": "CENSOR",
|
| 60 |
+
"HIGH": "WARN",
|
| 61 |
+
"MEDIUM": "MONITOR",
|
| 62 |
+
"LOW": "MONITOR",
|
| 63 |
+
"NONE": "ALLOW",
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _extract_blacklist_hits(cleaned_text: str, dil: str):
|
| 68 |
+
pure_text = re.sub(r"[^a-zA-ZçğıöşüÇĞİÖŞÜ0-9\s]", "", cleaned_text).lower()
|
| 69 |
+
words_in_pure_text = set(pure_text.split())
|
| 70 |
+
|
| 71 |
+
active_cache = get_blacklist_for_language(dil)
|
| 72 |
+
detected_profanity = []
|
| 73 |
+
detected_insult = []
|
| 74 |
+
|
| 75 |
+
for bad_word, category in active_cache.items():
|
| 76 |
+
is_hit = bad_word in words_in_pure_text or (len(bad_word) > 3 and bad_word in pure_text)
|
| 77 |
+
if is_hit:
|
| 78 |
+
if category == "profanity":
|
| 79 |
+
detected_profanity.append(bad_word)
|
| 80 |
+
else:
|
| 81 |
+
detected_insult.append(bad_word)
|
| 82 |
+
|
| 83 |
+
return sorted(set(detected_profanity)), sorted(set(detected_insult))
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _blacklist_early_result(profanity_hits, insult_hits, dil, cleaned_text):
|
| 87 |
+
verdict = calculate_verdict(
|
| 88 |
+
profanity_hits,
|
| 89 |
+
insult_hits,
|
| 90 |
+
{
|
| 91 |
+
"off_score": 0.0,
|
| 92 |
+
"detox_toxicity": 0.0,
|
| 93 |
+
},
|
| 94 |
+
)
|
| 95 |
+
detail = {
|
| 96 |
+
"hits": profanity_hits,
|
| 97 |
+
"insult_hits": insult_hits,
|
| 98 |
+
"action": ACTION_MAP.get(verdict["risk_level"], "MONITOR"),
|
| 99 |
+
"fast_path": "blacklist_early_exit",
|
| 100 |
+
}
|
| 101 |
+
return verdict["decision"], verdict["reason"], verdict["risk_level"], dil, cleaned_text, detail
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _tr_off_scores_batched(text_list, state, batch_size: int = 8):
|
| 105 |
+
if not text_list:
|
| 106 |
+
return []
|
| 107 |
+
|
| 108 |
+
scores = []
|
| 109 |
+
effective_batch = max(1, int(batch_size))
|
| 110 |
+
for i in range(0, len(text_list), effective_batch):
|
| 111 |
+
chunk = text_list[i : i + effective_batch]
|
| 112 |
+
in_o = state["T_O"](chunk, return_tensors="pt", truncation=True, padding=True, max_length=128)
|
| 113 |
+
in_o = {k: v.to(state["TORCH_DEVICE"]) for k, v in in_o.items()}
|
| 114 |
+
with torch.no_grad():
|
| 115 |
+
out_o = state["M_O"](**in_o)
|
| 116 |
+
p_o = torch.softmax(out_o.logits, dim=1)
|
| 117 |
+
if p_o.shape[1] > 1:
|
| 118 |
+
chunk_scores = p_o[:, 1].detach().cpu().tolist()
|
| 119 |
+
else:
|
| 120 |
+
chunk_scores = p_o.max(dim=1).values.detach().cpu().tolist()
|
| 121 |
+
scores.extend(float(s) for s in chunk_scores)
|
| 122 |
+
|
| 123 |
+
return scores
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def _tr_result_with_off_score(cleaned_text: str, profanity_hits, insult_hits, off_score: float, state, dil: str):
|
| 127 |
+
# Only run Detoxify on uncertain content to reduce inference cost.
|
| 128 |
+
if off_score < 0.60:
|
| 129 |
+
raw_threat_res = state["D_MULTI"].predict(cleaned_text)
|
| 130 |
+
else:
|
| 131 |
+
raw_threat_res = {
|
| 132 |
+
"toxicity": off_score,
|
| 133 |
+
"identity_attack": 0.0,
|
| 134 |
+
"threat": 0.0,
|
| 135 |
+
"insult": 0.0,
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
threat_res = {k: float(v) for k, v in raw_threat_res.items()}
|
| 139 |
+
threat = float(threat_res.get("threat", 0.0))
|
| 140 |
+
tox_score = float(threat_res.get("toxicity", 0.0))
|
| 141 |
+
ins_score = float(threat_res.get("insult", 0.0))
|
| 142 |
+
|
| 143 |
+
detail = {
|
| 144 |
+
"off_score": off_score,
|
| 145 |
+
"toxicity": tox_score,
|
| 146 |
+
"insult": ins_score,
|
| 147 |
+
"threat": threat,
|
| 148 |
+
"detox": threat_res,
|
| 149 |
+
"hits": profanity_hits,
|
| 150 |
+
"insult_hits": insult_hits,
|
| 151 |
+
}
|
| 152 |
+
verdict = calculate_verdict(
|
| 153 |
+
profanity_hits,
|
| 154 |
+
insult_hits,
|
| 155 |
+
{
|
| 156 |
+
"off_score": off_score,
|
| 157 |
+
"detox_toxicity": tox_score,
|
| 158 |
+
},
|
| 159 |
+
)
|
| 160 |
+
detail.update({"action": ACTION_MAP.get(verdict["risk_level"], "MONITOR")})
|
| 161 |
+
return verdict["decision"], verdict["reason"], verdict["risk_level"], dil, cleaned_text, detail
|
| 162 |
+
|
| 163 |
+
|
| 164 |
def run_moderation(text: str, platform_dil: str = "tr"):
|
| 165 |
state = _ensure_runtime_ready()
|
| 166 |
|
| 167 |
temiz = clean_text_nfkc(text)
|
| 168 |
dil = "en" if platform_dil == "en" else "tr"
|
|
|
|
|
|
|
| 169 |
|
| 170 |
if is_spam(temiz, dil):
|
| 171 |
return (
|
|
|
|
| 177 |
{"action": "MONITOR", "detox": {}},
|
| 178 |
)
|
| 179 |
|
| 180 |
+
profanity_hits, insult_hits = _extract_blacklist_hits(temiz, dil)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
# Fast path: if blacklist catches profanity/insult, skip all ML inference.
|
| 183 |
if profanity_hits or insult_hits:
|
| 184 |
+
return _blacklist_early_result(profanity_hits, insult_hits, dil, temiz)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
if dil == "en":
|
| 187 |
if state["GB_PIPE"] is not None:
|
|
|
|
| 222 |
"detox_toxicity": tox_score,
|
| 223 |
},
|
| 224 |
)
|
| 225 |
+
detail.update({"action": ACTION_MAP.get(verdict["risk_level"], "MONITOR")})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
return verdict["decision"], verdict["reason"], verdict["risk_level"], dil, temiz, detail
|
| 227 |
|
| 228 |
+
off_score = _tr_off_scores_batched([temiz], state, batch_size=1)[0]
|
| 229 |
+
return _tr_result_with_off_score(temiz, profanity_hits, insult_hits, off_score, state, dil)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
def run_moderation_batch(texts, platform_dil: str = "tr", batch_size: int = 8):
|
| 233 |
+
state = _ensure_runtime_ready()
|
| 234 |
+
dil = "en" if platform_dil == "en" else "tr"
|
| 235 |
+
|
| 236 |
+
results = [None] * len(texts)
|
| 237 |
+
tr_pending = []
|
| 238 |
+
tr_pending_texts = []
|
| 239 |
+
|
| 240 |
+
for idx, text in enumerate(texts):
|
| 241 |
+
temiz = clean_text_nfkc(text)
|
| 242 |
+
|
| 243 |
+
if is_spam(temiz, dil):
|
| 244 |
+
results[idx] = (
|
| 245 |
+
"🗑️ SPAM/GİBBERİSH",
|
| 246 |
+
"Anlamsız veya tekrarlı içerik.",
|
| 247 |
+
"LOW",
|
| 248 |
+
dil,
|
| 249 |
+
temiz,
|
| 250 |
+
{"action": "MONITOR", "detox": {}},
|
| 251 |
+
)
|
| 252 |
+
continue
|
| 253 |
+
|
| 254 |
+
profanity_hits, insult_hits = _extract_blacklist_hits(temiz, dil)
|
| 255 |
+
if profanity_hits or insult_hits:
|
| 256 |
+
results[idx] = _blacklist_early_result(profanity_hits, insult_hits, dil, temiz)
|
| 257 |
+
continue
|
| 258 |
+
|
| 259 |
+
if dil == "en":
|
| 260 |
+
results[idx] = run_moderation(text, platform_dil="en")
|
| 261 |
+
continue
|
| 262 |
+
|
| 263 |
+
tr_pending.append((idx, temiz, profanity_hits, insult_hits))
|
| 264 |
+
tr_pending_texts.append(temiz)
|
| 265 |
+
|
| 266 |
+
if tr_pending_texts:
|
| 267 |
+
off_scores = _tr_off_scores_batched(tr_pending_texts, state, batch_size=batch_size)
|
| 268 |
+
for pending, off_score in zip(tr_pending, off_scores):
|
| 269 |
+
idx, temiz, profanity_hits, insult_hits = pending
|
| 270 |
+
results[idx] = _tr_result_with_off_score(temiz, profanity_hits, insult_hits, off_score, state, dil)
|
| 271 |
+
|
| 272 |
+
return results
|