AcharO's picture
feat(ha): Hausa/Zulu lexicons, caller=studylabs gate skip, budurwa precision fix — HA P=1.000
63255ea
"""JuaKazi rewrite service — core correction logic (no HTTP)."""
import time
from typing import Optional
from config import (
AIBRIDGE_ENABLED,
DEFAULT_REWRITE_CONFIDENCE,
REWRITE_CONFIDENCE_BY_SOURCE,
get_semantic_threshold,
)
from core.semantic_preservation import SemanticPreservationMetrics
from .bias_detection_client import LANG_CODE_MAP, AibridgeResult, detect_bias
from .disambiguator import disambiguate
from .ml_rewriter import ml_rewrite
from .rules_engine import apply_rules_on_spans, build_reason
from .schemas import RewriteResponse
semantic_metrics = SemanticPreservationMetrics()
# Callers that already ran bias detection upstream — skip Stage 0 external /detect gate.
# StudyLabs (and similar) detect; we only correct when they hit POST /rewrite with this set.
_SKIP_EXTERNAL_BIAS_GATE_CALLERS = frozenset({"aibridge", "studylabs"})
def rewrite_text(
id: str,
text: str,
lang: str,
flags: Optional[list] = None,
region_dialect: Optional[str] = None,
caller: Optional[str] = None,
) -> tuple[RewriteResponse, dict]:
"""
Run bias detection + correction. Returns (response, audit_info).
audit_info has model_info, latency_ms for logging.
caller 'studylabs' or 'aibridge': skip Stage 0 external /detect — partner already
detected bias; we only run lexicon (and later stages) to correct.
"""
t0 = time.time()
# Stage 0: optional external bias detection gate (haus | swahili | zulu when enabled).
# Skipped when the integration partner already flagged bias (StudyLabs, AIBRIDGE pipeline).
# If the external model says no bias, skip correction entirely and return immediately.
# On any network/auth error, aibridge_result.error is set and we fall through silently.
aibridge_result: Optional[AibridgeResult] = None
caller_norm = (caller or "").strip().lower()
skip_gate = caller_norm in _SKIP_EXTERNAL_BIAS_GATE_CALLERS
ext_lang = LANG_CODE_MAP.get(lang) if (AIBRIDGE_ENABLED and not skip_gate) else None
if ext_lang:
aibridge_result = detect_bias(text, ext_lang)
if aibridge_result.error is None and not aibridge_result.has_bias:
latency_ms = int((time.time() - t0) * 1000)
response = RewriteResponse(
id=id,
original_text=text,
rewrite=text,
edits=[],
confidence=REWRITE_CONFIDENCE_BY_SOURCE["aibridge_preserved"],
needs_review=False,
source="aibridge_preserved",
reason=build_reason("aibridge_preserved", [], []),
semantic_score=None,
skipped_context=None,
has_bias_detected=False,
aibridge_confidence=aibridge_result.confidence,
aibridge_detected=False,
)
audit_info = {
"model_info": {
"model": "aibridge-external",
"confidence": aibridge_result.confidence,
"message": aibridge_result.message,
},
"latency_ms": latency_ms,
"region_dialect": region_dialect or "unknown",
}
return response, audit_info
rewritten, edits, matched_rules, skipped = apply_rules_on_spans(
text, lang, flags=flags
)
source = "rules"
ml_info = None
semantic_score = None
threshold = get_semantic_threshold()
if rewritten != text:
score = semantic_metrics.calculate_composite_preservation_score(text, rewritten)
semantic_score = score["composite_score"]
if semantic_score < threshold:
rewritten, edits, source, semantic_score = text, [], "preserved", 1.0
# Stage 2.5: LLM disambiguation for borderline warn-only matches (SW).
# Only fires when rules found warn-severity terms but no replace-severity terms.
warn_only = matched_rules > 0 and not any(
e.get("severity") == "replace" for e in edits
)
if warn_only and lang == "sw":
llm_result = disambiguate(text)
if llm_result is True:
# LLM confirmed bias — promote the warn edits to replace
for e in edits:
if e.get("severity") == "warn":
e["severity"] = "replace"
e["reason"] = (e.get("reason") or "") + " [LLM confirmed]"
source = "disambiguated"
elif llm_result is False:
# LLM says not bias — suppress the warn edits
edits = []
rewritten = text
source = "preserved"
if matched_rules == 0 and source != "preserved":
ml_out = ml_rewrite(text, lang=lang, num_return_sequences=3)
ml_score = semantic_metrics.calculate_composite_preservation_score(
text, ml_out["best"]
)
if ml_score["composite_score"] < threshold:
rewritten, source, semantic_score = text, "preserved", 1.0
else:
rewritten = ml_out["best"]
source = "ml"
semantic_score = ml_score["composite_score"]
ml_info = ml_out
edits.append({
"from": text,
"to": rewritten,
"severity": "ml_fallback",
"tags": "",
"reason": "ML rewrite",
})
latency_ms = int((time.time() - t0) * 1000)
confidence = REWRITE_CONFIDENCE_BY_SOURCE.get(source, DEFAULT_REWRITE_CONFIDENCE)
needs_review = source == "ml" or len(edits) == 0
aibridge_ok = aibridge_result is not None and aibridge_result.error is None
aibridge_detected = aibridge_result.has_bias if aibridge_ok else None
reason = build_reason(source, edits, skipped, aibridge_detected=bool(aibridge_detected))
has_bias_detected = any(e.get("severity") == "replace" for e in edits)
response = RewriteResponse(
id=id,
original_text=text,
rewrite=rewritten,
edits=edits,
confidence=confidence,
needs_review=needs_review,
source=source,
reason=reason,
semantic_score=semantic_score,
skipped_context=skipped or None,
has_bias_detected=has_bias_detected,
aibridge_confidence=aibridge_result.confidence if aibridge_ok else None,
aibridge_detected=aibridge_detected,
)
audit_info = {
"model_info": ml_info or {"model": "rulepack-v0.3"},
"latency_ms": latency_ms,
"region_dialect": region_dialect or "unknown",
"aibridge_error": aibridge_result.error if aibridge_result else None,
}
return response, audit_info