Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import logging | |
| import time | |
| from typing import Protocol | |
| from uuid import UUID, uuid4 | |
| from fastapi import HTTPException | |
| from sqlalchemy.orm import Session | |
| from ..config import settings | |
| from ..db.models import Analysis, Feature | |
| from ..schemas import ( | |
| AnalysisResult, | |
| AnalysisSource, | |
| AnalyzeTextRequest, | |
| DetectedFeature, | |
| Lang, | |
| RiskLevel, | |
| ) | |
| from .audio import temp_audio | |
| from .features import extract | |
| from .language import detect_language | |
| log = logging.getLogger("fraud.analyzer") | |
| class ClassifierLike(Protocol): | |
| def predict_proba(self, text: str) -> float: ... | |
| def fuse(ml_proba: float, features: list[DetectedFeature]) -> float: | |
| """ | |
| Disagreement-aware fusion of the ML probability and rule-based features. | |
| Base formula (kept for continuity with the spec): | |
| score = 0.6 * ml_proba + 0.4 * min(1, Σ feature_weights) | |
| Adjustments applied after the base score to be robust when one signal | |
| is missing or wrong: | |
| * **ML strong, no rule support** — cap below the HIGH threshold. A | |
| confident ML signal on text that triggers zero hand-crafted features | |
| is more likely overfit than genuine, so we degrade to MEDIUM. | |
| * **Strong rule signal, ML weak** — floor above the LOW threshold. The | |
| hand-crafted lexicons are precise; multiple matches mean the call / | |
| message looks like fraud even if ML disagrees (e.g. unusual wording | |
| the classifier hasn't seen). | |
| """ | |
| rule_bonus = min(1.0, sum(f.weight for f in features)) | |
| base = 0.6 * ml_proba + 0.4 * rule_bonus | |
| # Strong ML but no rule support — distrust slightly | |
| if ml_proba >= 0.7 and rule_bonus < 0.10: | |
| base = min(base, 0.55) | |
| # Strong rules but ML disagrees — force at least MEDIUM | |
| elif ml_proba < 0.30 and rule_bonus >= 0.50: | |
| base = max(base, 0.45) | |
| # Both agree strongly — confirm HIGH | |
| elif ml_proba >= 0.75 and rule_bonus >= 0.35: | |
| base = max(base, 0.75) | |
| return float(min(1.0, max(0.0, base))) | |
| def to_risk_level(score: float, lo: float | None = None, hi: float | None = None) -> RiskLevel: | |
| lo = lo if lo is not None else settings.risk_low_threshold | |
| hi = hi if hi is not None else settings.risk_high_threshold | |
| if score < lo: | |
| return RiskLevel.LOW | |
| if score < hi: | |
| return RiskLevel.MEDIUM | |
| return RiskLevel.HIGH | |
| def _preview(text: str, limit: int = 240) -> str: | |
| text = text.strip() | |
| return text if len(text) <= limit else text[: limit - 1] + "…" | |
| def _persist( | |
| db: Session, | |
| *, | |
| analysis_id: UUID, | |
| device_id: UUID, | |
| source: AnalysisSource, | |
| language: Lang, | |
| risk_score: float, | |
| risk_level: RiskLevel, | |
| detected: list[DetectedFeature], | |
| transcript_preview: str | None, | |
| text_hash: str | None, | |
| duration_ms: int, | |
| ) -> Analysis: | |
| row = Analysis( | |
| id=str(analysis_id), | |
| device_id=str(device_id), | |
| source=source.value, | |
| language=language, | |
| risk_score=risk_score, | |
| risk_level=risk_level.value, | |
| duration_ms=duration_ms, | |
| transcript_preview=transcript_preview, | |
| text_hash=text_hash, | |
| ) | |
| for feat in detected: | |
| row.features.append( | |
| Feature( | |
| type=feat.type.value, | |
| weight=feat.weight, | |
| evidence=json.dumps(feat.evidence, ensure_ascii=False), | |
| ) | |
| ) | |
| db.add(row) | |
| db.commit() | |
| db.refresh(row) | |
| return row | |
| def _result_from_row(row: Analysis, detected: list[DetectedFeature], transcript: str | None) -> AnalysisResult: | |
| return AnalysisResult( | |
| id=UUID(row.id), | |
| created_at=row.created_at, | |
| source=AnalysisSource(row.source), | |
| language=row.language, | |
| risk_score=row.risk_score, | |
| risk_level=RiskLevel(row.risk_level), | |
| detected_features=detected, | |
| transcript=transcript, | |
| duration_ms=row.duration_ms, | |
| ) | |
| def analyze_text_sync( | |
| payload: AnalyzeTextRequest, | |
| device_id: UUID, | |
| db: Session, | |
| clf: ClassifierLike | None, | |
| ) -> AnalysisResult: | |
| started = time.perf_counter() | |
| text = payload.text | |
| lang = detect_language(text, payload.language_hint) | |
| detected = extract(text, lang) | |
| ml_proba = clf.predict_proba(text) if clf is not None else 0.0 | |
| score = fuse(ml_proba, detected) | |
| level = to_risk_level(score) | |
| duration_ms = int((time.perf_counter() - started) * 1000) | |
| analysis_id = uuid4() | |
| text_hash = hashlib.sha256(text.encode("utf-8")).hexdigest() | |
| row = _persist( | |
| db, | |
| analysis_id=analysis_id, | |
| device_id=device_id, | |
| source=AnalysisSource.TEXT, | |
| language=lang, | |
| risk_score=score, | |
| risk_level=level, | |
| detected=detected, | |
| transcript_preview=_preview(text), | |
| text_hash=text_hash, | |
| duration_ms=duration_ms, | |
| ) | |
| return _result_from_row(row, detected, transcript=text) | |
| def analyze_audio_sync( | |
| audio_bytes: bytes, | |
| content_type: str | None, | |
| source: AnalysisSource, | |
| language_hint: Lang | None, | |
| device_id: UUID, | |
| db: Session, | |
| whisper, | |
| clf: ClassifierLike | None, | |
| ) -> AnalysisResult: | |
| from ..ml.whisper import transcribe | |
| started = time.perf_counter() | |
| suffix = _suffix_for_mime(content_type) | |
| with temp_audio(audio_bytes, suffix) as path: | |
| transcript, whisper_lang, _ = transcribe(whisper, path, language_hint) | |
| if not transcript or not transcript.strip(): | |
| raise HTTPException( | |
| status_code=400, | |
| detail={"error": "EMPTY_AUDIO", "message": "Whisper returned no segments"}, | |
| ) | |
| lang: Lang = language_hint if language_hint in ("ru", "kk") else ( | |
| whisper_lang if whisper_lang in ("ru", "kk") else detect_language(transcript) | |
| ) | |
| detected = extract(transcript, lang) | |
| ml_proba = clf.predict_proba(transcript) if clf is not None else 0.0 | |
| score = fuse(ml_proba, detected) | |
| level = to_risk_level(score) | |
| duration_ms = int((time.perf_counter() - started) * 1000) | |
| analysis_id = uuid4() | |
| text_hash = hashlib.sha256(transcript.encode("utf-8")).hexdigest() | |
| row = _persist( | |
| db, | |
| analysis_id=analysis_id, | |
| device_id=device_id, | |
| source=source, | |
| language=lang, | |
| risk_score=score, | |
| risk_level=level, | |
| detected=detected, | |
| transcript_preview=_preview(transcript), | |
| text_hash=text_hash, | |
| duration_ms=duration_ms, | |
| ) | |
| return _result_from_row(row, detected, transcript=transcript) | |
| _MIME_SUFFIX = { | |
| "audio/mpeg": ".mp3", | |
| "audio/mp3": ".mp3", | |
| "audio/wav": ".wav", | |
| "audio/x-wav": ".wav", | |
| "audio/mp4": ".m4a", | |
| "audio/m4a": ".m4a", | |
| "audio/x-m4a": ".m4a", | |
| "audio/ogg": ".ogg", | |
| "audio/webm": ".webm", | |
| "audio/aac": ".aac", | |
| } | |
| def _suffix_for_mime(mime: str | None) -> str: | |
| if not mime: | |
| return ".bin" | |
| return _MIME_SUFFIX.get(mime.lower(), ".bin") | |