sentinel-api / app /api /endpoints.py
Mustafa Öztürk
Add int8 quantization and batch moderation endpoint
7a29d91
import subprocess
import time
import os
from typing import Optional
import torch
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
try:
import psutil
except ImportError:
psutil = None
from app.services.cache_manager import get_cache_counts, load_blacklist_to_ram
from app.services.moderation_service import run_moderation, run_moderation_batch
router = APIRouter()
@router.get("/system-status")
def system_status():
if psutil is None:
raise HTTPException(status_code=500, detail="psutil kurulu değil")
process = psutil.Process(os.getpid())
mem = psutil.virtual_memory()
status = {
"process_ram_mb": round(process.memory_info().rss / 1024 / 1024, 1),
"system_ram_total_mb": round(mem.total / 1024 / 1024, 1),
"system_ram_used_mb": round(mem.used / 1024 / 1024, 1),
"system_ram_percent": mem.percent,
"cpu_percent": psutil.cpu_percent(interval=1),
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}
# Printed logs are visible in Hugging Face Space logs and local terminal output.
print(f"[system-status] {status}")
return status
def get_gpu_info():
try:
raw = subprocess.check_output(
[
"nvidia-smi",
"--query-gpu=utilization.gpu,memory.used,memory.total",
"--format=csv,noheader,nounits",
],
encoding="utf-8",
stderr=subprocess.STDOUT,
)
util, mem_used, mem_total = [p.strip() for p in raw.strip().splitlines()[0].split(",", maxsplit=2)]
return {
"load": int(float(util)),
"vram_used": int(float(mem_used)),
"vram_total": int(float(mem_total)),
}
except Exception:
if not torch.cuda.is_available():
return None
allocated = torch.cuda.memory_allocated(0) / (1024 ** 2)
total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
return {
"load": None,
"vram_used": int(round(allocated)),
"vram_total": int(round(total)),
}
def capture_process_metrics():
cpu_load = None
ram_pct = None
if psutil is not None:
cpu_load = round(psutil.cpu_percent(interval=0.05), 1)
ram_pct = round(psutil.virtual_memory().percent, 1)
gpu = get_gpu_info()
return {
"cpu": cpu_load,
"ram_pct": ram_pct,
"gpu_load": gpu["load"] if gpu else None,
"vram_used": gpu["vram_used"] if gpu else 0,
"vram_total": gpu["vram_total"] if gpu else 0,
"timestamp": time.strftime("%H:%M:%S"),
}
class ModerationInput(BaseModel):
text: str
platform_dil: Optional[str] = "tr"
class ModerationBatchInput(BaseModel):
texts: list[str]
platform_dil: Optional[str] = "tr"
batch_size: Optional[int] = 8
@router.get("/vram-status")
def get_vram_status():
if not torch.cuda.is_available():
return {
"cuda_available": False,
"message": "CUDA aktif değil, GPU belleği ölçülemedi.",
}
allocated = torch.cuda.memory_allocated(0) / (1024 ** 2)
reserved = torch.cuda.memory_reserved(0) / (1024 ** 2)
total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
return {
"cuda_available": True,
"gpu_name": torch.cuda.get_device_name(0),
"allocated_mb": round(allocated, 2),
"reserved_mb": round(reserved, 2),
"total_mb": round(total, 2),
"free_estimate_mb": round(total - reserved, 2),
}
@router.get("/refresh-cache")
def refresh_cache():
load_blacklist_to_ram()
tr_count, en_count = get_cache_counts()
return {
"status": "success",
"message": "Kara liste güncellendi.",
"tr_count": tr_count,
"en_count": en_count,
}
@router.post("/analyze")
async def analyze(input_data: ModerationInput):
if not input_data.text or not input_data.text.strip():
raise HTTPException(status_code=400, detail="text alanı boş olamaz")
start_time = time.time()
decision, reason, risk, lang, cleaned, details = run_moderation(
input_data.text,
input_data.platform_dil or "tr",
)
latency_ms = round((time.time() - start_time) * 1000, 2)
performance = capture_process_metrics()
performance["latency_ms"] = latency_ms
return {
"text": input_data.text,
"cleaned_text": cleaned,
"decision": decision,
"reason": reason,
"risk_level": risk,
"language": lang,
"details": details,
"latency_ms": latency_ms,
"performance": performance,
}
@router.post("/analyze-batch")
async def analyze_batch(input_data: ModerationBatchInput):
if not input_data.texts:
raise HTTPException(status_code=400, detail="texts alanı boş olamaz")
cleaned_texts = [t for t in input_data.texts if isinstance(t, str) and t.strip()]
if not cleaned_texts:
raise HTTPException(status_code=400, detail="Geçerli metin bulunamadı")
batch_size = max(1, int(input_data.batch_size or 8))
start_time = time.time()
batch_results = run_moderation_batch(
cleaned_texts,
input_data.platform_dil or "tr",
batch_size=batch_size,
)
latency_ms = round((time.time() - start_time) * 1000, 2)
performance = capture_process_metrics()
performance["latency_ms"] = latency_ms
items = []
for original_text, result in zip(cleaned_texts, batch_results):
decision, reason, risk, lang, cleaned, details = result
items.append(
{
"text": original_text,
"cleaned_text": cleaned,
"decision": decision,
"reason": reason,
"risk_level": risk,
"language": lang,
"details": details,
}
)
return {
"count": len(items),
"batch_size": batch_size,
"latency_ms": latency_ms,
"performance": performance,
"results": items,
}