Spaces:

moztrk
/

sentinel-api

Sleeping

Mustafa Öztürk

Add int8 quantization and batch moderation endpoint

7a29d91 16 days ago

6.12 kB

	import subprocess
	import time
	import os
	from typing import Optional

	import torch
	from fastapi import APIRouter, HTTPException
	from pydantic import BaseModel

	try:
	import psutil
	except ImportError:
	psutil = None

	from app.services.cache_manager import get_cache_counts, load_blacklist_to_ram
	from app.services.moderation_service import run_moderation, run_moderation_batch

	router = APIRouter()


	@router.get("/system-status")
	def system_status():
	if psutil is None:
	raise HTTPException(status_code=500, detail="psutil kurulu değil")

	process = psutil.Process(os.getpid())
	mem = psutil.virtual_memory()
	status = {
	"process_ram_mb": round(process.memory_info().rss / 1024 / 1024, 1),
	"system_ram_total_mb": round(mem.total / 1024 / 1024, 1),
	"system_ram_used_mb": round(mem.used / 1024 / 1024, 1),
	"system_ram_percent": mem.percent,
	"cpu_percent": psutil.cpu_percent(interval=1),
	"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
	}

	# Printed logs are visible in Hugging Face Space logs and local terminal output.
	print(f"[system-status] {status}")
	return status


	def get_gpu_info():
	try:
	raw = subprocess.check_output(
	[
	"nvidia-smi",
	"--query-gpu=utilization.gpu,memory.used,memory.total",
	"--format=csv,noheader,nounits",
	],
	encoding="utf-8",
	stderr=subprocess.STDOUT,
	)
	util, mem_used, mem_total = [p.strip() for p in raw.strip().splitlines()[0].split(",", maxsplit=2)]
	return {
	"load": int(float(util)),
	"vram_used": int(float(mem_used)),
	"vram_total": int(float(mem_total)),
	}
	except Exception:
	if not torch.cuda.is_available():
	return None
	allocated = torch.cuda.memory_allocated(0) / (1024 ** 2)
	total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
	return {
	"load": None,
	"vram_used": int(round(allocated)),
	"vram_total": int(round(total)),
	}


	def capture_process_metrics():
	cpu_load = None
	ram_pct = None
	if psutil is not None:
	cpu_load = round(psutil.cpu_percent(interval=0.05), 1)
	ram_pct = round(psutil.virtual_memory().percent, 1)

	gpu = get_gpu_info()
	return {
	"cpu": cpu_load,
	"ram_pct": ram_pct,
	"gpu_load": gpu["load"] if gpu else None,
	"vram_used": gpu["vram_used"] if gpu else 0,
	"vram_total": gpu["vram_total"] if gpu else 0,
	"timestamp": time.strftime("%H:%M:%S"),
	}


	class ModerationInput(BaseModel):
	text: str
	platform_dil: Optional[str] = "tr"


	class ModerationBatchInput(BaseModel):
	texts: list[str]
	platform_dil: Optional[str] = "tr"
	batch_size: Optional[int] = 8


	@router.get("/vram-status")
	def get_vram_status():
	if not torch.cuda.is_available():
	return {
	"cuda_available": False,
	"message": "CUDA aktif değil, GPU belleği ölçülemedi.",
	}

	allocated = torch.cuda.memory_allocated(0) / (1024 ** 2)
	reserved = torch.cuda.memory_reserved(0) / (1024 ** 2)
	total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)

	return {
	"cuda_available": True,
	"gpu_name": torch.cuda.get_device_name(0),
	"allocated_mb": round(allocated, 2),
	"reserved_mb": round(reserved, 2),
	"total_mb": round(total, 2),
	"free_estimate_mb": round(total - reserved, 2),
	}


	@router.get("/refresh-cache")
	def refresh_cache():
	load_blacklist_to_ram()
	tr_count, en_count = get_cache_counts()
	return {
	"status": "success",
	"message": "Kara liste güncellendi.",
	"tr_count": tr_count,
	"en_count": en_count,
	}


	@router.post("/analyze")
	async def analyze(input_data: ModerationInput):
	if not input_data.text or not input_data.text.strip():
	raise HTTPException(status_code=400, detail="text alanı boş olamaz")

	start_time = time.time()
	decision, reason, risk, lang, cleaned, details = run_moderation(
	input_data.text,
	input_data.platform_dil or "tr",
	)

	latency_ms = round((time.time() - start_time) * 1000, 2)
	performance = capture_process_metrics()
	performance["latency_ms"] = latency_ms

	return {
	"text": input_data.text,
	"cleaned_text": cleaned,
	"decision": decision,
	"reason": reason,
	"risk_level": risk,
	"language": lang,
	"details": details,
	"latency_ms": latency_ms,
	"performance": performance,
	}


	@router.post("/analyze-batch")
	async def analyze_batch(input_data: ModerationBatchInput):
	if not input_data.texts:
	raise HTTPException(status_code=400, detail="texts alanı boş olamaz")

	cleaned_texts = [t for t in input_data.texts if isinstance(t, str) and t.strip()]
	if not cleaned_texts:
	raise HTTPException(status_code=400, detail="Geçerli metin bulunamadı")

	batch_size = max(1, int(input_data.batch_size or 8))
	start_time = time.time()
	batch_results = run_moderation_batch(
	cleaned_texts,
	input_data.platform_dil or "tr",
	batch_size=batch_size,
	)
	latency_ms = round((time.time() - start_time) * 1000, 2)
	performance = capture_process_metrics()
	performance["latency_ms"] = latency_ms

	items = []
	for original_text, result in zip(cleaned_texts, batch_results):
	decision, reason, risk, lang, cleaned, details = result
	items.append(
	{
	"text": original_text,
	"cleaned_text": cleaned,
	"decision": decision,
	"reason": reason,
	"risk_level": risk,
	"language": lang,
	"details": details,
	}
	)

	return {
	"count": len(items),
	"batch_size": batch_size,
	"latency_ms": latency_ms,
	"performance": performance,
	"results": items,
	}