Spaces:

FaiziRBLX
/

NousAPI

Running

App Files Files Community

FaiziRBLX commited on Apr 11

Commit

907d439

verified ·

1 Parent(s): a93f50a

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -72

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
-import time
-import hashlib
-from collections import defaultdict
 from transformers import AutoTokenizer
 from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
@@ -10,102 +10,116 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
 from pydantic import BaseModel, Field
 from best import ModelConfig, IndonesianLLM, generate_text, _extract_thinking
-# ── Load model ──────────────────────────────────────────────────────────────
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
-tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
-checkpoint = torch.load("model.pt", map_location=device, weights_only=False)
-config = checkpoint['config']
-model = IndonesianLLM(config)
-state_dict = checkpoint['model_state_dict']
-if checkpoint.get('dtype') == 'fp16':
-    state_dict = {k: v.float() if v.dtype == torch.float16 else v
-                  for k, v in state_dict.items()}
-model.load_state_dict(state_dict)
 model.eval()
-model.to(device)
-# ── Rate Limiter (slowapi) ───────────────────────────────────────────────────
 limiter = Limiter(key_func=get_remote_address)
-# ── IP Blacklist (in-memory, reset saat restart) ────────────────────────────
-ip_blacklist: set = set()
-ip_request_count: dict = defaultdict(list)  # ip -> [timestamp, ...]
-BLACKLIST_THRESHOLD = 100   # request dalam window ini → blacklist
-BLACKLIST_WINDOW    = 60    # detik
-BLACKLIST_DURATION  = 3600  # banned 1 jam (simpan di set terpisah)
-ip_banned_until: dict = {}  # ip -> timestamp banned sampai kapan
-# ── FastAPI setup ──────────────────────────────────────────��────────────────
 app = FastAPI(title="Indonesian LLM API")
 app.state.limiter = limiter
 app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
-# CORS — ganti origins sesuai domain kamu
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["https://nousai.netlify.app"],  # ganti! jangan "*" di production
     allow_methods=["POST", "GET"],
     allow_headers=["*"],
 )
-# Trusted hosts — tolak request dengan Host header aneh
-app.add_middleware(
-    TrustedHostMiddleware,
-    allowed_hosts=["yourdomain.com", "localhost", "127.0.0.1"]
-)
-# ── Middleware: DDoS / Flood Detection ──────────────────────────────────────
 @app.middleware("http")
 async def ddos_protection(request: Request, call_next):
-    ip = get_remote_address(request)
     now = time.time()
-    # Cek apakah IP sedang dibanned
     if ip in ip_banned_until:
         if now < ip_banned_until[ip]:
             remaining = int(ip_banned_until[ip] - now)
-            return HTTPException(
-                status_code=429,
-                detail=f"IP banned. Coba lagi dalam {remaining} detik."
-            )
         else:
-            # Ban sudah habis
             del ip_banned_until[ip]
             ip_request_count[ip] = []
-    # Catat timestamp request ini
     ip_request_count[ip].append(now)
-    # Bersihkan request yang sudah di luar window
-    ip_request_count[ip] = [
-        t for t in ip_request_count[ip]
-        if now - t < BLACKLIST_WINDOW
-    ]
-    # Jika terlalu banyak request → ban
     if len(ip_request_count[ip]) > BLACKLIST_THRESHOLD:
         ip_banned_until[ip] = now + BLACKLIST_DURATION
         ip_request_count[ip] = []
-        raise HTTPException(
-            status_code=429,
-            detail=f"Terlalu banyak request. IP dibanned selama {BLACKLIST_DURATION//60} menit."
-        )
-    response = await call_next(request)
-    return response
-# ── Request/Response Schema ─────────────────────────────────────────────────
 class ChatRequest(BaseModel):
-    message: str = Field(..., min_length=1, max_length=500)  # batasi panjang input
-    max_tokens: int = Field(default=200, ge=10, le=500)      # min 10, max 500
     temperature: float = Field(default=0.7, ge=0.1, le=1.5)
     show_thinking: bool = False
@@ -114,31 +128,30 @@ class ChatResponse(BaseModel):
     thinking: str | None = None
     processing_time_ms: int
-# ── API Key sederhana (opsional tapi direkomendasikan) ──────────────────────
-API_KEYS = {"kunci-rahasia-kamu-123"}  # ganti dengan key yang kuat
 def verify_api_key(request: Request):
     key = request.headers.get("X-API-Key")
     if not key or key not in API_KEYS:
-        raise HTTPException(status_code=401, detail="API key tidak valid.")
     return key
-# ── Endpoints ───────────────────────────────────────────────────────────────
 @app.get("/")
 def health():
     return {"status": "ok", "device": str(device)}
 @app.post("/chat", response_model=ChatResponse)
-@limiter.limit("20/minute")          # max 10 request per menit per IP
-@limiter.limit("100/hour")            # max 50 request per jam per IP
 async def chat(
     req: ChatRequest,
     request: Request,
-    _key: str = Depends(verify_api_key)   # hapus baris ini jika tidak pakai API key
 ):
-    start = time.time()
     prompt = f"{req.message} <cot>"
     full = generate_text(
         model=model, tokenizer=tokenizer, prompt=prompt,
         max_new_tokens=req.max_tokens, temperature=req.temperature,
@@ -147,10 +160,8 @@ async def chat(
     raw = full[len(prompt):].strip()
     thinking, answer = _extract_thinking(raw)
-    elapsed_ms = int((time.time() - start) * 1000)
     return ChatResponse(
         answer=answer if answer else "Maaf, saya tidak mengerti.",
         thinking=thinking if req.show_thinking else None,
-        processing_time_ms=elapsed_ms
     )

 import torch
+import os
+import logging
+import gc
 from transformers import AutoTokenizer
 from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
 from pydantic import BaseModel, Field
+from collections import defaultdict
 from best import ModelConfig, IndonesianLLM, generate_text, _extract_thinking
+import time
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ── Cek file ────────────────────────────────────────────
+logger.info(f"model.pt ada: {os.path.exists('model.pt')}")
+if os.path.exists('model.pt'):
+    logger.info(f"model.pt size: {os.path.getsize('model.pt') / 1e6:.1f} MB")
+else:
+    raise FileNotFoundError("model.pt tidak ditemukan! Upload dulu ke Space.")
+# ── Device ──────────────────────────────────────────────
+device = torch.device('cpu')  # HF Spaces free = CPU only
+logger.info(f"Device: {device}")
+# ── Tokenizer ───────────────────────────────────────────
+logger.info("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
+tokenizer.add_special_tokens({"additional_special_tokens": ["<cot>", "</cot>"]})
+logger.info("Tokenizer OK")
+# ── Model ───────────────────────────────────────────────
+logger.info("Loading checkpoint...")
+try:
+    checkpoint = torch.load("model.pt", map_location='cpu', weights_only=False)
+    logger.info(f"Checkpoint keys: {list(checkpoint.keys())}")
+except Exception as e:
+    logger.error(f"GAGAL load checkpoint: {e}")
+    raise
+logger.info("Building model...")
+try:
+    config = checkpoint['config']
+    model = IndonesianLLM(config)
+    logger.info(f"Model params: {model.count_parameters():,}")
+except Exception as e:
+    logger.error(f"GAGAL buat model: {e}")
+    raise
+logger.info("Loading weights...")
+try:
+    state_dict = checkpoint['model_state_dict']
+    # Konversi fp16 → fp32 in-place (hemat RAM)
+    for k in list(state_dict.keys()):
+        if state_dict[k].dtype == torch.float16:
+            state_dict[k] = state_dict[k].float()
+    model.load_state_dict(state_dict)
+    logger.info("Weights OK")
+except Exception as e:
+    logger.error(f"GAGAL load weights: {e}")
+    raise
+# Bebaskan RAM
+del checkpoint, state_dict
+gc.collect()
+logger.info(f"RAM setelah cleanup: {torch.cuda.memory_allocated()/1e6:.1f} MB (GPU)" if torch.cuda.is_available() else "RAM cleanup done")
 model.eval()
+logger.info("Model siap!")
+# ── Rate limiter ─────────────────────────────────────────
 limiter = Limiter(key_func=get_remote_address)
+ip_request_count: dict = defaultdict(list)
+ip_banned_until: dict = {}
+BLACKLIST_THRESHOLD = 100
+BLACKLIST_WINDOW    = 60
+BLACKLIST_DURATION  = 3600
+# ── FastAPI ──────────────────────────────────────────────
 app = FastAPI(title="Indonesian LLM API")
 app.state.limiter = limiter
 app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # ganti dengan domain kamu di production
     allow_methods=["POST", "GET"],
     allow_headers=["*"],
 )
 @app.middleware("http")
 async def ddos_protection(request: Request, call_next):
+    ip  = get_remote_address(request)
     now = time.time()
     if ip in ip_banned_until:
         if now < ip_banned_until[ip]:
             remaining = int(ip_banned_until[ip] - now)
+            raise HTTPException(429, f"IP banned. Coba lagi dalam {remaining}s.")
         else:
             del ip_banned_until[ip]
             ip_request_count[ip] = []
     ip_request_count[ip].append(now)
+    ip_request_count[ip] = [t for t in ip_request_count[ip] if now - t < BLACKLIST_WINDOW]
     if len(ip_request_count[ip]) > BLACKLIST_THRESHOLD:
         ip_banned_until[ip] = now + BLACKLIST_DURATION
         ip_request_count[ip] = []
+        raise HTTPException(429, f"Terlalu banyak request. Banned {BLACKLIST_DURATION//60} menit.")
+    return await call_next(request)
+# ── Schema ───────────────────────────────────────────────
 class ChatRequest(BaseModel):
+    message: str = Field(..., min_length=1, max_length=500)
+    max_tokens: int = Field(default=200, ge=10, le=500)
     temperature: float = Field(default=0.7, ge=0.1, le=1.5)
     show_thinking: bool = False
     thinking: str | None = None
     processing_time_ms: int
+API_KEYS = {"kunci-rahasia-kamu-123"}  # ← ganti!
 def verify_api_key(request: Request):
     key = request.headers.get("X-API-Key")
     if not key or key not in API_KEYS:
+        raise HTTPException(401, "API key tidak valid.")
     return key
+# ── Endpoints ─────────────────────────────────────────────
 @app.get("/")
 def health():
     return {"status": "ok", "device": str(device)}
 @app.post("/chat", response_model=ChatResponse)
+@limiter.limit("10/minute")
+@limiter.limit("50/hour")
 async def chat(
     req: ChatRequest,
     request: Request,
+    _key: str = Depends(verify_api_key)
 ):
+    start  = time.time()
     prompt = f"{req.message} <cot>"
     full = generate_text(
         model=model, tokenizer=tokenizer, prompt=prompt,
         max_new_tokens=req.max_tokens, temperature=req.temperature,
     raw = full[len(prompt):].strip()
     thinking, answer = _extract_thinking(raw)
     return ChatResponse(
         answer=answer if answer else "Maaf, saya tidak mengerti.",
         thinking=thinking if req.show_thinking else None,
+        processing_time_ms=int((time.time() - start) * 1000)
     )