Spaces:

sedtha
/

khmer-summarizer-api-mBART-LoRA

Sleeping

File size: 5,497 Bytes

import torch
import warnings
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from peft import PeftModel
from transformers import (
    MBartForConditionalGeneration, MBart50Tokenizer,
    MT5ForConditionalGeneration, T5Tokenizer
)

warnings.filterwarnings("ignore")

app = FastAPI(
    title="Khmer Summarization API",
    description="mBART-LoRA + mT5 in ONE API",
    version="1.1.0"
)

# ================= CORS =================
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ================= Device =================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ================= Models Config =================
MODELS = {
    "model1": {
        "name": "Khmer mBART + LoRA",
        "type": "mbart",
        "repo": "sedtha/mBart-50-large_LoRa_kh_sumerize",
        "model": None,
        "tokenizer": None
    },
    "model2": {
        "name": "Khmer mT5",
        "type": "mt5",
        "repo": "angkor96/khmer-mT5-news-summarization",
        "model": None,
        "tokenizer": None
    }
}

# ================= Load Model =================
def load_model(key: str):
    info = MODELS[key]

    if info["model"] is None:
        print(f"🔹 Loading {info['name']}...")

        if info["type"] == "mbart":
            tokenizer = MBart50Tokenizer.from_pretrained(
                info["repo"],
                src_lang="km_KH",
                tgt_lang="km_KH",
                cache_dir="./cache"
            )

            base_model = MBartForConditionalGeneration.from_pretrained(
                "facebook/mbart-large-50",
                cache_dir="./cache"
            )

            model = PeftModel.from_pretrained(
                base_model,
                info["repo"],
                cache_dir="./cache"
            )

            # ✅ IMPORTANT: Merge LoRA weights
            model = model.merge_and_unload()

        elif info["type"] == "mt5":
            tokenizer = T5Tokenizer.from_pretrained(
                info["repo"],
                cache_dir="./cache"
            )

            model = MT5ForConditionalGeneration.from_pretrained(
                info["repo"],
                cache_dir="./cache"
            )

        model = model.to(device)
        model.eval()

        info["model"] = model
        info["tokenizer"] = tokenizer

        print(f"✅ Loaded {info['name']}")

    return info["model"], info["tokenizer"]

# ================= Request Schema =================
class SummarizeRequest(BaseModel):
    text: str
    model: str = "model2"

# ================= Summarization =================
@app.post("/summarize")
def summarize(req: SummarizeRequest):
    if not req.text.strip():
        raise HTTPException(status_code=400, detail="Text is empty")

    if req.model not in MODELS:
        raise HTTPException(status_code=400, detail="Invalid model")

    model, tokenizer = load_model(req.model)

    inputs = tokenizer(
        req.text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)

    gen_kwargs = {
        "max_new_tokens": 150,
        "do_sample": True,
        "temperature": 1.0,
        "top_p": 0.95,
        "top_k": 100,
        "repetition_penalty": 1.2,
        "no_repeat_ngram_size": 3
    }

    # ✅ Fix for mBART language output
    if MODELS[req.model]["type"] == "mbart":
        gen_kwargs["forced_bos_token_id"] = tokenizer.lang_code_to_id["km_KH"]

    with torch.no_grad():
        summary_ids = model.generate(**inputs, **gen_kwargs)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Khmer sentence cleanup
    if "។" in summary:
        summary = summary[:summary.rfind("។") + 1]

    return {
        "model": MODELS[req.model]["name"],
        "summary": summary.strip()
    }

# ================= Health =================
@app.get("/")
def root():
    return {"status": "Khmer Summarization API is running 🚀"}

@app.get("/health")
def health_check():
    return {
        "status": "healthy",
        "device": str(device),
        "models_loaded": {
            key: info["model"] is not None
            for key, info in MODELS.items()
        }
    }

# ================= Optional Compare Endpoint =================
@app.post("/compare")
def compare(req: SummarizeRequest):
    if not req.text.strip():
        raise HTTPException(status_code=400, detail="Text is empty")

    results = {}

    for key in MODELS:
        model, tokenizer = load_model(key)

        inputs = tokenizer(
            req.text,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        ).to(device)

        gen_kwargs = {
            "max_new_tokens": 120
        }

        if MODELS[key]["type"] == "mbart":
            gen_kwargs["forced_bos_token_id"] = tokenizer.lang_code_to_id["km_KH"]

        with torch.no_grad():
            ids = model.generate(**inputs, **gen_kwargs)

        results[MODELS[key]["name"]] = tokenizer.decode(
            ids[0],
            skip_special_tokens=True
        )

    return results

# ================= Startup =================
@app.on_event("startup")
async def startup_event():
    print("🚀 Starting Khmer Summarization API...")
    print(f"Using device: {device}")
    print("Models will load on first request (memory efficient)")