kataklassifer / api /main.py
krislette's picture
Docker config for HF deployment
628ca9d
"""
FastAPI backend for Gairaigo Map.
Endpoints:
GET /health - liveness check
GET /languages - returns the 3 classifiable languages
POST /predict - classifies a katakana word
POST /emotion - detects emotion from plain text, returns music list + loanwords
Usage:
uvicorn main:app --reload --port 8000
"""
import re
import numpy as np
import joblib
from pathlib import Path
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, field_validator
from transformers import pipeline
# ── Paths ──────────────────────────────────────────────────────────────────────
BASE_DIR = Path(__file__).parent
MODELS_DIR = BASE_DIR.parent / "models"
# Fallback for Docker where /home/user/app is the working root
if not MODELS_DIR.exists():
MODELS_DIR = Path("/home/user/app/models")
MODEL_PATH = MODELS_DIR / "model.joblib"
VECTORIZER_PATH = MODELS_DIR / "vectorizer.joblib"
ENCODER_PATH = MODELS_DIR / "encoder.joblib"
KATAKANA_RE = re.compile(r"^[\u30A0-\u30FF\u30FC\u30FB\u30FE\u30FD]+$")
def is_katakana(text: str) -> bool:
return bool(KATAKANA_RE.match(text.strip()))
# ── Language metadata (SVM classifier) ────────────────────────────────────────
LANGUAGE_META = {
"English": {"iso2": "GB", "country": "United Kingdom", "color": "#4a90d9"},
"French": {"iso2": "FR", "country": "France", "color": "#e85d5d"},
"German": {"iso2": "DE", "country": "Germany", "color": "#f0a500"},
}
# ── Emotion → Music playlist (multiple songs per emotion for variety) ──────────
# All video IDs verified via Wikipedia / official sources
EMOTION_MUSIC: dict[str, list[dict]] = {
"joy": [
{"title": "ATARASHII GAKKO! - Que Sera Sera", "video_id": "0S1-b9xGQac"},
{"title": "Wonderland x Showtime - Kyoufuu All Back", "video_id": "nq_x3D0_lgw"},
{"title": "Sumika - Fiction", "video_id": "IKHGAuNaGuA"},
{"title": "ATARASHII GAKKO! - OTONABLUE", "video_id": "l446hUqQ7GY"},
{"title": "Wonderland x Showtime - SEIBAITAAAAAASU!", "video_id": "w0lpuKNZRQ0"},
{"title": "Creepy Nuts - Bling-Bang-Bang-Born", "video_id": "mLW35YMzELE"},
{"title": "Wonderland x Showtime - Taiyoukei Disco", "video_id": "oA6aCY4bMg4"},
{"title": "Gen Hoshino - Koi", "video_id": "jhOVibLEDhA"},
{"title": "Yumi Arai - Rouge no Dengon", "video_id": "MH-P4mXvDPE"},
],
"sadness": [
{"title": "ZONE - Kimi ga Kureta Mono", "video_id": "Of36Qh7WLSQ"},
{"title": "YOSHIKI - Red Swan", "video_id": "r1XE8ON8fos"},
{"title": "Galileo Galilei - Aoi Shiori", "video_id": "T3bxbVGWy5k"},
{"title": "seven oops - Orange", "video_id": "nf-L5R8U-Q0"},
{"title": "DAOKO x Kenshi Yonezu - Fireworks", "video_id": "-tKVN2mAKRI"},
{"title": "Yorushika - Say It", "video_id": "F64yFFnZfkI"},
{"title": "Kenshi Yonezu - Lemon", "video_id": "SX_ViT4Ra7k"},
{"title": "Yoh Kamiyama - Irokousui", "video_id": "kQYLHjgUh_g"},
],
"anger": [
{"title": "Ado - Usseewa", "video_id": "Qp3b-RXtz4w"},
{"title": "Neru - Lost One's Weeping", "video_id": "U1aS62Juz70"},
{"title": "Hige Dandism - Cry Baby", "video_id": "O1bhZgkC4Gw"},
{"title": "Minami - Crying for Rain", "video_id": "0YF8vecQWYs"},
{"title": "Eve - Dramaturgy", "video_id": "jJzw1h5CR-I"},
{"title": "Kenshi Yonezu - Kick Back", "video_id": "M2cckDmNLMI"},
],
"fear": [
{"title": "TK - Unravel", "video_id": "Fve_lHIPa-I"},
{"title": "Nightcord at 25:00 x KAITO - Bakenohana", "video_id": "UFRIsspP9UE"},
{"title": "ATARASHII GAKKO! - Tokyo Calling", "video_id": "pHMH408ltEM"},
{"title": "Nightcord at 25:00 x KAITO - Heat Abnormal", "video_id": "ToqKNyZi2NQ"},
{"title": "Yuzu - Hyori Ittai", "video_id": "eKoD2CRr_KA"},
{"title": "Nightcord at 25:00 - Bug", "video_id": "2Ii7UBMxWVw"},
{"title": "RADWIMPS - Nandemonaiya", "video_id": "n89SKAymNfA"},
{"title": "sakanaction - Arukuaround", "video_id": "cADu9rtlZGQ"},
],
"surprise": [
{"title": "YOASOBI - Idol", "video_id": "ZRtdQ81jPUQ"},
{"title": "Ado - Buriki no Dance", "video_id": "iL7uoLCbJoc"},
{"title": "Ado - New Genesis", "video_id": "1FliVTcX8bQ"},
{"title": "RADWIMPS - Grand Escape", "video_id": "epQGR34yiTY"},
],
"disgust": [
{"title": "Nightcord at 25:00 - Bocca della Verità", "video_id": "ZjNUJUgyoOw"},
{"title": "Ado - Readymade", "video_id": "jg09lNupc1s"},
{"title": "Eve - Literary Nonsense", "video_id": "OskXF3s0UT8"},
],
"neutral": [
{"title": "Vaundy - Odoriko", "video_id": "7HgJIAUtICU"},
{"title": "Hanae - Kamisama Hajimemashita", "video_id": "gZaelu4lieE"},
{"title": "Fuji Kaze - Matsuri", "video_id": "NwOvu-j_WjY"},
{"title": "Tomofumi Tanizawa - Kimi ni Todoke", "video_id": "9o7tKXUjC6E"},
{"title": "ATARASHII GAKKO! - Dounimo Tomaranai", "video_id": "59bnq4wlGx8"},
{"title": "Yorushika - Just a Sunny Day for You", "video_id": "-VKIqrvVOpo"},
{"title": "natori - Overdose", "video_id": "H08YWE4CIFQ"},
{"title": "Mitchie M - Tokugawa Cup Noodle Kinshirei", "video_id": "jPXAgWkqbo4"},
{"title": "Homecomings - Cakes", "video_id": "u1A53wFN9A0"},
],
}
# ── Emotion → curated loanwords ────────────────────────────────────────────────
EMOTION_LOANWORDS: dict[str, list[dict]] = {
"joy": [
{"katakana": "カーニバル", "meaning": "carnival", "language": "English", "iso2": "GB"},
{"katakana": "フェスティバル", "meaning": "festival", "language": "English", "iso2": "GB"},
{"katakana": "ダンス", "meaning": "dance", "language": "English", "iso2": "GB"},
{"katakana": "ショーロ", "meaning": "choro; chorinho; style of Brazilian popular music", "language": "Portuguese", "iso2": "PT"},
{"katakana": "カステラ", "meaning": "castella (type of sponge cake)", "language": "Portuguese", "iso2": "PT"},
{"katakana": "バレエ", "meaning": "ballet", "language": "French", "iso2": "FR"},
{"katakana": "シャンソン", "meaning": "chanson; French song", "language": "French", "iso2": "FR"},
{"katakana": "フェット", "meaning": "fête; festival; celebration", "language": "French", "iso2": "FR"},
],
"sadness": [
{"katakana": "ノスタルジー", "meaning": "nostalgia", "language": "French", "iso2": "FR"},
{"katakana": "メランコリー", "meaning": "melancholy", "language": "French", "iso2": "FR"},
{"katakana": "アデュー", "meaning": "adieu; goodbye", "language": "French", "iso2": "FR"},
{"katakana": "ミンネ", "meaning": "love of a knight for a courtly lady (upon which he is unable to act)", "language": "German", "iso2": "DE"},
{"katakana": "フロイライン", "meaning": "miss (German title for an unmarried woman)", "language": "German", "iso2": "DE"},
{"katakana": "カッパ", "meaning": "raincoat", "language": "Portuguese", "iso2": "PT"},
{"katakana": "ロンリー", "meaning": "lonely", "language": "English", "iso2": "GB"},
{"katakana": "ブルース", "meaning": "blues (music genre)", "language": "English", "iso2": "GB"},
],
"anger": [
{"katakana": "ネリチャギ", "meaning": "axe kick; ax kick", "language": "Korean", "iso2": "KR"},
{"katakana": "サンダ", "meaning": "sanda; sanshou; Chinese boxing; Chinese kickboxing", "language": "Chinese", "iso2": "CN"},
{"katakana": "テロル", "meaning": "terror; terrorism", "language": "German", "iso2": "DE"},
{"katakana": "ストライキ", "meaning": "strike (labor action)", "language": "English", "iso2": "GB"},
{"katakana": "プロテスト", "meaning": "protest", "language": "English", "iso2": "GB"},
{"katakana": "レジスタンス", "meaning": "resistance (movement)", "language": "French", "iso2": "FR"},
{"katakana": "バトル", "meaning": "battle", "language": "English", "iso2": "GB"},
{"katakana": "パワー", "meaning": "power", "language": "English", "iso2": "GB"},
],
"fear": [
{"katakana": "ノワール", "meaning": "black; dark", "language": "French", "iso2": "FR"},
{"katakana": "エトランゼ", "meaning": "stranger; outsider; foreigner", "language": "French", "iso2": "FR"},
{"katakana": "テロル", "meaning": "terror; terrorism", "language": "German", "iso2": "DE"},
{"katakana": "デマゴギー", "meaning": "false rumor; false alarm; misinformation", "language": "German", "iso2": "DE"},
{"katakana": "ゴースト", "meaning": "ghost", "language": "English", "iso2": "GB"},
{"katakana": "ホラー", "meaning": "horror", "language": "English", "iso2": "GB"},
{"katakana": "パニック", "meaning": "panic", "language": "English", "iso2": "GB"},
{"katakana": "ミステリー", "meaning": "mystery", "language": "English", "iso2": "GB"},
],
"surprise": [
{"katakana": "ゲリラライブ", "meaning": "surprise concert", "language": "English", "iso2": "GB"},
{"katakana": "スライハンド", "meaning": "sleight of hand (e.g. in magic tricks)", "language": "English", "iso2": "GB"},
{"katakana": "マジック", "meaning": "magic", "language": "English", "iso2": "GB"},
{"katakana": "イリュージョン", "meaning": "illusion", "language": "English", "iso2": "GB"},
{"katakana": "サーカス", "meaning": "circus", "language": "English", "iso2": "GB"},
{"katakana": "スペクタクル", "meaning": "spectacle", "language": "French", "iso2": "FR"},
{"katakana": "ブリュット", "meaning": "brut; dry sparkling wine", "language": "French", "iso2": "FR"},
{"katakana": "サプライズ", "meaning": "surprise", "language": "English", "iso2": "GB"},
],
"disgust": [
{"katakana": "チョウドウフ", "meaning": "stinky tofu; fermented tofu", "language": "Chinese", "iso2": "CN"},
{"katakana": "シックハウス", "meaning": "sick building; building which causes people to feel unwell", "language": "English", "iso2": "GB"},
{"katakana": "トキシック", "meaning": "toxic", "language": "English", "iso2": "GB"},
{"katakana": "ダストシュート", "meaning": "garbage chute; trash chute", "language": "English", "iso2": "GB"},
{"katakana": "ポイズン", "meaning": "poison", "language": "English", "iso2": "GB"},
{"katakana": "スキャンダル", "meaning": "scandal", "language": "English", "iso2": "GB"},
{"katakana": "ネガティブ", "meaning": "negative", "language": "English", "iso2": "GB"},
{"katakana": "ゴミ", "meaning": "rubbish; trash; garbage", "language": "English", "iso2": "GB"},
],
"neutral": [
{"katakana": "アルバイター", "meaning": "part-time worker; part-timer", "language": "German", "iso2": "DE"},
{"katakana": "ピンイン", "meaning": "Pinyin (Chinese romanization system)", "language": "Chinese", "iso2": "CN"},
{"katakana": "スケジュール", "meaning": "schedule", "language": "English", "iso2": "GB"},
{"katakana": "システム", "meaning": "system", "language": "English", "iso2": "GB"},
{"katakana": "ドキュメント", "meaning": "document", "language": "English", "iso2": "GB"},
{"katakana": "ネットワーク", "meaning": "network", "language": "English", "iso2": "GB"},
{"katakana": "マネジメント", "meaning": "management", "language": "English", "iso2": "GB"},
{"katakana": "スタンダード", "meaning": "standard", "language": "English", "iso2": "GB"},
],
}
# ── Startup ────────────────────────────────────────────────────────────────────
artifacts: dict = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
for path in (MODEL_PATH, VECTORIZER_PATH, ENCODER_PATH):
if not path.exists():
raise RuntimeError(f"Model artifact not found: {path}")
artifacts["model"] = joblib.load(MODEL_PATH)
artifacts["vectorizer"] = joblib.load(VECTORIZER_PATH)
artifacts["encoder"] = joblib.load(ENCODER_PATH)
artifacts["emotion"] = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=1,
)
print("✓ Model artifacts loaded")
print("✓ Emotion classifier loaded")
yield
artifacts.clear()
# ── App ────────────────────────────────────────────────────────────────────────
app = FastAPI(title="Gairaigo Map API", version="2.0.0", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=[
"http://localhost:5173",
"http://127.0.0.1:5173",
"https://kotabi.vercel.app",
],
allow_methods=["GET", "POST"],
allow_headers=["*"],
)
# ── Schemas ────────────────────────────────────────────────────────────────────
class PredictRequest(BaseModel):
word: str
@field_validator("word")
@classmethod
def must_be_katakana(cls, v: str) -> str:
v = v.strip()
if not v:
raise ValueError("Word must not be empty.")
if not is_katakana(v):
raise ValueError("Input must be katakana (e.g. コーヒー).")
return v
class LanguageResult(BaseModel):
language: str
country: str
iso2: str
confidence: float
color: str
class PredictResponse(BaseModel):
word: str
prediction: LanguageResult
all_scores: list[LanguageResult]
class EmotionRequest(BaseModel):
text: str
@field_validator("text")
@classmethod
def must_not_be_empty(cls, v: str) -> str:
v = v.strip()
if not v:
raise ValueError("Text must not be empty.")
return v
class MusicEntry(BaseModel):
title: str
video_id: str
class LoanwordResult(BaseModel):
katakana: str
meaning: str
language: str
iso2: str
class EmotionResponse(BaseModel):
text: str
emotion: str
music_list: list[MusicEntry] # full playlist — frontend cycles through these
loanwords: list[LoanwordResult]
# ── Helpers ────────────────────────────────────────────────────────────────────
def softmax(scores: np.ndarray) -> np.ndarray:
exp_scores = np.exp(scores - np.max(scores))
return exp_scores / exp_scores.sum()
def classify(word: str) -> PredictResponse:
model = artifacts["model"]
vectorizer = artifacts["vectorizer"]
encoder = artifacts["encoder"]
X = vectorizer.transform([word])
decision_scores = model.decision_function(X)[0]
confidences = softmax(decision_scores)
classes = encoder.classes_
all_scores = [
LanguageResult(
language=classes[i],
country=LANGUAGE_META[classes[i]]["country"],
iso2=LANGUAGE_META[classes[i]]["iso2"],
confidence=round(float(confidences[i]), 4),
color=LANGUAGE_META[classes[i]]["color"],
)
for i in range(len(classes))
]
all_scores.sort(key=lambda r: r.confidence, reverse=True)
return PredictResponse(word=word, prediction=all_scores[0], all_scores=all_scores)
# ── Routes ─────────────────────────────────────────────────────────────────────
@app.get("/health", tags=["Meta"])
def health():
return {"status": "ok", "model_loaded": bool(artifacts)}
@app.get("/languages", tags=["Meta"])
def get_languages():
return {lang: meta for lang, meta in LANGUAGE_META.items()}
@app.post("/predict", response_model=PredictResponse, tags=["Classification"])
def predict(body: PredictRequest):
try:
return classify(body.word)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/emotion", response_model=EmotionResponse, tags=["Emotion"])
def detect_emotion(body: EmotionRequest):
"""
Detect emotion from plain English text.
Returns the detected emotion, a playlist of matching Japanese songs, and related loanwords.
The frontend can cycle through music_list to let users skip to the next song.
"""
try:
result = artifacts["emotion"](body.text)
label: str = result[0][0]["label"].lower()
if label not in EMOTION_MUSIC:
label = "neutral"
music_list = [MusicEntry(**m) for m in EMOTION_MUSIC[label]]
loanwords = [LoanwordResult(**w) for w in EMOTION_LOANWORDS[label]]
return EmotionResponse(
text=body.text,
emotion=label,
music_list=music_list,
loanwords=loanwords,
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))