File size: 2,830 Bytes
2fbba2c
 
 
 
 
 
 
 
 
 
 
98b1f2a
2fbba2c
 
 
 
 
 
 
 
 
 
 
98b1f2a
2fbba2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
814693e
 
 
 
 
2fbba2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any

import fasttext
import numpy as np
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel, Field


MODEL_PATH = Path(os.getenv("FASTTEXT_MODEL_PATH", "/app/models/cc.ug.300.bin"))


class SimilarityRequest(BaseModel):
    word1: str = Field(..., min_length=1)
    word2: str = Field(..., min_length=1)


def load_fasttext_model() -> Any:
    if not MODEL_PATH.exists():
        raise RuntimeError(
            f"fastText model not found at {MODEL_PATH}. "
            "Set FASTTEXT_MODEL_PATH or download cc.ug.300.bin during the Docker build."
        )

    return fasttext.load_model(str(MODEL_PATH))


@asynccontextmanager
async def lifespan(app: FastAPI):
    app.state.fasttext_model = load_fasttext_model()
    yield


app = FastAPI(
    title="Uyghur Word Similarity API",
    description="Returns fastText cosine similarity multiplied by 100.",
    version="1.0.0",
    lifespan=lifespan,
)


def get_model() -> Any:
    model = getattr(app.state, "fasttext_model", None)
    if model is None:
        raise HTTPException(status_code=503, detail="fastText model is not loaded")
    return model


def normalize_word(word: str, field_name: str) -> str:
    normalized = word.strip()
    if not normalized:
        raise HTTPException(status_code=400, detail=f"{field_name} must not be empty")
    return normalized


def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    denominator = np.linalg.norm(v1) * np.linalg.norm(v2)
    if denominator == 0:
        raise HTTPException(status_code=422, detail="Could not compute similarity")
    return float(np.dot(v1, v2) / denominator)


def similarity_score(word1: str, word2: str) -> float:
    word1 = normalize_word(word1, "word1")
    word2 = normalize_word(word2, "word2")

    model = get_model()
    v1 = model.get_word_vector(word1)
    v2 = model.get_word_vector(word2)
    return cosine_similarity(v1, v2) * 100


@app.get("/")
def root():
    return {
        "status": "ok",
        # "model": str(MODEL_PATH),
        # "usage": {
        #     "GET": "/similarity?word1=سىزغۇچ&word2=نان",
        #     "POST": {"url": "/similarity", "body": {"word1": "سىزغۇچ", "word2": "نان"}},
        # },
    }


@app.get("/health")
def health():
    return {"status": "ok", "model_loaded": getattr(app.state, "fasttext_model", None) is not None}


@app.get("/similarity", response_model=float)
def similarity_from_query(
    word1: str = Query(..., min_length=1),
    word2: str = Query(..., min_length=1),
):
    return similarity_score(word1, word2)


@app.post("/similarity", response_model=float)
def similarity_from_body(payload: SimilarityRequest):
    return similarity_score(payload.word1, payload.word2)