File size: 2,830 Bytes
2fbba2c 98b1f2a 2fbba2c 98b1f2a 2fbba2c 814693e 2fbba2c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | import os
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any
import fasttext
import numpy as np
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel, Field
MODEL_PATH = Path(os.getenv("FASTTEXT_MODEL_PATH", "/app/models/cc.ug.300.bin"))
class SimilarityRequest(BaseModel):
word1: str = Field(..., min_length=1)
word2: str = Field(..., min_length=1)
def load_fasttext_model() -> Any:
if not MODEL_PATH.exists():
raise RuntimeError(
f"fastText model not found at {MODEL_PATH}. "
"Set FASTTEXT_MODEL_PATH or download cc.ug.300.bin during the Docker build."
)
return fasttext.load_model(str(MODEL_PATH))
@asynccontextmanager
async def lifespan(app: FastAPI):
app.state.fasttext_model = load_fasttext_model()
yield
app = FastAPI(
title="Uyghur Word Similarity API",
description="Returns fastText cosine similarity multiplied by 100.",
version="1.0.0",
lifespan=lifespan,
)
def get_model() -> Any:
model = getattr(app.state, "fasttext_model", None)
if model is None:
raise HTTPException(status_code=503, detail="fastText model is not loaded")
return model
def normalize_word(word: str, field_name: str) -> str:
normalized = word.strip()
if not normalized:
raise HTTPException(status_code=400, detail=f"{field_name} must not be empty")
return normalized
def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
denominator = np.linalg.norm(v1) * np.linalg.norm(v2)
if denominator == 0:
raise HTTPException(status_code=422, detail="Could not compute similarity")
return float(np.dot(v1, v2) / denominator)
def similarity_score(word1: str, word2: str) -> float:
word1 = normalize_word(word1, "word1")
word2 = normalize_word(word2, "word2")
model = get_model()
v1 = model.get_word_vector(word1)
v2 = model.get_word_vector(word2)
return cosine_similarity(v1, v2) * 100
@app.get("/")
def root():
return {
"status": "ok",
# "model": str(MODEL_PATH),
# "usage": {
# "GET": "/similarity?word1=سىزغۇچ&word2=نان",
# "POST": {"url": "/similarity", "body": {"word1": "سىزغۇچ", "word2": "نان"}},
# },
}
@app.get("/health")
def health():
return {"status": "ok", "model_loaded": getattr(app.state, "fasttext_model", None) is not None}
@app.get("/similarity", response_model=float)
def similarity_from_query(
word1: str = Query(..., min_length=1),
word2: str = Query(..., min_length=1),
):
return similarity_score(word1, word2)
@app.post("/similarity", response_model=float)
def similarity_from_body(payload: SimilarityRequest):
return similarity_score(payload.word1, payload.word2)
|