Piyazon commited on
Commit ·
2fbba2c
1
Parent(s): 2a56b48
test
Browse files- .dockerignore +9 -0
- Dockerfile +18 -0
- README.md +28 -0
- app.py +102 -0
- requirements.txt +4 -0
.dockerignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
.pytest_cache/
|
| 5 |
+
.venv/
|
| 6 |
+
venv/
|
| 7 |
+
models/
|
| 8 |
+
*.bin
|
| 9 |
+
*.bin.gz
|
Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
+
# you will also find guides on how best to write your Dockerfile
|
| 3 |
+
|
| 4 |
+
FROM python:3.9
|
| 5 |
+
|
| 6 |
+
RUN useradd -m -u 1000 user && mkdir -p /app/models && chown -R user:user /app
|
| 7 |
+
USER user
|
| 8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 9 |
+
ENV FASTTEXT_MODEL_PATH="/app/models/cc.ug.300.bin"
|
| 10 |
+
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 14 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 15 |
+
RUN python -c "import gzip, os, shutil, urllib.request; url='https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz'; gz='/app/models/cc.ug.300.bin.gz'; out='/app/models/cc.ug.300.bin'; print('Downloading fastText model:', url, flush=True); urllib.request.urlretrieve(url, gz); print('Extracting fastText model to:', out, flush=True); f_in=gzip.open(gz, 'rb'); f_out=open(out, 'wb'); shutil.copyfileobj(f_in, f_out); f_in.close(); f_out.close(); os.remove(gz)"
|
| 16 |
+
|
| 17 |
+
COPY --chown=user . /app
|
| 18 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -8,3 +8,31 @@ pinned: false
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 11 |
+
|
| 12 |
+
## API
|
| 13 |
+
|
| 14 |
+
The Docker build downloads the official fastText Uyghur Common Crawl binary model, extracts it to `/app/models/cc.ug.300.bin`, and loads it once when the FastAPI app starts.
|
| 15 |
+
|
| 16 |
+
Model source: https://fasttext.cc/docs/en/crawl-vectors
|
| 17 |
+
|
| 18 |
+
### Query request
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
curl --get "https://<your-space>.hf.space/similarity" \
|
| 22 |
+
--data-urlencode "word1=سىزغۇچ" \
|
| 23 |
+
--data-urlencode "word2=نان"
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### JSON request
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
curl -X POST "https://<your-space>.hf.space/similarity" \
|
| 30 |
+
-H "Content-Type: application/json" \
|
| 31 |
+
-d '{"word1":"سىزغۇچ","word2":"نان"}'
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
Both endpoints return a JSON number, for example:
|
| 35 |
+
|
| 36 |
+
```json
|
| 37 |
+
42.123456
|
| 38 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
import fasttext
|
| 7 |
+
import numpy as np
|
| 8 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
MODEL_PATH = Path(os.getenv("FASTTEXT_MODEL_PATH", "/app/models/cc.ug.300.bin"))
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SimilarityRequest(BaseModel):
|
| 16 |
+
word1: str = Field(..., min_length=1)
|
| 17 |
+
word2: str = Field(..., min_length=1)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def load_fasttext_model() -> Any:
|
| 21 |
+
if not MODEL_PATH.exists():
|
| 22 |
+
raise RuntimeError(
|
| 23 |
+
f"fastText model not found at {MODEL_PATH}. "
|
| 24 |
+
"Set FASTTEXT_MODEL_PATH or download cc.ug.300.bin during the Docker build."
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
return fasttext.load_model(str(MODEL_PATH))
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@asynccontextmanager
|
| 31 |
+
async def lifespan(app: FastAPI):
|
| 32 |
+
app.state.fasttext_model = load_fasttext_model()
|
| 33 |
+
yield
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
app = FastAPI(
|
| 37 |
+
title="Uyghur Word Similarity API",
|
| 38 |
+
description="Returns fastText cosine similarity multiplied by 100.",
|
| 39 |
+
version="1.0.0",
|
| 40 |
+
lifespan=lifespan,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_model() -> Any:
|
| 45 |
+
model = getattr(app.state, "fasttext_model", None)
|
| 46 |
+
if model is None:
|
| 47 |
+
raise HTTPException(status_code=503, detail="fastText model is not loaded")
|
| 48 |
+
return model
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def normalize_word(word: str, field_name: str) -> str:
|
| 52 |
+
normalized = word.strip()
|
| 53 |
+
if not normalized:
|
| 54 |
+
raise HTTPException(status_code=400, detail=f"{field_name} must not be empty")
|
| 55 |
+
return normalized
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
|
| 59 |
+
denominator = np.linalg.norm(v1) * np.linalg.norm(v2)
|
| 60 |
+
if denominator == 0:
|
| 61 |
+
raise HTTPException(status_code=422, detail="Could not compute similarity")
|
| 62 |
+
return float(np.dot(v1, v2) / denominator)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def similarity_score(word1: str, word2: str) -> float:
|
| 66 |
+
word1 = normalize_word(word1, "word1")
|
| 67 |
+
word2 = normalize_word(word2, "word2")
|
| 68 |
+
|
| 69 |
+
model = get_model()
|
| 70 |
+
v1 = model.get_word_vector(word1)
|
| 71 |
+
v2 = model.get_word_vector(word2)
|
| 72 |
+
return cosine_similarity(v1, v2) * 100
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@app.get("/")
|
| 76 |
+
def root():
|
| 77 |
+
return {
|
| 78 |
+
"status": "ok",
|
| 79 |
+
"model": str(MODEL_PATH),
|
| 80 |
+
"usage": {
|
| 81 |
+
"GET": "/similarity?word1=سىزغۇچ&word2=نان",
|
| 82 |
+
"POST": {"url": "/similarity", "body": {"word1": "سىزغۇچ", "word2": "نان"}},
|
| 83 |
+
},
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@app.get("/health")
|
| 88 |
+
def health():
|
| 89 |
+
return {"status": "ok", "model_loaded": getattr(app.state, "fasttext_model", None) is not None}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@app.get("/similarity", response_model=float)
|
| 93 |
+
def similarity_from_query(
|
| 94 |
+
word1: str = Query(..., min_length=1),
|
| 95 |
+
word2: str = Query(..., min_length=1),
|
| 96 |
+
):
|
| 97 |
+
return similarity_score(word1, word2)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@app.post("/similarity", response_model=float)
|
| 101 |
+
def similarity_from_body(payload: SimilarityRequest):
|
| 102 |
+
return similarity_score(payload.word1, payload.word2)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
fasttext
|
| 4 |
+
numpy
|