Piyazon commited on
Commit
2fbba2c
·
1 Parent(s): 2a56b48
Files changed (5) hide show
  1. .dockerignore +9 -0
  2. Dockerfile +18 -0
  3. README.md +28 -0
  4. app.py +102 -0
  5. requirements.txt +4 -0
.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ __pycache__/
3
+ *.pyc
4
+ .pytest_cache/
5
+ .venv/
6
+ venv/
7
+ models/
8
+ *.bin
9
+ *.bin.gz
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user && mkdir -p /app/models && chown -R user:user /app
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+ ENV FASTTEXT_MODEL_PATH="/app/models/cc.ug.300.bin"
10
+
11
+ WORKDIR /app
12
+
13
+ COPY --chown=user ./requirements.txt requirements.txt
14
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
15
+ RUN python -c "import gzip, os, shutil, urllib.request; url='https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz'; gz='/app/models/cc.ug.300.bin.gz'; out='/app/models/cc.ug.300.bin'; print('Downloading fastText model:', url, flush=True); urllib.request.urlretrieve(url, gz); print('Extracting fastText model to:', out, flush=True); f_in=gzip.open(gz, 'rb'); f_out=open(out, 'wb'); shutil.copyfileobj(f_in, f_out); f_in.close(); f_out.close(); os.remove(gz)"
16
+
17
+ COPY --chown=user . /app
18
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -8,3 +8,31 @@ pinned: false
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
11
+
12
+ ## API
13
+
14
+ The Docker build downloads the official fastText Uyghur Common Crawl binary model, extracts it to `/app/models/cc.ug.300.bin`, and loads it once when the FastAPI app starts.
15
+
16
+ Model source: https://fasttext.cc/docs/en/crawl-vectors
17
+
18
+ ### Query request
19
+
20
+ ```bash
21
+ curl --get "https://<your-space>.hf.space/similarity" \
22
+ --data-urlencode "word1=سىزغۇچ" \
23
+ --data-urlencode "word2=نان"
24
+ ```
25
+
26
+ ### JSON request
27
+
28
+ ```bash
29
+ curl -X POST "https://<your-space>.hf.space/similarity" \
30
+ -H "Content-Type: application/json" \
31
+ -d '{"word1":"سىزغۇچ","word2":"نان"}'
32
+ ```
33
+
34
+ Both endpoints return a JSON number, for example:
35
+
36
+ ```json
37
+ 42.123456
38
+ ```
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from contextlib import asynccontextmanager
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ import fasttext
7
+ import numpy as np
8
+ from fastapi import FastAPI, HTTPException, Query
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ MODEL_PATH = Path(os.getenv("FASTTEXT_MODEL_PATH", "/app/models/cc.ug.300.bin"))
13
+
14
+
15
+ class SimilarityRequest(BaseModel):
16
+ word1: str = Field(..., min_length=1)
17
+ word2: str = Field(..., min_length=1)
18
+
19
+
20
+ def load_fasttext_model() -> Any:
21
+ if not MODEL_PATH.exists():
22
+ raise RuntimeError(
23
+ f"fastText model not found at {MODEL_PATH}. "
24
+ "Set FASTTEXT_MODEL_PATH or download cc.ug.300.bin during the Docker build."
25
+ )
26
+
27
+ return fasttext.load_model(str(MODEL_PATH))
28
+
29
+
30
+ @asynccontextmanager
31
+ async def lifespan(app: FastAPI):
32
+ app.state.fasttext_model = load_fasttext_model()
33
+ yield
34
+
35
+
36
+ app = FastAPI(
37
+ title="Uyghur Word Similarity API",
38
+ description="Returns fastText cosine similarity multiplied by 100.",
39
+ version="1.0.0",
40
+ lifespan=lifespan,
41
+ )
42
+
43
+
44
+ def get_model() -> Any:
45
+ model = getattr(app.state, "fasttext_model", None)
46
+ if model is None:
47
+ raise HTTPException(status_code=503, detail="fastText model is not loaded")
48
+ return model
49
+
50
+
51
+ def normalize_word(word: str, field_name: str) -> str:
52
+ normalized = word.strip()
53
+ if not normalized:
54
+ raise HTTPException(status_code=400, detail=f"{field_name} must not be empty")
55
+ return normalized
56
+
57
+
58
+ def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
59
+ denominator = np.linalg.norm(v1) * np.linalg.norm(v2)
60
+ if denominator == 0:
61
+ raise HTTPException(status_code=422, detail="Could not compute similarity")
62
+ return float(np.dot(v1, v2) / denominator)
63
+
64
+
65
+ def similarity_score(word1: str, word2: str) -> float:
66
+ word1 = normalize_word(word1, "word1")
67
+ word2 = normalize_word(word2, "word2")
68
+
69
+ model = get_model()
70
+ v1 = model.get_word_vector(word1)
71
+ v2 = model.get_word_vector(word2)
72
+ return cosine_similarity(v1, v2) * 100
73
+
74
+
75
+ @app.get("/")
76
+ def root():
77
+ return {
78
+ "status": "ok",
79
+ "model": str(MODEL_PATH),
80
+ "usage": {
81
+ "GET": "/similarity?word1=سىزغۇچ&word2=نان",
82
+ "POST": {"url": "/similarity", "body": {"word1": "سىزغۇچ", "word2": "نان"}},
83
+ },
84
+ }
85
+
86
+
87
+ @app.get("/health")
88
+ def health():
89
+ return {"status": "ok", "model_loaded": getattr(app.state, "fasttext_model", None) is not None}
90
+
91
+
92
+ @app.get("/similarity", response_model=float)
93
+ def similarity_from_query(
94
+ word1: str = Query(..., min_length=1),
95
+ word2: str = Query(..., min_length=1),
96
+ ):
97
+ return similarity_score(word1, word2)
98
+
99
+
100
+ @app.post("/similarity", response_model=float)
101
+ def similarity_from_body(payload: SimilarityRequest):
102
+ return similarity_score(payload.word1, payload.word2)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ fasttext
4
+ numpy