Spaces:

kachaf
/

Predict

Running

App Files Files Community

Boulbaba commited on 22 days ago

Commit

735c7d6

1 Parent(s): 291f07d

Add application file

Browse files

Files changed (8) hide show

Dockerfile +21 -0
app.py +112 -0
onnx_model/config.json +82 -0
onnx_model/special_tokens_map.json +37 -0
onnx_model/tokenizer.json +0 -0
onnx_model/tokenizer_config.json +58 -0
onnx_model/vocab.txt +0 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.10-slim
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PIP_NO_CACHE_DIR=1
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from typing import List, Union
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForSequenceClassification
+import torch
+MODEL_DIR = "onnx_model"
+CLASS_LABELS = [
+    "الطويل", "البسيط", "الكامل", "الوافر", "الهزج",
+    "الرجز", "الرمل", "السريع", "المنسرح", "الخفيف",
+    "المضارع", "المقتضب", "المجتث", "المتقارب", "المحدث"
+]
+app = FastAPI(title="Arabic Poetry Meter Predictor (ONNX)")
+tokenizer = None
+model = None
+id2label = None
+class PredictRequest(BaseModel):
+    sentences: Union[str, List[str]]
+@app.on_event("startup")
+def startup_event():
+    global tokenizer, model, id2label
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+    model = ORTModelForSequenceClassification.from_pretrained(MODEL_DIR)
+    config_labels = getattr(model.config, "id2label", None)
+    if isinstance(config_labels, dict) and config_labels:
+        fixed = {}
+        for k, v in config_labels.items():
+            try:
+                fixed[int(k)] = v
+            except Exception:
+                fixed[k] = v
+        id2label = fixed
+    else:
+        id2label = {i: label for i, label in enumerate(CLASS_LABELS)}
+def normalize_sentences(value: Union[str, List[str]]) -> List[str]:
+    if isinstance(value, str):
+        value = value.strip()
+        return [value] if value else []
+    if isinstance(value, list):
+        return [str(v).strip() for v in value if str(v).strip()]
+    return []
+@app.get("/")
+def root():
+    return {"message": "Arabic Poetry Meter Predictor API is running"}
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+@app.get("/test")
+def test():
+    return {"arabic": "مرحبا هذا اختبار"}
+@app.post("/predict")
+def predict(req: PredictRequest):
+    if tokenizer is None or model is None:
+        raise HTTPException(status_code=500, detail="Model is not loaded")
+    sentences = normalize_sentences(req.sentences)
+    if not sentences:
+        raise HTTPException(status_code=400, detail="No poetry lines provided")
+    try:
+        inputs = tokenizer(
+            sentences,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=256,
+        )
+        outputs = model(**inputs)
+        probs = torch.softmax(outputs.logits, dim=-1)
+        top_scores, top_indices = torch.max(probs, dim=-1)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
+    results = []
+    for line, score, idx in zip(sentences, top_scores.tolist(), top_indices.tolist()):
+        label = id2label.get(int(idx), f"LABEL_{idx}")
+        results.append({
+            "line": line,
+            "predictions": [
+                {
+                    "rank": 1,
+                    "label": label,
+                    "score": float(score),
+                }
+            ],
+        })
+    return results

onnx_model/config.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "dtype": "float32",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "\u0627\u0644\u0628\u0633\u064a\u0637",
+    "1": "\u0627\u0644\u062e\u0641\u064a\u0641",
+    "2": "\u0627\u0644\u062f\u0648\u0628\u064a\u062a",
+    "3": "\u0627\u0644\u0631\u062c\u0632",
+    "4": "\u0627\u0644\u0631\u0645\u0644",
+    "5": "\u0627\u0644\u0633\u0631\u064a\u0639",
+    "6": "\u0627\u0644\u0633\u0644\u0633\u0644\u0629",
+    "7": "\u0627\u0644\u0637\u0648\u064a\u0644",
+    "8": "\u0627\u0644\u0643\u0627\u0645\u0644",
+    "9": "\u0627\u0644\u0645\u062a\u062f\u0627\u0631\u0643",
+    "10": "\u0627\u0644\u0645\u062a\u0642\u0627\u0631\u0628",
+    "11": "\u0627\u0644\u0645\u062c\u062a\u062b",
+    "12": "\u0627\u0644\u0645\u062f\u064a\u062f",
+    "13": "\u0627\u0644\u0645\u0636\u0627\u0631\u0639",
+    "14": "\u0627\u0644\u0645\u0642\u062a\u0636\u0628",
+    "15": "\u0627\u0644\u0645\u0646\u0633\u0631\u062d",
+    "16": "\u0627\u0644\u0645\u0648\u0627\u0644\u064a\u0627",
+    "17": "\u0627\u0644\u0647\u0632\u062c",
+    "18": "\u0627\u0644\u0648\u0627\u0641\u0631",
+    "19": "\u0634\u0639\u0631 \u0627\u0644\u062a\u0641\u0639\u064a\u0644\u0629",
+    "20": "\u0634\u0639\u0631 \u062d\u0631",
+    "21": "\u0639\u0627\u0645\u064a",
+    "22": "\u0645\u0648\u0634\u062d"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "\u0627\u0644\u0628\u0633\u064a\u0637": 0,
+    "\u0627\u0644\u062e\u0641\u064a\u0641": 1,
+    "\u0627\u0644\u062f\u0648\u0628\u064a\u062a": 2,
+    "\u0627\u0644\u0631\u062c\u0632": 3,
+    "\u0627\u0644\u0631\u0645\u0644": 4,
+    "\u0627\u0644\u0633\u0631\u064a\u0639": 5,
+    "\u0627\u0644\u0633\u0644\u0633\u0644\u0629": 6,
+    "\u0627\u0644\u0637\u0648\u064a\u0644": 7,
+    "\u0627\u0644\u0643\u0627\u0645\u0644": 8,
+    "\u0627\u0644\u0645\u062a\u062f\u0627\u0631\u0643": 9,
+    "\u0627\u0644\u0645\u062a\u0642\u0627\u0631\u0628": 10,
+    "\u0627\u0644\u0645\u062c\u062a\u062b": 11,
+    "\u0627\u0644\u0645\u062f\u064a\u062f": 12,
+    "\u0627\u0644\u0645\u0636\u0627\u0631\u0639": 13,
+    "\u0627\u0644\u0645\u0642\u062a\u0636\u0628": 14,
+    "\u0627\u0644\u0645\u0646\u0633\u0631\u062d": 15,
+    "\u0627\u0644\u0645\u0648\u0627\u0644\u064a\u0627": 16,
+    "\u0627\u0644\u0647\u0632\u062c": 17,
+    "\u0627\u0644\u0648\u0627\u0641\u0631": 18,
+    "\u0634\u0639\u0631 \u0627\u0644\u062a\u0641\u0639\u064a\u0644\u0629": 19,
+    "\u0634\u0639\u0631 \u062d\u0631": 20,
+    "\u0639\u0627\u0645\u064a": 21,
+    "\u0645\u0648\u0634\u062d": 22
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.57.6",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 100000
+}

onnx_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

onnx_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

onnx_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

onnx_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn
+transformers
+optimum[onnxruntime]
+torch
+sentencepiece