Spaces:

AyoubChLin
/

classifier-general

Sleeping

App Files Files Community

AyoubChLin commited on about 1 month ago

Commit

2571402

1 Parent(s): 2d0ef3b

[REF] api documentation

Browse files

Files changed (12) hide show

.env.example +1 -0
README.md +7 -0
app/core/config.py +1 -0
app/routers/classification.py +1 -1
app/schemas/classification.py +8 -1
app/services/classifier_service.py +61 -3
app/services/label_service.py +3 -0
docs/reference/api.md +25 -1
docs/reference/configuration.md +2 -0
docs/tutorials/getting-started.md +1 -1
tests/test_classifier_service.py +115 -2
tests/test_routes.py +31 -1

.env.example CHANGED Viewed

@@ -8,5 +8,6 @@ UPLOAD_SUBDIR=uploads
 CLASSIFIER_MODEL=AyoubChLin/bert-base-uncased-zeroshot-nli
 ENABLE_MODEL_QUANTIZATION=true
 HUGGINGFACE_TOKEN=
 DEFAULT_LABELS_CSV=news,sport,finance,politics

 CLASSIFIER_MODEL=AyoubChLin/bert-base-uncased-zeroshot-nli
 ENABLE_MODEL_QUANTIZATION=true
 HUGGINGFACE_TOKEN=
+CLASSIFIER_ENTAILMENT_LABEL_ID=
 DEFAULT_LABELS_CSV=news,sport,finance,politics

README.md CHANGED Viewed

@@ -27,6 +27,12 @@ Refactored into a modular FastAPI backend with clear layers:
 - `POST /configlabel` -> returns labels array
 - `GET /labels` -> returns labels array
 Additional operational endpoints:
 - `GET /health/liveness`
 - `GET /health/readiness`
@@ -42,6 +48,7 @@ Key vars:
 - `CLASSIFIER_MODEL`
 - `ENABLE_MODEL_QUANTIZATION`
 - `HUGGINGFACE_TOKEN`
 - `DEFAULT_LABELS_CSV`
 ## Local Run

 - `POST /configlabel` -> returns labels array
 - `GET /labels` -> returns labels array
+`POST /configlabel` exact payload:
+- body accepts `{"labels":["label1","label2","label3"]}`
+- all resulting labels are trimmed, empty values removed
+- duplicates are kept if they are provided
+- returns the stored `string[]` labels
 Additional operational endpoints:
 - `GET /health/liveness`
 - `GET /health/readiness`
 - `CLASSIFIER_MODEL`
 - `ENABLE_MODEL_QUANTIZATION`
 - `HUGGINGFACE_TOKEN`
+- `CLASSIFIER_ENTAILMENT_LABEL_ID` (optional override when model config has no entailment label name)
 - `DEFAULT_LABELS_CSV`
 ## Local Run

app/core/config.py CHANGED Viewed

@@ -18,6 +18,7 @@ class Settings(BaseSettings):
     classifier_model: str = "AyoubChLin/bert-base-uncased-zeroshot-nli"
     enable_model_quantization: bool = True
     huggingface_token: str | None = None
     default_labels_csv: str = Field(default="news,sport,finance,politics")

     classifier_model: str = "AyoubChLin/bert-base-uncased-zeroshot-nli"
     enable_model_quantization: bool = True
     huggingface_token: str | None = None
+    classifier_entailment_label_id: int | None = None
     default_labels_csv: str = Field(default="news,sport,finance,politics")

app/routers/classification.py CHANGED Viewed

@@ -56,7 +56,7 @@ async def classify_uploaded_file(file: UploadFile = File(...)) -> dict:
 @router.post("/configlabel", response_model=list[str])
 async def configure_labels(payload: LabelUpdateInput) -> list[str]:
-    labels = label_service.set_labels_from_csv(payload.text)
     if not labels:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="At least one label is required")
     return labels

 @router.post("/configlabel", response_model=list[str])
 async def configure_labels(payload: LabelUpdateInput) -> list[str]:
+    labels = label_service.set_labels(payload.get_normalized_labels())
     if not labels:
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="At least one label is required")
     return labels

app/schemas/classification.py CHANGED Viewed

@@ -10,7 +10,14 @@ class TextInput(BaseSchema):
 class LabelUpdateInput(BaseSchema):
-    text: str = Field(min_length=1, description="Comma-separated labels, e.g. 'news, sport, finance'")
 class ClassifierResponse(BaseSchema):

 class LabelUpdateInput(BaseSchema):
+    labels: list[str] = Field(
+        min_length=1,
+        description="Direct list of labels. Items are trimmed and empty values are removed.",
+        examples=[["news", "sport", "finance"]],
+    )
+    def get_normalized_labels(self) -> list[str]:
+        return [label.strip() for label in self.labels if isinstance(label, str) and label.strip()]
 class ClassifierResponse(BaseSchema):

app/services/classifier_service.py CHANGED Viewed

@@ -61,19 +61,77 @@ class ClassifierService:
         cleaned = [label.strip() for label in labels if isinstance(label, str) and label.strip()]
         return list(dict.fromkeys(cleaned))
     @staticmethod
     def _resolve_entailment_id(model: Any) -> int:
         label2id = getattr(model.config, "label2id", {}) or {}
         for label, label_id in label2id.items():
             if isinstance(label, str) and label.lower().startswith("entail"):
-                return int(label_id)
         id2label = getattr(model.config, "id2label", {}) or {}
         for label_id, label in id2label.items():
             if isinstance(label, str) and label.lower().startswith("entail"):
-                return int(label_id)
-        raise ClassificationError("Classifier model is missing an entailment label mapping")
     def classify(self, text: str, labels: list[str]) -> str:
         candidate_labels = self._normalize_labels(labels)

         cleaned = [label.strip() for label in labels if isinstance(label, str) and label.strip()]
         return list(dict.fromkeys(cleaned))
+    @staticmethod
+    def _parse_label_id(value: Any) -> int | None:
+        try:
+            return int(value)
+        except (TypeError, ValueError):
+            return None
+    @staticmethod
+    def _extract_task_specific_entailment_id(model: Any) -> int | None:
+        task_specific_params = getattr(model.config, "task_specific_params", {}) or {}
+        if not isinstance(task_specific_params, dict):
+            return None
+        zero_shot_params = task_specific_params.get("zero-shot-classification", {})
+        if not isinstance(zero_shot_params, dict):
+            return None
+        return ClassifierService._parse_label_id(zero_shot_params.get("entailment_id"))
+    @staticmethod
+    def _has_generic_label_names(model: Any) -> bool:
+        label2id = getattr(model.config, "label2id", {}) or {}
+        id2label = getattr(model.config, "id2label", {}) or {}
+        labels: list[str] = []
+        labels.extend(label for label in label2id.keys() if isinstance(label, str))
+        labels.extend(label for label in id2label.values() if isinstance(label, str))
+        if not labels:
+            return False
+        return all(label.lower().startswith("label_") for label in labels)
     @staticmethod
     def _resolve_entailment_id(model: Any) -> int:
         label2id = getattr(model.config, "label2id", {}) or {}
         for label, label_id in label2id.items():
             if isinstance(label, str) and label.lower().startswith("entail"):
+                parsed = ClassifierService._parse_label_id(label_id)
+                if parsed is not None:
+                    return parsed
         id2label = getattr(model.config, "id2label", {}) or {}
         for label_id, label in id2label.items():
             if isinstance(label, str) and label.lower().startswith("entail"):
+                parsed = ClassifierService._parse_label_id(label_id)
+                if parsed is not None:
+                    return parsed
+        task_specific_entailment_id = ClassifierService._extract_task_specific_entailment_id(model)
+        if task_specific_entailment_id is not None:
+            return task_specific_entailment_id
+        if settings.classifier_entailment_label_id is not None:
+            return settings.classifier_entailment_label_id
+        num_labels = ClassifierService._parse_label_id(getattr(model.config, "num_labels", None))
+        if num_labels == 3 and (
+            ClassifierService._has_generic_label_names(model) or (not label2id and not id2label)
+        ):
+            logger.warning(
+                "Falling back to entailment label id 2 because model config labels are generic or missing "
+                "and no explicit entailment mapping was found. Set CLASSIFIER_ENTAILMENT_LABEL_ID "
+                "to override this behavior."
+            )
+            return 2
+        raise ClassificationError(
+            "Classifier model is missing an entailment label mapping. "
+            "Set CLASSIFIER_ENTAILMENT_LABEL_ID in the environment when the model config "
+            "does not expose an entailment label."
+        )
     def classify(self, text: str, labels: list[str]) -> str:
         candidate_labels = self._normalize_labels(labels)

app/services/label_service.py CHANGED Viewed

@@ -14,5 +14,8 @@ class LabelService:
         labels = [label.strip() for label in labels_csv.split(",") if label.strip()]
         return self._config.set_labels(labels)
 label_service = LabelService()

         labels = [label.strip() for label in labels_csv.split(",") if label.strip()]
         return self._config.set_labels(labels)
+    def set_labels(self, labels: list[str]) -> list[str]:
+        return self._config.set_labels(labels)
 label_service = LabelService()

docs/reference/api.md CHANGED Viewed

@@ -18,9 +18,33 @@ Evidence:
 | POST | `/api/language` | `{text}` | `"<language>"` |
 | POST | `/api/transformer` | multipart `file` | `{filename, content}` |
 | POST | `/classify` | multipart `file` | `{label, language, type?}` |
-| POST | `/configlabel` | `{text: "csv,labels"}` | `string[]` |
 | GET | `/labels` | none | `string[]` |
 ## Validation and errors
 - `400` for input validation and extraction problems.
 - `502` for classifier/language inference failures.

 | POST | `/api/language` | `{text}` | `"<language>"` |
 | POST | `/api/transformer` | multipart `file` | `{filename, content}` |
 | POST | `/classify` | multipart `file` | `{label, language, type?}` |
+| POST | `/configlabel` | `{labels:["a","b"]}` | `string[]` |
 | GET | `/labels` | none | `string[]` |
+## `POST /configlabel` contract (exact)
+Request body:
+- JSON object with required `labels` array.
+- Example: `{"labels":["tech","health","legal"]}`
+Normalization behavior:
+- Trim whitespace for each label.
+- Remove empty entries.
+- Preserve order.
+- Keep duplicates as provided.
+Response:
+- `200` with `string[]`, the stored labels after normalization.
+- Example response: `["tech", "health", "legal"]`
+Errors:
+- `400` with detail `"At least one label is required"` when all parsed labels are empty.
+- `422` when `labels` is missing or unknown fields are provided (schema validation error).
+Related state behavior:
+- Labels are process-local in memory and reset on restart.
+- `GET /labels` returns the same current in-memory list.
 ## Validation and errors
 - `400` for input validation and extraction problems.
 - `502` for classifier/language inference failures.

docs/reference/configuration.md CHANGED Viewed

@@ -29,6 +29,7 @@ Evidence:
 | `CLASSIFIER_MODEL` | `AyoubChLin/bert-base-uncased-zeroshot-nli` | Hugging Face model ID used for local zero-shot NLI classification |
 | `ENABLE_MODEL_QUANTIZATION` | `true` | enable dynamic INT8 quantization with automatic fallback |
 | `HUGGINGFACE_TOKEN` | empty | optional auth token for client init |
 ## Language detector settings
@@ -45,6 +46,7 @@ Evidence:
 ## Behavior notes
 - Labels are process-local in memory and reset on restart.
 - Upload directory is auto-created at app startup.
 Evidence:
 - `app/services/label_service.py`

 | `CLASSIFIER_MODEL` | `AyoubChLin/bert-base-uncased-zeroshot-nli` | Hugging Face model ID used for local zero-shot NLI classification |
 | `ENABLE_MODEL_QUANTIZATION` | `true` | enable dynamic INT8 quantization with automatic fallback |
 | `HUGGINGFACE_TOKEN` | empty | optional auth token for client init |
+| `CLASSIFIER_ENTAILMENT_LABEL_ID` | empty | optional integer override for entailment logit index when model config does not expose an `entailment` label |
 ## Language detector settings
 ## Behavior notes
 - Labels are process-local in memory and reset on restart.
 - Upload directory is auto-created at app startup.
+- If `label2id`/`id2label` does not include an entailment label, the service checks task-specific config, then `CLASSIFIER_ENTAILMENT_LABEL_ID`, then falls back to index `2` for 3-logit generic/missing mappings.
 Evidence:
 - `app/services/label_service.py`

docs/tutorials/getting-started.md CHANGED Viewed

@@ -60,7 +60,7 @@ Evidence:
 ```bash
 curl -s -X POST http://localhost:4002/configlabel \
   -H 'content-type: application/json' \
-  -d '{"text":"tech,health,legal"}'
 curl -s http://localhost:4002/labels
 ```

 ```bash
 curl -s -X POST http://localhost:4002/configlabel \
   -H 'content-type: application/json' \
+  -d '{"labels":["tech","health","legal"]}'
 curl -s http://localhost:4002/labels
 ```

tests/test_classifier_service.py CHANGED Viewed

@@ -15,9 +15,9 @@ class _FakeTokenizer:
 class _FakeInferenceModel:
-    def __init__(self, logits: torch.Tensor) -> None:
         self._logits = logits
-        self.config = SimpleNamespace(
             label2id={"CONTRADICTION": 0, "ENTAILMENT": 1},
             id2label={0: "CONTRADICTION", 1: "ENTAILMENT"},
         )
@@ -63,6 +63,119 @@ def test_classify_uses_runtime_candidate_labels(monkeypatch):
     assert predicted == "sport"
 def test_model_quantization_falls_back_to_non_quantized_model(monkeypatch):
     service = classifier_module.ClassifierService()
     fake_model = _FakeLoadModel()

 class _FakeInferenceModel:
+    def __init__(self, logits: torch.Tensor, config: SimpleNamespace | None = None) -> None:
         self._logits = logits
+        self.config = config or SimpleNamespace(
             label2id={"CONTRADICTION": 0, "ENTAILMENT": 1},
             id2label={0: "CONTRADICTION", 1: "ENTAILMENT"},
         )
     assert predicted == "sport"
+def test_classify_uses_task_specific_entailment_id_when_label_names_are_generic(monkeypatch):
+    service = classifier_module.ClassifierService()
+    tokenizer = _FakeTokenizer()
+    model = _FakeInferenceModel(
+        logits=torch.tensor(
+            [
+                [1.8, 0.3, 0.4],  # finance -> low entailment
+                [0.4, 0.7, 3.7],  # sport -> highest entailment
+            ]
+        ),
+        config=SimpleNamespace(
+            label2id={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
+            id2label={0: "LABEL_0", 1: "LABEL_1", 2: "LABEL_2"},
+            task_specific_params={"zero-shot-classification": {"entailment_id": 2}},
+            num_labels=3,
+        ),
+    )
+    monkeypatch.setattr(service, "_load_model", lambda: (tokenizer, model))
+    monkeypatch.setattr(classifier_module.settings, "classifier_entailment_label_id", None)
+    predicted = service.classify(
+        "The story is mostly about football transfers.",
+        ["finance", "sport"],
+    )
+    assert predicted == "sport"
+def test_classify_uses_explicit_entailment_id_setting_when_mapping_is_missing(monkeypatch):
+    service = classifier_module.ClassifierService()
+    tokenizer = _FakeTokenizer()
+    model = _FakeInferenceModel(
+        logits=torch.tensor(
+            [
+                [2.0, 0.3],  # finance -> low entailment
+                [0.2, 3.4],  # sport -> highest entailment
+            ]
+        ),
+        config=SimpleNamespace(
+            label2id={"NEGATIVE": 0, "POSITIVE": 1},
+            id2label={0: "NEGATIVE", 1: "POSITIVE"},
+            num_labels=2,
+        ),
+    )
+    monkeypatch.setattr(service, "_load_model", lambda: (tokenizer, model))
+    monkeypatch.setattr(classifier_module.settings, "classifier_entailment_label_id", 1)
+    predicted = service.classify(
+        "The story is mostly about football transfers.",
+        ["finance", "sport"],
+    )
+    assert predicted == "sport"
+def test_classify_falls_back_to_mnli_entailment_index_for_generic_three_label_configs(monkeypatch):
+    service = classifier_module.ClassifierService()
+    tokenizer = _FakeTokenizer()
+    model = _FakeInferenceModel(
+        logits=torch.tensor(
+            [
+                [2.3, 0.6, 0.8],  # finance -> low entailment
+                [0.4, 0.8, 3.9],  # sport -> highest entailment
+            ]
+        ),
+        config=SimpleNamespace(
+            label2id={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
+            id2label={0: "LABEL_0", 1: "LABEL_1", 2: "LABEL_2"},
+            num_labels=3,
+        ),
+    )
+    monkeypatch.setattr(service, "_load_model", lambda: (tokenizer, model))
+    monkeypatch.setattr(classifier_module.settings, "classifier_entailment_label_id", None)
+    predicted = service.classify(
+        "The story is mostly about football transfers.",
+        ["finance", "sport"],
+    )
+    assert predicted == "sport"
+def test_classify_falls_back_to_mnli_entailment_index_for_missing_label_mapping(monkeypatch):
+    service = classifier_module.ClassifierService()
+    tokenizer = _FakeTokenizer()
+    model = _FakeInferenceModel(
+        logits=torch.tensor(
+            [
+                [1.8, 0.4, 0.5],  # finance -> low entailment
+                [0.2, 0.5, 3.6],  # sport -> highest entailment
+            ]
+        ),
+        config=SimpleNamespace(
+            label2id={},
+            id2label={},
+            num_labels=3,
+        ),
+    )
+    monkeypatch.setattr(service, "_load_model", lambda: (tokenizer, model))
+    monkeypatch.setattr(classifier_module.settings, "classifier_entailment_label_id", None)
+    predicted = service.classify(
+        "The story is mostly about football transfers.",
+        ["finance", "sport"],
+    )
+    assert predicted == "sport"
 def test_model_quantization_falls_back_to_non_quantized_model(monkeypatch):
     service = classifier_module.ClassifierService()
     fake_model = _FakeLoadModel()

tests/test_routes.py CHANGED Viewed

@@ -27,7 +27,7 @@ def test_language_endpoint_contract(monkeypatch):
 def test_labels_config_roundtrip():
-    response = client.post("/configlabel", json={"text": "tech, health, legal"})
     assert response.status_code == 200
     assert response.json() == ["tech", "health", "legal"]
@@ -36,6 +36,36 @@ def test_labels_config_roundtrip():
     assert get_response.json() == ["tech", "health", "legal"]
 def test_transform_file_contract(monkeypatch):
     monkeypatch.setattr(classification_pipeline, "transform_file", lambda filename, path: "extracted content")

 def test_labels_config_roundtrip():
+    response = client.post("/configlabel", json={"labels": ["tech", "health", "legal"]})
     assert response.status_code == 200
     assert response.json() == ["tech", "health", "legal"]
     assert get_response.json() == ["tech", "health", "legal"]
+def test_labels_config_accepts_labels_list_payload():
+    response = client.post("/configlabel", json={"labels": ["tech", "health", "legal"]})
+    assert response.status_code == 200
+    assert response.json() == ["tech", "health", "legal"]
+def test_labels_config_rejects_empty_labels():
+    response = client.post("/configlabel", json={"labels": [" ", ""]})
+    assert response.status_code == 400
+    assert response.json() == {"detail": "At least one label is required"}
+def test_labels_config_rejects_missing_labels():
+    response = client.post("/configlabel", json={})
+    assert response.status_code == 422
+    assert "labels" in response.text
+def test_labels_config_rejects_text_field():
+    response = client.post("/configlabel", json={"text": "tech,health"})
+    assert response.status_code == 422
+    assert "extra_forbidden" in response.text
+def test_labels_config_rejects_texts_field():
+    response = client.post("/configlabel", json={"texts": ["tech,health"]})
+    assert response.status_code == 422
+    assert "extra_forbidden" in response.text
 def test_transform_file_contract(monkeypatch):
     monkeypatch.setattr(classification_pipeline, "transform_file", lambda filename, path: "extracted content")