Spaces:
Sleeping
Sleeping
Commit ·
50231a8
1
Parent(s): 3d9d878
feat: initial commit of Classifier General API with FastAPI
Browse files- Added Spacefile for deployment configuration.
- Created app structure with core, api, models, pipelines, routers, and services.
- Implemented classification and language detection pipelines.
- Integrated file extraction and storage services.
- Established API endpoints for classification, language detection, and file transformation.
- Added health check endpoints for service liveness and readiness.
- Configured Pydantic settings for environment-based configuration.
- Developed tests for route contracts and functionality.
- Included Docker Compose setup for local development and deployment.
- Documented architecture, decisions, and usage in the README and other markdown files.
- .dockerignore +9 -0
- .env.example +15 -0
- .gitignore +7 -0
- Dockerfile +19 -0
- README.md +54 -1
- Spacefile +10 -0
- app/__init__.py +1 -0
- app/api/__init__.py +1 -0
- app/api/router.py +7 -0
- app/core/__init__.py +1 -0
- app/core/config.py +37 -0
- app/core/exceptions.py +14 -0
- app/main.py +22 -0
- app/models/__init__.py +1 -0
- app/models/label_config.py +18 -0
- app/pipelines/__init__.py +1 -0
- app/pipelines/classification_pipeline.py +50 -0
- app/pipelines/text_pipeline.py +21 -0
- app/routers/__init__.py +1 -0
- app/routers/classification.py +67 -0
- app/routers/health.py +18 -0
- app/schemas/__init__.py +1 -0
- app/schemas/classification.py +36 -0
- app/services/__init__.py +1 -0
- app/services/classifier_service.py +83 -0
- app/services/extraction_service.py +70 -0
- app/services/file_storage_service.py +29 -0
- app/services/label_service.py +18 -0
- app/services/language_service.py +41 -0
- docker-compose.yml +27 -0
- docs/README.md +33 -0
- docs/explanation/architecture.md +129 -0
- docs/explanation/decisions.md +80 -0
- docs/how-to/deploy-with-docker-compose.md +35 -0
- docs/how-to/run-locally.md +64 -0
- docs/reference/api.md +39 -0
- docs/reference/configuration.md +53 -0
- docs/reference/runtime-state.md +48 -0
- docs/tutorials/getting-started.md +79 -0
- main.py +7 -0
- requirements.txt +16 -0
- tests/test_routes.py +61 -0
.dockerignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
.pytest_cache/
|
| 7 |
+
.env
|
| 8 |
+
aws/
|
| 9 |
+
awscliv2.zip
|
.env.example
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
APP_NAME=Classifier General API
|
| 2 |
+
ENVIRONMENT=development
|
| 3 |
+
DEBUG=false
|
| 4 |
+
|
| 5 |
+
STATIC_DIR=static
|
| 6 |
+
UPLOAD_SUBDIR=uploads
|
| 7 |
+
|
| 8 |
+
CLASSIFIER_SPACE=https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/
|
| 9 |
+
CLASSIFIER_API_NAME=/predict
|
| 10 |
+
HUGGINGFACE_TOKEN=
|
| 11 |
+
|
| 12 |
+
LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict
|
| 13 |
+
REQUEST_TIMEOUT_SECONDS=30
|
| 14 |
+
|
| 15 |
+
DEFAULT_LABELS_CSV=news,sport,finance,politics
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.space
|
| 2 |
+
.env
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
.pytest_cache/
|
| 6 |
+
static/uploads/
|
| 7 |
+
static/*
|
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
RUN apt-get update \
|
| 9 |
+
&& apt-get install -y --no-install-recommends tesseract-ocr curl \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
EXPOSE 4002
|
| 18 |
+
|
| 19 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "4002"]
|
README.md
CHANGED
|
@@ -9,4 +9,57 @@ license: mit
|
|
| 9 |
short_description: classifier-general
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
short_description: classifier-general
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Classifier General API (Refactored)
|
| 13 |
+
|
| 14 |
+
Refactored into a modular FastAPI backend with clear layers:
|
| 15 |
+
- `app/routers`
|
| 16 |
+
- `app/services`
|
| 17 |
+
- `app/pipelines`
|
| 18 |
+
- `app/schemas`
|
| 19 |
+
- `app/models`
|
| 20 |
+
- `app/core`
|
| 21 |
+
|
| 22 |
+
## Preserved Endpoint Contract
|
| 23 |
+
- `POST /api/classifier` -> returns label string
|
| 24 |
+
- `POST /api/language` -> returns language string
|
| 25 |
+
- `POST /api/transformer` -> returns `{ filename, content }`
|
| 26 |
+
- `POST /classify` -> returns `{ label, language, type? }`
|
| 27 |
+
- `POST /configlabel` -> returns labels array
|
| 28 |
+
- `GET /labels` -> returns labels array
|
| 29 |
+
|
| 30 |
+
Additional operational endpoints:
|
| 31 |
+
- `GET /health/liveness`
|
| 32 |
+
- `GET /health/readiness`
|
| 33 |
+
- `GET /endpoint/`
|
| 34 |
+
|
| 35 |
+
## Environment
|
| 36 |
+
Copy and edit:
|
| 37 |
+
```bash
|
| 38 |
+
cp .env.example .env
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
Key vars:
|
| 42 |
+
- `CLASSIFIER_SPACE`
|
| 43 |
+
- `HUGGINGFACE_TOKEN`
|
| 44 |
+
- `LANGUAGE_DETECTOR_URL`
|
| 45 |
+
- `DEFAULT_LABELS_CSV`
|
| 46 |
+
|
| 47 |
+
## Local Run
|
| 48 |
+
```bash
|
| 49 |
+
pip install -r requirements.txt
|
| 50 |
+
uvicorn main:app --host 0.0.0.0 --port 4002 --reload
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Docker Run
|
| 54 |
+
```bash
|
| 55 |
+
docker compose up --build
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Tests
|
| 59 |
+
```bash
|
| 60 |
+
pytest -q
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## Notes
|
| 64 |
+
- OCR requires `tesseract-ocr` (installed in Dockerfile).
|
| 65 |
+
- Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
|
Spacefile
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spacefile Docs: https://go.deta.dev/docs/spacefile/v0
|
| 2 |
+
v: 0
|
| 3 |
+
micros:
|
| 4 |
+
- name: classifier-general
|
| 5 |
+
src: .
|
| 6 |
+
engine: python3.9
|
| 7 |
+
primary: true
|
| 8 |
+
public: true
|
| 9 |
+
run: uvicorn main:app
|
| 10 |
+
dev: uvicorn main:app --reload
|
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Classifier General backend package."""
|
app/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""API router assembly."""
|
app/api/router.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
|
| 3 |
+
from app.routers import classification, health
|
| 4 |
+
|
| 5 |
+
api_router = APIRouter()
|
| 6 |
+
api_router.include_router(health.router)
|
| 7 |
+
api_router.include_router(classification.router)
|
app/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Core settings and shared utilities."""
|
app/core/config.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from pydantic import Field
|
| 5 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Settings(BaseSettings):
|
| 9 |
+
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
| 10 |
+
|
| 11 |
+
app_name: str = "Classifier General API"
|
| 12 |
+
environment: str = "development"
|
| 13 |
+
debug: bool = False
|
| 14 |
+
|
| 15 |
+
static_dir: Path = Path("static")
|
| 16 |
+
upload_subdir: str = "uploads"
|
| 17 |
+
|
| 18 |
+
classifier_space: str = "https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/"
|
| 19 |
+
classifier_api_name: str = "/predict"
|
| 20 |
+
huggingface_token: str | None = None
|
| 21 |
+
|
| 22 |
+
language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"
|
| 23 |
+
request_timeout_seconds: float = 30.0
|
| 24 |
+
|
| 25 |
+
default_labels_csv: str = Field(default="news,sport,finance,politics")
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def upload_dir(self) -> Path:
|
| 29 |
+
return self.static_dir / self.upload_subdir
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@lru_cache
|
| 33 |
+
def get_settings() -> Settings:
|
| 34 |
+
return Settings()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
settings = get_settings()
|
app/core/exceptions.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class ClassificationError(Exception):
|
| 2 |
+
pass
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class LanguageDetectionError(Exception):
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ExtractionError(Exception):
|
| 10 |
+
pass
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ValidationError(Exception):
|
| 14 |
+
pass
|
app/main.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from fastapi.staticfiles import StaticFiles
|
| 3 |
+
|
| 4 |
+
from app.api.router import api_router
|
| 5 |
+
from app.core.config import settings
|
| 6 |
+
|
| 7 |
+
settings.static_dir.mkdir(parents=True, exist_ok=True)
|
| 8 |
+
settings.upload_dir.mkdir(parents=True, exist_ok=True)
|
| 9 |
+
|
| 10 |
+
app = FastAPI(title=settings.app_name, debug=settings.debug)
|
| 11 |
+
app.mount("/static", StaticFiles(directory=str(settings.static_dir)), name="static")
|
| 12 |
+
app.include_router(api_router)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@app.get("/endpoint/")
|
| 16 |
+
def list_endpoints() -> list[dict]:
|
| 17 |
+
endpoints = []
|
| 18 |
+
for route in app.routes:
|
| 19 |
+
methods = sorted((route.methods or set()) & {"GET", "POST", "PUT", "DELETE"})
|
| 20 |
+
if methods:
|
| 21 |
+
endpoints.append({"endpoint": route.path, "methods": methods})
|
| 22 |
+
return endpoints
|
app/models/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Domain models."""
|
app/models/label_config.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
from threading import Lock
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class LabelConfig:
|
| 7 |
+
labels: list[str] = field(default_factory=list)
|
| 8 |
+
_lock: Lock = field(default_factory=Lock)
|
| 9 |
+
|
| 10 |
+
def get_labels(self) -> list[str]:
|
| 11 |
+
with self._lock:
|
| 12 |
+
return list(self.labels)
|
| 13 |
+
|
| 14 |
+
def set_labels(self, labels: list[str]) -> list[str]:
|
| 15 |
+
cleaned = [label.strip() for label in labels if label and label.strip()]
|
| 16 |
+
with self._lock:
|
| 17 |
+
self.labels = cleaned
|
| 18 |
+
return list(self.labels)
|
app/pipelines/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""ML/OCR processing pipelines."""
|
app/pipelines/classification_pipeline.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError
|
| 4 |
+
from app.pipelines.text_pipeline import preprocess_text
|
| 5 |
+
from app.services.classifier_service import classifier_service
|
| 6 |
+
from app.services.extraction_service import extraction_service
|
| 7 |
+
from app.services.label_service import label_service
|
| 8 |
+
from app.services.language_service import language_service
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ClassificationPipeline:
|
| 12 |
+
def classify_text(self, text: str) -> str:
|
| 13 |
+
preprocessed_text = preprocess_text(text)
|
| 14 |
+
labels = label_service.get_labels()
|
| 15 |
+
return classifier_service.classify(preprocessed_text, labels)
|
| 16 |
+
|
| 17 |
+
def detect_language(self, text: str) -> str:
|
| 18 |
+
preprocessed_text = preprocess_text(text)
|
| 19 |
+
return language_service.detect_language(preprocessed_text)
|
| 20 |
+
|
| 21 |
+
def transform_file(self, original_filename: str, file_path: Path) -> str:
|
| 22 |
+
text = extraction_service.extract_text(original_filename, file_path)
|
| 23 |
+
if not text or not text.strip():
|
| 24 |
+
raise ExtractionError("No text extracted from file")
|
| 25 |
+
return text
|
| 26 |
+
|
| 27 |
+
def classify_file(self, original_filename: str, file_path: Path) -> dict:
|
| 28 |
+
text = self.transform_file(original_filename, file_path)
|
| 29 |
+
preprocessed_text = preprocess_text(text)
|
| 30 |
+
|
| 31 |
+
language = language_service.detect_language(preprocessed_text)
|
| 32 |
+
labels = label_service.get_labels()
|
| 33 |
+
topic = classifier_service.classify(preprocessed_text, labels)
|
| 34 |
+
|
| 35 |
+
result = {"label": topic, "language": language}
|
| 36 |
+
if language != "en":
|
| 37 |
+
result["type"] = "not english"
|
| 38 |
+
return result
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
classification_pipeline = ClassificationPipeline()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
__all__ = [
|
| 45 |
+
"classification_pipeline",
|
| 46 |
+
"ClassificationError",
|
| 47 |
+
"LanguageDetectionError",
|
| 48 |
+
"ExtractionError",
|
| 49 |
+
"ValidationError",
|
| 50 |
+
]
|
app/pipelines/text_pipeline.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
from app.core.exceptions import ValidationError
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
MIN_WORDS = 4
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def preprocess_text(text: str) -> str:
|
| 10 |
+
if text is None:
|
| 11 |
+
raise ValidationError("Text is required")
|
| 12 |
+
|
| 13 |
+
cleaned = text.replace("\n", " ")
|
| 14 |
+
cleaned = re.sub(r"<[^>]+>", "", cleaned)
|
| 15 |
+
cleaned = re.sub(r"\s+", " ", cleaned)
|
| 16 |
+
cleaned = re.sub(r"[^\w\s$€%.,-/]|(?<=\d)[.,/](?=\d)", " ", cleaned).lower().strip()
|
| 17 |
+
|
| 18 |
+
if len(cleaned.split(" ")) < MIN_WORDS:
|
| 19 |
+
raise ValidationError(f"Text must contain at least {MIN_WORDS} words after preprocessing")
|
| 20 |
+
|
| 21 |
+
return cleaned
|
app/routers/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""HTTP route modules."""
|
app/routers/classification.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, File, HTTPException, UploadFile, status
|
| 2 |
+
|
| 3 |
+
from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError
|
| 4 |
+
from app.pipelines.classification_pipeline import classification_pipeline
|
| 5 |
+
from app.schemas.classification import FileClassifyResponse, FileTransformResponse, LabelUpdateInput, TextInput
|
| 6 |
+
from app.services.file_storage_service import file_storage_service
|
| 7 |
+
from app.services.label_service import label_service
|
| 8 |
+
|
| 9 |
+
router = APIRouter(tags=["classification"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _handle_exception(exc: Exception) -> None:
|
| 13 |
+
if isinstance(exc, ValidationError):
|
| 14 |
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
|
| 15 |
+
if isinstance(exc, ExtractionError):
|
| 16 |
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
|
| 17 |
+
if isinstance(exc, (ClassificationError, LanguageDetectionError)):
|
| 18 |
+
raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail=str(exc)) from exc
|
| 19 |
+
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Unexpected error") from exc
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@router.post("/api/classifier", response_model=str)
|
| 23 |
+
async def classify_text(payload: TextInput) -> str:
|
| 24 |
+
try:
|
| 25 |
+
return classification_pipeline.classify_text(payload.text)
|
| 26 |
+
except Exception as exc:
|
| 27 |
+
_handle_exception(exc)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@router.post("/api/language", response_model=str)
|
| 31 |
+
async def detect_language(payload: TextInput) -> str:
|
| 32 |
+
try:
|
| 33 |
+
return classification_pipeline.detect_language(payload.text)
|
| 34 |
+
except Exception as exc:
|
| 35 |
+
_handle_exception(exc)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@router.post("/api/transformer", response_model=FileTransformResponse)
|
| 39 |
+
async def transform_file(file: UploadFile = File(...)) -> dict:
|
| 40 |
+
try:
|
| 41 |
+
saved_path = file_storage_service.save_upload(file)
|
| 42 |
+
content = classification_pipeline.transform_file(file.filename or saved_path.name, saved_path)
|
| 43 |
+
return {"filename": file.filename or saved_path.name, "content": content}
|
| 44 |
+
except Exception as exc:
|
| 45 |
+
_handle_exception(exc)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@router.post("/classify", response_model=FileClassifyResponse)
|
| 49 |
+
async def classify_uploaded_file(file: UploadFile = File(...)) -> dict:
|
| 50 |
+
try:
|
| 51 |
+
saved_path = file_storage_service.save_upload(file)
|
| 52 |
+
return classification_pipeline.classify_file(file.filename or saved_path.name, saved_path)
|
| 53 |
+
except Exception as exc:
|
| 54 |
+
_handle_exception(exc)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@router.post("/configlabel", response_model=list[str])
|
| 58 |
+
async def configure_labels(payload: LabelUpdateInput) -> list[str]:
|
| 59 |
+
labels = label_service.set_labels_from_csv(payload.text)
|
| 60 |
+
if not labels:
|
| 61 |
+
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="At least one label is required")
|
| 62 |
+
return labels
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@router.get("/labels", response_model=list[str])
|
| 66 |
+
async def get_labels() -> list[str]:
|
| 67 |
+
return label_service.get_labels()
|
app/routers/health.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
|
| 3 |
+
from app.services.label_service import label_service
|
| 4 |
+
|
| 5 |
+
router = APIRouter(tags=["health"])
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@router.get("/health/liveness")
|
| 9 |
+
def liveness() -> dict:
|
| 10 |
+
return {"status": "ok"}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@router.get("/health/readiness")
|
| 14 |
+
def readiness() -> dict:
|
| 15 |
+
# This service depends on external APIs, but readiness for local runtime
|
| 16 |
+
# is based on successful startup and non-empty label config.
|
| 17 |
+
labels = label_service.get_labels()
|
| 18 |
+
return {"status": "ready", "labels_count": len(labels)}
|
app/schemas/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic API schemas."""
|
app/schemas/classification.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BaseSchema(BaseModel):
|
| 5 |
+
model_config = ConfigDict(extra="forbid")
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TextInput(BaseSchema):
|
| 9 |
+
text: str = Field(min_length=1)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class LabelUpdateInput(BaseSchema):
|
| 13 |
+
text: str = Field(min_length=1, description="Comma-separated labels, e.g. 'news, sport, finance'")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ClassifierResponse(BaseSchema):
|
| 17 |
+
label: str
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class LanguageResponse(BaseSchema):
|
| 21 |
+
language: str
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class FileTransformResponse(BaseSchema):
|
| 25 |
+
filename: str
|
| 26 |
+
content: str
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class FileClassifyResponse(BaseSchema):
|
| 30 |
+
label: str
|
| 31 |
+
language: str
|
| 32 |
+
type: str | None = None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class LabelsResponse(BaseSchema):
|
| 36 |
+
labels: list[str]
|
app/services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Service layer modules."""
|
app/services/classifier_service.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
from gradio_client import Client
|
| 6 |
+
|
| 7 |
+
from app.core.config import settings
|
| 8 |
+
from app.core.exceptions import ClassificationError
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ClassifierService:
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
self._client: Client | None = None
|
| 14 |
+
|
| 15 |
+
def _get_client(self) -> Client:
|
| 16 |
+
if self._client is not None:
|
| 17 |
+
return self._client
|
| 18 |
+
|
| 19 |
+
client_kwargs: dict[str, Any] = {}
|
| 20 |
+
if settings.huggingface_token:
|
| 21 |
+
client_kwargs["hf_token"] = settings.huggingface_token
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
self._client = Client(settings.classifier_space, **client_kwargs)
|
| 25 |
+
except Exception as exc:
|
| 26 |
+
raise ClassificationError("Unable to initialize classifier client") from exc
|
| 27 |
+
|
| 28 |
+
return self._client
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def _extract_label(payload: Any) -> str | None:
|
| 32 |
+
if isinstance(payload, dict):
|
| 33 |
+
value = payload.get("label")
|
| 34 |
+
if isinstance(value, str) and value.strip():
|
| 35 |
+
return value.strip()
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
if isinstance(payload, list):
|
| 39 |
+
for item in payload:
|
| 40 |
+
label = ClassifierService._extract_label(item)
|
| 41 |
+
if label:
|
| 42 |
+
return label
|
| 43 |
+
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
def classify(self, text: str, labels: list[str]) -> str:
|
| 47 |
+
if not labels:
|
| 48 |
+
raise ClassificationError("No labels configured")
|
| 49 |
+
|
| 50 |
+
labels_text = ", ".join(labels)
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
result = self._get_client().predict(
|
| 54 |
+
text,
|
| 55 |
+
labels_text,
|
| 56 |
+
api_name=settings.classifier_api_name,
|
| 57 |
+
)
|
| 58 |
+
except Exception as exc:
|
| 59 |
+
raise ClassificationError("Classifier request failed") from exc
|
| 60 |
+
|
| 61 |
+
if isinstance(result, str):
|
| 62 |
+
candidate_path = Path(result)
|
| 63 |
+
if candidate_path.exists():
|
| 64 |
+
try:
|
| 65 |
+
parsed = json.loads(candidate_path.read_text(encoding="utf-8"))
|
| 66 |
+
except Exception as exc:
|
| 67 |
+
raise ClassificationError("Classifier output file is not valid JSON") from exc
|
| 68 |
+
label = self._extract_label(parsed)
|
| 69 |
+
if label:
|
| 70 |
+
return label
|
| 71 |
+
|
| 72 |
+
stripped = result.strip()
|
| 73 |
+
if stripped:
|
| 74 |
+
return stripped
|
| 75 |
+
|
| 76 |
+
label = self._extract_label(result)
|
| 77 |
+
if label:
|
| 78 |
+
return label
|
| 79 |
+
|
| 80 |
+
raise ClassificationError("Classifier did not return a valid label")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
classifier_service = ClassifierService()
|
app/services/extraction_service.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
import docx2txt
|
| 4 |
+
from openpyxl import load_workbook
|
| 5 |
+
from PIL import Image
|
| 6 |
+
from pypdf import PdfReader
|
| 7 |
+
import pytesseract
|
| 8 |
+
|
| 9 |
+
from app.core.exceptions import ExtractionError
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
DOC_EXTENSIONS = {".pdf", ".docx", ".xlsx"}
|
| 13 |
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"}
|
| 14 |
+
TEXT_EXTENSIONS = {".txt", ".md", ".csv", ".json"}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ExtractionService:
|
| 18 |
+
@staticmethod
|
| 19 |
+
def _extract_pdf(file_path: Path) -> str:
|
| 20 |
+
reader = PdfReader(str(file_path))
|
| 21 |
+
chunks: list[str] = []
|
| 22 |
+
for page in reader.pages:
|
| 23 |
+
text = page.extract_text() or ""
|
| 24 |
+
if text.strip():
|
| 25 |
+
chunks.append(text)
|
| 26 |
+
return "\n".join(chunks)
|
| 27 |
+
|
| 28 |
+
@staticmethod
|
| 29 |
+
def _extract_docx(file_path: Path) -> str:
|
| 30 |
+
return docx2txt.process(str(file_path))
|
| 31 |
+
|
| 32 |
+
@staticmethod
|
| 33 |
+
def _extract_xlsx(file_path: Path) -> str:
|
| 34 |
+
workbook = load_workbook(filename=str(file_path), read_only=True, data_only=True)
|
| 35 |
+
chunks: list[str] = []
|
| 36 |
+
for sheet in workbook.worksheets:
|
| 37 |
+
for row in sheet.iter_rows(values_only=True):
|
| 38 |
+
row_values = [str(value).strip() for value in row if value is not None and str(value).strip()]
|
| 39 |
+
if row_values:
|
| 40 |
+
chunks.append(" ".join(row_values))
|
| 41 |
+
workbook.close()
|
| 42 |
+
return "\n".join(chunks)
|
| 43 |
+
|
| 44 |
+
def extract_text(self, file_name: str, file_path: Path) -> str:
|
| 45 |
+
extension = Path(file_name).suffix.lower()
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
if extension in DOC_EXTENSIONS:
|
| 49 |
+
if extension == ".pdf":
|
| 50 |
+
return self._extract_pdf(file_path)
|
| 51 |
+
if extension == ".docx":
|
| 52 |
+
return self._extract_docx(file_path)
|
| 53 |
+
if extension == ".xlsx":
|
| 54 |
+
return self._extract_xlsx(file_path)
|
| 55 |
+
|
| 56 |
+
if extension in IMAGE_EXTENSIONS:
|
| 57 |
+
image = Image.open(file_path)
|
| 58 |
+
return pytesseract.image_to_string(image)
|
| 59 |
+
|
| 60 |
+
if extension in TEXT_EXTENSIONS:
|
| 61 |
+
return file_path.read_text(encoding="utf-8", errors="ignore")
|
| 62 |
+
|
| 63 |
+
raise ExtractionError(f"Unsupported file extension: {extension or 'unknown'}")
|
| 64 |
+
except ExtractionError:
|
| 65 |
+
raise
|
| 66 |
+
except Exception as exc:
|
| 67 |
+
raise ExtractionError("Failed to extract text from file") from exc
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
extraction_service = ExtractionService()
|
app/services/file_storage_service.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from uuid import uuid4
|
| 3 |
+
|
| 4 |
+
from fastapi import UploadFile
|
| 5 |
+
|
| 6 |
+
from app.core.config import settings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class FileStorageService:
|
| 10 |
+
def __init__(self) -> None:
|
| 11 |
+
settings.static_dir.mkdir(parents=True, exist_ok=True)
|
| 12 |
+
settings.upload_dir.mkdir(parents=True, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
def save_upload(self, upload_file: UploadFile) -> Path:
|
| 15 |
+
original_name = Path(upload_file.filename or "uploaded_file").name
|
| 16 |
+
safe_name = original_name.replace("/", "_")
|
| 17 |
+
target_path = settings.upload_dir / f"{uuid4().hex}_{safe_name}"
|
| 18 |
+
|
| 19 |
+
with target_path.open("wb") as destination:
|
| 20 |
+
while True:
|
| 21 |
+
chunk = upload_file.file.read(1024 * 1024)
|
| 22 |
+
if not chunk:
|
| 23 |
+
break
|
| 24 |
+
destination.write(chunk)
|
| 25 |
+
|
| 26 |
+
return target_path
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
file_storage_service = FileStorageService()
|
app/services/label_service.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.core.config import settings
|
| 2 |
+
from app.models.label_config import LabelConfig
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class LabelService:
|
| 6 |
+
def __init__(self) -> None:
|
| 7 |
+
initial_labels = [label.strip() for label in settings.default_labels_csv.split(",") if label.strip()]
|
| 8 |
+
self._config = LabelConfig(labels=initial_labels)
|
| 9 |
+
|
| 10 |
+
def get_labels(self) -> list[str]:
|
| 11 |
+
return self._config.get_labels()
|
| 12 |
+
|
| 13 |
+
def set_labels_from_csv(self, labels_csv: str) -> list[str]:
|
| 14 |
+
labels = [label.strip() for label in labels_csv.split(",") if label.strip()]
|
| 15 |
+
return self._config.set_labels(labels)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
label_service = LabelService()
|
app/services/language_service.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
from app.core.config import settings
|
| 4 |
+
from app.core.exceptions import LanguageDetectionError
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class LanguageService:
|
| 8 |
+
def __init__(self) -> None:
|
| 9 |
+
self._session = requests.Session()
|
| 10 |
+
|
| 11 |
+
def detect_language(self, text: str) -> str:
|
| 12 |
+
try:
|
| 13 |
+
response = self._session.post(
|
| 14 |
+
settings.language_detector_url,
|
| 15 |
+
json={"data": [text]},
|
| 16 |
+
timeout=settings.request_timeout_seconds,
|
| 17 |
+
)
|
| 18 |
+
response.raise_for_status()
|
| 19 |
+
payload = response.json()
|
| 20 |
+
except requests.RequestException as exc:
|
| 21 |
+
raise LanguageDetectionError("Language detection request failed") from exc
|
| 22 |
+
except ValueError as exc:
|
| 23 |
+
raise LanguageDetectionError("Language detector returned invalid JSON") from exc
|
| 24 |
+
|
| 25 |
+
data = payload.get("data") if isinstance(payload, dict) else None
|
| 26 |
+
if not data or not isinstance(data, list):
|
| 27 |
+
raise LanguageDetectionError("Language detector response missing 'data' field")
|
| 28 |
+
|
| 29 |
+
first = data[0]
|
| 30 |
+
if isinstance(first, dict):
|
| 31 |
+
label = first.get("label")
|
| 32 |
+
else:
|
| 33 |
+
label = first
|
| 34 |
+
|
| 35 |
+
if not isinstance(label, str) or not label.strip():
|
| 36 |
+
raise LanguageDetectionError("Language detector did not return a valid label")
|
| 37 |
+
|
| 38 |
+
return label.strip()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
language_service = LanguageService()
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
classifier-api:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: Dockerfile
|
| 6 |
+
container_name: classifier-general-api
|
| 7 |
+
environment:
|
| 8 |
+
APP_NAME: ${APP_NAME:-Classifier General API}
|
| 9 |
+
ENVIRONMENT: ${ENVIRONMENT:-development}
|
| 10 |
+
DEBUG: ${DEBUG:-false}
|
| 11 |
+
STATIC_DIR: ${STATIC_DIR:-static}
|
| 12 |
+
UPLOAD_SUBDIR: ${UPLOAD_SUBDIR:-uploads}
|
| 13 |
+
CLASSIFIER_SPACE: ${CLASSIFIER_SPACE:-https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/}
|
| 14 |
+
CLASSIFIER_API_NAME: ${CLASSIFIER_API_NAME:-/predict}
|
| 15 |
+
HUGGINGFACE_TOKEN: ${HUGGINGFACE_TOKEN:-}
|
| 16 |
+
LANGUAGE_DETECTOR_URL: ${LANGUAGE_DETECTOR_URL:-https://team-language-detector-languagedetector.hf.space/run/predict}
|
| 17 |
+
REQUEST_TIMEOUT_SECONDS: ${REQUEST_TIMEOUT_SECONDS:-30}
|
| 18 |
+
DEFAULT_LABELS_CSV: ${DEFAULT_LABELS_CSV:-news,sport,finance,politics}
|
| 19 |
+
ports:
|
| 20 |
+
- "4002:4002"
|
| 21 |
+
volumes:
|
| 22 |
+
- ./static:/app/static
|
| 23 |
+
healthcheck:
|
| 24 |
+
test: ["CMD", "curl", "-f", "http://localhost:4002/health/liveness"]
|
| 25 |
+
interval: 10s
|
| 26 |
+
timeout: 4s
|
| 27 |
+
retries: 6
|
docs/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Classifier-General Documentation
|
| 2 |
+
|
| 3 |
+
This folder contains reverse-engineered documentation generated from repository evidence.
|
| 4 |
+
|
| 5 |
+
## Doc Map (Diataxis)
|
| 6 |
+
- Tutorial:
|
| 7 |
+
- `docs/tutorials/getting-started.md`
|
| 8 |
+
- How-to guides:
|
| 9 |
+
- `docs/how-to/run-locally.md`
|
| 10 |
+
- `docs/how-to/deploy-with-docker-compose.md`
|
| 11 |
+
- Reference:
|
| 12 |
+
- `docs/reference/configuration.md`
|
| 13 |
+
- `docs/reference/api.md`
|
| 14 |
+
- `docs/reference/runtime-state.md`
|
| 15 |
+
- Explanation:
|
| 16 |
+
- `docs/explanation/architecture.md`
|
| 17 |
+
- `docs/explanation/decisions.md`
|
| 18 |
+
|
| 19 |
+
## Scope
|
| 20 |
+
- Covered modules: classification routes, text preprocessing, extraction pipeline, remote classifier/language services, label config.
|
| 21 |
+
- This service has no relational database layer; runtime state is file system + in-memory labels.
|
| 22 |
+
|
| 23 |
+
## Evidence anchors
|
| 24 |
+
- `app/main.py`
|
| 25 |
+
- `app/api/router.py`
|
| 26 |
+
- `app/routers/*.py`
|
| 27 |
+
- `app/pipelines/*.py`
|
| 28 |
+
- `app/services/*.py`
|
| 29 |
+
- `app/core/*.py`
|
| 30 |
+
- `app/schemas/*.py`
|
| 31 |
+
- `docker-compose.yml`
|
| 32 |
+
- `Dockerfile`
|
| 33 |
+
- `tests/test_routes.py`
|
docs/explanation/architecture.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture Explanation
|
| 2 |
+
|
| 3 |
+
## 1. Executive summary
|
| 4 |
+
`classifier-general` is a single FastAPI service that classifies text and files by combining local extraction/preprocessing with two remote AI endpoints (topic classifier and language detector).
|
| 5 |
+
|
| 6 |
+
Evidence:
|
| 7 |
+
- `app/main.py`
|
| 8 |
+
- `app/routers/classification.py`
|
| 9 |
+
- `app/services/classifier_service.py`
|
| 10 |
+
- `app/services/language_service.py`
|
| 11 |
+
|
| 12 |
+
## 2. Purpose and scope
|
| 13 |
+
### What exists
|
| 14 |
+
- Contract-compatible endpoints for classify/language/transform flows.
|
| 15 |
+
- Pipeline split into preprocess, extraction, classification orchestration.
|
| 16 |
+
- Configurable runtime through environment variables.
|
| 17 |
+
|
| 18 |
+
Evidence:
|
| 19 |
+
- `app/routers/classification.py`
|
| 20 |
+
- `app/pipelines/classification_pipeline.py`
|
| 21 |
+
- `app/core/config.py`
|
| 22 |
+
|
| 23 |
+
### How it works
|
| 24 |
+
- Router accepts JSON or multipart requests.
|
| 25 |
+
- Files are written to disk (`static/uploads`).
|
| 26 |
+
- Extraction service parses document/image/text into plain text.
|
| 27 |
+
- Text preprocessing enforces minimum quality.
|
| 28 |
+
- Pipeline calls language and classifier services.
|
| 29 |
+
|
| 30 |
+
Evidence:
|
| 31 |
+
- `app/services/file_storage_service.py`
|
| 32 |
+
- `app/services/extraction_service.py`
|
| 33 |
+
- `app/pipelines/text_pipeline.py`
|
| 34 |
+
- `app/pipelines/classification_pipeline.py`
|
| 35 |
+
|
| 36 |
+
### Why designed this way (inferred)
|
| 37 |
+
- Maintain old API contract while introducing modular services and safer config handling.
|
| 38 |
+
|
| 39 |
+
## 3. C4-style views
|
| 40 |
+
### Context view
|
| 41 |
+
Actors/systems:
|
| 42 |
+
- API client sending text/files.
|
| 43 |
+
- External classifier endpoint (`CLASSIFIER_SPACE`).
|
| 44 |
+
- External language detector endpoint (`LANGUAGE_DETECTOR_URL`).
|
| 45 |
+
- Local filesystem for uploaded files.
|
| 46 |
+
|
| 47 |
+
Evidence:
|
| 48 |
+
- `app/core/config.py`
|
| 49 |
+
- `app/services/classifier_service.py`
|
| 50 |
+
- `app/services/language_service.py`
|
| 51 |
+
|
| 52 |
+
### Container view
|
| 53 |
+
- One container/service (`classifier-api`) with FastAPI + OCR binary.
|
| 54 |
+
|
| 55 |
+
Evidence:
|
| 56 |
+
- `docker-compose.yml`
|
| 57 |
+
- `Dockerfile`
|
| 58 |
+
|
| 59 |
+
### Component view
|
| 60 |
+
- API routing: `app/routers/*`
|
| 61 |
+
- Orchestration pipelines: `app/pipelines/*`
|
| 62 |
+
- Integration services: `app/services/classifier_service.py`, `app/services/language_service.py`
|
| 63 |
+
- Extraction + storage services: `app/services/extraction_service.py`, `app/services/file_storage_service.py`
|
| 64 |
+
- Config/exceptions/schemas: `app/core/*`, `app/schemas/*`
|
| 65 |
+
|
| 66 |
+
### Code-level workflow: file classification
|
| 67 |
+
1. `POST /classify` receives file.
|
| 68 |
+
2. File saved to upload directory.
|
| 69 |
+
3. Text extracted by extension-specific handlers.
|
| 70 |
+
4. Text preprocessed (regex cleanup + min words).
|
| 71 |
+
5. Language detector called.
|
| 72 |
+
6. Classifier called with CSV labels converted to joined text.
|
| 73 |
+
7. Response returns `{label, language}` plus `type=not english` when applicable.
|
| 74 |
+
|
| 75 |
+
Evidence:
|
| 76 |
+
- `app/routers/classification.py`
|
| 77 |
+
- `app/services/file_storage_service.py`
|
| 78 |
+
- `app/services/extraction_service.py`
|
| 79 |
+
- `app/pipelines/text_pipeline.py`
|
| 80 |
+
- `app/pipelines/classification_pipeline.py`
|
| 81 |
+
|
| 82 |
+
## 4. Cross-cutting concerns
|
| 83 |
+
### Validation and error mapping
|
| 84 |
+
- Input schemas use strict `extra=forbid`.
|
| 85 |
+
- Error mapping explicitly separates validation/extraction (400) from upstream AI failures (502).
|
| 86 |
+
|
| 87 |
+
Evidence:
|
| 88 |
+
- `app/schemas/classification.py`
|
| 89 |
+
- `app/routers/classification.py`
|
| 90 |
+
|
| 91 |
+
### Configuration and secrets
|
| 92 |
+
- Runtime config sourced from env.
|
| 93 |
+
- HF token optional and no hardcoded secret in current service code.
|
| 94 |
+
|
| 95 |
+
Evidence:
|
| 96 |
+
- `app/core/config.py`
|
| 97 |
+
- `app/services/classifier_service.py`
|
| 98 |
+
|
| 99 |
+
### Concurrency and mutable state
|
| 100 |
+
- Labels guarded by thread lock (`LabelConfig._lock`).
|
| 101 |
+
- State is still process-local; multi-instance deployments can diverge.
|
| 102 |
+
|
| 103 |
+
Evidence:
|
| 104 |
+
- `app/models/label_config.py`
|
| 105 |
+
- `app/services/label_service.py`
|
| 106 |
+
|
| 107 |
+
### Testing strategy
|
| 108 |
+
- Route contract tests monkeypatch pipeline methods for deterministic tests.
|
| 109 |
+
- Tests validate response shape and key endpoint behavior, not remote network calls.
|
| 110 |
+
|
| 111 |
+
Evidence:
|
| 112 |
+
- `tests/test_routes.py`
|
| 113 |
+
|
| 114 |
+
## 5. Risks, gaps, and technical debt
|
| 115 |
+
- External endpoint dependency introduces latency and runtime failure risk.
|
| 116 |
+
- No upload retention/cleanup process.
|
| 117 |
+
- Readiness check does not probe external AI services, only local label readiness.
|
| 118 |
+
- No authentication/authorization layer on API endpoints.
|
| 119 |
+
|
| 120 |
+
Evidence:
|
| 121 |
+
- `app/services/language_service.py`
|
| 122 |
+
- `app/services/classifier_service.py`
|
| 123 |
+
- `app/routers/health.py`
|
| 124 |
+
- `app/routers/classification.py`
|
| 125 |
+
|
| 126 |
+
## 6. Unknown or inferred
|
| 127 |
+
- Unknown: expected SLA and acceptable latency.
|
| 128 |
+
- Unknown: intended persistence/retention policy for uploaded files.
|
| 129 |
+
- Inferred: service is optimized for local/dev contract compatibility and integration testing.
|
docs/explanation/decisions.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture Decisions (ADR-style)
|
| 2 |
+
|
| 3 |
+
## ADR-001: Use modular FastAPI layout for classifier backend
|
| 4 |
+
- Status: Accepted
|
| 5 |
+
- Type: Explicit
|
| 6 |
+
- Evidence:
|
| 7 |
+
- `app/main.py`
|
| 8 |
+
- `app/api/router.py`
|
| 9 |
+
- `app/routers/`
|
| 10 |
+
- `app/services/`
|
| 11 |
+
- `app/pipelines/`
|
| 12 |
+
- Rationale:
|
| 13 |
+
- Clear separation between transport, orchestration, and integrations.
|
| 14 |
+
|
| 15 |
+
## ADR-002: Preserve endpoint contracts from prior service behavior
|
| 16 |
+
- Status: Accepted
|
| 17 |
+
- Type: Explicit
|
| 18 |
+
- Evidence:
|
| 19 |
+
- `app/routers/classification.py`
|
| 20 |
+
- `tests/test_routes.py`
|
| 21 |
+
- Rationale:
|
| 22 |
+
- Keep clients functional while refactoring internals.
|
| 23 |
+
|
| 24 |
+
## ADR-003: Use remote HF/Gradio endpoint for classification
|
| 25 |
+
- Status: Accepted
|
| 26 |
+
- Type: Explicit
|
| 27 |
+
- Evidence:
|
| 28 |
+
- `app/core/config.py`
|
| 29 |
+
- `app/services/classifier_service.py`
|
| 30 |
+
- Rationale:
|
| 31 |
+
- Avoid shipping local model runtime in this service.
|
| 32 |
+
|
| 33 |
+
## ADR-004: Use remote language detector HTTP endpoint
|
| 34 |
+
- Status: Accepted
|
| 35 |
+
- Type: Explicit
|
| 36 |
+
- Evidence:
|
| 37 |
+
- `app/services/language_service.py`
|
| 38 |
+
- `app/core/config.py`
|
| 39 |
+
- Rationale:
|
| 40 |
+
- Decouple language detection model from this codebase.
|
| 41 |
+
|
| 42 |
+
## ADR-005: Keep labels in in-memory mutable config
|
| 43 |
+
- Status: Accepted (current), Needs review
|
| 44 |
+
- Type: Explicit
|
| 45 |
+
- Evidence:
|
| 46 |
+
- `app/models/label_config.py`
|
| 47 |
+
- `app/services/label_service.py`
|
| 48 |
+
- `app/routers/classification.py` (`/configlabel`, `/labels`)
|
| 49 |
+
- Rationale:
|
| 50 |
+
- Simple runtime tuning without DB migration.
|
| 51 |
+
- Tradeoff:
|
| 52 |
+
- No persistence across restarts, no cross-instance consistency.
|
| 53 |
+
|
| 54 |
+
## ADR-006: Store uploaded files on local filesystem under static mount
|
| 55 |
+
- Status: Accepted
|
| 56 |
+
- Type: Explicit
|
| 57 |
+
- Evidence:
|
| 58 |
+
- `app/services/file_storage_service.py`
|
| 59 |
+
- `app/main.py`
|
| 60 |
+
- `docker-compose.yml`
|
| 61 |
+
- Rationale:
|
| 62 |
+
- Enables document/image extraction workflow with minimal infrastructure.
|
| 63 |
+
|
| 64 |
+
## ADR-007: Map errors into contract-friendly HTTP statuses
|
| 65 |
+
- Status: Accepted
|
| 66 |
+
- Type: Explicit
|
| 67 |
+
- Evidence:
|
| 68 |
+
- `app/routers/classification.py`
|
| 69 |
+
- `app/core/exceptions.py`
|
| 70 |
+
- Rationale:
|
| 71 |
+
- Differentiate local validation issues (`400`) from upstream AI failures (`502`).
|
| 72 |
+
|
| 73 |
+
## ADR-008: No built-in auth layer for this API
|
| 74 |
+
- Status: Accepted (current), Needs review
|
| 75 |
+
- Type: Inferred
|
| 76 |
+
- Evidence:
|
| 77 |
+
- `app/routers/classification.py`
|
| 78 |
+
- absence of auth dependencies/middleware in `app/main.py`
|
| 79 |
+
- Rationale:
|
| 80 |
+
- likely positioned as internal or early-stage service.
|
docs/how-to/deploy-with-docker-compose.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deploy With Docker Compose
|
| 2 |
+
|
| 3 |
+
## Topology
|
| 4 |
+
- Single container: `classifier-api`
|
| 5 |
+
- Volume mount: `./static:/app/static` for persisted uploaded files
|
| 6 |
+
- Healthcheck: `GET /health/liveness`
|
| 7 |
+
|
| 8 |
+
Evidence:
|
| 9 |
+
- `docker-compose.yml`
|
| 10 |
+
|
| 11 |
+
## Command
|
| 12 |
+
```bash
|
| 13 |
+
cd classifier-general
|
| 14 |
+
docker compose up -d --build
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
## Verify
|
| 18 |
+
```bash
|
| 19 |
+
docker compose ps
|
| 20 |
+
curl -s http://localhost:4002/health/liveness
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Production hardening gaps
|
| 24 |
+
- No reverse proxy/TLS config in this repo.
|
| 25 |
+
- External AI dependencies are hard network dependencies at runtime.
|
| 26 |
+
- No horizontal scaling coordination for in-memory labels (`/configlabel` mutates process-local state).
|
| 27 |
+
|
| 28 |
+
Evidence:
|
| 29 |
+
- `app/services/language_service.py`
|
| 30 |
+
- `app/services/classifier_service.py`
|
| 31 |
+
- `app/services/label_service.py`
|
| 32 |
+
|
| 33 |
+
## Unknown or inferred
|
| 34 |
+
- Unknown: expected deployment platform (only Docker artifacts are present).
|
| 35 |
+
- Inferred: this compose file targets local/dev usage.
|
docs/how-to/run-locally.md
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Run Locally (Dev Loop)
|
| 2 |
+
|
| 3 |
+
## 1. Install dependencies
|
| 4 |
+
```bash
|
| 5 |
+
cd classifier-general
|
| 6 |
+
python3 -m venv .venv
|
| 7 |
+
source .venv/bin/activate
|
| 8 |
+
pip install -r requirements.txt
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
Evidence:
|
| 12 |
+
- `requirements.txt`
|
| 13 |
+
|
| 14 |
+
## 2. Configure environment
|
| 15 |
+
```bash
|
| 16 |
+
cp .env.example .env
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
Critical settings:
|
| 20 |
+
- `CLASSIFIER_SPACE`
|
| 21 |
+
- `CLASSIFIER_API_NAME`
|
| 22 |
+
- `LANGUAGE_DETECTOR_URL`
|
| 23 |
+
- `DEFAULT_LABELS_CSV`
|
| 24 |
+
|
| 25 |
+
Evidence:
|
| 26 |
+
- `app/core/config.py`
|
| 27 |
+
- `.env.example`
|
| 28 |
+
|
| 29 |
+
## 3. Start server
|
| 30 |
+
```bash
|
| 31 |
+
uvicorn main:app --host 0.0.0.0 --port 4002 --reload
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
Evidence:
|
| 35 |
+
- `main.py`
|
| 36 |
+
- `app/main.py`
|
| 37 |
+
|
| 38 |
+
## 4. Test file-based endpoints
|
| 39 |
+
```bash
|
| 40 |
+
curl -s -X POST http://localhost:4002/api/transformer \
|
| 41 |
+
-F 'file=@/absolute/path/to/sample.pdf'
|
| 42 |
+
|
| 43 |
+
curl -s -X POST http://localhost:4002/classify \
|
| 44 |
+
-F 'file=@/absolute/path/to/sample.pdf'
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
Uploads are stored under `static/uploads` with random UUID prefixes.
|
| 48 |
+
|
| 49 |
+
Evidence:
|
| 50 |
+
- `app/services/file_storage_service.py`
|
| 51 |
+
- `app/services/extraction_service.py`
|
| 52 |
+
|
| 53 |
+
## Troubleshooting
|
| 54 |
+
- `400 Text must contain at least 4 words`:
|
| 55 |
+
- input failed preprocessing minimum-word rule.
|
| 56 |
+
- `502 Classifier request failed`:
|
| 57 |
+
- HF Space unreachable or incompatible response.
|
| 58 |
+
- OCR extraction quality is low:
|
| 59 |
+
- verify tesseract install and image quality.
|
| 60 |
+
|
| 61 |
+
Evidence:
|
| 62 |
+
- `app/pipelines/text_pipeline.py`
|
| 63 |
+
- `app/routers/classification.py`
|
| 64 |
+
- `Dockerfile`
|
docs/reference/api.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Reference
|
| 2 |
+
|
| 3 |
+
Base URL: `http://localhost:4002`
|
| 4 |
+
|
| 5 |
+
Evidence:
|
| 6 |
+
- `app/api/router.py`
|
| 7 |
+
- `app/routers/classification.py`
|
| 8 |
+
- `app/routers/health.py`
|
| 9 |
+
|
| 10 |
+
## Endpoints
|
| 11 |
+
|
| 12 |
+
| Method | Path | Request | Response |
|
| 13 |
+
|---|---|---|---|
|
| 14 |
+
| GET | `/health/liveness` | none | `{status}` |
|
| 15 |
+
| GET | `/health/readiness` | none | `{status, labels_count}` |
|
| 16 |
+
| GET | `/endpoint/` | none | list of routes/methods |
|
| 17 |
+
| POST | `/api/classifier` | `{text}` | `"<label>"` |
|
| 18 |
+
| POST | `/api/language` | `{text}` | `"<language>"` |
|
| 19 |
+
| POST | `/api/transformer` | multipart `file` | `{filename, content}` |
|
| 20 |
+
| POST | `/classify` | multipart `file` | `{label, language, type?}` |
|
| 21 |
+
| POST | `/configlabel` | `{text: "csv,labels"}` | `string[]` |
|
| 22 |
+
| GET | `/labels` | none | `string[]` |
|
| 23 |
+
|
| 24 |
+
## Validation and errors
|
| 25 |
+
- `400` for input validation and extraction problems.
|
| 26 |
+
- `502` for upstream classifier/language failures.
|
| 27 |
+
- `500` for unexpected failures.
|
| 28 |
+
|
| 29 |
+
Evidence:
|
| 30 |
+
- `app/routers/classification.py` (`_handle_exception`)
|
| 31 |
+
- `app/schemas/classification.py`
|
| 32 |
+
|
| 33 |
+
## Contract notes
|
| 34 |
+
- Contract returns plain string for `/api/classifier` and `/api/language` (not wrapped object).
|
| 35 |
+
- `/classify` returns optional `type="not english"` when language output is not `en`.
|
| 36 |
+
|
| 37 |
+
Evidence:
|
| 38 |
+
- `app/routers/classification.py`
|
| 39 |
+
- `app/pipelines/classification_pipeline.py`
|
docs/reference/configuration.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration Reference
|
| 2 |
+
|
| 3 |
+
Configuration is managed with Pydantic Settings.
|
| 4 |
+
|
| 5 |
+
Evidence:
|
| 6 |
+
- `app/core/config.py`
|
| 7 |
+
- `.env.example`
|
| 8 |
+
- `docker-compose.yml`
|
| 9 |
+
|
| 10 |
+
## Application settings
|
| 11 |
+
|
| 12 |
+
| Variable | Default | Purpose |
|
| 13 |
+
|---|---|---|
|
| 14 |
+
| `APP_NAME` | `Classifier General API` | FastAPI title |
|
| 15 |
+
| `ENVIRONMENT` | `development` | environment label |
|
| 16 |
+
| `DEBUG` | `false` | debug mode |
|
| 17 |
+
|
| 18 |
+
## Filesystem settings
|
| 19 |
+
|
| 20 |
+
| Variable | Default | Purpose |
|
| 21 |
+
|---|---|---|
|
| 22 |
+
| `STATIC_DIR` | `static` | static root served at `/static` |
|
| 23 |
+
| `UPLOAD_SUBDIR` | `uploads` | upload directory under static root |
|
| 24 |
+
|
| 25 |
+
## Classifier integration settings
|
| 26 |
+
|
| 27 |
+
| Variable | Default | Purpose |
|
| 28 |
+
|---|---|---|
|
| 29 |
+
| `CLASSIFIER_SPACE` | `https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/` | Gradio/HF Space endpoint |
|
| 30 |
+
| `CLASSIFIER_API_NAME` | `/predict` | Gradio predict API name |
|
| 31 |
+
| `HUGGINGFACE_TOKEN` | empty | optional auth token for client init |
|
| 32 |
+
|
| 33 |
+
## Language detector settings
|
| 34 |
+
|
| 35 |
+
| Variable | Default | Purpose |
|
| 36 |
+
|---|---|---|
|
| 37 |
+
| `LANGUAGE_DETECTOR_URL` | `https://team-language-detector-languagedetector.hf.space/run/predict` | remote language endpoint |
|
| 38 |
+
| `REQUEST_TIMEOUT_SECONDS` | `30` | HTTP timeout for language requests |
|
| 39 |
+
|
| 40 |
+
## Label settings
|
| 41 |
+
|
| 42 |
+
| Variable | Default | Purpose |
|
| 43 |
+
|---|---|---|
|
| 44 |
+
| `DEFAULT_LABELS_CSV` | `news,sport,finance,politics` | initial in-memory labels |
|
| 45 |
+
|
| 46 |
+
## Behavior notes
|
| 47 |
+
- Labels are process-local in memory and reset on restart.
|
| 48 |
+
- Upload directory is auto-created at app startup.
|
| 49 |
+
|
| 50 |
+
Evidence:
|
| 51 |
+
- `app/services/label_service.py`
|
| 52 |
+
- `app/models/label_config.py`
|
| 53 |
+
- `app/main.py`
|
docs/reference/runtime-state.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Runtime State Reference
|
| 2 |
+
|
| 3 |
+
This service does not define a relational database. State exists in memory and filesystem.
|
| 4 |
+
|
| 5 |
+
Evidence:
|
| 6 |
+
- no `app/database/` package
|
| 7 |
+
- `app/models/label_config.py`
|
| 8 |
+
- `app/services/file_storage_service.py`
|
| 9 |
+
|
| 10 |
+
## In-memory state
|
| 11 |
+
|
| 12 |
+
| State | Location | Lifecycle |
|
| 13 |
+
|---|---|---|
|
| 14 |
+
| Active labels list | `LabelConfig.labels` | initialized at process start from `DEFAULT_LABELS_CSV`; mutable via `/configlabel`; reset on restart |
|
| 15 |
+
|
| 16 |
+
Evidence:
|
| 17 |
+
- `app/services/label_service.py`
|
| 18 |
+
- `app/models/label_config.py`
|
| 19 |
+
|
| 20 |
+
## Filesystem state
|
| 21 |
+
|
| 22 |
+
| State | Location | Lifecycle |
|
| 23 |
+
|---|---|---|
|
| 24 |
+
| Uploaded files | `STATIC_DIR/UPLOAD_SUBDIR` (default `static/uploads`) | created per upload; not automatically deleted by app |
|
| 25 |
+
| Static mount | `/static` route | served directly by FastAPI static files |
|
| 26 |
+
|
| 27 |
+
Evidence:
|
| 28 |
+
- `app/core/config.py`
|
| 29 |
+
- `app/main.py`
|
| 30 |
+
- `app/services/file_storage_service.py`
|
| 31 |
+
|
| 32 |
+
## External runtime dependencies
|
| 33 |
+
|
| 34 |
+
| Dependency | Usage |
|
| 35 |
+
|---|---|
|
| 36 |
+
| HF/Gradio classifier Space | text topic classification |
|
| 37 |
+
| Language detector endpoint | language inference |
|
| 38 |
+
| Tesseract binary | OCR extraction for images |
|
| 39 |
+
|
| 40 |
+
Evidence:
|
| 41 |
+
- `app/services/classifier_service.py`
|
| 42 |
+
- `app/services/language_service.py`
|
| 43 |
+
- `app/services/extraction_service.py`
|
| 44 |
+
- `Dockerfile`
|
| 45 |
+
|
| 46 |
+
## Unknown or inferred
|
| 47 |
+
- Unknown: long-term retention policy for uploaded files.
|
| 48 |
+
- Inferred: `static/uploads` can grow unbounded without cleanup process.
|
docs/tutorials/getting-started.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Getting Started
|
| 2 |
+
|
| 3 |
+
This tutorial runs the classifier API and validates endpoint contracts.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
- Docker and Docker Compose
|
| 7 |
+
- Internet access for external classifier/language services (unless tests are monkeypatched)
|
| 8 |
+
|
| 9 |
+
Evidence:
|
| 10 |
+
- `docker-compose.yml`
|
| 11 |
+
- `app/services/classifier_service.py`
|
| 12 |
+
- `app/services/language_service.py`
|
| 13 |
+
|
| 14 |
+
## 1. Prepare environment
|
| 15 |
+
```bash
|
| 16 |
+
cd classifier-general
|
| 17 |
+
cp .env.example .env
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
Evidence:
|
| 21 |
+
- `.env.example`
|
| 22 |
+
- `app/core/config.py`
|
| 23 |
+
|
| 24 |
+
## 2. Start API in Docker
|
| 25 |
+
```bash
|
| 26 |
+
docker compose up --build
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
Service:
|
| 30 |
+
- `classifier-api` exposed on `4002`
|
| 31 |
+
|
| 32 |
+
Evidence:
|
| 33 |
+
- `docker-compose.yml`
|
| 34 |
+
|
| 35 |
+
## 3. Check health
|
| 36 |
+
```bash
|
| 37 |
+
curl -s http://localhost:4002/health/liveness
|
| 38 |
+
curl -s http://localhost:4002/health/readiness
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
Expected:
|
| 42 |
+
- `{"status":"ok"}`
|
| 43 |
+
- `{"status":"ready","labels_count":<n>}`
|
| 44 |
+
|
| 45 |
+
Evidence:
|
| 46 |
+
- `app/routers/health.py`
|
| 47 |
+
|
| 48 |
+
## 4. Call text classification
|
| 49 |
+
```bash
|
| 50 |
+
curl -s -X POST http://localhost:4002/api/classifier \
|
| 51 |
+
-H 'content-type: application/json' \
|
| 52 |
+
-d '{"text":"This is a long enough sentence for classification."}'
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
Evidence:
|
| 56 |
+
- `app/routers/classification.py`
|
| 57 |
+
- `app/pipelines/classification_pipeline.py`
|
| 58 |
+
|
| 59 |
+
## 5. Update labels and verify
|
| 60 |
+
```bash
|
| 61 |
+
curl -s -X POST http://localhost:4002/configlabel \
|
| 62 |
+
-H 'content-type: application/json' \
|
| 63 |
+
-d '{"text":"tech,health,legal"}'
|
| 64 |
+
|
| 65 |
+
curl -s http://localhost:4002/labels
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
Evidence:
|
| 69 |
+
- `app/services/label_service.py`
|
| 70 |
+
- `app/models/label_config.py`
|
| 71 |
+
|
| 72 |
+
## 6. Run route contract tests
|
| 73 |
+
```bash
|
| 74 |
+
docker build -t classifier-general-refactor .
|
| 75 |
+
docker run --rm -w /app -e PYTHONPATH=/app classifier-general-refactor pytest -q
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
Evidence:
|
| 79 |
+
- `tests/test_routes.py`
|
main.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.main import app
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
if __name__ == "__main__":
|
| 5 |
+
import uvicorn
|
| 6 |
+
|
| 7 |
+
uvicorn.run("main:app", host="0.0.0.0", port=4002, reload=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.8
|
| 2 |
+
uvicorn[standard]==0.34.0
|
| 3 |
+
pydantic==2.10.6
|
| 4 |
+
pydantic-settings==2.7.1
|
| 5 |
+
requests==2.32.3
|
| 6 |
+
gradio_client==1.7.0
|
| 7 |
+
python-multipart==0.0.20
|
| 8 |
+
|
| 9 |
+
pytesseract==0.3.13
|
| 10 |
+
Pillow==11.1.0
|
| 11 |
+
pypdf==5.4.0
|
| 12 |
+
docx2txt==0.8
|
| 13 |
+
openpyxl==3.1.5
|
| 14 |
+
|
| 15 |
+
pytest==8.3.4
|
| 16 |
+
httpx==0.28.1
|
tests/test_routes.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from io import BytesIO
|
| 2 |
+
|
| 3 |
+
from fastapi.testclient import TestClient
|
| 4 |
+
|
| 5 |
+
from app.main import app
|
| 6 |
+
from app.pipelines.classification_pipeline import classification_pipeline
|
| 7 |
+
|
| 8 |
+
client = TestClient(app)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_classifier_endpoint_contract(monkeypatch):
|
| 12 |
+
monkeypatch.setattr(classification_pipeline, "classify_text", lambda text: "news")
|
| 13 |
+
|
| 14 |
+
response = client.post("/api/classifier", json={"text": "This is a long enough sentence for classification."})
|
| 15 |
+
|
| 16 |
+
assert response.status_code == 200
|
| 17 |
+
assert response.json() == "news"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_language_endpoint_contract(monkeypatch):
|
| 21 |
+
monkeypatch.setattr(classification_pipeline, "detect_language", lambda text: "en")
|
| 22 |
+
|
| 23 |
+
response = client.post("/api/language", json={"text": "This is a language detection sample text."})
|
| 24 |
+
|
| 25 |
+
assert response.status_code == 200
|
| 26 |
+
assert response.json() == "en"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_labels_config_roundtrip():
|
| 30 |
+
response = client.post("/configlabel", json={"text": "tech, health, legal"})
|
| 31 |
+
assert response.status_code == 200
|
| 32 |
+
assert response.json() == ["tech", "health", "legal"]
|
| 33 |
+
|
| 34 |
+
get_response = client.get("/labels")
|
| 35 |
+
assert get_response.status_code == 200
|
| 36 |
+
assert get_response.json() == ["tech", "health", "legal"]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def test_transform_file_contract(monkeypatch):
|
| 40 |
+
monkeypatch.setattr(classification_pipeline, "transform_file", lambda filename, path: "extracted content")
|
| 41 |
+
|
| 42 |
+
files = {"file": ("sample.txt", BytesIO(b"hello"), "text/plain")}
|
| 43 |
+
response = client.post("/api/transformer", files=files)
|
| 44 |
+
|
| 45 |
+
assert response.status_code == 200
|
| 46 |
+
assert response.json()["filename"] == "sample.txt"
|
| 47 |
+
assert response.json()["content"] == "extracted content"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_classify_file_contract(monkeypatch):
|
| 51 |
+
monkeypatch.setattr(
|
| 52 |
+
classification_pipeline,
|
| 53 |
+
"classify_file",
|
| 54 |
+
lambda filename, path: {"label": "finance", "language": "en"},
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
files = {"file": ("sample.txt", BytesIO(b"hello"), "text/plain")}
|
| 58 |
+
response = client.post("/classify", files=files)
|
| 59 |
+
|
| 60 |
+
assert response.status_code == 200
|
| 61 |
+
assert response.json() == {"label": "finance", "language": "en", "type": None}
|