AyoubChLin commited on
Commit
50231a8
·
1 Parent(s): 3d9d878

feat: initial commit of Classifier General API with FastAPI

Browse files

- Added Spacefile for deployment configuration.
- Created app structure with core, api, models, pipelines, routers, and services.
- Implemented classification and language detection pipelines.
- Integrated file extraction and storage services.
- Established API endpoints for classification, language detection, and file transformation.
- Added health check endpoints for service liveness and readiness.
- Configured Pydantic settings for environment-based configuration.
- Developed tests for route contracts and functionality.
- Included Docker Compose setup for local development and deployment.
- Documented architecture, decisions, and usage in the README and other markdown files.

.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .pytest_cache/
7
+ .env
8
+ aws/
9
+ awscliv2.zip
.env.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APP_NAME=Classifier General API
2
+ ENVIRONMENT=development
3
+ DEBUG=false
4
+
5
+ STATIC_DIR=static
6
+ UPLOAD_SUBDIR=uploads
7
+
8
+ CLASSIFIER_SPACE=https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/
9
+ CLASSIFIER_API_NAME=/predict
10
+ HUGGINGFACE_TOKEN=
11
+
12
+ LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict
13
+ REQUEST_TIMEOUT_SECONDS=30
14
+
15
+ DEFAULT_LABELS_CSV=news,sport,finance,politics
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .space
2
+ .env
3
+ __pycache__/
4
+ *.pyc
5
+ .pytest_cache/
6
+ static/uploads/
7
+ static/*
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ RUN apt-get update \
9
+ && apt-get install -y --no-install-recommends tesseract-ocr curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ COPY . .
16
+
17
+ EXPOSE 4002
18
+
19
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "4002"]
README.md CHANGED
@@ -9,4 +9,57 @@ license: mit
9
  short_description: classifier-general
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  short_description: classifier-general
10
  ---
11
 
12
+ # Classifier General API (Refactored)
13
+
14
+ Refactored into a modular FastAPI backend with clear layers:
15
+ - `app/routers`
16
+ - `app/services`
17
+ - `app/pipelines`
18
+ - `app/schemas`
19
+ - `app/models`
20
+ - `app/core`
21
+
22
+ ## Preserved Endpoint Contract
23
+ - `POST /api/classifier` -> returns label string
24
+ - `POST /api/language` -> returns language string
25
+ - `POST /api/transformer` -> returns `{ filename, content }`
26
+ - `POST /classify` -> returns `{ label, language, type? }`
27
+ - `POST /configlabel` -> returns labels array
28
+ - `GET /labels` -> returns labels array
29
+
30
+ Additional operational endpoints:
31
+ - `GET /health/liveness`
32
+ - `GET /health/readiness`
33
+ - `GET /endpoint/`
34
+
35
+ ## Environment
36
+ Copy and edit:
37
+ ```bash
38
+ cp .env.example .env
39
+ ```
40
+
41
+ Key vars:
42
+ - `CLASSIFIER_SPACE`
43
+ - `HUGGINGFACE_TOKEN`
44
+ - `LANGUAGE_DETECTOR_URL`
45
+ - `DEFAULT_LABELS_CSV`
46
+
47
+ ## Local Run
48
+ ```bash
49
+ pip install -r requirements.txt
50
+ uvicorn main:app --host 0.0.0.0 --port 4002 --reload
51
+ ```
52
+
53
+ ## Docker Run
54
+ ```bash
55
+ docker compose up --build
56
+ ```
57
+
58
+ ## Tests
59
+ ```bash
60
+ pytest -q
61
+ ```
62
+
63
+ ## Notes
64
+ - OCR requires `tesseract-ocr` (installed in Dockerfile).
65
+ - Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
Spacefile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spacefile Docs: https://go.deta.dev/docs/spacefile/v0
2
+ v: 0
3
+ micros:
4
+ - name: classifier-general
5
+ src: .
6
+ engine: python3.9
7
+ primary: true
8
+ public: true
9
+ run: uvicorn main:app
10
+ dev: uvicorn main:app --reload
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Classifier General backend package."""
app/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """API router assembly."""
app/api/router.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from app.routers import classification, health
4
+
5
+ api_router = APIRouter()
6
+ api_router.include_router(health.router)
7
+ api_router.include_router(classification.router)
app/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Core settings and shared utilities."""
app/core/config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import lru_cache
2
+ from pathlib import Path
3
+
4
+ from pydantic import Field
5
+ from pydantic_settings import BaseSettings, SettingsConfigDict
6
+
7
+
8
+ class Settings(BaseSettings):
9
+ model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
10
+
11
+ app_name: str = "Classifier General API"
12
+ environment: str = "development"
13
+ debug: bool = False
14
+
15
+ static_dir: Path = Path("static")
16
+ upload_subdir: str = "uploads"
17
+
18
+ classifier_space: str = "https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/"
19
+ classifier_api_name: str = "/predict"
20
+ huggingface_token: str | None = None
21
+
22
+ language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"
23
+ request_timeout_seconds: float = 30.0
24
+
25
+ default_labels_csv: str = Field(default="news,sport,finance,politics")
26
+
27
+ @property
28
+ def upload_dir(self) -> Path:
29
+ return self.static_dir / self.upload_subdir
30
+
31
+
32
+ @lru_cache
33
+ def get_settings() -> Settings:
34
+ return Settings()
35
+
36
+
37
+ settings = get_settings()
app/core/exceptions.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ClassificationError(Exception):
2
+ pass
3
+
4
+
5
+ class LanguageDetectionError(Exception):
6
+ pass
7
+
8
+
9
+ class ExtractionError(Exception):
10
+ pass
11
+
12
+
13
+ class ValidationError(Exception):
14
+ pass
app/main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.staticfiles import StaticFiles
3
+
4
+ from app.api.router import api_router
5
+ from app.core.config import settings
6
+
7
+ settings.static_dir.mkdir(parents=True, exist_ok=True)
8
+ settings.upload_dir.mkdir(parents=True, exist_ok=True)
9
+
10
+ app = FastAPI(title=settings.app_name, debug=settings.debug)
11
+ app.mount("/static", StaticFiles(directory=str(settings.static_dir)), name="static")
12
+ app.include_router(api_router)
13
+
14
+
15
+ @app.get("/endpoint/")
16
+ def list_endpoints() -> list[dict]:
17
+ endpoints = []
18
+ for route in app.routes:
19
+ methods = sorted((route.methods or set()) & {"GET", "POST", "PUT", "DELETE"})
20
+ if methods:
21
+ endpoints.append({"endpoint": route.path, "methods": methods})
22
+ return endpoints
app/models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Domain models."""
app/models/label_config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from threading import Lock
3
+
4
+
5
+ @dataclass
6
+ class LabelConfig:
7
+ labels: list[str] = field(default_factory=list)
8
+ _lock: Lock = field(default_factory=Lock)
9
+
10
+ def get_labels(self) -> list[str]:
11
+ with self._lock:
12
+ return list(self.labels)
13
+
14
+ def set_labels(self, labels: list[str]) -> list[str]:
15
+ cleaned = [label.strip() for label in labels if label and label.strip()]
16
+ with self._lock:
17
+ self.labels = cleaned
18
+ return list(self.labels)
app/pipelines/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """ML/OCR processing pipelines."""
app/pipelines/classification_pipeline.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError
4
+ from app.pipelines.text_pipeline import preprocess_text
5
+ from app.services.classifier_service import classifier_service
6
+ from app.services.extraction_service import extraction_service
7
+ from app.services.label_service import label_service
8
+ from app.services.language_service import language_service
9
+
10
+
11
+ class ClassificationPipeline:
12
+ def classify_text(self, text: str) -> str:
13
+ preprocessed_text = preprocess_text(text)
14
+ labels = label_service.get_labels()
15
+ return classifier_service.classify(preprocessed_text, labels)
16
+
17
+ def detect_language(self, text: str) -> str:
18
+ preprocessed_text = preprocess_text(text)
19
+ return language_service.detect_language(preprocessed_text)
20
+
21
+ def transform_file(self, original_filename: str, file_path: Path) -> str:
22
+ text = extraction_service.extract_text(original_filename, file_path)
23
+ if not text or not text.strip():
24
+ raise ExtractionError("No text extracted from file")
25
+ return text
26
+
27
+ def classify_file(self, original_filename: str, file_path: Path) -> dict:
28
+ text = self.transform_file(original_filename, file_path)
29
+ preprocessed_text = preprocess_text(text)
30
+
31
+ language = language_service.detect_language(preprocessed_text)
32
+ labels = label_service.get_labels()
33
+ topic = classifier_service.classify(preprocessed_text, labels)
34
+
35
+ result = {"label": topic, "language": language}
36
+ if language != "en":
37
+ result["type"] = "not english"
38
+ return result
39
+
40
+
41
+ classification_pipeline = ClassificationPipeline()
42
+
43
+
44
+ __all__ = [
45
+ "classification_pipeline",
46
+ "ClassificationError",
47
+ "LanguageDetectionError",
48
+ "ExtractionError",
49
+ "ValidationError",
50
+ ]
app/pipelines/text_pipeline.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from app.core.exceptions import ValidationError
4
+
5
+
6
+ MIN_WORDS = 4
7
+
8
+
9
+ def preprocess_text(text: str) -> str:
10
+ if text is None:
11
+ raise ValidationError("Text is required")
12
+
13
+ cleaned = text.replace("\n", " ")
14
+ cleaned = re.sub(r"<[^>]+>", "", cleaned)
15
+ cleaned = re.sub(r"\s+", " ", cleaned)
16
+ cleaned = re.sub(r"[^\w\s$€%.,-/]|(?<=\d)[.,/](?=\d)", " ", cleaned).lower().strip()
17
+
18
+ if len(cleaned.split(" ")) < MIN_WORDS:
19
+ raise ValidationError(f"Text must contain at least {MIN_WORDS} words after preprocessing")
20
+
21
+ return cleaned
app/routers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """HTTP route modules."""
app/routers/classification.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, File, HTTPException, UploadFile, status
2
+
3
+ from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError
4
+ from app.pipelines.classification_pipeline import classification_pipeline
5
+ from app.schemas.classification import FileClassifyResponse, FileTransformResponse, LabelUpdateInput, TextInput
6
+ from app.services.file_storage_service import file_storage_service
7
+ from app.services.label_service import label_service
8
+
9
+ router = APIRouter(tags=["classification"])
10
+
11
+
12
+ def _handle_exception(exc: Exception) -> None:
13
+ if isinstance(exc, ValidationError):
14
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
15
+ if isinstance(exc, ExtractionError):
16
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(exc)) from exc
17
+ if isinstance(exc, (ClassificationError, LanguageDetectionError)):
18
+ raise HTTPException(status_code=status.HTTP_502_BAD_GATEWAY, detail=str(exc)) from exc
19
+ raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Unexpected error") from exc
20
+
21
+
22
+ @router.post("/api/classifier", response_model=str)
23
+ async def classify_text(payload: TextInput) -> str:
24
+ try:
25
+ return classification_pipeline.classify_text(payload.text)
26
+ except Exception as exc:
27
+ _handle_exception(exc)
28
+
29
+
30
+ @router.post("/api/language", response_model=str)
31
+ async def detect_language(payload: TextInput) -> str:
32
+ try:
33
+ return classification_pipeline.detect_language(payload.text)
34
+ except Exception as exc:
35
+ _handle_exception(exc)
36
+
37
+
38
+ @router.post("/api/transformer", response_model=FileTransformResponse)
39
+ async def transform_file(file: UploadFile = File(...)) -> dict:
40
+ try:
41
+ saved_path = file_storage_service.save_upload(file)
42
+ content = classification_pipeline.transform_file(file.filename or saved_path.name, saved_path)
43
+ return {"filename": file.filename or saved_path.name, "content": content}
44
+ except Exception as exc:
45
+ _handle_exception(exc)
46
+
47
+
48
+ @router.post("/classify", response_model=FileClassifyResponse)
49
+ async def classify_uploaded_file(file: UploadFile = File(...)) -> dict:
50
+ try:
51
+ saved_path = file_storage_service.save_upload(file)
52
+ return classification_pipeline.classify_file(file.filename or saved_path.name, saved_path)
53
+ except Exception as exc:
54
+ _handle_exception(exc)
55
+
56
+
57
+ @router.post("/configlabel", response_model=list[str])
58
+ async def configure_labels(payload: LabelUpdateInput) -> list[str]:
59
+ labels = label_service.set_labels_from_csv(payload.text)
60
+ if not labels:
61
+ raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="At least one label is required")
62
+ return labels
63
+
64
+
65
+ @router.get("/labels", response_model=list[str])
66
+ async def get_labels() -> list[str]:
67
+ return label_service.get_labels()
app/routers/health.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+ from app.services.label_service import label_service
4
+
5
+ router = APIRouter(tags=["health"])
6
+
7
+
8
+ @router.get("/health/liveness")
9
+ def liveness() -> dict:
10
+ return {"status": "ok"}
11
+
12
+
13
+ @router.get("/health/readiness")
14
+ def readiness() -> dict:
15
+ # This service depends on external APIs, but readiness for local runtime
16
+ # is based on successful startup and non-empty label config.
17
+ labels = label_service.get_labels()
18
+ return {"status": "ready", "labels_count": len(labels)}
app/schemas/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Pydantic API schemas."""
app/schemas/classification.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, ConfigDict, Field
2
+
3
+
4
+ class BaseSchema(BaseModel):
5
+ model_config = ConfigDict(extra="forbid")
6
+
7
+
8
+ class TextInput(BaseSchema):
9
+ text: str = Field(min_length=1)
10
+
11
+
12
+ class LabelUpdateInput(BaseSchema):
13
+ text: str = Field(min_length=1, description="Comma-separated labels, e.g. 'news, sport, finance'")
14
+
15
+
16
+ class ClassifierResponse(BaseSchema):
17
+ label: str
18
+
19
+
20
+ class LanguageResponse(BaseSchema):
21
+ language: str
22
+
23
+
24
+ class FileTransformResponse(BaseSchema):
25
+ filename: str
26
+ content: str
27
+
28
+
29
+ class FileClassifyResponse(BaseSchema):
30
+ label: str
31
+ language: str
32
+ type: str | None = None
33
+
34
+
35
+ class LabelsResponse(BaseSchema):
36
+ labels: list[str]
app/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Service layer modules."""
app/services/classifier_service.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from gradio_client import Client
6
+
7
+ from app.core.config import settings
8
+ from app.core.exceptions import ClassificationError
9
+
10
+
11
+ class ClassifierService:
12
+ def __init__(self) -> None:
13
+ self._client: Client | None = None
14
+
15
+ def _get_client(self) -> Client:
16
+ if self._client is not None:
17
+ return self._client
18
+
19
+ client_kwargs: dict[str, Any] = {}
20
+ if settings.huggingface_token:
21
+ client_kwargs["hf_token"] = settings.huggingface_token
22
+
23
+ try:
24
+ self._client = Client(settings.classifier_space, **client_kwargs)
25
+ except Exception as exc:
26
+ raise ClassificationError("Unable to initialize classifier client") from exc
27
+
28
+ return self._client
29
+
30
+ @staticmethod
31
+ def _extract_label(payload: Any) -> str | None:
32
+ if isinstance(payload, dict):
33
+ value = payload.get("label")
34
+ if isinstance(value, str) and value.strip():
35
+ return value.strip()
36
+ return None
37
+
38
+ if isinstance(payload, list):
39
+ for item in payload:
40
+ label = ClassifierService._extract_label(item)
41
+ if label:
42
+ return label
43
+
44
+ return None
45
+
46
+ def classify(self, text: str, labels: list[str]) -> str:
47
+ if not labels:
48
+ raise ClassificationError("No labels configured")
49
+
50
+ labels_text = ", ".join(labels)
51
+
52
+ try:
53
+ result = self._get_client().predict(
54
+ text,
55
+ labels_text,
56
+ api_name=settings.classifier_api_name,
57
+ )
58
+ except Exception as exc:
59
+ raise ClassificationError("Classifier request failed") from exc
60
+
61
+ if isinstance(result, str):
62
+ candidate_path = Path(result)
63
+ if candidate_path.exists():
64
+ try:
65
+ parsed = json.loads(candidate_path.read_text(encoding="utf-8"))
66
+ except Exception as exc:
67
+ raise ClassificationError("Classifier output file is not valid JSON") from exc
68
+ label = self._extract_label(parsed)
69
+ if label:
70
+ return label
71
+
72
+ stripped = result.strip()
73
+ if stripped:
74
+ return stripped
75
+
76
+ label = self._extract_label(result)
77
+ if label:
78
+ return label
79
+
80
+ raise ClassificationError("Classifier did not return a valid label")
81
+
82
+
83
+ classifier_service = ClassifierService()
app/services/extraction_service.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import docx2txt
4
+ from openpyxl import load_workbook
5
+ from PIL import Image
6
+ from pypdf import PdfReader
7
+ import pytesseract
8
+
9
+ from app.core.exceptions import ExtractionError
10
+
11
+
12
+ DOC_EXTENSIONS = {".pdf", ".docx", ".xlsx"}
13
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"}
14
+ TEXT_EXTENSIONS = {".txt", ".md", ".csv", ".json"}
15
+
16
+
17
+ class ExtractionService:
18
+ @staticmethod
19
+ def _extract_pdf(file_path: Path) -> str:
20
+ reader = PdfReader(str(file_path))
21
+ chunks: list[str] = []
22
+ for page in reader.pages:
23
+ text = page.extract_text() or ""
24
+ if text.strip():
25
+ chunks.append(text)
26
+ return "\n".join(chunks)
27
+
28
+ @staticmethod
29
+ def _extract_docx(file_path: Path) -> str:
30
+ return docx2txt.process(str(file_path))
31
+
32
+ @staticmethod
33
+ def _extract_xlsx(file_path: Path) -> str:
34
+ workbook = load_workbook(filename=str(file_path), read_only=True, data_only=True)
35
+ chunks: list[str] = []
36
+ for sheet in workbook.worksheets:
37
+ for row in sheet.iter_rows(values_only=True):
38
+ row_values = [str(value).strip() for value in row if value is not None and str(value).strip()]
39
+ if row_values:
40
+ chunks.append(" ".join(row_values))
41
+ workbook.close()
42
+ return "\n".join(chunks)
43
+
44
+ def extract_text(self, file_name: str, file_path: Path) -> str:
45
+ extension = Path(file_name).suffix.lower()
46
+
47
+ try:
48
+ if extension in DOC_EXTENSIONS:
49
+ if extension == ".pdf":
50
+ return self._extract_pdf(file_path)
51
+ if extension == ".docx":
52
+ return self._extract_docx(file_path)
53
+ if extension == ".xlsx":
54
+ return self._extract_xlsx(file_path)
55
+
56
+ if extension in IMAGE_EXTENSIONS:
57
+ image = Image.open(file_path)
58
+ return pytesseract.image_to_string(image)
59
+
60
+ if extension in TEXT_EXTENSIONS:
61
+ return file_path.read_text(encoding="utf-8", errors="ignore")
62
+
63
+ raise ExtractionError(f"Unsupported file extension: {extension or 'unknown'}")
64
+ except ExtractionError:
65
+ raise
66
+ except Exception as exc:
67
+ raise ExtractionError("Failed to extract text from file") from exc
68
+
69
+
70
+ extraction_service = ExtractionService()
app/services/file_storage_service.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from uuid import uuid4
3
+
4
+ from fastapi import UploadFile
5
+
6
+ from app.core.config import settings
7
+
8
+
9
+ class FileStorageService:
10
+ def __init__(self) -> None:
11
+ settings.static_dir.mkdir(parents=True, exist_ok=True)
12
+ settings.upload_dir.mkdir(parents=True, exist_ok=True)
13
+
14
+ def save_upload(self, upload_file: UploadFile) -> Path:
15
+ original_name = Path(upload_file.filename or "uploaded_file").name
16
+ safe_name = original_name.replace("/", "_")
17
+ target_path = settings.upload_dir / f"{uuid4().hex}_{safe_name}"
18
+
19
+ with target_path.open("wb") as destination:
20
+ while True:
21
+ chunk = upload_file.file.read(1024 * 1024)
22
+ if not chunk:
23
+ break
24
+ destination.write(chunk)
25
+
26
+ return target_path
27
+
28
+
29
+ file_storage_service = FileStorageService()
app/services/label_service.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.core.config import settings
2
+ from app.models.label_config import LabelConfig
3
+
4
+
5
+ class LabelService:
6
+ def __init__(self) -> None:
7
+ initial_labels = [label.strip() for label in settings.default_labels_csv.split(",") if label.strip()]
8
+ self._config = LabelConfig(labels=initial_labels)
9
+
10
+ def get_labels(self) -> list[str]:
11
+ return self._config.get_labels()
12
+
13
+ def set_labels_from_csv(self, labels_csv: str) -> list[str]:
14
+ labels = [label.strip() for label in labels_csv.split(",") if label.strip()]
15
+ return self._config.set_labels(labels)
16
+
17
+
18
+ label_service = LabelService()
app/services/language_service.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ from app.core.config import settings
4
+ from app.core.exceptions import LanguageDetectionError
5
+
6
+
7
+ class LanguageService:
8
+ def __init__(self) -> None:
9
+ self._session = requests.Session()
10
+
11
+ def detect_language(self, text: str) -> str:
12
+ try:
13
+ response = self._session.post(
14
+ settings.language_detector_url,
15
+ json={"data": [text]},
16
+ timeout=settings.request_timeout_seconds,
17
+ )
18
+ response.raise_for_status()
19
+ payload = response.json()
20
+ except requests.RequestException as exc:
21
+ raise LanguageDetectionError("Language detection request failed") from exc
22
+ except ValueError as exc:
23
+ raise LanguageDetectionError("Language detector returned invalid JSON") from exc
24
+
25
+ data = payload.get("data") if isinstance(payload, dict) else None
26
+ if not data or not isinstance(data, list):
27
+ raise LanguageDetectionError("Language detector response missing 'data' field")
28
+
29
+ first = data[0]
30
+ if isinstance(first, dict):
31
+ label = first.get("label")
32
+ else:
33
+ label = first
34
+
35
+ if not isinstance(label, str) or not label.strip():
36
+ raise LanguageDetectionError("Language detector did not return a valid label")
37
+
38
+ return label.strip()
39
+
40
+
41
+ language_service = LanguageService()
docker-compose.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ classifier-api:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ container_name: classifier-general-api
7
+ environment:
8
+ APP_NAME: ${APP_NAME:-Classifier General API}
9
+ ENVIRONMENT: ${ENVIRONMENT:-development}
10
+ DEBUG: ${DEBUG:-false}
11
+ STATIC_DIR: ${STATIC_DIR:-static}
12
+ UPLOAD_SUBDIR: ${UPLOAD_SUBDIR:-uploads}
13
+ CLASSIFIER_SPACE: ${CLASSIFIER_SPACE:-https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/}
14
+ CLASSIFIER_API_NAME: ${CLASSIFIER_API_NAME:-/predict}
15
+ HUGGINGFACE_TOKEN: ${HUGGINGFACE_TOKEN:-}
16
+ LANGUAGE_DETECTOR_URL: ${LANGUAGE_DETECTOR_URL:-https://team-language-detector-languagedetector.hf.space/run/predict}
17
+ REQUEST_TIMEOUT_SECONDS: ${REQUEST_TIMEOUT_SECONDS:-30}
18
+ DEFAULT_LABELS_CSV: ${DEFAULT_LABELS_CSV:-news,sport,finance,politics}
19
+ ports:
20
+ - "4002:4002"
21
+ volumes:
22
+ - ./static:/app/static
23
+ healthcheck:
24
+ test: ["CMD", "curl", "-f", "http://localhost:4002/health/liveness"]
25
+ interval: 10s
26
+ timeout: 4s
27
+ retries: 6
docs/README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Classifier-General Documentation
2
+
3
+ This folder contains reverse-engineered documentation generated from repository evidence.
4
+
5
+ ## Doc Map (Diataxis)
6
+ - Tutorial:
7
+ - `docs/tutorials/getting-started.md`
8
+ - How-to guides:
9
+ - `docs/how-to/run-locally.md`
10
+ - `docs/how-to/deploy-with-docker-compose.md`
11
+ - Reference:
12
+ - `docs/reference/configuration.md`
13
+ - `docs/reference/api.md`
14
+ - `docs/reference/runtime-state.md`
15
+ - Explanation:
16
+ - `docs/explanation/architecture.md`
17
+ - `docs/explanation/decisions.md`
18
+
19
+ ## Scope
20
+ - Covered modules: classification routes, text preprocessing, extraction pipeline, remote classifier/language services, label config.
21
+ - This service has no relational database layer; runtime state is file system + in-memory labels.
22
+
23
+ ## Evidence anchors
24
+ - `app/main.py`
25
+ - `app/api/router.py`
26
+ - `app/routers/*.py`
27
+ - `app/pipelines/*.py`
28
+ - `app/services/*.py`
29
+ - `app/core/*.py`
30
+ - `app/schemas/*.py`
31
+ - `docker-compose.yml`
32
+ - `Dockerfile`
33
+ - `tests/test_routes.py`
docs/explanation/architecture.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture Explanation
2
+
3
+ ## 1. Executive summary
4
+ `classifier-general` is a single FastAPI service that classifies text and files by combining local extraction/preprocessing with two remote AI endpoints (topic classifier and language detector).
5
+
6
+ Evidence:
7
+ - `app/main.py`
8
+ - `app/routers/classification.py`
9
+ - `app/services/classifier_service.py`
10
+ - `app/services/language_service.py`
11
+
12
+ ## 2. Purpose and scope
13
+ ### What exists
14
+ - Contract-compatible endpoints for classify/language/transform flows.
15
+ - Pipeline split into preprocess, extraction, classification orchestration.
16
+ - Configurable runtime through environment variables.
17
+
18
+ Evidence:
19
+ - `app/routers/classification.py`
20
+ - `app/pipelines/classification_pipeline.py`
21
+ - `app/core/config.py`
22
+
23
+ ### How it works
24
+ - Router accepts JSON or multipart requests.
25
+ - Files are written to disk (`static/uploads`).
26
+ - Extraction service parses document/image/text into plain text.
27
+ - Text preprocessing enforces minimum quality.
28
+ - Pipeline calls language and classifier services.
29
+
30
+ Evidence:
31
+ - `app/services/file_storage_service.py`
32
+ - `app/services/extraction_service.py`
33
+ - `app/pipelines/text_pipeline.py`
34
+ - `app/pipelines/classification_pipeline.py`
35
+
36
+ ### Why designed this way (inferred)
37
+ - Maintain old API contract while introducing modular services and safer config handling.
38
+
39
+ ## 3. C4-style views
40
+ ### Context view
41
+ Actors/systems:
42
+ - API client sending text/files.
43
+ - External classifier endpoint (`CLASSIFIER_SPACE`).
44
+ - External language detector endpoint (`LANGUAGE_DETECTOR_URL`).
45
+ - Local filesystem for uploaded files.
46
+
47
+ Evidence:
48
+ - `app/core/config.py`
49
+ - `app/services/classifier_service.py`
50
+ - `app/services/language_service.py`
51
+
52
+ ### Container view
53
+ - One container/service (`classifier-api`) with FastAPI + OCR binary.
54
+
55
+ Evidence:
56
+ - `docker-compose.yml`
57
+ - `Dockerfile`
58
+
59
+ ### Component view
60
+ - API routing: `app/routers/*`
61
+ - Orchestration pipelines: `app/pipelines/*`
62
+ - Integration services: `app/services/classifier_service.py`, `app/services/language_service.py`
63
+ - Extraction + storage services: `app/services/extraction_service.py`, `app/services/file_storage_service.py`
64
+ - Config/exceptions/schemas: `app/core/*`, `app/schemas/*`
65
+
66
+ ### Code-level workflow: file classification
67
+ 1. `POST /classify` receives file.
68
+ 2. File saved to upload directory.
69
+ 3. Text extracted by extension-specific handlers.
70
+ 4. Text preprocessed (regex cleanup + min words).
71
+ 5. Language detector called.
72
+ 6. Classifier called with CSV labels converted to joined text.
73
+ 7. Response returns `{label, language}` plus `type=not english` when applicable.
74
+
75
+ Evidence:
76
+ - `app/routers/classification.py`
77
+ - `app/services/file_storage_service.py`
78
+ - `app/services/extraction_service.py`
79
+ - `app/pipelines/text_pipeline.py`
80
+ - `app/pipelines/classification_pipeline.py`
81
+
82
+ ## 4. Cross-cutting concerns
83
+ ### Validation and error mapping
84
+ - Input schemas use strict `extra=forbid`.
85
+ - Error mapping explicitly separates validation/extraction (400) from upstream AI failures (502).
86
+
87
+ Evidence:
88
+ - `app/schemas/classification.py`
89
+ - `app/routers/classification.py`
90
+
91
+ ### Configuration and secrets
92
+ - Runtime config sourced from env.
93
+ - HF token optional and no hardcoded secret in current service code.
94
+
95
+ Evidence:
96
+ - `app/core/config.py`
97
+ - `app/services/classifier_service.py`
98
+
99
+ ### Concurrency and mutable state
100
+ - Labels guarded by thread lock (`LabelConfig._lock`).
101
+ - State is still process-local; multi-instance deployments can diverge.
102
+
103
+ Evidence:
104
+ - `app/models/label_config.py`
105
+ - `app/services/label_service.py`
106
+
107
+ ### Testing strategy
108
+ - Route contract tests monkeypatch pipeline methods for deterministic tests.
109
+ - Tests validate response shape and key endpoint behavior, not remote network calls.
110
+
111
+ Evidence:
112
+ - `tests/test_routes.py`
113
+
114
+ ## 5. Risks, gaps, and technical debt
115
+ - External endpoint dependency introduces latency and runtime failure risk.
116
+ - No upload retention/cleanup process.
117
+ - Readiness check does not probe external AI services, only local label readiness.
118
+ - No authentication/authorization layer on API endpoints.
119
+
120
+ Evidence:
121
+ - `app/services/language_service.py`
122
+ - `app/services/classifier_service.py`
123
+ - `app/routers/health.py`
124
+ - `app/routers/classification.py`
125
+
126
+ ## 6. Unknown or inferred
127
+ - Unknown: expected SLA and acceptable latency.
128
+ - Unknown: intended persistence/retention policy for uploaded files.
129
+ - Inferred: service is optimized for local/dev contract compatibility and integration testing.
docs/explanation/decisions.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture Decisions (ADR-style)
2
+
3
+ ## ADR-001: Use modular FastAPI layout for classifier backend
4
+ - Status: Accepted
5
+ - Type: Explicit
6
+ - Evidence:
7
+ - `app/main.py`
8
+ - `app/api/router.py`
9
+ - `app/routers/`
10
+ - `app/services/`
11
+ - `app/pipelines/`
12
+ - Rationale:
13
+ - Clear separation between transport, orchestration, and integrations.
14
+
15
+ ## ADR-002: Preserve endpoint contracts from prior service behavior
16
+ - Status: Accepted
17
+ - Type: Explicit
18
+ - Evidence:
19
+ - `app/routers/classification.py`
20
+ - `tests/test_routes.py`
21
+ - Rationale:
22
+ - Keep clients functional while refactoring internals.
23
+
24
+ ## ADR-003: Use remote HF/Gradio endpoint for classification
25
+ - Status: Accepted
26
+ - Type: Explicit
27
+ - Evidence:
28
+ - `app/core/config.py`
29
+ - `app/services/classifier_service.py`
30
+ - Rationale:
31
+ - Avoid shipping local model runtime in this service.
32
+
33
+ ## ADR-004: Use remote language detector HTTP endpoint
34
+ - Status: Accepted
35
+ - Type: Explicit
36
+ - Evidence:
37
+ - `app/services/language_service.py`
38
+ - `app/core/config.py`
39
+ - Rationale:
40
+ - Decouple language detection model from this codebase.
41
+
42
+ ## ADR-005: Keep labels in in-memory mutable config
43
+ - Status: Accepted (current), Needs review
44
+ - Type: Explicit
45
+ - Evidence:
46
+ - `app/models/label_config.py`
47
+ - `app/services/label_service.py`
48
+ - `app/routers/classification.py` (`/configlabel`, `/labels`)
49
+ - Rationale:
50
+ - Simple runtime tuning without DB migration.
51
+ - Tradeoff:
52
+ - No persistence across restarts, no cross-instance consistency.
53
+
54
+ ## ADR-006: Store uploaded files on local filesystem under static mount
55
+ - Status: Accepted
56
+ - Type: Explicit
57
+ - Evidence:
58
+ - `app/services/file_storage_service.py`
59
+ - `app/main.py`
60
+ - `docker-compose.yml`
61
+ - Rationale:
62
+ - Enables document/image extraction workflow with minimal infrastructure.
63
+
64
+ ## ADR-007: Map errors into contract-friendly HTTP statuses
65
+ - Status: Accepted
66
+ - Type: Explicit
67
+ - Evidence:
68
+ - `app/routers/classification.py`
69
+ - `app/core/exceptions.py`
70
+ - Rationale:
71
+ - Differentiate local validation issues (`400`) from upstream AI failures (`502`).
72
+
73
+ ## ADR-008: No built-in auth layer for this API
74
+ - Status: Accepted (current), Needs review
75
+ - Type: Inferred
76
+ - Evidence:
77
+ - `app/routers/classification.py`
78
+ - absence of auth dependencies/middleware in `app/main.py`
79
+ - Rationale:
80
+ - likely positioned as internal or early-stage service.
docs/how-to/deploy-with-docker-compose.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deploy With Docker Compose
2
+
3
+ ## Topology
4
+ - Single container: `classifier-api`
5
+ - Volume mount: `./static:/app/static` for persisted uploaded files
6
+ - Healthcheck: `GET /health/liveness`
7
+
8
+ Evidence:
9
+ - `docker-compose.yml`
10
+
11
+ ## Command
12
+ ```bash
13
+ cd classifier-general
14
+ docker compose up -d --build
15
+ ```
16
+
17
+ ## Verify
18
+ ```bash
19
+ docker compose ps
20
+ curl -s http://localhost:4002/health/liveness
21
+ ```
22
+
23
+ ## Production hardening gaps
24
+ - No reverse proxy/TLS config in this repo.
25
+ - External AI dependencies are hard network dependencies at runtime.
26
+ - No horizontal scaling coordination for in-memory labels (`/configlabel` mutates process-local state).
27
+
28
+ Evidence:
29
+ - `app/services/language_service.py`
30
+ - `app/services/classifier_service.py`
31
+ - `app/services/label_service.py`
32
+
33
+ ## Unknown or inferred
34
+ - Unknown: expected deployment platform (only Docker artifacts are present).
35
+ - Inferred: this compose file targets local/dev usage.
docs/how-to/run-locally.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run Locally (Dev Loop)
2
+
3
+ ## 1. Install dependencies
4
+ ```bash
5
+ cd classifier-general
6
+ python3 -m venv .venv
7
+ source .venv/bin/activate
8
+ pip install -r requirements.txt
9
+ ```
10
+
11
+ Evidence:
12
+ - `requirements.txt`
13
+
14
+ ## 2. Configure environment
15
+ ```bash
16
+ cp .env.example .env
17
+ ```
18
+
19
+ Critical settings:
20
+ - `CLASSIFIER_SPACE`
21
+ - `CLASSIFIER_API_NAME`
22
+ - `LANGUAGE_DETECTOR_URL`
23
+ - `DEFAULT_LABELS_CSV`
24
+
25
+ Evidence:
26
+ - `app/core/config.py`
27
+ - `.env.example`
28
+
29
+ ## 3. Start server
30
+ ```bash
31
+ uvicorn main:app --host 0.0.0.0 --port 4002 --reload
32
+ ```
33
+
34
+ Evidence:
35
+ - `main.py`
36
+ - `app/main.py`
37
+
38
+ ## 4. Test file-based endpoints
39
+ ```bash
40
+ curl -s -X POST http://localhost:4002/api/transformer \
41
+ -F 'file=@/absolute/path/to/sample.pdf'
42
+
43
+ curl -s -X POST http://localhost:4002/classify \
44
+ -F 'file=@/absolute/path/to/sample.pdf'
45
+ ```
46
+
47
+ Uploads are stored under `static/uploads` with random UUID prefixes.
48
+
49
+ Evidence:
50
+ - `app/services/file_storage_service.py`
51
+ - `app/services/extraction_service.py`
52
+
53
+ ## Troubleshooting
54
+ - `400 Text must contain at least 4 words`:
55
+ - input failed preprocessing minimum-word rule.
56
+ - `502 Classifier request failed`:
57
+ - HF Space unreachable or incompatible response.
58
+ - OCR extraction quality is low:
59
+ - verify tesseract install and image quality.
60
+
61
+ Evidence:
62
+ - `app/pipelines/text_pipeline.py`
63
+ - `app/routers/classification.py`
64
+ - `Dockerfile`
docs/reference/api.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Reference
2
+
3
+ Base URL: `http://localhost:4002`
4
+
5
+ Evidence:
6
+ - `app/api/router.py`
7
+ - `app/routers/classification.py`
8
+ - `app/routers/health.py`
9
+
10
+ ## Endpoints
11
+
12
+ | Method | Path | Request | Response |
13
+ |---|---|---|---|
14
+ | GET | `/health/liveness` | none | `{status}` |
15
+ | GET | `/health/readiness` | none | `{status, labels_count}` |
16
+ | GET | `/endpoint/` | none | list of routes/methods |
17
+ | POST | `/api/classifier` | `{text}` | `"<label>"` |
18
+ | POST | `/api/language` | `{text}` | `"<language>"` |
19
+ | POST | `/api/transformer` | multipart `file` | `{filename, content}` |
20
+ | POST | `/classify` | multipart `file` | `{label, language, type?}` |
21
+ | POST | `/configlabel` | `{text: "csv,labels"}` | `string[]` |
22
+ | GET | `/labels` | none | `string[]` |
23
+
24
+ ## Validation and errors
25
+ - `400` for input validation and extraction problems.
26
+ - `502` for upstream classifier/language failures.
27
+ - `500` for unexpected failures.
28
+
29
+ Evidence:
30
+ - `app/routers/classification.py` (`_handle_exception`)
31
+ - `app/schemas/classification.py`
32
+
33
+ ## Contract notes
34
+ - Contract returns plain string for `/api/classifier` and `/api/language` (not wrapped object).
35
+ - `/classify` returns optional `type="not english"` when language output is not `en`.
36
+
37
+ Evidence:
38
+ - `app/routers/classification.py`
39
+ - `app/pipelines/classification_pipeline.py`
docs/reference/configuration.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration Reference
2
+
3
+ Configuration is managed with Pydantic Settings.
4
+
5
+ Evidence:
6
+ - `app/core/config.py`
7
+ - `.env.example`
8
+ - `docker-compose.yml`
9
+
10
+ ## Application settings
11
+
12
+ | Variable | Default | Purpose |
13
+ |---|---|---|
14
+ | `APP_NAME` | `Classifier General API` | FastAPI title |
15
+ | `ENVIRONMENT` | `development` | environment label |
16
+ | `DEBUG` | `false` | debug mode |
17
+
18
+ ## Filesystem settings
19
+
20
+ | Variable | Default | Purpose |
21
+ |---|---|---|
22
+ | `STATIC_DIR` | `static` | static root served at `/static` |
23
+ | `UPLOAD_SUBDIR` | `uploads` | upload directory under static root |
24
+
25
+ ## Classifier integration settings
26
+
27
+ | Variable | Default | Purpose |
28
+ |---|---|---|
29
+ | `CLASSIFIER_SPACE` | `https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/` | Gradio/HF Space endpoint |
30
+ | `CLASSIFIER_API_NAME` | `/predict` | Gradio predict API name |
31
+ | `HUGGINGFACE_TOKEN` | empty | optional auth token for client init |
32
+
33
+ ## Language detector settings
34
+
35
+ | Variable | Default | Purpose |
36
+ |---|---|---|
37
+ | `LANGUAGE_DETECTOR_URL` | `https://team-language-detector-languagedetector.hf.space/run/predict` | remote language endpoint |
38
+ | `REQUEST_TIMEOUT_SECONDS` | `30` | HTTP timeout for language requests |
39
+
40
+ ## Label settings
41
+
42
+ | Variable | Default | Purpose |
43
+ |---|---|---|
44
+ | `DEFAULT_LABELS_CSV` | `news,sport,finance,politics` | initial in-memory labels |
45
+
46
+ ## Behavior notes
47
+ - Labels are process-local in memory and reset on restart.
48
+ - Upload directory is auto-created at app startup.
49
+
50
+ Evidence:
51
+ - `app/services/label_service.py`
52
+ - `app/models/label_config.py`
53
+ - `app/main.py`
docs/reference/runtime-state.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Runtime State Reference
2
+
3
+ This service does not define a relational database. State exists in memory and filesystem.
4
+
5
+ Evidence:
6
+ - no `app/database/` package
7
+ - `app/models/label_config.py`
8
+ - `app/services/file_storage_service.py`
9
+
10
+ ## In-memory state
11
+
12
+ | State | Location | Lifecycle |
13
+ |---|---|---|
14
+ | Active labels list | `LabelConfig.labels` | initialized at process start from `DEFAULT_LABELS_CSV`; mutable via `/configlabel`; reset on restart |
15
+
16
+ Evidence:
17
+ - `app/services/label_service.py`
18
+ - `app/models/label_config.py`
19
+
20
+ ## Filesystem state
21
+
22
+ | State | Location | Lifecycle |
23
+ |---|---|---|
24
+ | Uploaded files | `STATIC_DIR/UPLOAD_SUBDIR` (default `static/uploads`) | created per upload; not automatically deleted by app |
25
+ | Static mount | `/static` route | served directly by FastAPI static files |
26
+
27
+ Evidence:
28
+ - `app/core/config.py`
29
+ - `app/main.py`
30
+ - `app/services/file_storage_service.py`
31
+
32
+ ## External runtime dependencies
33
+
34
+ | Dependency | Usage |
35
+ |---|---|
36
+ | HF/Gradio classifier Space | text topic classification |
37
+ | Language detector endpoint | language inference |
38
+ | Tesseract binary | OCR extraction for images |
39
+
40
+ Evidence:
41
+ - `app/services/classifier_service.py`
42
+ - `app/services/language_service.py`
43
+ - `app/services/extraction_service.py`
44
+ - `Dockerfile`
45
+
46
+ ## Unknown or inferred
47
+ - Unknown: long-term retention policy for uploaded files.
48
+ - Inferred: `static/uploads` can grow unbounded without cleanup process.
docs/tutorials/getting-started.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Getting Started
2
+
3
+ This tutorial runs the classifier API and validates endpoint contracts.
4
+
5
+ ## Prerequisites
6
+ - Docker and Docker Compose
7
+ - Internet access for external classifier/language services (unless tests are monkeypatched)
8
+
9
+ Evidence:
10
+ - `docker-compose.yml`
11
+ - `app/services/classifier_service.py`
12
+ - `app/services/language_service.py`
13
+
14
+ ## 1. Prepare environment
15
+ ```bash
16
+ cd classifier-general
17
+ cp .env.example .env
18
+ ```
19
+
20
+ Evidence:
21
+ - `.env.example`
22
+ - `app/core/config.py`
23
+
24
+ ## 2. Start API in Docker
25
+ ```bash
26
+ docker compose up --build
27
+ ```
28
+
29
+ Service:
30
+ - `classifier-api` exposed on `4002`
31
+
32
+ Evidence:
33
+ - `docker-compose.yml`
34
+
35
+ ## 3. Check health
36
+ ```bash
37
+ curl -s http://localhost:4002/health/liveness
38
+ curl -s http://localhost:4002/health/readiness
39
+ ```
40
+
41
+ Expected:
42
+ - `{"status":"ok"}`
43
+ - `{"status":"ready","labels_count":<n>}`
44
+
45
+ Evidence:
46
+ - `app/routers/health.py`
47
+
48
+ ## 4. Call text classification
49
+ ```bash
50
+ curl -s -X POST http://localhost:4002/api/classifier \
51
+ -H 'content-type: application/json' \
52
+ -d '{"text":"This is a long enough sentence for classification."}'
53
+ ```
54
+
55
+ Evidence:
56
+ - `app/routers/classification.py`
57
+ - `app/pipelines/classification_pipeline.py`
58
+
59
+ ## 5. Update labels and verify
60
+ ```bash
61
+ curl -s -X POST http://localhost:4002/configlabel \
62
+ -H 'content-type: application/json' \
63
+ -d '{"text":"tech,health,legal"}'
64
+
65
+ curl -s http://localhost:4002/labels
66
+ ```
67
+
68
+ Evidence:
69
+ - `app/services/label_service.py`
70
+ - `app/models/label_config.py`
71
+
72
+ ## 6. Run route contract tests
73
+ ```bash
74
+ docker build -t classifier-general-refactor .
75
+ docker run --rm -w /app -e PYTHONPATH=/app classifier-general-refactor pytest -q
76
+ ```
77
+
78
+ Evidence:
79
+ - `tests/test_routes.py`
main.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from app.main import app
2
+
3
+
4
+ if __name__ == "__main__":
5
+ import uvicorn
6
+
7
+ uvicorn.run("main:app", host="0.0.0.0", port=4002, reload=True)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.8
2
+ uvicorn[standard]==0.34.0
3
+ pydantic==2.10.6
4
+ pydantic-settings==2.7.1
5
+ requests==2.32.3
6
+ gradio_client==1.7.0
7
+ python-multipart==0.0.20
8
+
9
+ pytesseract==0.3.13
10
+ Pillow==11.1.0
11
+ pypdf==5.4.0
12
+ docx2txt==0.8
13
+ openpyxl==3.1.5
14
+
15
+ pytest==8.3.4
16
+ httpx==0.28.1
tests/test_routes.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+
3
+ from fastapi.testclient import TestClient
4
+
5
+ from app.main import app
6
+ from app.pipelines.classification_pipeline import classification_pipeline
7
+
8
+ client = TestClient(app)
9
+
10
+
11
+ def test_classifier_endpoint_contract(monkeypatch):
12
+ monkeypatch.setattr(classification_pipeline, "classify_text", lambda text: "news")
13
+
14
+ response = client.post("/api/classifier", json={"text": "This is a long enough sentence for classification."})
15
+
16
+ assert response.status_code == 200
17
+ assert response.json() == "news"
18
+
19
+
20
+ def test_language_endpoint_contract(monkeypatch):
21
+ monkeypatch.setattr(classification_pipeline, "detect_language", lambda text: "en")
22
+
23
+ response = client.post("/api/language", json={"text": "This is a language detection sample text."})
24
+
25
+ assert response.status_code == 200
26
+ assert response.json() == "en"
27
+
28
+
29
+ def test_labels_config_roundtrip():
30
+ response = client.post("/configlabel", json={"text": "tech, health, legal"})
31
+ assert response.status_code == 200
32
+ assert response.json() == ["tech", "health", "legal"]
33
+
34
+ get_response = client.get("/labels")
35
+ assert get_response.status_code == 200
36
+ assert get_response.json() == ["tech", "health", "legal"]
37
+
38
+
39
+ def test_transform_file_contract(monkeypatch):
40
+ monkeypatch.setattr(classification_pipeline, "transform_file", lambda filename, path: "extracted content")
41
+
42
+ files = {"file": ("sample.txt", BytesIO(b"hello"), "text/plain")}
43
+ response = client.post("/api/transformer", files=files)
44
+
45
+ assert response.status_code == 200
46
+ assert response.json()["filename"] == "sample.txt"
47
+ assert response.json()["content"] == "extracted content"
48
+
49
+
50
+ def test_classify_file_contract(monkeypatch):
51
+ monkeypatch.setattr(
52
+ classification_pipeline,
53
+ "classify_file",
54
+ lambda filename, path: {"label": "finance", "language": "en"},
55
+ )
56
+
57
+ files = {"file": ("sample.txt", BytesIO(b"hello"), "text/plain")}
58
+ response = client.post("/classify", files=files)
59
+
60
+ assert response.status_code == 200
61
+ assert response.json() == {"label": "finance", "language": "en", "type": None}