Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +43 -0
- Dockerfile +30 -0
- README.md +15 -10
- Toxic_TweetTagger.egg-info/PKG-INFO +46 -0
- Toxic_TweetTagger.egg-info/SOURCES.txt +27 -0
- Toxic_TweetTagger.egg-info/dependency_links.txt +1 -0
- Toxic_TweetTagger.egg-info/requires.txt +25 -0
- Toxic_TweetTagger.egg-info/top_level.txt +4 -0
- __init__.py +0 -0
- __pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/api/__pycache__/api_routes.cpython-311.pyc +0 -0
- app/api/__pycache__/dependencies.cpython-311.pyc +0 -0
- app/api/__pycache__/schemas.cpython-311.pyc +0 -0
- app/api/api_routes.py +39 -0
- app/api/dependencies.py +17 -0
- app/api/schemas.py +44 -0
- app/main.py +86 -0
- app/middleware/__init__.py +66 -0
- app/middleware/__pycache__/__init__.cpython-311.pyc +0 -0
- app/model/MLmodel +31 -0
- app/model/artifacts/booster.json +0 -0
- app/model/artifacts/metrics.json +1 -0
- app/model/artifacts/model.joblib +3 -0
- app/model/artifacts/vectorizer.joblib +3 -0
- app/model/conda.yaml +15 -0
- app/model/python_env.yaml +7 -0
- app/model/python_model.pkl +3 -0
- app/model/registered_model_meta +2 -0
- app/model/requirements.txt +8 -0
- app/monitoring/__init__.py +0 -0
- app/monitoring/__pycache__/__init__.cpython-311.pyc +0 -0
- app/monitoring/__pycache__/http_metrics.cpython-311.pyc +0 -0
- app/monitoring/__pycache__/service_metrics.cpython-311.pyc +0 -0
- app/monitoring/http_metrics.py +20 -0
- app/monitoring/service_metrics.py +62 -0
- app/requirements.txt +7 -0
- app/services/__pycache__/explainer.cpython-311.pyc +0 -0
- app/services/__pycache__/feedback.cpython-311.pyc +0 -0
- app/services/__pycache__/inference.cpython-311.pyc +0 -0
- app/services/explainer.py +55 -0
- app/services/feedback.py +38 -0
- app/services/inference.py +113 -0
- app/workers/__init__.py +144 -0
- app/workers/__pycache__/__init__.cpython-311.pyc +0 -0
- components/__init__.py +0 -0
- components/__pycache__/__init__.cpython-311.pyc +0 -0
- components/__pycache__/data_ingestion.cpython-311.pyc +0 -0
- components/__pycache__/data_preprocessing.cpython-311.pyc +0 -0
- components/__pycache__/data_validation.cpython-311.pyc +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore Python cache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.so
|
| 5 |
+
|
| 6 |
+
# Ignore Jupyter notebooks (if not used)
|
| 7 |
+
*.ipynb
|
| 8 |
+
.ipynb_checkpoints/
|
| 9 |
+
|
| 10 |
+
# Ignore logs and temp files
|
| 11 |
+
*.log
|
| 12 |
+
logs/
|
| 13 |
+
*.tmp
|
| 14 |
+
*.DS_Store
|
| 15 |
+
|
| 16 |
+
# Ignore version control and dev files
|
| 17 |
+
.git/
|
| 18 |
+
.github/
|
| 19 |
+
.vscode/
|
| 20 |
+
*.env
|
| 21 |
+
.env*
|
| 22 |
+
.gitignore
|
| 23 |
+
|
| 24 |
+
# MLflow & DVC metadata (keep only if you need them at runtime)
|
| 25 |
+
.mlflow/
|
| 26 |
+
.dvc/
|
| 27 |
+
.dvcignore
|
| 28 |
+
|
| 29 |
+
# CI/CD config files
|
| 30 |
+
tox.ini
|
| 31 |
+
pytest.ini
|
| 32 |
+
setup.cfg
|
| 33 |
+
setup.py
|
| 34 |
+
requirements-dev.txt
|
| 35 |
+
|
| 36 |
+
# Ignore Docker build context bloat
|
| 37 |
+
*.tar
|
| 38 |
+
*.zip
|
| 39 |
+
*.gz
|
| 40 |
+
*.egg-info/
|
| 41 |
+
|
| 42 |
+
# Ignore Hugging Face cache
|
| 43 |
+
~/.cache/huggingface/
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11.11-slim-bookworm
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && \
|
| 4 |
+
apt-get install --no-install-recommends -y build-essential && \
|
| 5 |
+
rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY . /app
|
| 10 |
+
|
| 11 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 12 |
+
pip install --no-cache-dir -r app/requirements.txt -r app/model/requirements.txt
|
| 13 |
+
|
| 14 |
+
RUN mkdir -p /tmp/prometheus_metrics && \
|
| 15 |
+
chmod 777 /tmp/prometheus_metrics
|
| 16 |
+
|
| 17 |
+
COPY start.sh /start.sh
|
| 18 |
+
RUN chmod +x /start.sh
|
| 19 |
+
|
| 20 |
+
RUN useradd -m appuser
|
| 21 |
+
USER appuser
|
| 22 |
+
|
| 23 |
+
EXPOSE 7860
|
| 24 |
+
ENV HOST=0.0.0.0 \
|
| 25 |
+
PORT=7860 \
|
| 26 |
+
PYTHONUNBUFFERED=1 \
|
| 27 |
+
PROMETHEUS_MULTIPROC_DIR=/tmp/prometheus_metrics
|
| 28 |
+
|
| 29 |
+
ENTRYPOINT ["/start.sh"]
|
| 30 |
+
CMD ["gunicorn", "-k", "uvicorn.workers.UvicornWorker", "app.main:app", "--workers", "2", "--bind", "0.0.0.0:7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,15 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: docker
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Toxic Tweet Tagger
|
| 3 |
+
emoji: 🤖
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
python_version: "3.11"
|
| 9 |
+
app_file: app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Toxic Tweet Tagger
|
| 14 |
+
|
| 15 |
+
A machine learning app that detects toxic tweets and explains predictions using LIME.
|
Toxic_TweetTagger.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: Toxic-TweetTagger
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: End to end Hate-Tweet detection automation with MLOps implementation
|
| 5 |
+
Home-page: https://github.com/SubinoyBera/Toxic-TweetTagger
|
| 6 |
+
Author: Subinoy Bera
|
| 7 |
+
Author-email: subinoyberadgp@gmail.com
|
| 8 |
+
License: Apache-2.0
|
| 9 |
+
Classifier: Topic :: Engineering/Automation :: ML/MLOps
|
| 10 |
+
Classifier: Development Status :: 4 - Beta
|
| 11 |
+
Classifier: Intended Audience :: Developers
|
| 12 |
+
Classifier: License :: OSI Approved :: MIT License
|
| 13 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 14 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 15 |
+
Classifier: Operating System :: OS Independent
|
| 16 |
+
Requires-Python: >=3.11
|
| 17 |
+
Description-Content-Type: text/markdown
|
| 18 |
+
License-File: LICENSE
|
| 19 |
+
Requires-Dist: numpy==2.2.6
|
| 20 |
+
Requires-Dist: pandas==2.3.1
|
| 21 |
+
Requires-Dist: scipy==1.13.1
|
| 22 |
+
Requires-Dist: scikit-learn==1.7.0
|
| 23 |
+
Requires-Dist: xgboost==3.0.2
|
| 24 |
+
Requires-Dist: nltk==3.9.1
|
| 25 |
+
Requires-Dist: python-box==7.3.2
|
| 26 |
+
Requires-Dist: ensure==1.0.4
|
| 27 |
+
Requires-Dist: PyYAML==6.0.2
|
| 28 |
+
Requires-Dist: dvc==3.61.0
|
| 29 |
+
Requires-Dist: mlflow==2.22.1
|
| 30 |
+
Requires-Dist: dagshub==0.5.10
|
| 31 |
+
Requires-Dist: fastapi==0.116.1
|
| 32 |
+
Requires-Dist: pydantic==2.11.7
|
| 33 |
+
Requires-Dist: tqdm==4.67.1
|
| 34 |
+
Requires-Dist: requests==2.32.4
|
| 35 |
+
Requires-Dist: pytest==8.4.1
|
| 36 |
+
Requires-Dist: tox==4.11.3
|
| 37 |
+
Provides-Extra: testing
|
| 38 |
+
Requires-Dist: pytest>=8.0.0; extra == "testing"
|
| 39 |
+
Requires-Dist: black>=25.0.0; extra == "testing"
|
| 40 |
+
Requires-Dist: flake8>=6.0.0; extra == "testing"
|
| 41 |
+
Requires-Dist: mypy>=1.5.0; extra == "testing"
|
| 42 |
+
Requires-Dist: tox>=4.0.0; extra == "testing"
|
| 43 |
+
Dynamic: license-file
|
| 44 |
+
|
| 45 |
+
# Toxic-TweetTagger
|
| 46 |
+
Hate speech detection
|
Toxic_TweetTagger.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
README.md
|
| 3 |
+
setup.cfg
|
| 4 |
+
setup.py
|
| 5 |
+
src/Toxic_TweetTagger.egg-info/PKG-INFO
|
| 6 |
+
src/Toxic_TweetTagger.egg-info/SOURCES.txt
|
| 7 |
+
src/Toxic_TweetTagger.egg-info/dependency_links.txt
|
| 8 |
+
src/Toxic_TweetTagger.egg-info/requires.txt
|
| 9 |
+
src/Toxic_TweetTagger.egg-info/top_level.txt
|
| 10 |
+
src/components/__init__.py
|
| 11 |
+
src/components/data_ingestion.py
|
| 12 |
+
src/components/data_preprocessing.py
|
| 13 |
+
src/components/feature_engineering.py
|
| 14 |
+
src/components/model_evaluation.py
|
| 15 |
+
src/components/model_training.py
|
| 16 |
+
src/components/register_model.py
|
| 17 |
+
src/constant/__init__.py
|
| 18 |
+
src/constant/constants.py
|
| 19 |
+
src/core/__init__.py
|
| 20 |
+
src/core/config_entity.py
|
| 21 |
+
src/core/configuration.py
|
| 22 |
+
src/core/exception.py
|
| 23 |
+
src/core/logger.py
|
| 24 |
+
src/pipeline/__init__.py
|
| 25 |
+
src/pipeline/ml_pipeline.py
|
| 26 |
+
tests/test_app.py
|
| 27 |
+
tests/test_model.py
|
Toxic_TweetTagger.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
Toxic_TweetTagger.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==2.2.6
|
| 2 |
+
pandas==2.3.1
|
| 3 |
+
scipy==1.13.1
|
| 4 |
+
scikit-learn==1.7.0
|
| 5 |
+
xgboost==3.0.2
|
| 6 |
+
nltk==3.9.1
|
| 7 |
+
python-box==7.3.2
|
| 8 |
+
ensure==1.0.4
|
| 9 |
+
PyYAML==6.0.2
|
| 10 |
+
dvc==3.61.0
|
| 11 |
+
mlflow==2.22.1
|
| 12 |
+
dagshub==0.5.10
|
| 13 |
+
fastapi==0.116.1
|
| 14 |
+
pydantic==2.11.7
|
| 15 |
+
tqdm==4.67.1
|
| 16 |
+
requests==2.32.4
|
| 17 |
+
pytest==8.4.1
|
| 18 |
+
tox==4.11.3
|
| 19 |
+
|
| 20 |
+
[testing]
|
| 21 |
+
pytest>=8.0.0
|
| 22 |
+
black>=25.0.0
|
| 23 |
+
flake8>=6.0.0
|
| 24 |
+
mypy>=1.5.0
|
| 25 |
+
tox>=4.0.0
|
Toxic_TweetTagger.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
components
|
| 2 |
+
constant
|
| 3 |
+
core
|
| 4 |
+
pipeline
|
__init__.py
ADDED
|
File without changes
|
__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (152 Bytes). View file
|
|
|
app/__pycache__/main.cpython-311.pyc
ADDED
|
Binary file (5.04 kB). View file
|
|
|
app/api/__pycache__/api_routes.cpython-311.pyc
ADDED
|
Binary file (3.36 kB). View file
|
|
|
app/api/__pycache__/dependencies.cpython-311.pyc
ADDED
|
Binary file (1.18 kB). View file
|
|
|
app/api/__pycache__/schemas.cpython-311.pyc
ADDED
|
Binary file (3.97 kB). View file
|
|
|
app/api/api_routes.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Request, Depends
|
| 2 |
+
from prometheus_client import generate_latest, CollectorRegistry, multiprocess, CONTENT_TYPE_LATEST
|
| 3 |
+
from fastapi.responses import Response, HTMLResponse
|
| 4 |
+
|
| 5 |
+
from src.app.services.inference import InferenceService
|
| 6 |
+
from src.app.services.explainer import ExplainerService
|
| 7 |
+
from src.app.services.feedback import FeedbackService
|
| 8 |
+
from src.app.api.dependencies import get_inference_service, get_explainer_service, get_feedback_service
|
| 9 |
+
from src.app.api.schemas import InferenceRequest, InferenceResponse, FeedbackRequest, ExplanationRequest
|
| 10 |
+
|
| 11 |
+
router = APIRouter()
|
| 12 |
+
|
| 13 |
+
@router.get("/health")
|
| 14 |
+
async def health_check():
|
| 15 |
+
return {"status": "ok"}
|
| 16 |
+
|
| 17 |
+
@router.post("/predict", response_model=InferenceResponse)
|
| 18 |
+
async def predict(request: Request, payload: InferenceRequest, service: InferenceService = Depends(get_inference_service)):
|
| 19 |
+
request_id = request.state.request_id
|
| 20 |
+
return service.predict(request_id, payload.input_tweet, payload.text)
|
| 21 |
+
|
| 22 |
+
@router.post("/explain")
|
| 23 |
+
async def explain(payload: ExplanationRequest, service: ExplainerService = Depends(get_explainer_service)):
|
| 24 |
+
return HTMLResponse(service.explain(payload.input_tweet))
|
| 25 |
+
|
| 26 |
+
@router.post("/submit_feedback")
|
| 27 |
+
async def submit_feedback(request: Request, payload: FeedbackRequest, service: FeedbackService = Depends(get_feedback_service)):
|
| 28 |
+
request_id = request.state.request_id
|
| 29 |
+
return service.submit_feedback(request_id, payload.predicted_label, payload.feedback_label)
|
| 30 |
+
|
| 31 |
+
@router.get("/metrics")
|
| 32 |
+
def metrics():
|
| 33 |
+
registry = CollectorRegistry()
|
| 34 |
+
multiprocess.MultiProcessCollector(registry)
|
| 35 |
+
return Response(
|
| 36 |
+
generate_latest(registry),
|
| 37 |
+
media_type=CONTENT_TYPE_LATEST,
|
| 38 |
+
headers={"Cache-Control": "no-cache"}
|
| 39 |
+
)
|
app/api/dependencies.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import Request
|
| 2 |
+
|
| 3 |
+
from src.app.services.inference import InferenceService
|
| 4 |
+
from src.app.services.explainer import ExplainerService
|
| 5 |
+
from src.app.services.feedback import FeedbackService
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_inference_service(request: Request) -> InferenceService:
|
| 9 |
+
return request.app.state.prediction_service
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_explainer_service(request: Request) -> ExplainerService:
|
| 13 |
+
return request.app.state.explainer_service
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_feedback_service(request: Request) -> FeedbackService:
|
| 17 |
+
return request.app.state.feedback_service
|
app/api/schemas.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Schema validation for the API requests and responses
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Annotated, Dict, Literal, Optional
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class InferenceRequest(BaseModel):
|
| 9 |
+
input_tweet: Annotated[str, Field(..., description="Input tweet or comment text for classification")]
|
| 10 |
+
text: Annotated[str, Field(..., description="Preprocessed text to be fed to the model for prediction")]
|
| 11 |
+
|
| 12 |
+
class ExplanationRequest(BaseModel):
|
| 13 |
+
input_tweet: Annotated[str, Field(..., description="Input tweet or comment text for generating explanation")]
|
| 14 |
+
|
| 15 |
+
class PredictionResult(BaseModel):
|
| 16 |
+
label: int
|
| 17 |
+
confidence: float = Field(..., ge=0.0, le=1.0, description="Prediction probability")
|
| 18 |
+
toxicity: Literal["strong", "high", "uncertain", "none"]
|
| 19 |
+
|
| 20 |
+
class ModelInfoSchema(BaseModel):
|
| 21 |
+
name: str = Field(..., description="Model name")
|
| 22 |
+
version: int = Field(..., description="Model version")
|
| 23 |
+
vectorizer: str = Field(..., description="Vectorizer class name")
|
| 24 |
+
|
| 25 |
+
class MetadataSchema(BaseModel):
|
| 26 |
+
latency: float = Field(..., ge=0, description="Response time in seconds")
|
| 27 |
+
usage: Dict[str, float]
|
| 28 |
+
model: ModelInfoSchema
|
| 29 |
+
streamable: bool = Field(default=False)
|
| 30 |
+
environment: Literal["Standard", "Beta", "Production"]
|
| 31 |
+
api_version: str
|
| 32 |
+
|
| 33 |
+
class InferenceResponse(BaseModel):
|
| 34 |
+
id: str
|
| 35 |
+
timestamp: datetime
|
| 36 |
+
object: Literal["text-classification"]
|
| 37 |
+
prediction: PredictionResult
|
| 38 |
+
warnings: Optional[dict] = None
|
| 39 |
+
metadata: MetadataSchema
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class FeedbackRequest(BaseModel):
|
| 43 |
+
predicted_label: Literal[0, 1]
|
| 44 |
+
feedback_label: Literal[0, 1]
|
app/main.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import yaml, json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from fastapi import FastAPI
|
| 5 |
+
from contextlib import asynccontextmanager
|
| 6 |
+
|
| 7 |
+
import xgboost as xgb
|
| 8 |
+
from lime.lime_text import LimeTextExplainer
|
| 9 |
+
from src.core.constants import REGISTERED_MODELS_DIR
|
| 10 |
+
from src.app.workers import BufferedEventConsumerWorker
|
| 11 |
+
from src.app.services.inference import InferenceService
|
| 12 |
+
from src.app.services.feedback import FeedbackService
|
| 13 |
+
from src.app.services.explainer import ExplainerService
|
| 14 |
+
|
| 15 |
+
from src.utils import load_obj
|
| 16 |
+
from src.app.middleware import http_observability_middleware
|
| 17 |
+
from src.core.constants import DATABASE_NAME, PRODUCTION_COLLECTION_NAME, FEEDBACK_COLLECTION_NAME
|
| 18 |
+
from src.core.mongo_client import MongoDBClient
|
| 19 |
+
from src.core.logger import logging
|
| 20 |
+
from src.core.exception import AppException
|
| 21 |
+
|
| 22 |
+
from src.app.api.api_routes import router
|
| 23 |
+
|
| 24 |
+
@asynccontextmanager
|
| 25 |
+
async def lifespan(app: FastAPI):
|
| 26 |
+
# application state setup
|
| 27 |
+
try:
|
| 28 |
+
mongo_client = MongoDBClient()
|
| 29 |
+
# load model
|
| 30 |
+
xgb_booster = xgb.Booster()
|
| 31 |
+
xgb_booster.load_model(Path(REGISTERED_MODELS_DIR, "artifacts", "booster.json"))
|
| 32 |
+
xgb_booster.set_param({"nthread": 1})
|
| 33 |
+
# load vectorizer
|
| 34 |
+
vectorizer = load_obj(Path(REGISTERED_MODELS_DIR, "artifacts"), "vectorizer.joblib")
|
| 35 |
+
|
| 36 |
+
with open(Path(REGISTERED_MODELS_DIR, "artifacts/metrics.json"), 'r') as f:
|
| 37 |
+
metrics = json.load(f)
|
| 38 |
+
eval_threshold = metrics.get("threshold", 0.5)
|
| 39 |
+
|
| 40 |
+
# get model version
|
| 41 |
+
with open(Path("src/app/model/registered_model_meta"), 'r') as f:
|
| 42 |
+
model_metadata = yaml.safe_load(f)
|
| 43 |
+
if not model_metadata:
|
| 44 |
+
raise FileNotFoundError("Failed to load file having model metadata")
|
| 45 |
+
model_version = int(model_metadata.get("model_version", 0))
|
| 46 |
+
|
| 47 |
+
# initialize workers
|
| 48 |
+
prediction_event_consumer = BufferedEventConsumerWorker(mongo_client, DATABASE_NAME, PRODUCTION_COLLECTION_NAME)
|
| 49 |
+
feedback_event_consumer = BufferedEventConsumerWorker(mongo_client, DATABASE_NAME, FEEDBACK_COLLECTION_NAME)
|
| 50 |
+
|
| 51 |
+
# initialize services
|
| 52 |
+
app.state.prediction_service = InferenceService(xgb_booster, vectorizer, eval_threshold,
|
| 53 |
+
prediction_event_consumer, model_version)
|
| 54 |
+
|
| 55 |
+
app.state.feedback_service = FeedbackService(feedback_event_consumer)
|
| 56 |
+
|
| 57 |
+
lime_explainer = LimeTextExplainer(class_names=["hate", "non-hate"], bow=False)
|
| 58 |
+
app.state.explainer_service = ExplainerService(lime_explainer, xgb_booster, vectorizer)
|
| 59 |
+
|
| 60 |
+
logging.info("Infernce API app server started successfully")
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logging.critical(f"Startup Failed: {e}", exc_info=True)
|
| 64 |
+
raise AppException(e, sys)
|
| 65 |
+
|
| 66 |
+
# run application
|
| 67 |
+
yield
|
| 68 |
+
|
| 69 |
+
# application shutdown
|
| 70 |
+
prediction_event_consumer.shutdown()
|
| 71 |
+
feedback_event_consumer.shutdown()
|
| 72 |
+
mongo_client.close_connection()
|
| 73 |
+
|
| 74 |
+
# Create FastAPI app
|
| 75 |
+
app = FastAPI(
|
| 76 |
+
title="Hate Speech Detection API",
|
| 77 |
+
version="2.0.0",
|
| 78 |
+
description="Production-grade ML inference API with monitoring and feedback system.",
|
| 79 |
+
lifespan=lifespan
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Register API routes
|
| 83 |
+
app.include_router(router, prefix="/api")
|
| 84 |
+
|
| 85 |
+
# Register middleware
|
| 86 |
+
app.middleware("http")(http_observability_middleware)
|
app/middleware/__init__.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import uuid
|
| 3 |
+
from fastapi import Request
|
| 4 |
+
from src.core.logger import logging
|
| 5 |
+
from src.app.monitoring.http_metrics import HTTP_REQUESTS_TOTAL, HTTP_REQUEST_DURATION_SECONDS, HTTP_REQUESTS_IN_PROGRESS
|
| 6 |
+
|
| 7 |
+
async def http_observability_middleware(request: Request, call_next):
|
| 8 |
+
# Skip Prometheus metrics endpoint
|
| 9 |
+
if request.url.path == "/api/metrics":
|
| 10 |
+
return await call_next(request)
|
| 11 |
+
|
| 12 |
+
request_id = request.headers.get("X-Request-ID")
|
| 13 |
+
if not request_id:
|
| 14 |
+
request_id = str(uuid.uuid4())
|
| 15 |
+
|
| 16 |
+
request.state.request_id = request_id
|
| 17 |
+
|
| 18 |
+
method = request.method
|
| 19 |
+
route = request.scope.get("route")
|
| 20 |
+
path = route.path if route else request.url.path
|
| 21 |
+
|
| 22 |
+
start_time = time.perf_counter()
|
| 23 |
+
|
| 24 |
+
logging.info(f"[{request_id}] Incoming request {method} {path}")
|
| 25 |
+
|
| 26 |
+
HTTP_REQUESTS_IN_PROGRESS.inc()
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
response = await call_next(request)
|
| 30 |
+
status_code = response.status_code
|
| 31 |
+
|
| 32 |
+
except Exception:
|
| 33 |
+
duration = time.perf_counter() - start_time
|
| 34 |
+
|
| 35 |
+
HTTP_REQUESTS_TOTAL.labels(
|
| 36 |
+
method=method,
|
| 37 |
+
path=path,
|
| 38 |
+
status="500",
|
| 39 |
+
).inc()
|
| 40 |
+
|
| 41 |
+
HTTP_REQUEST_DURATION_SECONDS.labels(
|
| 42 |
+
method=method,
|
| 43 |
+
path=path,
|
| 44 |
+
).observe(duration)
|
| 45 |
+
|
| 46 |
+
HTTP_REQUESTS_IN_PROGRESS.dec()
|
| 47 |
+
raise
|
| 48 |
+
|
| 49 |
+
duration = time.perf_counter() - start_time
|
| 50 |
+
|
| 51 |
+
HTTP_REQUESTS_TOTAL.labels(
|
| 52 |
+
method=method,
|
| 53 |
+
path=path,
|
| 54 |
+
status=str(status_code),
|
| 55 |
+
).inc()
|
| 56 |
+
|
| 57 |
+
HTTP_REQUEST_DURATION_SECONDS.labels(
|
| 58 |
+
method=method,
|
| 59 |
+
path=path,
|
| 60 |
+
).observe(duration)
|
| 61 |
+
|
| 62 |
+
HTTP_REQUESTS_IN_PROGRESS.dec()
|
| 63 |
+
|
| 64 |
+
response.headers["X-Request-ID"] = request_id
|
| 65 |
+
|
| 66 |
+
return response
|
app/middleware/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (2.7 kB). View file
|
|
|
app/model/MLmodel
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
artifact_path: XGB-v4
|
| 2 |
+
flavors:
|
| 3 |
+
python_function:
|
| 4 |
+
artifacts:
|
| 5 |
+
booster:
|
| 6 |
+
path: artifacts\booster.json
|
| 7 |
+
uri: D:\My Projects\Toxic Tweet-Tagger\test_models\booster.json
|
| 8 |
+
metrics:
|
| 9 |
+
path: artifacts\metrics.json
|
| 10 |
+
uri: D:\My Projects\Toxic Tweet-Tagger\test_models\metrics.json
|
| 11 |
+
model:
|
| 12 |
+
path: artifacts\model.joblib
|
| 13 |
+
uri: D:\My Projects\Toxic Tweet-Tagger\test_models\model.joblib
|
| 14 |
+
vectorizer:
|
| 15 |
+
path: artifacts\vectorizer.joblib
|
| 16 |
+
uri: D:\My Projects\Toxic Tweet-Tagger\test_models\vectorizer.joblib
|
| 17 |
+
cloudpickle_version: 3.1.1
|
| 18 |
+
code: null
|
| 19 |
+
env:
|
| 20 |
+
conda: conda.yaml
|
| 21 |
+
virtualenv: python_env.yaml
|
| 22 |
+
loader_module: mlflow.pyfunc.model
|
| 23 |
+
python_model: python_model.pkl
|
| 24 |
+
python_version: 3.11.5
|
| 25 |
+
streamable: false
|
| 26 |
+
mlflow_version: 2.22.1
|
| 27 |
+
model_size_bytes: 4509986
|
| 28 |
+
model_uuid: a2c7dc9359894fab899c13a88f9f9a4c
|
| 29 |
+
prompts: null
|
| 30 |
+
run_id: 9b6349ac22e04cf0a22257952299e056
|
| 31 |
+
utc_time_created: '2026-02-17 03:36:56.916972'
|
app/model/artifacts/booster.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/model/artifacts/metrics.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"threshold": 0.48, "accuracy": 0.80601184024014, "precision": 0.7922348788705278, "recall": 0.8272049585392411, "f1 score": 0.8093423478795329, "roc_auc": 0.8852078424195188}
|
app/model/artifacts/model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:944104aefb800d70fccf6e65e545f3b3fbbbd79382e461ba06f98992f2e686fe
|
| 3 |
+
size 1240274
|
app/model/artifacts/vectorizer.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fea91a7b6fb18543f7a7e2717e44135985df11a339efbc30b410dbad8d962fe
|
| 3 |
+
size 116783
|
app/model/conda.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
channels:
|
| 2 |
+
- conda-forge
|
| 3 |
+
dependencies:
|
| 4 |
+
- python=3.11.5
|
| 5 |
+
- pip<=25.1
|
| 6 |
+
- pip:
|
| 7 |
+
- mlflow==2.22.1
|
| 8 |
+
- cloudpickle==3.1.1
|
| 9 |
+
- numpy==2.2.6
|
| 10 |
+
- pandas==2.3.1
|
| 11 |
+
- psutil==7.0.0
|
| 12 |
+
- scikit-learn==1.6.1
|
| 13 |
+
- scipy==1.13.1
|
| 14 |
+
- xgboost==3.0.2
|
| 15 |
+
name: mlflow-env
|
app/model/python_env.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python: 3.11.5
|
| 2 |
+
build_dependencies:
|
| 3 |
+
- pip==25.1
|
| 4 |
+
- setuptools==80.9.0
|
| 5 |
+
- wheel==0.45.1
|
| 6 |
+
dependencies:
|
| 7 |
+
- -r requirements.txt
|
app/model/python_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6472cb967ef181289c33621e9dbfd2437a53589d1ca75de4307ba20a0bae7ef
|
| 3 |
+
size 1378921
|
app/model/registered_model_meta
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name: ToxicTagger-Models
|
| 2 |
+
model_version: '26'
|
app/model/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
mlflow==2.22.1
|
| 2 |
+
cloudpickle==3.1.1
|
| 3 |
+
numpy==2.2.6
|
| 4 |
+
pandas==2.3.1
|
| 5 |
+
psutil==7.0.0
|
| 6 |
+
scikit-learn==1.6.1
|
| 7 |
+
scipy==1.13.1
|
| 8 |
+
xgboost==3.0.2
|
app/monitoring/__init__.py
ADDED
|
File without changes
|
app/monitoring/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (173 Bytes). View file
|
|
|
app/monitoring/__pycache__/http_metrics.cpython-311.pyc
ADDED
|
Binary file (931 Bytes). View file
|
|
|
app/monitoring/__pycache__/service_metrics.cpython-311.pyc
ADDED
|
Binary file (1.96 kB). View file
|
|
|
app/monitoring/http_metrics.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from prometheus_client import Counter, Histogram, Gauge
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
HTTP_REQUESTS_TOTAL = Counter(
|
| 5 |
+
"http_requests_total",
|
| 6 |
+
"Total number of HTTP requests handled by the inference service",
|
| 7 |
+
["method", "path", "status"],
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
HTTP_REQUEST_DURATION_SECONDS = Histogram(
|
| 11 |
+
"http_request_duration_seconds",
|
| 12 |
+
"HTTP request latency in seconds",
|
| 13 |
+
["method", "path"],
|
| 14 |
+
buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
HTTP_REQUESTS_IN_PROGRESS = Gauge(
|
| 18 |
+
"http_requests_in_progress",
|
| 19 |
+
"Number of HTTP requests currently being processed",
|
| 20 |
+
)
|
app/monitoring/service_metrics.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from prometheus_client import Counter, Histogram
|
| 2 |
+
|
| 3 |
+
# Requests successfully served
|
| 4 |
+
PREDICTION_REQUEST_SUCCESS = Counter(
|
| 5 |
+
"predict_requests_success_total",
|
| 6 |
+
"Total successful prediction responses"
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
# Requests failed
|
| 10 |
+
PREDICTION_REQUEST_FAILED = Counter(
|
| 11 |
+
"predict_requests_failed_total",
|
| 12 |
+
"Total failed prediction requests"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
# Prediction class distribution (hate / non-hate)
|
| 16 |
+
PREDICTION_CLASS = Counter(
|
| 17 |
+
"prediction_class_total",
|
| 18 |
+
"Count of predicted classes",
|
| 19 |
+
["class_label"] # label dimension
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Prediction label confidence
|
| 23 |
+
PREDICTION_CONFIDENCE = Histogram(
|
| 24 |
+
"prediction_confidence",
|
| 25 |
+
"Confidence distribution by predicted class",
|
| 26 |
+
["class_label"],
|
| 27 |
+
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Inference responce time
|
| 31 |
+
INFERENCE_LATENCY = Histogram(
|
| 32 |
+
"model_inference_seconds",
|
| 33 |
+
"Model inference time in seconds",
|
| 34 |
+
buckets=[0.005, 0.01, 0.02, 0.03, 0.05, 0.1]
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
EXPLAINER_REQUEST_SUCCESS = Counter(
|
| 38 |
+
"explainer_requests_success_total",
|
| 39 |
+
"Total successful explain requests"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
EXPLAINER_REQUEST_FAILED = Counter(
|
| 43 |
+
"explainer_requests_failed_total",
|
| 44 |
+
"Total failed explain requests"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Feedback counter
|
| 48 |
+
FEEDBACK_REQUEST_SUCCESS = Counter(
|
| 49 |
+
"feedback_subissions_success_total",
|
| 50 |
+
"Total successful feedback submissions"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
USER_PREDICTION_FEEDBACK = Counter(
|
| 54 |
+
"user_prediction_feedback_total",
|
| 55 |
+
"User feedback indicating whether the model prediction was correct",
|
| 56 |
+
["feedback"]
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
FEEDBACK_REQUEST_FAILED = Counter(
|
| 60 |
+
"feedback_submissions_failed_total",
|
| 61 |
+
"Total failed feedback submissions"
|
| 62 |
+
)
|
app/requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.116.1
|
| 2 |
+
uvicorn==0.35.0
|
| 3 |
+
joblib==1.5.1
|
| 4 |
+
PyYAML==6.0.2
|
| 5 |
+
lime==0.2.0.1
|
| 6 |
+
gunicorn==23.0.0
|
| 7 |
+
prometheus-client==0.23.1
|
app/services/__pycache__/explainer.cpython-311.pyc
ADDED
|
Binary file (3.12 kB). View file
|
|
|
app/services/__pycache__/feedback.cpython-311.pyc
ADDED
|
Binary file (2.4 kB). View file
|
|
|
app/services/__pycache__/inference.cpython-311.pyc
ADDED
|
Binary file (5.56 kB). View file
|
|
|
app/services/explainer.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import numpy as np
|
| 3 |
+
from src.core.logger import logging
|
| 4 |
+
from src.core.exception import AppException
|
| 5 |
+
from src.app.monitoring.service_metrics import EXPLAINER_REQUEST_SUCCESS, EXPLAINER_REQUEST_FAILED
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ExplainerService:
|
| 9 |
+
def __init__(self, explainer, model_booster, vectorizer):
|
| 10 |
+
"""
|
| 11 |
+
Initializes an instance of LimeExplainer explaination service.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
explainer (LimeTextExplainer) : An instance of LimeTextExplainer.
|
| 15 |
+
model_booster (xgb.XGBClassifier) : An instance of the XGBoost model.
|
| 16 |
+
vectorizer : An instance of the vectorizer to transform input text into numerical features.
|
| 17 |
+
"""
|
| 18 |
+
self.explainer = explainer
|
| 19 |
+
self.booster = model_booster
|
| 20 |
+
self.vectorizer = vectorizer
|
| 21 |
+
|
| 22 |
+
def _get_prediction(self, text) -> np.ndarray:
|
| 23 |
+
"""
|
| 24 |
+
Internal function to get class probability scores for lime explainer.
|
| 25 |
+
"""
|
| 26 |
+
X = self.vectorizer.transform(text)
|
| 27 |
+
prob = self.booster.inplace_predict(X)
|
| 28 |
+
|
| 29 |
+
if len(prob.shape) == 1:
|
| 30 |
+
prob = np.vstack([1 - prob, prob]).T
|
| 31 |
+
return prob
|
| 32 |
+
|
| 33 |
+
def explain(self, text: str):
|
| 34 |
+
"""
|
| 35 |
+
Generate an explanation of the prediction made by the model for a given text.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
HTML content of the explanation which is rendered in the UI
|
| 39 |
+
"""
|
| 40 |
+
try:
|
| 41 |
+
explanation = self.explainer.explain_instance(
|
| 42 |
+
text,
|
| 43 |
+
self._get_prediction,
|
| 44 |
+
num_features=10,
|
| 45 |
+
num_samples=20
|
| 46 |
+
)
|
| 47 |
+
html_content = explanation.as_html()
|
| 48 |
+
EXPLAINER_REQUEST_SUCCESS.inc()
|
| 49 |
+
|
| 50 |
+
return html_content
|
| 51 |
+
|
| 52 |
+
except Exception as e:
|
| 53 |
+
EXPLAINER_REQUEST_FAILED.inc()
|
| 54 |
+
logging.error(f"Explainer service failed: {e}", exc_info=True)
|
| 55 |
+
raise AppException(e, sys)
|
app/services/feedback.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from datetime import datetime, timezone
|
| 3 |
+
from src.core.logger import logging
|
| 4 |
+
from src.core.exception import AppException
|
| 5 |
+
from src.app.monitoring.service_metrics import FEEDBACK_REQUEST_SUCCESS, USER_PREDICTION_FEEDBACK, FEEDBACK_REQUEST_FAILED
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class FeedbackService:
|
| 9 |
+
def __init__(self, feedback_event_consumer):
|
| 10 |
+
|
| 11 |
+
self.event_consumer_worker = feedback_event_consumer
|
| 12 |
+
|
| 13 |
+
def submit_feedback(self, request_id, pred_label, feedback_label):
|
| 14 |
+
try:
|
| 15 |
+
feedback_record = {
|
| 16 |
+
"request_id": request_id,
|
| 17 |
+
"time_stamp": datetime.now(timezone.utc).isoformat(),
|
| 18 |
+
"predicted_label": pred_label,
|
| 19 |
+
"feedback_label": feedback_label
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
self.event_consumer_worker.add_event(feedback_record)
|
| 23 |
+
FEEDBACK_REQUEST_SUCCESS.inc()
|
| 24 |
+
|
| 25 |
+
if feedback_label == 1:
|
| 26 |
+
USER_PREDICTION_FEEDBACK.labels(feedback="correct").inc()
|
| 27 |
+
else:
|
| 28 |
+
USER_PREDICTION_FEEDBACK.labels(feedback="incorrect").inc()
|
| 29 |
+
|
| 30 |
+
return {
|
| 31 |
+
"status": "success",
|
| 32 |
+
"message": "Feedback recorded successfully",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logging.exception(f"Failed to submit feedback for request_id: {request_id} : {e}")
|
| 37 |
+
FEEDBACK_REQUEST_FAILED.inc()
|
| 38 |
+
raise AppException(e, sys)
|
app/services/inference.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time, sys
|
| 2 |
+
from datetime import datetime, timezone
|
| 3 |
+
from src.core.logger import logging
|
| 4 |
+
from src.core.exception import AppException
|
| 5 |
+
from src.app.monitoring.service_metrics import (PREDICTION_REQUEST_SUCCESS, PREDICTION_REQUEST_FAILED,
|
| 6 |
+
PREDICTION_CLASS, INFERENCE_LATENCY, PREDICTION_CONFIDENCE)
|
| 7 |
+
|
| 8 |
+
class InferenceService:
|
| 9 |
+
def __init__(self, model_booster, vectorizer, eval_threshold,
|
| 10 |
+
prediction_event_consumer, model_version):
|
| 11 |
+
|
| 12 |
+
self.booster = model_booster
|
| 13 |
+
self.vectorizer = vectorizer
|
| 14 |
+
self.threshold = eval_threshold
|
| 15 |
+
self.event_consumer_worker = prediction_event_consumer
|
| 16 |
+
self.model_version = model_version
|
| 17 |
+
|
| 18 |
+
def predict(self, request_id: str, input_tweet: str, text: str):
|
| 19 |
+
"""
|
| 20 |
+
Model inference API endpoint.
|
| 21 |
+
|
| 22 |
+
This endpoint accepts a POST request with a JSON payload containing a comment string.
|
| 23 |
+
It returns a JSON response containing the model prediction, and metadata.
|
| 24 |
+
"""
|
| 25 |
+
try:
|
| 26 |
+
timestamp = datetime.now(timezone.utc).isoformat()
|
| 27 |
+
start_time = time.perf_counter()
|
| 28 |
+
|
| 29 |
+
x = self.vectorizer.transform([text])
|
| 30 |
+
prob = self.booster.inplace_predict(x) # P(class=1)
|
| 31 |
+
pred = (prob > self.threshold).astype(int)
|
| 32 |
+
|
| 33 |
+
except Exception as e:
|
| 34 |
+
logging.exception(e)
|
| 35 |
+
PREDICTION_REQUEST_FAILED.inc()
|
| 36 |
+
raise AppException(e, sys)
|
| 37 |
+
|
| 38 |
+
if prob is None or len(pred) == 0:
|
| 39 |
+
logging.error("No prediction made by the model")
|
| 40 |
+
PREDICTION_REQUEST_FAILED.inc()
|
| 41 |
+
raise RuntimeError("No prediction made by the model")
|
| 42 |
+
|
| 43 |
+
if prob[0] > 0.70:
|
| 44 |
+
toxicity = "strong"
|
| 45 |
+
elif prob[0] > self.threshold + 0.05:
|
| 46 |
+
toxicity = "high"
|
| 47 |
+
elif prob[0] > self.threshold - 0.03:
|
| 48 |
+
toxicity = "uncertain"
|
| 49 |
+
else:
|
| 50 |
+
toxicity = "none"
|
| 51 |
+
|
| 52 |
+
confidence = float(prob[0] if pred[0] == 1 else 1-prob[0])
|
| 53 |
+
confidence_margin = abs(2*float(prob[0]) - 1)
|
| 54 |
+
|
| 55 |
+
warnings = None
|
| 56 |
+
if confidence_margin < 0.10:
|
| 57 |
+
message=f"Prediction is close to model decision boundary. Confidence Margin: {round(confidence_margin, 4)}. Manual review is recommended!"
|
| 58 |
+
warnings = {
|
| 59 |
+
"code": "LOW_CONFIDENCE_MARGIN",
|
| 60 |
+
"message": message
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Prepare the record to insert into database
|
| 64 |
+
prediction_record = {
|
| 65 |
+
"request_id": request_id,
|
| 66 |
+
"timestamp": timestamp,
|
| 67 |
+
"comment": input_tweet,
|
| 68 |
+
"prediction": int(pred[0]),
|
| 69 |
+
"confidence": round(confidence, 4)
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# Add the record to the batch writer for asynchronous insertion into MongoDB
|
| 73 |
+
self.event_consumer_worker.add_event(prediction_record)
|
| 74 |
+
|
| 75 |
+
end_time = time.perf_counter()
|
| 76 |
+
response_time = round((end_time - start_time), 4)
|
| 77 |
+
|
| 78 |
+
PREDICTION_REQUEST_SUCCESS.inc()
|
| 79 |
+
# Track latency
|
| 80 |
+
INFERENCE_LATENCY.observe(response_time)
|
| 81 |
+
# Track class distribution
|
| 82 |
+
PREDICTION_CLASS.labels(class_label=str(pred[0])).inc()
|
| 83 |
+
# Track class confidence
|
| 84 |
+
PREDICTION_CONFIDENCE.labels(class_label=str(pred[0])).observe(confidence)
|
| 85 |
+
|
| 86 |
+
response = {
|
| 87 |
+
"id": request_id,
|
| 88 |
+
"timestamp": timestamp,
|
| 89 |
+
"object": "text-classification",
|
| 90 |
+
"prediction": {
|
| 91 |
+
"label": int(pred[0]),
|
| 92 |
+
"confidence": round(confidence, 4),
|
| 93 |
+
"toxicity": toxicity,
|
| 94 |
+
},
|
| 95 |
+
"warnings": warnings if warnings else None,
|
| 96 |
+
"metadata": {
|
| 97 |
+
"latency": response_time,
|
| 98 |
+
"usage": {
|
| 99 |
+
"word_count": len(text.split()),
|
| 100 |
+
"total_characters": len(text)
|
| 101 |
+
},
|
| 102 |
+
"model": {
|
| 103 |
+
"name": "XGB-Classifier-Booster",
|
| 104 |
+
"version": self.model_version,
|
| 105 |
+
"vectorizer": str(type(self.vectorizer).__name__),
|
| 106 |
+
},
|
| 107 |
+
"streamable": False,
|
| 108 |
+
"environment": "Production",
|
| 109 |
+
"api_version": "v-2.0"
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
return response
|
app/workers/__init__.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This module contains the implementation of the BufferedEventConsumerWorker class, which is responsible for consuming events from a buffer queue and writing them to a MongoDB collection in batches.
|
| 2 |
+
|
| 3 |
+
import time
|
| 4 |
+
import threading
|
| 5 |
+
from typing import Any
|
| 6 |
+
from queue import Queue, Empty, Full
|
| 7 |
+
from src.core.logger import logging
|
| 8 |
+
|
| 9 |
+
class BufferedEventConsumerWorker:
|
| 10 |
+
def __init__(self, mongo_client, database_name, collection_name, queue_maxsize: int = 1200,
|
| 11 |
+
max_batch_size: int = 1000, flush_interval: int = 30, mongo_timeout: int = 5):
|
| 12 |
+
"""
|
| 13 |
+
Initializes the buffered event consumer worker.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
mongo_client (MongoClient): The MongoDB client.
|
| 17 |
+
database_name (str): The name of the MongoDB database.
|
| 18 |
+
collection_name (str): The name of the MongoDB collection.
|
| 19 |
+
queue_maxsize (int, optional): The maximum size of the queue. Defaults to 1200.
|
| 20 |
+
max_batch_size (int, optional): The maximum batch size for write operations. Defaults to 1000.
|
| 21 |
+
flush_interval (int, optional): The interval (in seconds) at which to flush the queue. Defaults to 30.
|
| 22 |
+
mongo_timeout (int, optional): The timeout (in seconds) for write operations. Defaults to 5.
|
| 23 |
+
"""
|
| 24 |
+
self.queue: Queue[dict[str, Any] | None] = Queue(queue_maxsize)
|
| 25 |
+
self.max_batch_size = max_batch_size
|
| 26 |
+
self.flush_interval = flush_interval
|
| 27 |
+
self.mongo_timeout = mongo_timeout
|
| 28 |
+
|
| 29 |
+
self.shutdown_event: threading.Event = threading.Event()
|
| 30 |
+
|
| 31 |
+
self.client = mongo_client
|
| 32 |
+
self.database_name = database_name
|
| 33 |
+
self.collection_name = collection_name
|
| 34 |
+
|
| 35 |
+
self.worker: threading.Thread = threading.Thread(
|
| 36 |
+
target=self._writer_worker,
|
| 37 |
+
daemon=True
|
| 38 |
+
)
|
| 39 |
+
self.worker.start()
|
| 40 |
+
|
| 41 |
+
def add_event(self, record: dict) -> None:
|
| 42 |
+
"""
|
| 43 |
+
Adds a record to the buffer queue.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
record (dict): The record to add to the buffer queue.
|
| 47 |
+
"""
|
| 48 |
+
try:
|
| 49 |
+
self.queue.put(record, timeout=0.3)
|
| 50 |
+
except Full:
|
| 51 |
+
logging.warning(f"Failed to add record in buffer queue: Queue is full")
|
| 52 |
+
|
| 53 |
+
def shutdown(self) -> None:
|
| 54 |
+
"""
|
| 55 |
+
Gracefully shut down the worker thread.
|
| 56 |
+
Ensures final flush before exit.
|
| 57 |
+
"""
|
| 58 |
+
self.shutdown_event.set()
|
| 59 |
+
|
| 60 |
+
# Wake up worker if it's blocked on queue.get()
|
| 61 |
+
self.queue.put(None)
|
| 62 |
+
# Wait the main thread until worker fully exits.
|
| 63 |
+
self.worker.join()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# INTERNAL WORKER
|
| 67 |
+
def _writer_worker(self) -> None:
|
| 68 |
+
"""
|
| 69 |
+
The worker thread responsible for consuming records from the buffer queue and writing them to MongoDB.
|
| 70 |
+
|
| 71 |
+
It runs indefinitely until the shutdown event is set, at which point it will drain the queue quickly and exit.
|
| 72 |
+
The worker thread tries to flush the queue at regular intervals, or when the batch size reaches the maximum threshold.
|
| 73 |
+
If the queue is empty, it will wait indefinitely for new records to arrive. If a timeout is reached, it will flush the queue and reset the timer.
|
| 74 |
+
"""
|
| 75 |
+
batch = []
|
| 76 |
+
first_record_time = None
|
| 77 |
+
|
| 78 |
+
while not self.shutdown_event.is_set():
|
| 79 |
+
try:
|
| 80 |
+
if first_record_time is None:
|
| 81 |
+
# No records yet → wait indefinitely
|
| 82 |
+
record = self.queue.get()
|
| 83 |
+
else:
|
| 84 |
+
elapsed = time.time() - first_record_time
|
| 85 |
+
remaining = max(self.flush_interval - elapsed, 0)
|
| 86 |
+
record = self.queue.get(timeout=remaining)
|
| 87 |
+
|
| 88 |
+
if record is None:
|
| 89 |
+
break
|
| 90 |
+
|
| 91 |
+
batch.append(record)
|
| 92 |
+
|
| 93 |
+
if first_record_time is None:
|
| 94 |
+
first_record_time = time.time()
|
| 95 |
+
|
| 96 |
+
# Drain quickly if batch growing
|
| 97 |
+
while len(batch) < self.max_batch_size:
|
| 98 |
+
try:
|
| 99 |
+
record = self.queue.get_nowait()
|
| 100 |
+
if record is None:
|
| 101 |
+
break
|
| 102 |
+
batch.append(record)
|
| 103 |
+
except Empty:
|
| 104 |
+
break
|
| 105 |
+
|
| 106 |
+
except Empty:
|
| 107 |
+
# Timeout reached
|
| 108 |
+
pass
|
| 109 |
+
|
| 110 |
+
# Flush conditions
|
| 111 |
+
if batch and (
|
| 112 |
+
len(batch) >= self.max_batch_size or
|
| 113 |
+
(first_record_time and
|
| 114 |
+
time.time() - first_record_time >= self.flush_interval)
|
| 115 |
+
):
|
| 116 |
+
self._flush(batch)
|
| 117 |
+
batch.clear()
|
| 118 |
+
first_record_time = None
|
| 119 |
+
|
| 120 |
+
# Final flush on shutdown
|
| 121 |
+
if batch:
|
| 122 |
+
self._flush(batch)
|
| 123 |
+
logging.info("BufferedEventConsumer worker stopped cleanly.")
|
| 124 |
+
|
| 125 |
+
# Database flush
|
| 126 |
+
def _flush(self, batch_records: list):
|
| 127 |
+
"""
|
| 128 |
+
Writes batch records to MongoDB with basic failure handling.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
batch_records (list): The list of records to flush to the database.
|
| 132 |
+
|
| 133 |
+
Raises:
|
| 134 |
+
Exception: If an error occurs while flushing the batch records to the database.
|
| 135 |
+
"""
|
| 136 |
+
try:
|
| 137 |
+
self.client.insert_docs(self.collection_name,
|
| 138 |
+
self.database_name,
|
| 139 |
+
batch_records
|
| 140 |
+
)
|
| 141 |
+
logging.info(f"Flushed {len(batch_records)} records to MongoDB")
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logging.error(f"BufferedBatchWriter failed to flush: {e}", exc_info=True)
|
app/workers/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (7.08 kB). View file
|
|
|
components/__init__.py
ADDED
|
File without changes
|
components/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (163 Bytes). View file
|
|
|
components/__pycache__/data_ingestion.cpython-311.pyc
ADDED
|
Binary file (5 kB). View file
|
|
|
components/__pycache__/data_preprocessing.cpython-311.pyc
ADDED
|
Binary file (9.21 kB). View file
|
|
|
components/__pycache__/data_validation.cpython-311.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|