Spaces:
Sleeping
Sleeping
MOHAN799S commited on
Commit ·
8da2d54
1
Parent(s): 53e8064
Deploy CivicConnect AI Engine — BERT + BLIP + EasyOCR + Whisper API
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +55 -0
- .gitattributes +2 -0
- .gitignore +51 -0
- Dockerfile +67 -0
- classification/artifacts/bert_model/config.json +51 -0
- classification/artifacts/indic_test.csv +0 -0
- classification/artifacts/indic_tokenizer.pkl +3 -0
- classification/artifacts/indic_train.csv +0 -0
- classification/artifacts/indic_val.csv +0 -0
- classification/artifacts/indicbert_model/config.json +54 -0
- classification/artifacts/indicbert_model/tokenizer.json +3 -0
- classification/artifacts/indicbert_model/tokenizer_config.json +24 -0
- classification/artifacts/label_encoder.pkl +3 -0
- classification/artifacts/label_map.pkl +3 -0
- classification/artifacts/test.csv +523 -0
- classification/artifacts/tokenizer.pkl +3 -0
- classification/artifacts/train.csv +0 -0
- classification/artifacts/val.csv +523 -0
- classification/bert_classify.py +164 -0
- classification/bert_model.py +417 -0
- classification/classification/artifacts/label_encoder.pkl +3 -0
- classification/classification/artifacts/label_map.pkl +3 -0
- classification/indic_bert_classify.py +142 -0
- classification/indic_bert_model.py +299 -0
- classification/indic_train.csv +0 -0
- classification/train.csv +0 -0
- gfas/__init__.py +9 -0
- gfas/disparity_analysis.py +156 -0
- gfas/fairness_audit.py +111 -0
- gfas/fairness_metrics.py +80 -0
- gfas/gfas_engine.py +81 -0
- gfas/report_generator.py +93 -0
- main.py +707 -0
- multi_modal/audio_to_text.py +463 -0
- multi_modal/image_to_text.py +346 -0
- requirements.txt +50 -0
- sentiment_analysis/artifacts/indic_urgency_model/config.json +46 -0
- sentiment_analysis/artifacts/indic_urgency_model/label_encoder.pkl +3 -0
- sentiment_analysis/artifacts/indic_urgency_model/label_map.pkl +3 -0
- sentiment_analysis/artifacts/indic_urgency_model/tokenizer.json +3 -0
- sentiment_analysis/artifacts/indic_urgency_model/tokenizer_config.json +24 -0
- sentiment_analysis/artifacts/urgency_bert_model/config.json +43 -0
- sentiment_analysis/artifacts/urgency_bert_model/label_encoder.pkl +3 -0
- sentiment_analysis/artifacts/urgency_bert_model/label_map.pkl +3 -0
- sentiment_analysis/artifacts/urgency_bert_model/tokenizer.json +0 -0
- sentiment_analysis/artifacts/urgency_bert_model/tokenizer_config.json +14 -0
- sentiment_analysis/bert_model.py +268 -0
- sentiment_analysis/bert_predict.py +82 -0
- sentiment_analysis/indic_bert_model.py +260 -0
- sentiment_analysis/indic_bert_predict.py +89 -0
.dockerignore
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Version control ───────────────────────────────────────
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# ── Python cache ──────────────────────────────────────────
|
| 6 |
+
__pycache__
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
.Python
|
| 11 |
+
*.egg-info
|
| 12 |
+
dist/
|
| 13 |
+
build/
|
| 14 |
+
.eggs/
|
| 15 |
+
|
| 16 |
+
# ── Virtual environments ──────────────────────────────────
|
| 17 |
+
.venv
|
| 18 |
+
venv/
|
| 19 |
+
env/
|
| 20 |
+
ENV/
|
| 21 |
+
|
| 22 |
+
# ── Local secrets / config ────────────────────────────────
|
| 23 |
+
.env
|
| 24 |
+
.env.*
|
| 25 |
+
!.env.example
|
| 26 |
+
|
| 27 |
+
# ── Test / dev artefacts ──────────────────────────────────
|
| 28 |
+
tests/
|
| 29 |
+
*.test.py
|
| 30 |
+
pytest.ini
|
| 31 |
+
.pytest_cache/
|
| 32 |
+
.coverage
|
| 33 |
+
htmlcov/
|
| 34 |
+
|
| 35 |
+
# ── Jupyter notebooks ─────────────────────────────────────
|
| 36 |
+
*.ipynb
|
| 37 |
+
.ipynb_checkpoints/
|
| 38 |
+
|
| 39 |
+
# ── OS junk ───────────────────────────────────────────────
|
| 40 |
+
.DS_Store
|
| 41 |
+
Thumbs.db
|
| 42 |
+
|
| 43 |
+
# ── Docs / CI (not needed at runtime) ─────────────────────
|
| 44 |
+
docs/
|
| 45 |
+
*.md
|
| 46 |
+
!README.md
|
| 47 |
+
.github/
|
| 48 |
+
|
| 49 |
+
# ── Large local model checkpoints (downloaded at runtime) ─
|
| 50 |
+
# Comment these out if you bundle models into the image.
|
| 51 |
+
models/
|
| 52 |
+
*.bin
|
| 53 |
+
*.safetensors
|
| 54 |
+
*.pt
|
| 55 |
+
*.ckpt
|
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
classification/artifacts/indicbert_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
sentiment_analysis/artifacts/indic_urgency_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# In PowerShell inside civicconnect-ai-engine folder:
|
| 2 |
+
@"
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
*.pyo
|
| 6 |
+
.env
|
| 7 |
+
venv/
|
| 8 |
+
.venv/
|
| 9 |
+
*.log
|
| 10 |
+
classification/artifacts/results/
|
| 11 |
+
classification/artifacts/indic_results/
|
| 12 |
+
sentiment_analysis/artifacts/results/
|
| 13 |
+
sentiment_analysis/artifacts/indic_results/
|
| 14 |
+
"@ | Out-File -FilePath .gitignore -Encoding utf8
|
| 15 |
+
|
| 16 |
+
# Python
|
| 17 |
+
__pycache__/
|
| 18 |
+
*.py[cod]
|
| 19 |
+
*.egg-info/
|
| 20 |
+
dist/
|
| 21 |
+
build/
|
| 22 |
+
.eggs/
|
| 23 |
+
|
| 24 |
+
# Environments
|
| 25 |
+
.venv/
|
| 26 |
+
venv/
|
| 27 |
+
env/
|
| 28 |
+
|
| 29 |
+
# Secrets
|
| 30 |
+
.env
|
| 31 |
+
*.key
|
| 32 |
+
|
| 33 |
+
# Models (large binaries — use HF Hub or Git LFS)
|
| 34 |
+
models/
|
| 35 |
+
*.bin
|
| 36 |
+
*.safetensors
|
| 37 |
+
*.pt
|
| 38 |
+
*.ckpt
|
| 39 |
+
|
| 40 |
+
# OS
|
| 41 |
+
.DS_Store
|
| 42 |
+
Thumbs.db
|
| 43 |
+
|
| 44 |
+
# IDE
|
| 45 |
+
.vscode/
|
| 46 |
+
.idea/
|
| 47 |
+
|
| 48 |
+
# Test artefacts
|
| 49 |
+
.pytest_cache/
|
| 50 |
+
.coverage
|
| 51 |
+
htmlcov/
|
Dockerfile
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# CivicConnect — Flask AI Engine
|
| 3 |
+
# Deploy target: Hugging Face Spaces (Docker SDK)
|
| 4 |
+
# =========================================================
|
| 5 |
+
|
| 6 |
+
FROM python:3.11-slim
|
| 7 |
+
|
| 8 |
+
# ── System dependencies ───────────────────────────────────
|
| 9 |
+
# ffmpeg → pydub audio decode (webm/ogg/mp3 → wav)
|
| 10 |
+
# libsndfile1 → soundfile (WAV/FLAC fallback)
|
| 11 |
+
# libgl1-mesa-glx + libglib2.0-0 → EasyOCR / OpenCV headless
|
| 12 |
+
# libgomp1 → PyTorch multi-threaded CPU ops
|
| 13 |
+
# git → HF model downloads via git-lfs if needed
|
| 14 |
+
# curl → health-check probes / HF API calls
|
| 15 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 16 |
+
ffmpeg \
|
| 17 |
+
libsndfile1 \
|
| 18 |
+
libgl1-mesa-glx \
|
| 19 |
+
libglib2.0-0 \
|
| 20 |
+
libgomp1 \
|
| 21 |
+
git \
|
| 22 |
+
curl \
|
| 23 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 24 |
+
|
| 25 |
+
# ── Create non-root user (HF Spaces requirement) ─────────
|
| 26 |
+
RUN useradd -m -u 1000 appuser
|
| 27 |
+
|
| 28 |
+
# ── Set working directory ─────────────────────────────────
|
| 29 |
+
WORKDIR /app
|
| 30 |
+
|
| 31 |
+
# ── Copy requirements first (layer cache) ────────────────
|
| 32 |
+
COPY requirements.txt .
|
| 33 |
+
|
| 34 |
+
# ── Install Python dependencies ───────────────────────────
|
| 35 |
+
RUN pip install --no-cache-dir --upgrade pip \
|
| 36 |
+
&& pip install --no-cache-dir -r requirements.txt
|
| 37 |
+
|
| 38 |
+
# ── Copy application source ───────────────────────────────
|
| 39 |
+
COPY --chown=appuser:appuser . .
|
| 40 |
+
|
| 41 |
+
# ── Environment defaults (overridden by HF Secrets) ──────
|
| 42 |
+
ENV PORT=7860
|
| 43 |
+
ENV PYTHONUNBUFFERED=1
|
| 44 |
+
ENV HF_HOME=/app/.cache/huggingface
|
| 45 |
+
|
| 46 |
+
# ── Switch to non-root user ───────────────────────────────
|
| 47 |
+
USER appuser
|
| 48 |
+
|
| 49 |
+
# ── Expose port ───────────────────────────────────────────
|
| 50 |
+
EXPOSE 7860
|
| 51 |
+
|
| 52 |
+
# ── Healthcheck ───────────────────────────────────────────
|
| 53 |
+
HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=3 \
|
| 54 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 55 |
+
|
| 56 |
+
# ── Start server ──────────────────────────────────────────
|
| 57 |
+
# 1 worker only — models are loaded once at startup (global state).
|
| 58 |
+
# 600s timeout handles audio+image (Whisper large-v3 ≈ 2-3 min on CPU).
|
| 59 |
+
CMD ["gunicorn", \
|
| 60 |
+
"--bind", "0.0.0.0:7860", \
|
| 61 |
+
"--workers", "1", \
|
| 62 |
+
"--timeout", "600", \
|
| 63 |
+
"--keep-alive", "5", \
|
| 64 |
+
"--log-level", "info", \
|
| 65 |
+
"--access-logfile", "-", \
|
| 66 |
+
"--error-logfile", "-", \
|
| 67 |
+
"main:app"]
|
classification/artifacts/bert_model/config.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_cross_attention": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": null,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"eos_token_id": null,
|
| 11 |
+
"gradient_checkpointing": false,
|
| 12 |
+
"hidden_act": "gelu",
|
| 13 |
+
"hidden_dropout_prob": 0.1,
|
| 14 |
+
"hidden_size": 768,
|
| 15 |
+
"id2label": {
|
| 16 |
+
"0": "LABEL_0",
|
| 17 |
+
"1": "LABEL_1",
|
| 18 |
+
"2": "LABEL_2",
|
| 19 |
+
"3": "LABEL_3",
|
| 20 |
+
"4": "LABEL_4",
|
| 21 |
+
"5": "LABEL_5",
|
| 22 |
+
"6": "LABEL_6",
|
| 23 |
+
"7": "LABEL_7"
|
| 24 |
+
},
|
| 25 |
+
"initializer_range": 0.02,
|
| 26 |
+
"intermediate_size": 3072,
|
| 27 |
+
"is_decoder": false,
|
| 28 |
+
"label2id": {
|
| 29 |
+
"LABEL_0": 0,
|
| 30 |
+
"LABEL_1": 1,
|
| 31 |
+
"LABEL_2": 2,
|
| 32 |
+
"LABEL_3": 3,
|
| 33 |
+
"LABEL_4": 4,
|
| 34 |
+
"LABEL_5": 5,
|
| 35 |
+
"LABEL_6": 6,
|
| 36 |
+
"LABEL_7": 7
|
| 37 |
+
},
|
| 38 |
+
"layer_norm_eps": 1e-12,
|
| 39 |
+
"max_position_embeddings": 512,
|
| 40 |
+
"model_type": "bert",
|
| 41 |
+
"num_attention_heads": 12,
|
| 42 |
+
"num_hidden_layers": 12,
|
| 43 |
+
"pad_token_id": 0,
|
| 44 |
+
"position_embedding_type": "absolute",
|
| 45 |
+
"problem_type": "single_label_classification",
|
| 46 |
+
"tie_word_embeddings": true,
|
| 47 |
+
"transformers_version": "5.0.0",
|
| 48 |
+
"type_vocab_size": 2,
|
| 49 |
+
"use_cache": false,
|
| 50 |
+
"vocab_size": 30522
|
| 51 |
+
}
|
classification/artifacts/indic_test.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
classification/artifacts/indic_tokenizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa2396c5b53605359d466d67fc892aaca020711ae8ac7b7ab2fd9336d82c428c
|
| 3 |
+
size 14979445
|
classification/artifacts/indic_train.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
classification/artifacts/indic_val.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
classification/artifacts/indicbert_model/config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"AlbertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0,
|
| 6 |
+
"bos_token_id": 2,
|
| 7 |
+
"classifier_dropout_prob": 0.1,
|
| 8 |
+
"down_scale_factor": 1,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"embedding_size": 128,
|
| 11 |
+
"eos_token_id": 3,
|
| 12 |
+
"gap_size": 0,
|
| 13 |
+
"hidden_act": "gelu",
|
| 14 |
+
"hidden_dropout_prob": 0,
|
| 15 |
+
"hidden_size": 768,
|
| 16 |
+
"id2label": {
|
| 17 |
+
"0": "LABEL_0",
|
| 18 |
+
"1": "LABEL_1",
|
| 19 |
+
"2": "LABEL_2",
|
| 20 |
+
"3": "LABEL_3",
|
| 21 |
+
"4": "LABEL_4",
|
| 22 |
+
"5": "LABEL_5",
|
| 23 |
+
"6": "LABEL_6",
|
| 24 |
+
"7": "LABEL_7"
|
| 25 |
+
},
|
| 26 |
+
"initializer_range": 0.02,
|
| 27 |
+
"inner_group_num": 1,
|
| 28 |
+
"intermediate_size": 3072,
|
| 29 |
+
"label2id": {
|
| 30 |
+
"LABEL_0": 0,
|
| 31 |
+
"LABEL_1": 1,
|
| 32 |
+
"LABEL_2": 2,
|
| 33 |
+
"LABEL_3": 3,
|
| 34 |
+
"LABEL_4": 4,
|
| 35 |
+
"LABEL_5": 5,
|
| 36 |
+
"LABEL_6": 6,
|
| 37 |
+
"LABEL_7": 7
|
| 38 |
+
},
|
| 39 |
+
"layer_norm_eps": 1e-12,
|
| 40 |
+
"max_position_embeddings": 512,
|
| 41 |
+
"model_type": "albert",
|
| 42 |
+
"net_structure_type": 0,
|
| 43 |
+
"num_attention_heads": 12,
|
| 44 |
+
"num_hidden_groups": 1,
|
| 45 |
+
"num_hidden_layers": 12,
|
| 46 |
+
"num_memory_blocks": 0,
|
| 47 |
+
"pad_token_id": 0,
|
| 48 |
+
"problem_type": "single_label_classification",
|
| 49 |
+
"tie_word_embeddings": true,
|
| 50 |
+
"transformers_version": "5.1.0",
|
| 51 |
+
"type_vocab_size": 2,
|
| 52 |
+
"use_cache": false,
|
| 53 |
+
"vocab_size": 200000
|
| 54 |
+
}
|
classification/artifacts/indicbert_model/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d34df3ca6a5769c1f8ae24a1e64517f3c37a934fd221d9a2ae2c5164d5e21be5
|
| 3 |
+
size 14969520
|
classification/artifacts/indicbert_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": true,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "[CLS]",
|
| 5 |
+
"cls_token": "[CLS]",
|
| 6 |
+
"do_lower_case": true,
|
| 7 |
+
"eos_token": "[SEP]",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<pad>",
|
| 10 |
+
"[CLS]",
|
| 11 |
+
"[SEP]",
|
| 12 |
+
"[MASK]"
|
| 13 |
+
],
|
| 14 |
+
"is_local": false,
|
| 15 |
+
"keep_accents": false,
|
| 16 |
+
"mask_token": "[MASK]",
|
| 17 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 18 |
+
"pad_token": "<pad>",
|
| 19 |
+
"sep_token": "[SEP]",
|
| 20 |
+
"tokenizer_class": "AlbertTokenizer",
|
| 21 |
+
"trim_offsets": true,
|
| 22 |
+
"unk_id": 1,
|
| 23 |
+
"unk_token": "<unk>"
|
| 24 |
+
}
|
classification/artifacts/label_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b0be0d88eed1838fba777af266556aea55e435b970076684d2ad1c8c9b3fb0b
|
| 3 |
+
size 342
|
classification/artifacts/label_map.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8e10c5614e117fd9ccab4af3fa62c0e4c44d23195847586d4d1ddb47f4a00cc
|
| 3 |
+
size 321
|
classification/artifacts/test.csv
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
text,label,label_id
|
| 2 |
+
Transformer emits burning smell,Electricity,0
|
| 3 |
+
Power failures interrupt electric signaling systems causing traffic confusion,Electricity,0
|
| 4 |
+
Water main repair causing disruption,Water,7
|
| 5 |
+
Stray dogs near playgrounds scaring children,Stray Animals,6
|
| 6 |
+
Buses running without first aid kits,Public Transport,3
|
| 7 |
+
Electric poles damaged near highway,Electricity,0
|
| 8 |
+
No information boards at bus stops,Public Transport,3
|
| 9 |
+
Low voltage causing fan malfunction,Electricity,0
|
| 10 |
+
Stray animals damage public seating and benches,Stray Animals,6
|
| 11 |
+
Pigs damaging road surfaces,Stray Animals,6
|
| 12 |
+
Overall public transport problem in locality,Public Transport,3
|
| 13 |
+
Stray dogs barking loudly at night,Stray Animals,6
|
| 14 |
+
The ongoing noise from the water pump is making it difficult for families to rest and relax at home,Water,7
|
| 15 |
+
Street dogs biting pedestrians,Stray Animals,6
|
| 16 |
+
Household waste overflow leads to foul odor spreading into nearby streets,Garbage,1
|
| 17 |
+
Overflowing dustbins near temple,Garbage,1
|
| 18 |
+
Roads are left uneven after utility maintenance work,Roads,4
|
| 19 |
+
Waste not collected from high-rise apartments,Garbage,1
|
| 20 |
+
Air pollution impacts children outdoor play,Pollution,2
|
| 21 |
+
Irregular bus timings causing inconvenience,Public Transport,3
|
| 22 |
+
Delayed trains disrupt workforce movement across city zones,Public Transport,3
|
| 23 |
+
Dust from unpaved road affecting residents,Pollution,2
|
| 24 |
+
Unreliable water supply affects small businesses and shops,Water,7
|
| 25 |
+
Bus shelters without proper signage,Public Transport,3
|
| 26 |
+
Industrial noise pollution disrupts normal daily activities,Pollution,2
|
| 27 |
+
The water pump noise persists for long durations causing mental stress,Water,7
|
| 28 |
+
Road near temple congested during festivals,Roads,4
|
| 29 |
+
Inadequate fleet capacity increases wait times,Public Transport,3
|
| 30 |
+
Accumulated waste near street corner,Garbage,1
|
| 31 |
+
Sanitation workers absent on weekends,Sanitation,5
|
| 32 |
+
Stray animals damage underground cables and water pipelines,Stray Animals,6
|
| 33 |
+
Road drainage damaged,Roads,4
|
| 34 |
+
Garbage has not been cleared from the market area for several days,Garbage,1
|
| 35 |
+
Water contamination affects public health outcomes,Water,7
|
| 36 |
+
Road markings missing on newly constructed road,Roads,4
|
| 37 |
+
Industrial waste contaminating groundwater,Pollution,2
|
| 38 |
+
Water supply pressure varies widely between floors,Water,7
|
| 39 |
+
Plastic waste dumped near playground,Garbage,1
|
| 40 |
+
Electrical outages disable automated road safety equipment creating hazardous conditions,Electricity,0
|
| 41 |
+
Garbage accumulation problem,Garbage,1
|
| 42 |
+
Pollutants from fuel combustion linger in enclosed transport zones,Pollution,2
|
| 43 |
+
Road construction debris narrows lanes and slows vehicles,Roads,4
|
| 44 |
+
Noise pollution increases mental fatigue,Pollution,2
|
| 45 |
+
Power outages interrupt data driven traffic optimization,Electricity,0
|
| 46 |
+
Public transport breakdowns increase dependency on informal transit options,Public Transport,3
|
| 47 |
+
Uncontrolled dumping of waste is leading to severe soil contamination,Pollution,2
|
| 48 |
+
Industrial fumes causing bad odor in colony,Pollution,2
|
| 49 |
+
Water supply interrupted by municipal work,Water,7
|
| 50 |
+
Garbage collection staff do not cover this street regularly,Garbage,1
|
| 51 |
+
Residents complain of constant disturbance caused by water pump vibrations and sound,Water,7
|
| 52 |
+
Poor road lighting affects night time driving safety,Roads,4
|
| 53 |
+
Smoke from factories causing health issues,Pollution,2
|
| 54 |
+
Street clogged due to construction,Roads,4
|
| 55 |
+
Stray goats eating plants in public gardens,Stray Animals,6
|
| 56 |
+
Dogs blocking roads and footpaths,Stray Animals,6
|
| 57 |
+
The water pump generates ongoing noise that affects mental calmness of residents,Water,7
|
| 58 |
+
Electric supply interruptions disrupt essential household chores,Electricity,0
|
| 59 |
+
Stop start driving patterns significantly increase emission output,Pollution,2
|
| 60 |
+
Streetlights not operational,Electricity,0
|
| 61 |
+
Street corners clogged with wet waste,Sanitation,5
|
| 62 |
+
Inefficient driving patterns raise overall emission output,Pollution,2
|
| 63 |
+
Drinking water quality has deteriorated significantly over the past few months,Water,7
|
| 64 |
+
Unsegregated waste decomposition is polluting the air and surrounding land,Pollution,2
|
| 65 |
+
Odor from chemical treatment plant near houses,Pollution,2
|
| 66 |
+
Noise pollution disrupts residential sleep cycles,Pollution,2
|
| 67 |
+
Stray goats blocking sidewalks,Stray Animals,6
|
| 68 |
+
Road bottlenecks from incomplete construction trap vehicles in narrow corridors,Roads,4
|
| 69 |
+
Stray cattle damage public gardens and green belts,Stray Animals,6
|
| 70 |
+
Overhead tank pump malfunctioning,Water,7
|
| 71 |
+
Potholes near bus stop causing delays,Roads,4
|
| 72 |
+
Stray animals causing fear among women residents,Stray Animals,6
|
| 73 |
+
Waste collection vehicles arrive at irregular times causing inconvenience,Sanitation,5
|
| 74 |
+
Roads are left damaged after cable laying work,Roads,4
|
| 75 |
+
Open burning of waste releases toxic smoke affecting nearby households,Pollution,2
|
| 76 |
+
Poor road finishing is leading to early deterioration,Roads,4
|
| 77 |
+
No proper lighting inside buses at night,Public Transport,3
|
| 78 |
+
Aggressive stray dogs attacking children,Stray Animals,6
|
| 79 |
+
Drain water mixing with rainwater,Sanitation,5
|
| 80 |
+
Street corners full of waste,Sanitation,5
|
| 81 |
+
No drinking water facility in our street,Water,7
|
| 82 |
+
Stray dogs roaming in parks causing fear,Stray Animals,6
|
| 83 |
+
Nighttime operation of the water pump is leading to frequent sleep interruptions for nearby residents,Water,7
|
| 84 |
+
Traffic signals not visible,Roads,4
|
| 85 |
+
Traffic signals not synchronized,Roads,4
|
| 86 |
+
The water pump produces nonstop noise causing stress,Water,7
|
| 87 |
+
Low water pressure in commercial buildings,Water,7
|
| 88 |
+
Roadside drainage overflowing,Roads,4
|
| 89 |
+
Garbage left near drainage channels contributes to blockages during rainfall,Garbage,1
|
| 90 |
+
Persistent congestion contributes to chronic air pollution exposure,Pollution,2
|
| 91 |
+
Public water tap is broken,Water,7
|
| 92 |
+
Overcharging by bus conductors,Public Transport,3
|
| 93 |
+
Waterlogging on damaged roads makes them impassable,Roads,4
|
| 94 |
+
Power supply resumes late,Electricity,0
|
| 95 |
+
Animal presence affects pedestrian safety,Stray Animals,6
|
| 96 |
+
Roadside damage narrows lanes and slows traffic,Roads,4
|
| 97 |
+
Electric failures disrupt smart road analytics causing unmanaged flow,Electricity,0
|
| 98 |
+
Lack of buses in early morning hours,Public Transport,3
|
| 99 |
+
Low water pressure in offices,Water,7
|
| 100 |
+
Animal movement causes repeated traffic interruptions,Stray Animals,6
|
| 101 |
+
Fuse blowing repeatedly in our neighborhood,Electricity,0
|
| 102 |
+
Pollution due to illegal dumping,Pollution,2
|
| 103 |
+
Residents report difficulty sleeping due to loud water pump operation,Water,7
|
| 104 |
+
Garbage pile near market area,Garbage,1
|
| 105 |
+
Road construction debris increases airborne particulate matter,Pollution,2
|
| 106 |
+
Stray animals block footpaths forcing pedestrians onto roads,Stray Animals,6
|
| 107 |
+
Road signage failures cause navigation confusion,Roads,4
|
| 108 |
+
Water pipeline damage reduces distribution efficiency,Water,7
|
| 109 |
+
Electric poles obstruct roads and pedestrian pathways,Electricity,0
|
| 110 |
+
Low voltage affecting fans and lights,Electricity,0
|
| 111 |
+
Waste disposal points are not properly sanitized,Sanitation,5
|
| 112 |
+
No shelter for injured stray animals,Stray Animals,6
|
| 113 |
+
Power outage affecting schools,Electricity,0
|
| 114 |
+
Stray animals biting delivery workers,Stray Animals,6
|
| 115 |
+
Sanitation workers not equipped with tools,Sanitation,5
|
| 116 |
+
Electric wires spark during strong winds,Electricity,0
|
| 117 |
+
Irregular tanker delivery schedules increase uncertainty,Water,7
|
| 118 |
+
Road network imbalance shifts traffic to residential streets,Roads,4
|
| 119 |
+
Delayed water pipeline repairs increase hardship,Water,7
|
| 120 |
+
Electric failures shut down borewell motors affecting residential water access,Electricity,0
|
| 121 |
+
Frequent power cuts in monsoon,Electricity,0
|
| 122 |
+
Garbage bins overflow during weekends due to lack of timely pickup,Garbage,1
|
| 123 |
+
Pollution from roadside burning affects nearby shops,Pollution,2
|
| 124 |
+
Street clogged due to parked vehicles,Roads,4
|
| 125 |
+
Power outages cause loss of productivity for freelancers,Electricity,0
|
| 126 |
+
Open waste dumping encourages animal congregation near residences,Garbage,1
|
| 127 |
+
Pipeline repair delayed due to traffic,Water,7
|
| 128 |
+
No animal control measures implemented,Stray Animals,6
|
| 129 |
+
Stray animals causing accidents on roads,Stray Animals,6
|
| 130 |
+
Garbage overflowing near house,Garbage,1
|
| 131 |
+
Public toilets without privacy,Sanitation,5
|
| 132 |
+
Improper waste handling causes frequent odor problems,Sanitation,5
|
| 133 |
+
Water supply stopped without notice,Water,7
|
| 134 |
+
Bus stops not visible at night,Public Transport,3
|
| 135 |
+
Dust pollution due to construction work,Pollution,2
|
| 136 |
+
Stagnant water near market street,Sanitation,5
|
| 137 |
+
Bus routes not covering industrial corridors,Public Transport,3
|
| 138 |
+
Uneven road height is causing frequent vehicle underbody damage,Roads,4
|
| 139 |
+
Garbage collection lacks proper supervision,Garbage,1
|
| 140 |
+
Noise pollution from night time construction disrupts sleep,Pollution,2
|
| 141 |
+
Streetlights not repaired after accident,Roads,4
|
| 142 |
+
Drain water on footpath,Sanitation,5
|
| 143 |
+
Garbage disposal issue,Garbage,1
|
| 144 |
+
Water supply irregular in community area,Water,7
|
| 145 |
+
Transformer overloaded during peak load,Electricity,0
|
| 146 |
+
Transport hubs suffer from sanitation and crowd management issues,Public Transport,3
|
| 147 |
+
Road shoulder eroded near river bank,Roads,4
|
| 148 |
+
No buses for late-night travel,Public Transport,3
|
| 149 |
+
Sanitation services decline during public holidays,Sanitation,5
|
| 150 |
+
Residents report disturbance from water pump operation,Water,7
|
| 151 |
+
Stray animals disrupt public spaces frequently,Stray Animals,6
|
| 152 |
+
Dirty streets near bus stand,Sanitation,5
|
| 153 |
+
Street handpump dry for several days,Water,7
|
| 154 |
+
Power instability affects electric vehicle infrastructure availability,Electricity,0
|
| 155 |
+
Waste accumulation is contaminating nearby stormwater drains,Pollution,2
|
| 156 |
+
Stray animals causing dirt accumulation near markets,Stray Animals,6
|
| 157 |
+
Air pollution from cement factory chimney,Pollution,2
|
| 158 |
+
Water from tap has suspended particles,Water,7
|
| 159 |
+
Stray animals remain untreated for diseases spreading infection,Stray Animals,6
|
| 160 |
+
Drivers refusing service in rain,Public Transport,3
|
| 161 |
+
Public sanitation issues affect overall quality of life,Sanitation,5
|
| 162 |
+
Transformer emits smoke,Electricity,0
|
| 163 |
+
Waste dumped illegally,Garbage,1
|
| 164 |
+
Road capacity limitations force vehicles into dense clusters,Roads,4
|
| 165 |
+
Sewage smell near bus stand,Sanitation,5
|
| 166 |
+
Air pollution from diesel generators in colony,Pollution,2
|
| 167 |
+
Industrial pollution damages surrounding ecosystems,Pollution,2
|
| 168 |
+
Road safety is compromised due to roaming animals,Stray Animals,6
|
| 169 |
+
Dust from demolition site near playground,Pollution,2
|
| 170 |
+
Streetlights not working near bus stop,Electricity,0
|
| 171 |
+
Noise pollution from train operations,Pollution,2
|
| 172 |
+
Garbage collection staff leave waste behind after partial pickup,Garbage,1
|
| 173 |
+
Blocked drains causing flooding in colony,Sanitation,5
|
| 174 |
+
Polluted drainage water seeps into residential plots,Pollution,2
|
| 175 |
+
Mud road causing dust problem,Roads,4
|
| 176 |
+
Road repair materials used are of very poor quality,Roads,4
|
| 177 |
+
Dust from quarry affecting local residents,Pollution,2
|
| 178 |
+
Garbage collection stopped,Garbage,1
|
| 179 |
+
Footpath uneven and unsafe,Roads,4
|
| 180 |
+
Pipeline under construction causing water shortage,Water,7
|
| 181 |
+
Stalling engines emit excessive smoke degrading air quality,Pollution,2
|
| 182 |
+
Garbage heaps are obstructing traffic and pedestrian movement,Garbage,1
|
| 183 |
+
Lack of regular street sweeping leads to dust and waste accumulation,Sanitation,5
|
| 184 |
+
Electric system faults disrupt coordination of transport infrastructure,Electricity,0
|
| 185 |
+
Persistent congestion sustains unhealthy air quality levels,Pollution,2
|
| 186 |
+
Slow moving traffic produces higher emissions per distance traveled,Pollution,2
|
| 187 |
+
Pipeline damage causing water supply disruption,Water,7
|
| 188 |
+
Rainwater flooding water storage area,Water,7
|
| 189 |
+
Water tankers are irregular and insufficient to meet residential requirements,Water,7
|
| 190 |
+
Garbage not cleared regularly,Garbage,1
|
| 191 |
+
Stray cattle wander into drainage channels causing blockages,Stray Animals,6
|
| 192 |
+
Vehicles forced closer together increase localized emission density,Pollution,2
|
| 193 |
+
Polluted waste disposal attracts stray animals,Pollution,2
|
| 194 |
+
Unvaccinated stray dogs increase the risk of rabies in the neighborhood,Stray Animals,6
|
| 195 |
+
Overhead wires hanging low,Electricity,0
|
| 196 |
+
Frequent power interruptions near major intersections cause traffic buildup and prolonged vehicle idling affecting air quality,Electricity,0
|
| 197 |
+
Water cuts affecting hospital area,Water,7
|
| 198 |
+
Dust from open construction affecting local residents,Pollution,2
|
| 199 |
+
Electricity outages disrupt water pumping operations,Electricity,0
|
| 200 |
+
Uneven road surface near hospital entrance,Roads,4
|
| 201 |
+
Congestion related emissions worsen air quality in commercial districts,Pollution,2
|
| 202 |
+
Noise pollution near hospital affecting patients,Pollution,2
|
| 203 |
+
Garbage not removed from community center,Garbage,1
|
| 204 |
+
Power instability disrupts electric vehicle adoption in urban transport corridors,Electricity,0
|
| 205 |
+
Persistent humming from the water pump is causing stress and discomfort for residents living close to the facility,Water,7
|
| 206 |
+
Road near bus stand full of potholes,Roads,4
|
| 207 |
+
Stray dogs attack pets in apartment complexes,Stray Animals,6
|
| 208 |
+
Water from taps has sand particles,Water,7
|
| 209 |
+
Environmental hazards in residential zone,Pollution,2
|
| 210 |
+
Unsafe travel conditions for women,Public Transport,3
|
| 211 |
+
Garbage accumulation is increasing environmental risk,Garbage,1
|
| 212 |
+
Dirty water from taps during rainy season,Water,7
|
| 213 |
+
Stray animals creating mess near water bodies,Stray Animals,6
|
| 214 |
+
Sanitation systems fail leading to sewage water backing up onto roads,Sanitation,5
|
| 215 |
+
Street lighting outages increase accident risk on poorly visible roads,Electricity,0
|
| 216 |
+
Residents are affected by water pump sound levels exceeding acceptable limits,Water,7
|
| 217 |
+
Stray cattle obstruct parking areas in residential zones,Stray Animals,6
|
| 218 |
+
Long waiting time for buses,Public Transport,3
|
| 219 |
+
Bus stops not cleaned after festivals,Public Transport,3
|
| 220 |
+
Water leaking under road surface,Water,7
|
| 221 |
+
Buses without GPS updates,Public Transport,3
|
| 222 |
+
Streetlights not working on Elm Road,Roads,4
|
| 223 |
+
Water shortage impacts daily cleaning and sanitation,Water,7
|
| 224 |
+
Stray animals creating mess in community areas,Stray Animals,6
|
| 225 |
+
Buses overcrowded during festivals,Public Transport,3
|
| 226 |
+
Pollution affecting children and elderly,Pollution,2
|
| 227 |
+
Public toilets lack regular cleaning schedules,Sanitation,5
|
| 228 |
+
Damaged roads increase travel uncertainty,Roads,4
|
| 229 |
+
Improper road grading leads to water accumulation,Roads,4
|
| 230 |
+
Sewage water leaking onto roads,Sanitation,5
|
| 231 |
+
Pollution from waste burning spreads toxic particles,Pollution,2
|
| 232 |
+
Garbage remains scattered near commercial complexes for days,Garbage,1
|
| 233 |
+
Bus routes not covering industrial areas,Public Transport,3
|
| 234 |
+
Waste disposal areas are not properly fenced,Sanitation,5
|
| 235 |
+
Water main burst near market,Water,7
|
| 236 |
+
Road surface wear increases particulate release from tires,Roads,4
|
| 237 |
+
Air pollution from diesel generators,Pollution,2
|
| 238 |
+
Potholes near park causing accidents,Roads,4
|
| 239 |
+
Leaking overhead tanks result in continuous water wastage,Water,7
|
| 240 |
+
Public sanitation facilities are insufficient in crowded areas,Sanitation,5
|
| 241 |
+
Residents face daily inconvenience due to uncontrolled noise from the water pump,Water,7
|
| 242 |
+
Drain smell causing illness,Sanitation,5
|
| 243 |
+
Cats multiplying rapidly in neighborhood,Stray Animals,6
|
| 244 |
+
Stray cats creating hygiene problems in markets,Stray Animals,6
|
| 245 |
+
Road issue near temple,Roads,4
|
| 246 |
+
Uncollected garbage provides feeding grounds for roaming animals,Garbage,1
|
| 247 |
+
Frequent power cuts affecting shops,Electricity,0
|
| 248 |
+
Damaged road surfaces slow traffic causing inefficient fuel usage,Roads,4
|
| 249 |
+
Improper disposal of waste is contaminating nearby open areas,Sanitation,5
|
| 250 |
+
The water pump produces constant noise that disrupts daily household activities,Water,7
|
| 251 |
+
Residents experience repeated water pump disturbance,Water,7
|
| 252 |
+
Voltage surges damage electronic devices unexpectedly,Electricity,0
|
| 253 |
+
Industrial noise pollution affects quality of life,Pollution,2
|
| 254 |
+
Garbage mismanagement amplifies sanitation maintenance burden,Garbage,1
|
| 255 |
+
Electricity outages increase fire safety risks,Electricity,0
|
| 256 |
+
Sewage discharge causing river pollution,Pollution,2
|
| 257 |
+
Accumulated waste blocks sanitation channels leading to stagnant wastewater,Garbage,1
|
| 258 |
+
Garbage is scattered due to stray animals tearing open trash bags,Garbage,1
|
| 259 |
+
Oil spill in water body,Pollution,2
|
| 260 |
+
Uncollected waste decomposes and flows into sewage lines worsening sanitation blockages,Garbage,1
|
| 261 |
+
Sanitation workers lack adequate training,Sanitation,5
|
| 262 |
+
Water supply irregular in new housing society,Water,7
|
| 263 |
+
Traffic congestion due to narrow road,Roads,4
|
| 264 |
+
Generator noise late at night,Pollution,2
|
| 265 |
+
Unattended garbage heaps are spoiling the appearance of the locality,Garbage,1
|
| 266 |
+
Solid waste blocking drainage system,Pollution,2
|
| 267 |
+
Improper waste handling is creating sanitation problems,Garbage,1
|
| 268 |
+
Stray animals causing injuries to pedestrians,Stray Animals,6
|
| 269 |
+
Stray dogs attacking postal workers,Stray Animals,6
|
| 270 |
+
Buses running without permits,Public Transport,3
|
| 271 |
+
Electric supply irregular,Electricity,0
|
| 272 |
+
Monkey problem near market area,Stray Animals,6
|
| 273 |
+
Waste degradation is impacting environmental and public health,Pollution,2
|
| 274 |
+
Factory emissions affecting children with asthma,Pollution,2
|
| 275 |
+
Low speed traffic generates higher emission per distance,Pollution,2
|
| 276 |
+
Sanitation services are delayed without prior notice,Sanitation,5
|
| 277 |
+
Garbage accumulation causes repeated public complaints across departments,Garbage,1
|
| 278 |
+
Uncollected household waste attracting flies,Garbage,1
|
| 279 |
+
Stray cats damaging community gardens,Stray Animals,6
|
| 280 |
+
Road construction stopped halfway,Roads,4
|
| 281 |
+
Illegal dumping of construction debris,Garbage,1
|
| 282 |
+
Transformer oil leakage near road,Electricity,0
|
| 283 |
+
Garbage from nearby construction sites is dumped illegally,Garbage,1
|
| 284 |
+
Waste contamination is impacting soil and water quality,Pollution,2
|
| 285 |
+
Road damage worsens during monsoon due to poor drainage integration,Roads,4
|
| 286 |
+
Poor water quality leads to foul taste and odor,Water,7
|
| 287 |
+
Public garbage bins are overflowing and spilling waste onto the roads,Garbage,1
|
| 288 |
+
No coordination with train schedules,Public Transport,3
|
| 289 |
+
Electricity issues affect remote work productivity,Electricity,0
|
| 290 |
+
Electric poles with loose wires,Electricity,0
|
| 291 |
+
Water supply resumes late morning,Water,7
|
| 292 |
+
Garbage heaps block proper drainage,Garbage,1
|
| 293 |
+
Public sanitation infrastructure is inadequate for urban demands,Sanitation,5
|
| 294 |
+
Road repair delays extend traffic disruption periods,Roads,4
|
| 295 |
+
Roads are breaking repeatedly after each monsoon season,Roads,4
|
| 296 |
+
Drivers rude to passengers,Public Transport,3
|
| 297 |
+
Water pollution spreading diseases,Pollution,2
|
| 298 |
+
Sudden electricity cuts without prior notice are affecting daily work from home activities,Electricity,0
|
| 299 |
+
Supply water appears muddy after pipeline repair work in the locality,Water,7
|
| 300 |
+
Garbage dumping spots are unmanaged and poorly maintained,Garbage,1
|
| 301 |
+
Old buses with faulty engines,Public Transport,3
|
| 302 |
+
Industrial pollution increases cancer risks,Pollution,2
|
| 303 |
+
Discolored water flows from taps after long supply gaps,Water,7
|
| 304 |
+
Route diversions lead to unpredictable commute durations,Public Transport,3
|
| 305 |
+
Sanitation workers not performing evening duty,Sanitation,5
|
| 306 |
+
Uneven roads cause discomfort for passengers,Roads,4
|
| 307 |
+
Stray dogs chase vehicles during nighttime hours,Stray Animals,6
|
| 308 |
+
Stray cats entering school premises,Stray Animals,6
|
| 309 |
+
Noise pollution from traffic affecting students,Pollution,2
|
| 310 |
+
Sanitation facilities near residential areas are neglected,Sanitation,5
|
| 311 |
+
Stray cats entering restaurants,Stray Animals,6
|
| 312 |
+
Electricity supply schedules are not followed consistently,Electricity,0
|
| 313 |
+
Water supply cut without prior notice,Water,7
|
| 314 |
+
Noise from sports stadium affecting neighborhood,Pollution,2
|
| 315 |
+
Fuse keeps blowing in kitchen,Electricity,0
|
| 316 |
+
Air pollution from industrial chimneys,Pollution,2
|
| 317 |
+
Stray animals roam freely due to ineffective monitoring,Stray Animals,6
|
| 318 |
+
Stray dogs fighting with other animals in streets,Stray Animals,6
|
| 319 |
+
No electricity supply in office,Electricity,0
|
| 320 |
+
Heavy vehicles damaging residential roads,Roads,4
|
| 321 |
+
Household garbage is not collected on holidays leading to buildup,Garbage,1
|
| 322 |
+
The water pump operates loudly affecting quality of life,Water,7
|
| 323 |
+
Dog attacks reported in locality,Stray Animals,6
|
| 324 |
+
Open dumping near street causing pollution,Garbage,1
|
| 325 |
+
Leaking sewage lines pollute nearby water bodies,Sanitation,5
|
| 326 |
+
Frequent engine problems,Public Transport,3
|
| 327 |
+
Drain cleaning not done,Sanitation,5
|
| 328 |
+
Broken road near school,Roads,4
|
| 329 |
+
Stray animals scavenging from open garbage,Stray Animals,6
|
| 330 |
+
Road markings faded causing confusion for drivers,Roads,4
|
| 331 |
+
Road near temple full of potholes,Roads,4
|
| 332 |
+
Blocked drains causing stagnant water,Sanitation,5
|
| 333 |
+
Animals causing traffic jams,Stray Animals,6
|
| 334 |
+
Waste decomposition emits pollutants into residential areas,Pollution,2
|
| 335 |
+
Transformer noise disturbing residents,Electricity,0
|
| 336 |
+
Overflowing sewage near street,Sanitation,5
|
| 337 |
+
Poor road quality impacts overall city image,Roads,4
|
| 338 |
+
Water main leakage near central road,Water,7
|
| 339 |
+
No water connection for new house,Water,7
|
| 340 |
+
Sanitation workers do not segregate waste properly,Sanitation,5
|
| 341 |
+
Road bottlenecks caused by poor design delay movement,Roads,4
|
| 342 |
+
Streetlights off during night,Electricity,0
|
| 343 |
+
The water pump operates loudly and disrupts household peace,Water,7
|
| 344 |
+
Narrow roads reduce bus movement efficiency,Public Transport,3
|
| 345 |
+
Sewage smell unbearable,Sanitation,5
|
| 346 |
+
Water cuts extended for more than 24 hours,Water,7
|
| 347 |
+
Electric wires near playground unsafe,Electricity,0
|
| 348 |
+
Stray animals damaging parked vehicles,Stray Animals,6
|
| 349 |
+
Garbage collection is irregular during rainy seasons,Garbage,1
|
| 350 |
+
Garbage collection does not cover all households equally,Sanitation,5
|
| 351 |
+
Dust from road repair work affecting houses,Pollution,2
|
| 352 |
+
Temporary road fixes fail within weeks of implementation,Roads,4
|
| 353 |
+
Waste collection vehicles are insufficient for this locality,Garbage,1
|
| 354 |
+
Pollution caused by heavy trucks affects nearby residential colonies,Pollution,2
|
| 355 |
+
Sanitation workers not maintaining cleanliness,Sanitation,5
|
| 356 |
+
Road repair materials blocking lanes,Roads,4
|
| 357 |
+
Residents experience sleep issues due to water pump noise,Water,7
|
| 358 |
+
Garbage burning issue,Garbage,1
|
| 359 |
+
Voltage drops affecting office equipment,Electricity,0
|
| 360 |
+
Sanitation system collapse impacts drinking water safety,Sanitation,5
|
| 361 |
+
Water supply lines are poorly mapped leading to frequent damage,Water,7
|
| 362 |
+
Fuse keeps tripping in rainy season,Electricity,0
|
| 363 |
+
Streetlight flickering at night,Electricity,0
|
| 364 |
+
Overflowing dustbins near market,Garbage,1
|
| 365 |
+
Noise from generators disturbing neighborhood,Pollution,2
|
| 366 |
+
Fuse box damaged,Electricity,0
|
| 367 |
+
Poor water pressure affects bathroom usage severely,Water,7
|
| 368 |
+
Cattle blocking traffic movement,Stray Animals,6
|
| 369 |
+
Lack of seating at bus stops,Public Transport,3
|
| 370 |
+
Low water pressure in government buildings,Water,7
|
| 371 |
+
Dust from demolition site causing allergies,Pollution,2
|
| 372 |
+
Dense vehicle clusters elevate local emission concentration,Pollution,2
|
| 373 |
+
Road width constraints force stop start movement,Roads,4
|
| 374 |
+
Open burning of plastic waste,Pollution,2
|
| 375 |
+
Stray dogs occupy bus stops causing inconvenience to commuters,Stray Animals,6
|
| 376 |
+
Road markings are missing due to worn out surfaces,Roads,4
|
| 377 |
+
Road work causing traffic jam,Roads,4
|
| 378 |
+
Drivers ignoring pedestrian crossings,Public Transport,3
|
| 379 |
+
Water supply is inconsistent across different times of day,Water,7
|
| 380 |
+
No electricity in entire colony,Electricity,0
|
| 381 |
+
Waste burning polluting surroundings,Pollution,2
|
| 382 |
+
Waste collection points are poorly located causing inconvenience,Sanitation,5
|
| 383 |
+
Stray dogs bark loudly during late night hours,Stray Animals,6
|
| 384 |
+
Stray dogs forming packs near bus depot,Stray Animals,6
|
| 385 |
+
Waste related pollution is impacting daily life,Pollution,2
|
| 386 |
+
Bus not stopping at designated bus stop,Public Transport,3
|
| 387 |
+
Street drains filled with plastic waste,Sanitation,5
|
| 388 |
+
Buses overcrowded during weekends,Public Transport,3
|
| 389 |
+
Sanitation failures impact school environments,Sanitation,5
|
| 390 |
+
Industrial smoke causing respiratory issues,Pollution,2
|
| 391 |
+
Broken road surfaces near residential areas are making daily commuting unsafe for motorists,Roads,4
|
| 392 |
+
Water leakage near school entrance,Water,7
|
| 393 |
+
Drivers not assisting differently-abled passengers,Public Transport,3
|
| 394 |
+
Public sanitation services are underfunded and understaffed,Sanitation,5
|
| 395 |
+
Streetlights off on major roads,Electricity,0
|
| 396 |
+
Sewage water stagnant near temple,Sanitation,5
|
| 397 |
+
Stray cows wandering in residential streets,Stray Animals,6
|
| 398 |
+
Stray dogs lack vaccination leading to health hazards,Stray Animals,6
|
| 399 |
+
Garbage truck skipped area,Garbage,1
|
| 400 |
+
Sewage water stagnant near residential block,Sanitation,5
|
| 401 |
+
Improper waste treatment is increasing pollution levels in surrounding neighborhoods,Pollution,2
|
| 402 |
+
No proper drainage in colony,Sanitation,5
|
| 403 |
+
Sanitation workers not using safety equipment,Sanitation,5
|
| 404 |
+
No monitoring of stray animals in colony,Stray Animals,6
|
| 405 |
+
Water pipelines are damaged during unrelated construction work,Water,7
|
| 406 |
+
Bus breakdowns happening frequently,Public Transport,3
|
| 407 |
+
Street littered with garbage and sewage,Sanitation,5
|
| 408 |
+
Overloaded transformers frequently trip causing blackouts,Electricity,0
|
| 409 |
+
Transport inefficiency impacts economic productivity,Public Transport,3
|
| 410 |
+
Garbage collection delays are a recurring issue,Garbage,1
|
| 411 |
+
Pipeline blockage causing low water supply,Water,7
|
| 412 |
+
Electricity department response time is unsatisfactory,Electricity,0
|
| 413 |
+
Power supply disrupted without notice,Electricity,0
|
| 414 |
+
Unannounced water shutdowns cause inconvenience to residents,Water,7
|
| 415 |
+
Traffic congestion due to road narrowing,Roads,4
|
| 416 |
+
Transformer failure affecting area,Electricity,0
|
| 417 |
+
Garbage piles near manholes worsen sewage backflow issues,Garbage,1
|
| 418 |
+
Chemical smell spreading in residential area,Pollution,2
|
| 419 |
+
Roads are not resurfaced regularly,Roads,4
|
| 420 |
+
The water pump generates loud operational noise that disrupts sleep and rest patterns,Water,7
|
| 421 |
+
Garbage piles remain after festival events,Garbage,1
|
| 422 |
+
Stray dogs roaming near hospitals,Stray Animals,6
|
| 423 |
+
Voltage drop prevents proper functioning of appliances,Electricity,0
|
| 424 |
+
Noise pollution near hospital area,Pollution,2
|
| 425 |
+
Water cuts affecting commercial complexes,Water,7
|
| 426 |
+
Water complaint pending for long time,Water,7
|
| 427 |
+
Stray dogs roaming near hospitals causing fear,Stray Animals,6
|
| 428 |
+
No response to stray animal complaints,Stray Animals,6
|
| 429 |
+
Power interruptions affect automated road management infrastructure,Electricity,0
|
| 430 |
+
Electric poles block proper road widening projects,Electricity,0
|
| 431 |
+
Roadside garbage obstructing traffic,Roads,4
|
| 432 |
+
Road surface uneven after monsoon,Roads,4
|
| 433 |
+
Tap water contains visible particles making it unsafe for consumption,Water,7
|
| 434 |
+
Lack of proper information for routes,Public Transport,3
|
| 435 |
+
Air pollution from generators affects indoor air quality,Pollution,2
|
| 436 |
+
Voltage fluctuations causing hazards,Electricity,0
|
| 437 |
+
Industrial waste dumping has degraded soil quality severely,Pollution,2
|
| 438 |
+
Animals damage sanitation pipelines searching for food,Stray Animals,6
|
| 439 |
+
Pollution control norms are not enforced on local industries,Pollution,2
|
| 440 |
+
Pollution from stone crushing units spreads fine dust,Pollution,2
|
| 441 |
+
Public toilets without maintenance,Sanitation,5
|
| 442 |
+
Mechanical noise from the water pump is becoming unbearable during nighttime hours for residents,Water,7
|
| 443 |
+
Water pollution due to domestic waste dumping,Pollution,2
|
| 444 |
+
Street corners dirty due to uncollected waste,Sanitation,5
|
| 445 |
+
Electric issues interrupt automated toll and traffic flow systems,Electricity,0
|
| 446 |
+
Waste collection timing clashes with peak activity hours,Sanitation,5
|
| 447 |
+
Road surface uneven after rains,Roads,4
|
| 448 |
+
Odor from sewage line leaks in residential area,Pollution,2
|
| 449 |
+
Roadside garbage obstructing lanes,Roads,4
|
| 450 |
+
Poorly maintained valves cause water wastage,Water,7
|
| 451 |
+
Power cut without information,Electricity,0
|
| 452 |
+
Public sanitation infrastructure lacks maintenance,Sanitation,5
|
| 453 |
+
Garbage collection delayed during holidays,Garbage,1
|
| 454 |
+
Noise pollution from construction machinery in mornings,Pollution,2
|
| 455 |
+
Stagnant water near bus stop,Sanitation,5
|
| 456 |
+
Industrial fumes affecting neighborhood air quality,Pollution,2
|
| 457 |
+
Water meter shows unusual consumption,Water,7
|
| 458 |
+
Road edges damaged by waterlogging,Roads,4
|
| 459 |
+
Road repair work causing extended traffic jams,Roads,4
|
| 460 |
+
Traffic bottlenecks delay public transport vehicles,Public Transport,3
|
| 461 |
+
Sanitation workers not collecting waste on time,Sanitation,5
|
| 462 |
+
Water leakage contributes to road deterioration,Water,7
|
| 463 |
+
Sanitation services do not cover all localities equally,Sanitation,5
|
| 464 |
+
Water flow is insufficient for basic hygiene needs,Water,7
|
| 465 |
+
Electric instability affects charging dependent transit reducing efficiency,Electricity,0
|
| 466 |
+
Water pump vibrations are clearly audible inside houses and are causing continuous disturbance to residents,Water,7
|
| 467 |
+
Noise pollution from construction near offices,Pollution,2
|
| 468 |
+
Improper disposal of waste is impacting daily life,Garbage,1
|
| 469 |
+
Street littered with wet and dry waste,Sanitation,5
|
| 470 |
+
Public toilets lacking handwashing facilities,Sanitation,5
|
| 471 |
+
Bus stop information boards missing,Public Transport,3
|
| 472 |
+
Broken roads pose risks to senior citizens,Roads,4
|
| 473 |
+
Persistent water pump noise affects indoor comfort,Water,7
|
| 474 |
+
Garbage bin broken,Garbage,1
|
| 475 |
+
Garbage piles are becoming permanent fixtures,Garbage,1
|
| 476 |
+
Loud water pump vibrations are causing discomfort and anxiety among residents living close to it,Water,7
|
| 477 |
+
Water supply resumes with air bursts damaging pipelines,Water,7
|
| 478 |
+
Road surface damaged by heavy rainfall,Roads,4
|
| 479 |
+
Dirty water from community taps,Water,7
|
| 480 |
+
Residents complain about excessive water pump sound disturbing peaceful living,Water,7
|
| 481 |
+
Sanitation workers absent in evening rounds,Sanitation,5
|
| 482 |
+
Electrical load failures impact road tunnel ventilation systems,Electricity,0
|
| 483 |
+
Water pressure drops drastically during peak usage hours every day,Water,7
|
| 484 |
+
Stray cows grazing near road construction,Stray Animals,6
|
| 485 |
+
Garbage bin missing,Garbage,1
|
| 486 |
+
Uncollected waste causing bad smell,Pollution,2
|
| 487 |
+
Animal herds slow down traffic significantly,Stray Animals,6
|
| 488 |
+
Road repair material spilling on lanes,Roads,4
|
| 489 |
+
Public toilets without proper maintenance,Sanitation,5
|
| 490 |
+
Low voltage in offices,Electricity,0
|
| 491 |
+
Water supply disrupted due to pipeline cleaning,Water,7
|
| 492 |
+
Garbage problem near shops,Garbage,1
|
| 493 |
+
The water pump produces continuous noise impacting residential peace,Water,7
|
| 494 |
+
Dirty water from taps after rain,Water,7
|
| 495 |
+
Water supply disrupted due to power outage,Water,7
|
| 496 |
+
Animals roaming near hospital area,Stray Animals,6
|
| 497 |
+
Damaged roads are affecting delivery services,Roads,4
|
| 498 |
+
Power interruptions affect electric mobility adoption increasing combustion usage,Electricity,0
|
| 499 |
+
Unstable power supply affects industrial equipment performance,Electricity,0
|
| 500 |
+
The water pump generates excessive sound that disrupts normal household activities,Water,7
|
| 501 |
+
Electric wires pass dangerously close to trees,Electricity,0
|
| 502 |
+
Air pollution from industrial boilers,Pollution,2
|
| 503 |
+
Drain water breeding mosquitoes,Sanitation,5
|
| 504 |
+
No schedule boards at bus stops,Public Transport,3
|
| 505 |
+
Stray dogs causing accidents at intersections,Stray Animals,6
|
| 506 |
+
Air pollution from burning crop residue,Pollution,2
|
| 507 |
+
Pollution spreads due to lack of integrated urban planning,Pollution,2
|
| 508 |
+
Electric infrastructure planning ignores future demand growth,Electricity,0
|
| 509 |
+
Residents are affected by loud water pump sounds daily,Water,7
|
| 510 |
+
Accumulated waste near street lights,Garbage,1
|
| 511 |
+
Waste disposal areas emit strong foul odors,Sanitation,5
|
| 512 |
+
Water infrastructure cannot handle peak demand loads,Water,7
|
| 513 |
+
Uncollected wet waste causing odor,Garbage,1
|
| 514 |
+
Drainage blockage causing muddy water on road,Roads,4
|
| 515 |
+
Dirty streets causing mosquito nuisance,Sanitation,5
|
| 516 |
+
Street corners dirty with garbage,Sanitation,5
|
| 517 |
+
Waste collection staff do not collect garbage from interior lanes,Garbage,1
|
| 518 |
+
Overflowing bins causing insect breeding,Garbage,1
|
| 519 |
+
Increased fuel combustion worsens environmental air conditions,Pollution,2
|
| 520 |
+
Voltage spikes damaging ACs,Electricity,0
|
| 521 |
+
No CCTV in buses for safety,Public Transport,3
|
| 522 |
+
Dust from stone crushing units affecting schools,Pollution,2
|
| 523 |
+
Electric cable broken,Electricity,0
|
classification/artifacts/tokenizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c3bb446109f57871636dcbaf11730f886c37cbab2e72deb065ba0619617fefa
|
| 3 |
+
size 851995
|
classification/artifacts/train.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
classification/artifacts/val.csv
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
text,label,label_id
|
| 2 |
+
Poorly designed road curves increase accident risk,Roads,4
|
| 3 |
+
Prolonged power failures during summer are making living conditions unbearable,Electricity,0
|
| 4 |
+
Dogs sleeping on busy roads,Stray Animals,6
|
| 5 |
+
Stray animals making loud noise at night,Stray Animals,6
|
| 6 |
+
No water in entire colony,Water,7
|
| 7 |
+
Decomposing organic waste is creating environmental pollution beyond sanitation issues,Pollution,2
|
| 8 |
+
Open dumping of waste is causing soil and air pollution in nearby residential areas,Pollution,2
|
| 9 |
+
Road repair work incomplete,Roads,4
|
| 10 |
+
Bus terminals poorly lit at night,Public Transport,3
|
| 11 |
+
Road surfaces are not designed for heavy vehicle load,Roads,4
|
| 12 |
+
Frequent power interruptions affect electric road signage reliability,Electricity,0
|
| 13 |
+
Pollution from vehicle exhaust accumulates in narrow streets,Pollution,2
|
| 14 |
+
Open dumping of garbage in streets,Pollution,2
|
| 15 |
+
Odor from sewage treatment plant,Pollution,2
|
| 16 |
+
Garbage is accumulating near parks and playgrounds,Garbage,1
|
| 17 |
+
Frequent accidents at junction of Oak and Pine,Roads,4
|
| 18 |
+
Street water valves leaking after rain,Water,7
|
| 19 |
+
Constant water pump noise interferes with peaceful residential living conditions,Water,7
|
| 20 |
+
Stray goats damaging roadside plants,Stray Animals,6
|
| 21 |
+
Voltage fluctuations affecting lights,Electricity,0
|
| 22 |
+
Sewage overflow near transport hubs causes commuter distress,Sanitation,5
|
| 23 |
+
Water cuts affecting schools and offices,Water,7
|
| 24 |
+
Road widening work delayed,Roads,4
|
| 25 |
+
No shelters at remote bus stops,Public Transport,3
|
| 26 |
+
Sanitation failures contaminate nearby water sources,Sanitation,5
|
| 27 |
+
No vaccination for stray animals in colony,Stray Animals,6
|
| 28 |
+
Electric wires near trees causing hazard,Electricity,0
|
| 29 |
+
Streetlights not functioning near school,Electricity,0
|
| 30 |
+
Potholes near school entrance,Roads,4
|
| 31 |
+
Water main repair causing water shortage,Water,7
|
| 32 |
+
Residents face regular sleep disruption due to excessive water pump noise,Water,7
|
| 33 |
+
Garbage dumping continues despite warning notices,Sanitation,5
|
| 34 |
+
Polluted river water emits foul odor affecting nearby areas,Pollution,2
|
| 35 |
+
Stray cattle block traffic lanes during peak hours,Stray Animals,6
|
| 36 |
+
Bus stops not properly sheltered,Public Transport,3
|
| 37 |
+
Wastewater stagnation seeps into water storage areas,Sanitation,5
|
| 38 |
+
Stray cattle graze on roadside greenery damaging landscaping,Stray Animals,6
|
| 39 |
+
Stray animals leaving waste near houses,Stray Animals,6
|
| 40 |
+
Stray dogs attacking school children,Stray Animals,6
|
| 41 |
+
Garbage collection vehicles skip this area frequently,Garbage,1
|
| 42 |
+
Power supply interruptions affect hospitals and clinics nearby,Electricity,0
|
| 43 |
+
Water pump sound pollution is affecting the quality of life of people living in surrounding apartments,Water,7
|
| 44 |
+
Air quality worsens due to inefficient traffic flow patterns,Pollution,2
|
| 45 |
+
Stray animals wander into construction sites creating hazards,Stray Animals,6
|
| 46 |
+
Sanitation leaks affect nearby commercial areas,Sanitation,5
|
| 47 |
+
Sanitation issues worsen waterborne disease risks,Sanitation,5
|
| 48 |
+
Blocked drains causing water stagnation,Sanitation,5
|
| 49 |
+
Road curvature issues force slow driving in high traffic zones,Roads,4
|
| 50 |
+
Poorly designed road layouts force vehicles to idle longer increasing travel delays,Roads,4
|
| 51 |
+
Improper compaction during construction is weakening road strength,Roads,4
|
| 52 |
+
Overhead tank water not sufficient,Water,7
|
| 53 |
+
Uncovered garbage piles pose serious health risks to residents,Sanitation,5
|
| 54 |
+
Road conditions are worsening despite recent repairs,Roads,4
|
| 55 |
+
Roadside encroachments reduce effective driving space,Roads,4
|
| 56 |
+
Overflowing septic tanks near street,Sanitation,5
|
| 57 |
+
Garbage remains uncollected after scheduled pickup times,Garbage,1
|
| 58 |
+
Electric supply does not meet modern appliance requirements,Electricity,0
|
| 59 |
+
Water pipeline damaged during excavation,Water,7
|
| 60 |
+
Street drains not cleaned for weeks,Sanitation,5
|
| 61 |
+
Overhead tank not refilled after maintenance,Water,7
|
| 62 |
+
Dust from cement plant affecting residential area,Pollution,2
|
| 63 |
+
Open burning of leaves and trash creating smoke,Pollution,2
|
| 64 |
+
Inadequate water supply is affecting sanitation and hygiene in homes,Water,7
|
| 65 |
+
Garbage piles emit strong odor and attract rodents due to delayed removal,Garbage,1
|
| 66 |
+
Electric lines spark during rains creating fire hazards,Electricity,0
|
| 67 |
+
Residents are unable to maintain quiet living conditions due to water pump noise,Water,7
|
| 68 |
+
Road damaged due to heavy rain,Roads,4
|
| 69 |
+
No water in community tank,Water,7
|
| 70 |
+
Stray animals obstructing emergency vehicles,Stray Animals,6
|
| 71 |
+
Garbage attracting mosquitoes and flies,Pollution,2
|
| 72 |
+
Road surfaces peel off creating dangerous driving conditions,Roads,4
|
| 73 |
+
Stray cattle feed on roadside waste creating health issues,Stray Animals,6
|
| 74 |
+
Persistent water pump noise impacts mental well-being of residents,Water,7
|
| 75 |
+
Bus lane blocked by parked trucks,Roads,4
|
| 76 |
+
Cracked pavement reduces driving efficiency and increases fuel usage,Roads,4
|
| 77 |
+
Poor connectivity to rural areas,Public Transport,3
|
| 78 |
+
Road cracks widening after rains,Roads,4
|
| 79 |
+
Buses not stopping at proper stops,Public Transport,3
|
| 80 |
+
No water supply in Sarpavaram since morning,Water,7
|
| 81 |
+
Road shoulder eroding,Roads,4
|
| 82 |
+
Garbage collection frequency is inadequate,Garbage,1
|
| 83 |
+
No electricity in entire street,Electricity,0
|
| 84 |
+
No dedicated buses for women,Public Transport,3
|
| 85 |
+
Garbage collection systems are failing in this zone,Garbage,1
|
| 86 |
+
Frequent breakdowns during peak hours,Public Transport,3
|
| 87 |
+
Electric supply interruptions impact food storage safety,Electricity,0
|
| 88 |
+
Water pump breakdown near park,Water,7
|
| 89 |
+
Insufficient water supply is creating severe inconvenience for large families,Water,7
|
| 90 |
+
The water pump generates loud operational sounds disrupting rest,Water,7
|
| 91 |
+
Road near bridge damaged,Roads,4
|
| 92 |
+
Power outages prevent effective road signal synchronization during peak hours,Electricity,0
|
| 93 |
+
Damaged roads increase vehicle maintenance costs,Roads,4
|
| 94 |
+
Road near school has potholes,Roads,4
|
| 95 |
+
Residents complain about loud water pump operation,Water,7
|
| 96 |
+
Road construction debris narrows traffic lanes,Roads,4
|
| 97 |
+
Overhead tank pump not working properly,Water,7
|
| 98 |
+
Stray cattle rest under streetlights blocking visibility,Stray Animals,6
|
| 99 |
+
Smoke from roadside burning affecting nearby homes,Pollution,2
|
| 100 |
+
Smoke from tire burning polluting air,Pollution,2
|
| 101 |
+
Stray dogs in residential streets making noise,Stray Animals,6
|
| 102 |
+
Residents are troubled by water pump sound,Water,7
|
| 103 |
+
Electric supply disruptions affect safety lighting reducing night time traffic efficiency,Electricity,0
|
| 104 |
+
Stray dogs chasing joggers in parks,Stray Animals,6
|
| 105 |
+
Power instability affects hospital infrastructure impacting sanitation and water use,Electricity,0
|
| 106 |
+
Residents experience continuous irritation due to loud water pump vibrations,Water,7
|
| 107 |
+
Power cuts during night,Electricity,0
|
| 108 |
+
Low voltage in hospital affecting equipment,Electricity,0
|
| 109 |
+
Dust from demolition affecting local market,Pollution,2
|
| 110 |
+
Electric infrastructure repairs are delayed unnecessarily,Electricity,0
|
| 111 |
+
Metro station cleanliness issues,Public Transport,3
|
| 112 |
+
Water cuts affecting residents for multiple days,Water,7
|
| 113 |
+
Damaged traffic signs causing confusion,Roads,4
|
| 114 |
+
Garbage is scattered by animals because collection is irregular,Garbage,1
|
| 115 |
+
Unclean seats and floors in buses,Public Transport,3
|
| 116 |
+
DJ sound creating public nuisance,Pollution,2
|
| 117 |
+
Stray dogs enter school premises causing panic among students,Stray Animals,6
|
| 118 |
+
Waste bins not available in market area,Garbage,1
|
| 119 |
+
Industrial effluents polluting pond water,Pollution,2
|
| 120 |
+
Overflowing trash bins are spreading foul smells but the primary issue is improper garbage collection,Garbage,1
|
| 121 |
+
Garbage disposal points are unmanaged and constantly overflowing,Garbage,1
|
| 122 |
+
Water supply does not meet basic daily consumption needs,Water,7
|
| 123 |
+
Water supply irregular after pipeline repair,Water,7
|
| 124 |
+
Garbage not collected for over a week,Garbage,1
|
| 125 |
+
Stray animals gather near roadside eateries creating mess,Stray Animals,6
|
| 126 |
+
Smoke from vehicles affecting morning walkers,Pollution,2
|
| 127 |
+
Open drain without cover,Sanitation,5
|
| 128 |
+
Garbage scattered near community hall,Garbage,1
|
| 129 |
+
Frequent voltage drop during evenings,Electricity,0
|
| 130 |
+
Waste pollution poses long term health risks,Pollution,2
|
| 131 |
+
Persistent water pump noise disrupts residents,Water,7
|
| 132 |
+
No buses connecting new residential areas,Public Transport,3
|
| 133 |
+
Waste buildup near water bodies contaminates local supply sources indirectly,Garbage,1
|
| 134 |
+
Road repair work causing inconvenience,Roads,4
|
| 135 |
+
Residents are disturbed by loud water pump sound,Water,7
|
| 136 |
+
Traffic slowdowns intensify pollution concentration near residences,Pollution,2
|
| 137 |
+
Electric meter malfunctioning,Electricity,0
|
| 138 |
+
Residents report water pump sound disturbance,Water,7
|
| 139 |
+
Public toilets without proper lighting,Sanitation,5
|
| 140 |
+
Stray dogs making loud noise at night,Stray Animals,6
|
| 141 |
+
Potholes causing accidents,Roads,4
|
| 142 |
+
Stray animals causing traffic accidents,Stray Animals,6
|
| 143 |
+
Damaged pavement causing accidents,Roads,4
|
| 144 |
+
Vehicle idling near intersections raises particulate concentration levels,Pollution,2
|
| 145 |
+
Poor road connectivity affects transit access,Public Transport,3
|
| 146 |
+
Waste management issues are worsening over time,Garbage,1
|
| 147 |
+
Noise pollution increases stress levels among residents,Pollution,2
|
| 148 |
+
Residents experience irritation due to water pump noise,Water,7
|
| 149 |
+
Industrial dust settles on homes causing cleanliness issues,Pollution,2
|
| 150 |
+
Lack of awareness leads to mixing of wet and dry waste,Sanitation,5
|
| 151 |
+
Open garbage near playground causing health hazard,Garbage,1
|
| 152 |
+
Open dumping of waste is increasing environmental pollution,Garbage,1
|
| 153 |
+
High voltage surges in colony,Electricity,0
|
| 154 |
+
Pollution from unregulated industries harms environment,Pollution,2
|
| 155 |
+
Contaminated water supply is increasing dependency on bottled water,Water,7
|
| 156 |
+
Stray dogs fight over food causing injuries and noise,Stray Animals,6
|
| 157 |
+
No water for drinking and cooking,Water,7
|
| 158 |
+
Garbage disposal methods are outdated and ineffective,Garbage,1
|
| 159 |
+
Water pollution from river effluents,Pollution,2
|
| 160 |
+
Drain water stagnation,Sanitation,5
|
| 161 |
+
Power instability disrupts functioning of smart road systems,Electricity,0
|
| 162 |
+
Dogs barking at night causing disturbance,Stray Animals,6
|
| 163 |
+
Sanitation blockages worsen during monsoon season causing flooding,Sanitation,5
|
| 164 |
+
Damaged footpath causing inconvenience to pedestrians,Roads,4
|
| 165 |
+
Poor air quality near traffic junction,Pollution,2
|
| 166 |
+
Garbage pile near school,Garbage,1
|
| 167 |
+
Waste dumping near school gate,Garbage,1
|
| 168 |
+
Sewage water leaking onto streets,Sanitation,5
|
| 169 |
+
Electric poles not properly grounded,Electricity,0
|
| 170 |
+
Pollution from vehicle congestion impacts air quality daily,Pollution,2
|
| 171 |
+
Stray dogs roam in packs increasing attack risks,Stray Animals,6
|
| 172 |
+
No security cameras at bus stations,Public Transport,3
|
| 173 |
+
The water pump produces disturbing sounds that interfere with peaceful living conditions in the locality,Water,7
|
| 174 |
+
The water pump emits ongoing mechanical noise causing irritation,Water,7
|
| 175 |
+
Electric meter not updating readings,Electricity,0
|
| 176 |
+
Smoke from garbage burning near school,Pollution,2
|
| 177 |
+
Buses without proper lighting at night,Public Transport,3
|
| 178 |
+
Monkeys entering homes frequently,Stray Animals,6
|
| 179 |
+
Waste collection disrupted after festival,Garbage,1
|
| 180 |
+
The water pump generates continuous sound that penetrates walls and disturbs indoor peace,Water,7
|
| 181 |
+
Water supply restoration takes excessively long after repairs,Water,7
|
| 182 |
+
Potholes causing tire punctures in colony roads,Roads,4
|
| 183 |
+
Stray animals causing hygiene issues in alleys,Stray Animals,6
|
| 184 |
+
Odor from open sewage causing discomfort,Pollution,2
|
| 185 |
+
Voltage fluctuations during evening hours,Electricity,0
|
| 186 |
+
Electricity failures affect street lighting at night,Electricity,0
|
| 187 |
+
Residents complain that water pump sound disrupts rest and relaxation,Water,7
|
| 188 |
+
Dirty streets due to irregular cleaning,Sanitation,5
|
| 189 |
+
Cats creating noise at night,Stray Animals,6
|
| 190 |
+
Low pressure water supply prevents proper cleaning,Water,7
|
| 191 |
+
Blocked drainage causing flooding near park,Sanitation,5
|
| 192 |
+
Road near bridge uneven and dangerous,Roads,4
|
| 193 |
+
Garbage dumping near markets creates unhygienic conditions,Sanitation,5
|
| 194 |
+
Persistent water pump noise causes frustration and mental strain among residents,Water,7
|
| 195 |
+
Waste decomposition is releasing strong pollutants into the air,Pollution,2
|
| 196 |
+
Drain cleaning vehicle not coming,Sanitation,5
|
| 197 |
+
Oil spills polluting street drains,Pollution,2
|
| 198 |
+
No ramps in buses for wheelchairs,Public Transport,3
|
| 199 |
+
Traffic signs missing on busy roads,Roads,4
|
| 200 |
+
Stray animals fighting each other in streets,Stray Animals,6
|
| 201 |
+
Stray cats entering residential buildings,Stray Animals,6
|
| 202 |
+
Dirty streets near market area,Sanitation,5
|
| 203 |
+
Overhead wires sagging dangerously,Electricity,0
|
| 204 |
+
Stray cows wandering near schools,Stray Animals,6
|
| 205 |
+
Water quality test failed,Water,7
|
| 206 |
+
Excessive noise from generators violates permissible sound levels,Pollution,2
|
| 207 |
+
Supply water contains excessive chlorine smell,Water,7
|
| 208 |
+
Pollution from waste incineration spreads toxins,Pollution,2
|
| 209 |
+
Garbage bins are broken and unusable forcing people to dump waste outside,Garbage,1
|
| 210 |
+
Public sanitation facilities lack proper water supply,Sanitation,5
|
| 211 |
+
Inconsistent water flow damages household water storage systems,Water,7
|
| 212 |
+
Unreliable power disables water purification infrastructure intermittently,Electricity,0
|
| 213 |
+
Power cuts affecting businesses,Electricity,0
|
| 214 |
+
No night bus services available,Public Transport,3
|
| 215 |
+
Odor from sewage backup in street,Pollution,2
|
| 216 |
+
Traffic congestion near junction,Roads,4
|
| 217 |
+
Drainage problem near shops,Sanitation,5
|
| 218 |
+
Long term waste accumulation is degrading environmental quality,Pollution,2
|
| 219 |
+
Residents express concern over long-term exposure to water pump noise pollution,Water,7
|
| 220 |
+
Roadside erosion releases dust affecting respiratory health,Pollution,2
|
| 221 |
+
Dust from construction debris affecting children,Pollution,2
|
| 222 |
+
Uneven pavements disrupt public transport schedules,Roads,4
|
| 223 |
+
Streetlights off on main road,Electricity,0
|
| 224 |
+
Road surfaces have sunk creating deep depressions,Roads,4
|
| 225 |
+
Air pollution from brick kiln operations,Pollution,2
|
| 226 |
+
Old buses causing discomfort to passengers,Public Transport,3
|
| 227 |
+
Overhead tank valve is malfunctioning,Water,7
|
| 228 |
+
Streetlights not working in residential area,Electricity,0
|
| 229 |
+
Power cuts during peak hours,Electricity,0
|
| 230 |
+
Electric line damaged due to rain,Electricity,0
|
| 231 |
+
Bus drivers not following traffic rules,Public Transport,3
|
| 232 |
+
Drinking water tanker arrives late,Water,7
|
| 233 |
+
Stray animals leaving waste on streets,Stray Animals,6
|
| 234 |
+
Stray cattle sit on speed breakers causing visibility issues,Stray Animals,6
|
| 235 |
+
Poor sanitation maintenance increases mosquito breeding,Sanitation,5
|
| 236 |
+
Street littered with plastic bags,Garbage,1
|
| 237 |
+
Garbage trucks not covering all streets,Garbage,1
|
| 238 |
+
No emergency numbers displayed in buses,Public Transport,3
|
| 239 |
+
Airborne particulate matter rises due to prolonged vehicle idling in traffic heavy corridors,Pollution,2
|
| 240 |
+
Irregular water supply forces residents to rely on unsafe storage methods,Water,7
|
| 241 |
+
Stray dogs disrupt morning walks in residential colonies,Stray Animals,6
|
| 242 |
+
Voltage drops affect illuminated road signage clarity at night,Electricity,0
|
| 243 |
+
Garbage trucks create spillage during transportation,Sanitation,5
|
| 244 |
+
Electric wires exposed near playground,Electricity,0
|
| 245 |
+
Garbage remains uncleared despite municipal schedules,Garbage,1
|
| 246 |
+
Electricity department does not provide outage updates,Electricity,0
|
| 247 |
+
Road near market has cracks,Roads,4
|
| 248 |
+
Damaged road shoulders reduce usable driving space,Roads,4
|
| 249 |
+
Pollution levels rise due to unmanaged waste decay,Pollution,2
|
| 250 |
+
Plastic waste mixed with organic waste,Garbage,1
|
| 251 |
+
Stray dogs forming packs near temples,Stray Animals,6
|
| 252 |
+
Streetlight poles broken,Electricity,0
|
| 253 |
+
Road surface slippery due to oil spillage,Roads,4
|
| 254 |
+
Garbage bins are not sufficient for waste volume,Garbage,1
|
| 255 |
+
Garbage disposal points are poorly managed and constantly overflowing,Garbage,1
|
| 256 |
+
Stray cows blocking traffic on highways,Stray Animals,6
|
| 257 |
+
Burning leaves and trash releases harmful pollutants into the air,Pollution,2
|
| 258 |
+
Blocked drains causing flooding near park,Sanitation,5
|
| 259 |
+
Power failures affect monitoring of traffic density resulting in unmanaged congestion,Electricity,0
|
| 260 |
+
Long queues for public transport tickets,Public Transport,3
|
| 261 |
+
Buses without functional horn or lights,Public Transport,3
|
| 262 |
+
Uncontrolled pollution in urban area,Pollution,2
|
| 263 |
+
Auto drivers misbehaving with passengers,Public Transport,3
|
| 264 |
+
Electricity outages affect elevator operations in apartments,Electricity,0
|
| 265 |
+
Electric infrastructure maintenance is irregular and insufficient,Electricity,0
|
| 266 |
+
Fuse boxes not maintained,Electricity,0
|
| 267 |
+
No asphalt layer on road,Roads,4
|
| 268 |
+
Water supply interruptions affect hospitals and schools,Water,7
|
| 269 |
+
Bus staff not enforcing safety measures,Public Transport,3
|
| 270 |
+
Water leakage causes erosion around building foundations,Water,7
|
| 271 |
+
No shelter homes for injured strays,Stray Animals,6
|
| 272 |
+
Dirty drains causing mosquito nuisance,Sanitation,5
|
| 273 |
+
Frequent power cuts during peak hours,Electricity,0
|
| 274 |
+
Narrow road design leads to chronic congestion during working hours,Roads,4
|
| 275 |
+
Drain clogged for days,Sanitation,5
|
| 276 |
+
Stray animals defecate near homes causing hygiene problems,Stray Animals,6
|
| 277 |
+
No segregation of wet and dry waste,Garbage,1
|
| 278 |
+
Power cuts occur without any prior announcements,Electricity,0
|
| 279 |
+
Poor maintenance of public transport vehicles,Public Transport,3
|
| 280 |
+
Drain water flowing continuously,Sanitation,5
|
| 281 |
+
Road surfaces have lost structural integrity,Roads,4
|
| 282 |
+
Water contamination in handpump near residential block,Water,7
|
| 283 |
+
Power cuts affecting hospitals,Electricity,0
|
| 284 |
+
Garbage from nearby markets is dumped irresponsibly in residential zones,Garbage,1
|
| 285 |
+
Odor pollution from garbage dumping site,Pollution,2
|
| 286 |
+
Buses not stopping at requested locations,Public Transport,3
|
| 287 |
+
Water supply irregular in residential colony,Water,7
|
| 288 |
+
Environmental pollution is increasing due to lack of proper waste treatment,Pollution,2
|
| 289 |
+
Electricity department fails to upgrade outdated infrastructure,Electricity,0
|
| 290 |
+
Street littered with paper and plastic waste,Garbage,1
|
| 291 |
+
Noise from night-time clubs disturbing residents,Pollution,2
|
| 292 |
+
Open defecation near residential area,Sanitation,5
|
| 293 |
+
Electric failures affect emergency response systems across major road corridors,Electricity,0
|
| 294 |
+
Open garbage near hospital creating health hazard,Garbage,1
|
| 295 |
+
Water pollution due to sewage leakage,Pollution,2
|
| 296 |
+
Public dustbin overflowing,Garbage,1
|
| 297 |
+
Multiple pollution sources in area,Pollution,2
|
| 298 |
+
Stray cats spreading diseases in markets,Stray Animals,6
|
| 299 |
+
Road shoulders are damaged making pedestrian movement unsafe,Roads,4
|
| 300 |
+
Transformer making noise,Electricity,0
|
| 301 |
+
Bus drivers refusing to stop at requested locations,Public Transport,3
|
| 302 |
+
No proper animal control in residential areas,Stray Animals,6
|
| 303 |
+
Smoke from crematorium affecting local area,Pollution,2
|
| 304 |
+
Stray dogs near railway station scaring passengers,Stray Animals,6
|
| 305 |
+
Garbage dumping near drains causes sewage overflow during rainfall,Garbage,1
|
| 306 |
+
Drain overflow creating traffic issue,Sanitation,5
|
| 307 |
+
Dirty drains causing waterlogging during monsoon,Sanitation,5
|
| 308 |
+
Garbage not collected from high-rise buildings,Garbage,1
|
| 309 |
+
Garbage collection irregular in park area,Garbage,1
|
| 310 |
+
Supply water contains excessive sediment affecting water filters,Water,7
|
| 311 |
+
Sanitation backflow damages road surfaces,Sanitation,5
|
| 312 |
+
Stray animals blocking sidewalks,Stray Animals,6
|
| 313 |
+
Water supply stops abruptly without any official communication,Water,7
|
| 314 |
+
Stray animals damage parked vehicles while searching for food,Stray Animals,6
|
| 315 |
+
Overflowing bins near bus stops create hygiene and commuter discomfort,Garbage,1
|
| 316 |
+
Buses not running on weekends,Public Transport,3
|
| 317 |
+
Bus drivers ignoring signals,Public Transport,3
|
| 318 |
+
Incomplete road projects create traffic bottlenecks,Roads,4
|
| 319 |
+
Electricity outages occur frequently during weekends,Electricity,0
|
| 320 |
+
Traffic signal timing not optimized,Roads,4
|
| 321 |
+
Buses not following GPS routes,Public Transport,3
|
| 322 |
+
Stray animals sleeping on pavements,Stray Animals,6
|
| 323 |
+
Drain blockage causing flooding,Sanitation,5
|
| 324 |
+
Electricity supply issues disrupt online education,Electricity,0
|
| 325 |
+
Polluted soil contains hazardous chemicals,Pollution,2
|
| 326 |
+
The water pump noise remains constant without breaks causing ongoing stress to nearby households,Water,7
|
| 327 |
+
Water overflow near house,Water,7
|
| 328 |
+
Garbage collection vehicles do not arrive on time,Garbage,1
|
| 329 |
+
Trapped traffic emits concentrated pollutants impacting nearby pedestrians,Pollution,2
|
| 330 |
+
Road surface uneven near playground,Roads,4
|
| 331 |
+
Poor road quality is forcing vehicles to take long detours,Roads,4
|
| 332 |
+
Long term waste dumping is degrading environmental health in the locality,Pollution,2
|
| 333 |
+
Stray cats entering homes and damaging property,Stray Animals,6
|
| 334 |
+
Traffic signals malfunctioning,Roads,4
|
| 335 |
+
Odor from sewage backup in residential area,Pollution,2
|
| 336 |
+
Stray dogs gather near food waste sites,Stray Animals,6
|
| 337 |
+
Detour related fuel burn increases atmospheric contamination,Pollution,2
|
| 338 |
+
No announcements for stops for visually impaired,Public Transport,3
|
| 339 |
+
Garbage disposal practices need urgent improvement,Garbage,1
|
| 340 |
+
Uncollected household garbage near shops,Garbage,1
|
| 341 |
+
Pollution from heavy vehicles affects residential zones,Pollution,2
|
| 342 |
+
Noise from metro construction disturbing residents,Pollution,2
|
| 343 |
+
Waste bins are not covered allowing animals to scatter garbage,Garbage,1
|
| 344 |
+
Garbage is not collected daily leading to foul smells and health concerns,Sanitation,5
|
| 345 |
+
Loose electrical connections cause repeated outages,Electricity,0
|
| 346 |
+
Electric shock from pole,Electricity,0
|
| 347 |
+
Smoke from roadside eateries causing health issues,Pollution,2
|
| 348 |
+
Road repair complaint ignored,Roads,4
|
| 349 |
+
Broken roads increase travel stress and fatigue,Roads,4
|
| 350 |
+
Waste disposal practices increase environmental hazards,Sanitation,5
|
| 351 |
+
Decaying waste releases gases that significantly degrade air quality,Pollution,2
|
| 352 |
+
Electric outage since yesterday,Electricity,0
|
| 353 |
+
Public sanitation services lack accountability,Sanitation,5
|
| 354 |
+
Drivers not following assigned routes,Public Transport,3
|
| 355 |
+
Buses overcrowded with standing passengers,Public Transport,3
|
| 356 |
+
Water supply does not meet the needs of growing population in the area,Water,7
|
| 357 |
+
Frequent fare disputes in buses,Public Transport,3
|
| 358 |
+
Stray animals causing sanitation issues,Stray Animals,6
|
| 359 |
+
Accumulated waste attracts insects and rodents,Sanitation,5
|
| 360 |
+
Sewage water flowing on road,Sanitation,5
|
| 361 |
+
Air pollution from construction dust near hospital,Pollution,2
|
| 362 |
+
Drainage overflow in street,Sanitation,5
|
| 363 |
+
Sanitation workers do not report for duty regularly,Sanitation,5
|
| 364 |
+
Water pressure drops affect sanitation and hygiene practices,Water,7
|
| 365 |
+
Overflowing water tanks cause wastage due to faulty valves,Water,7
|
| 366 |
+
Animal interference increases sanitation workload,Stray Animals,6
|
| 367 |
+
Overflowing drains near school,Sanitation,5
|
| 368 |
+
Garbage is piling up near bus stops and public areas,Garbage,1
|
| 369 |
+
Open drains near houses,Sanitation,5
|
| 370 |
+
Power outages occur daily during peak usage hours without explanation,Electricity,0
|
| 371 |
+
Sewage stagnation creates unhygienic living conditions,Sanitation,5
|
| 372 |
+
Odor from chemical treatment plant near road,Pollution,2
|
| 373 |
+
Stray goats eating flowers in gardens,Stray Animals,6
|
| 374 |
+
Odor from garbage dump near residential area,Pollution,2
|
| 375 |
+
Animals crossing roads increase collision risk,Stray Animals,6
|
| 376 |
+
Speed bumps not visible at night,Roads,4
|
| 377 |
+
Odor from sewage near commercial complex,Pollution,2
|
| 378 |
+
Air pollution aggravates asthma and respiratory conditions,Pollution,2
|
| 379 |
+
Drain overflow near hospital,Sanitation,5
|
| 380 |
+
Water management inefficiency impacts urban resilience,Water,7
|
| 381 |
+
Dumped waste is polluting nearby agricultural land,Pollution,2
|
| 382 |
+
Street taps dry after pipeline maintenance,Water,7
|
| 383 |
+
Organic waste decomposition is polluting the air and attracting disease carrying insects,Pollution,2
|
| 384 |
+
Lack of municipal response to stray animal complaints is concerning,Stray Animals,6
|
| 385 |
+
Water meters show abnormal readings despite limited water usage,Water,7
|
| 386 |
+
Uneven road surface,Roads,4
|
| 387 |
+
Waste disposal sites attract stray animals,Sanitation,5
|
| 388 |
+
Industrial pollution affects nearby residential quality of life,Pollution,2
|
| 389 |
+
Electric lines are exposed and unsafe in public areas,Electricity,0
|
| 390 |
+
Public toilets lacking maintenance schedule,Sanitation,5
|
| 391 |
+
Ongoing water pump vibrations are creating a persistent nuisance for families living nearby,Water,7
|
| 392 |
+
Water tankers charge high prices due to municipal shortages,Water,7
|
| 393 |
+
Dogs entering houses frequently,Stray Animals,6
|
| 394 |
+
Water pipeline leakage near main road,Water,7
|
| 395 |
+
Waste segregation is not practiced consistently,Garbage,1
|
| 396 |
+
Street corners full of mixed garbage,Garbage,1
|
| 397 |
+
Streetlight poles corroded,Electricity,0
|
| 398 |
+
Buses not following scheduled intervals,Public Transport,3
|
| 399 |
+
Power failures disrupt home medical equipment usage,Electricity,0
|
| 400 |
+
Road surfaces are damaged by heavy construction vehicle movement,Roads,4
|
| 401 |
+
Air pollution from coal transport trucks,Pollution,2
|
| 402 |
+
Garbage collection trucks not available,Garbage,1
|
| 403 |
+
Stray animals knocking over dustbins,Stray Animals,6
|
| 404 |
+
Construction debris dumped illegally,Pollution,2
|
| 405 |
+
Pollution caused by waste decay is affecting nearby residential comfort,Pollution,2
|
| 406 |
+
Damaged culvert causing road erosion,Roads,4
|
| 407 |
+
Waste breakdown affects surrounding environmental conditions,Pollution,2
|
| 408 |
+
Stray goats wandering on highways,Stray Animals,6
|
| 409 |
+
Garbage is often burned causing air pollution,Sanitation,5
|
| 410 |
+
Power outage affecting local shops,Electricity,0
|
| 411 |
+
Sanitation neglect increases long term infrastructure damage,Sanitation,5
|
| 412 |
+
Street light flickering,Electricity,0
|
| 413 |
+
Poor road connectivity increases travel time and logistical inefficiency,Roads,4
|
| 414 |
+
Water contamination from untreated sewage,Pollution,2
|
| 415 |
+
Garbage piles draw stray dogs increasing public safety concerns,Garbage,1
|
| 416 |
+
Water pump noise disturbing residents,Water,7
|
| 417 |
+
Stray animals disrupt peaceful living in residential areas,Stray Animals,6
|
| 418 |
+
Garbage collection delayed during rainy season,Garbage,1
|
| 419 |
+
Residents complain about persistent water pump noise,Water,7
|
| 420 |
+
Road maintenance work is delayed for months without explanation,Roads,4
|
| 421 |
+
Stray goats entering parks,Stray Animals,6
|
| 422 |
+
Road surface cracks widening,Roads,4
|
| 423 |
+
Bus stations without toilets or drinking water,Public Transport,3
|
| 424 |
+
Traffic signals not functioning at major intersection,Roads,4
|
| 425 |
+
Road network gaps increase dependency on longer travel routes,Roads,4
|
| 426 |
+
Electric poles damaged due to storm,Electricity,0
|
| 427 |
+
Power cuts disrupting businesses,Electricity,0
|
| 428 |
+
Stray animals lack proper shelters leading to street occupation,Stray Animals,6
|
| 429 |
+
Stray cattle wander into marketplaces creating safety hazards,Stray Animals,6
|
| 430 |
+
Road near bus stop damaged,Roads,4
|
| 431 |
+
Improper waste management is causing resident dissatisfaction,Garbage,1
|
| 432 |
+
Garbage remains scattered after market hours,Garbage,1
|
| 433 |
+
Industrial wastewater discharge pollutes groundwater sources,Pollution,2
|
| 434 |
+
Garbage accumulation creates breeding grounds for pests,Sanitation,5
|
| 435 |
+
Road full of potholes near bus stand,Roads,4
|
| 436 |
+
Waste management practices do not meet basic standards,Sanitation,5
|
| 437 |
+
Odor from poultry market affecting houses,Pollution,2
|
| 438 |
+
Overhead tank overflowing constantly,Water,7
|
| 439 |
+
Sewage pipe damaged,Sanitation,5
|
| 440 |
+
Smoke from cooking chimneys in dense areas,Pollution,2
|
| 441 |
+
Dirty drains causing street flooding,Sanitation,5
|
| 442 |
+
Water tank cleaning required,Water,7
|
| 443 |
+
Garbage lying near drain,Garbage,1
|
| 444 |
+
Drivers not issuing proper receipts,Public Transport,3
|
| 445 |
+
Water tanker delivery inconsistent,Water,7
|
| 446 |
+
Suspended dust particles from road surfaces are contributing to respiratory discomfort,Pollution,2
|
| 447 |
+
Lack of buses in suburban areas,Public Transport,3
|
| 448 |
+
Stray dogs guard territories aggressively near houses,Stray Animals,6
|
| 449 |
+
Stagnant water near public park,Sanitation,5
|
| 450 |
+
Road near market slippery after rain,Roads,4
|
| 451 |
+
Sanitation workers not patrolling community areas,Sanitation,5
|
| 452 |
+
Water pipes frequently clog causing supply interruptions,Water,7
|
| 453 |
+
No electricity in residential block,Electricity,0
|
| 454 |
+
Odor from leather tanning unit near river,Pollution,2
|
| 455 |
+
Noise pollution from nearby nightclub,Pollution,2
|
| 456 |
+
Water mains corrode causing supply issues,Water,7
|
| 457 |
+
Drain water entering homes,Sanitation,5
|
| 458 |
+
Water contamination reported in residential area,Water,7
|
| 459 |
+
Water supply pressure is too weak to reach upper floors of buildings,Water,7
|
| 460 |
+
Voltage drop in colony during night,Electricity,0
|
| 461 |
+
Burning of garbage creating toxic smoke,Pollution,2
|
| 462 |
+
Garbage pile near street corner attracting rats,Garbage,1
|
| 463 |
+
Road shoulder erosion limits usable space leading to congestion,Roads,4
|
| 464 |
+
Damaged roads near hospitals are affecting ambulance movement,Roads,4
|
| 465 |
+
Potholes near shopping complex,Roads,4
|
| 466 |
+
Road surface dust contributes to respiratory discomfort,Roads,4
|
| 467 |
+
Water infrastructure repairs lack proper supervision,Water,7
|
| 468 |
+
No proper waiting areas at bus terminals,Public Transport,3
|
| 469 |
+
Garbage scattered by stray animals,Garbage,1
|
| 470 |
+
Garbage not segregated in bins,Garbage,1
|
| 471 |
+
Water tanker service not available on time,Water,7
|
| 472 |
+
Dust from open construction sites affecting market,Pollution,2
|
| 473 |
+
Air pollution increases hospital visits for breathing issues,Pollution,2
|
| 474 |
+
Public toilets locked or inaccessible,Sanitation,5
|
| 475 |
+
Overflowing sewage near commercial area,Sanitation,5
|
| 476 |
+
Electric outages halt automated water distribution scheduling systems,Electricity,0
|
| 477 |
+
The water pump emits a harsh mechanical sound that causes constant irritation to residents,Water,7
|
| 478 |
+
Bad road near hospital,Roads,4
|
| 479 |
+
Pollution affecting quality of life,Pollution,2
|
| 480 |
+
Water supply timing not communicated,Water,7
|
| 481 |
+
Water pipelines lack proper insulation and protection,Water,7
|
| 482 |
+
Excessive vehicle emissions in this area have significantly reduced air quality levels,Pollution,2
|
| 483 |
+
Stray animals cause night time disturbances near homes,Stray Animals,6
|
| 484 |
+
Bus drivers not obeying traffic signals,Public Transport,3
|
| 485 |
+
Sanitation services are poorly monitored,Sanitation,5
|
| 486 |
+
No separate buses for students,Public Transport,3
|
| 487 |
+
Temporary road repairs wash away during rains,Roads,4
|
| 488 |
+
Noise pollution from factories disturbs nearby residents,Pollution,2
|
| 489 |
+
Stray animals affect commuter comfort,Stray Animals,6
|
| 490 |
+
Damaged roads force vehicles to take longer detours increasing fuel consumption,Roads,4
|
| 491 |
+
Garbage has been left unattended near public places,Garbage,1
|
| 492 |
+
Sewage leaks enter drainage and road systems,Sanitation,5
|
| 493 |
+
Improper garbage disposal affects nearby residential buildings,Sanitation,5
|
| 494 |
+
Garbage heaps are visible across multiple streets,Garbage,1
|
| 495 |
+
Electric supply disruptions force manual traffic handling causing congestion,Electricity,0
|
| 496 |
+
Electric infrastructure failures disrupt adaptive traffic control systems,Electricity,0
|
| 497 |
+
Sanitation complaints are not addressed promptly by authorities,Sanitation,5
|
| 498 |
+
Garbage dumped in public spaces is affecting cleanliness,Garbage,1
|
| 499 |
+
Smoke from tire burning in industrial area,Pollution,2
|
| 500 |
+
Damaged guardrail causing accident,Roads,4
|
| 501 |
+
Low water pressure in newly built area,Water,7
|
| 502 |
+
Irregular water supply during festival season,Water,7
|
| 503 |
+
Unregulated tanker water sources raise safety concerns,Water,7
|
| 504 |
+
Water from taps has unusual odor,Water,7
|
| 505 |
+
Accumulated waste near bus stops,Garbage,1
|
| 506 |
+
Street water valves leaking,Water,7
|
| 507 |
+
Voltage spikes affecting ACs,Electricity,0
|
| 508 |
+
Smoke from burning tires on roadside,Pollution,2
|
| 509 |
+
Stray goats wandering in playgrounds,Stray Animals,6
|
| 510 |
+
Stray cattle block roads during peak hours disrupting traffic flow,Stray Animals,6
|
| 511 |
+
Unfriendly behavior from bus staff,Public Transport,3
|
| 512 |
+
Water supply infrastructure expansion has not kept pace with growth,Water,7
|
| 513 |
+
The loud humming of the water pump causes discomfort throughout the day and night,Water,7
|
| 514 |
+
Road construction debris restricts lane capacity,Roads,4
|
| 515 |
+
Streetlight malfunction causing darkness,Electricity,0
|
| 516 |
+
Water supply disrupted due to civic work,Water,7
|
| 517 |
+
Dirty water flowing from street taps,Water,7
|
| 518 |
+
Pollution from heavy machinery continues throughout night,Pollution,2
|
| 519 |
+
Stray animals causing traffic delays,Stray Animals,6
|
| 520 |
+
Industrial waste discharged into river,Pollution,2
|
| 521 |
+
Electric poles with broken cross arms,Electricity,0
|
| 522 |
+
Improper waste dumping worsens underground sanitation congestion,Garbage,1
|
| 523 |
+
Water supply schedules change without notification,Water,7
|
classification/bert_classify.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# BERT MODEL — CATEGORY CLASSIFICATION (ENGLISH)
|
| 3 |
+
# =========================================================
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import torch
|
| 8 |
+
import pickle
|
| 9 |
+
from transformers import BertForSequenceClassification
|
| 10 |
+
|
| 11 |
+
# ── Path config ───────────────────────────────────────────
|
| 12 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 13 |
+
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
|
| 14 |
+
MODEL_DIR = os.path.join(ARTIFACT_DIR, "bert_model")
|
| 15 |
+
MAX_LENGTH = 128 # FIX: was 100 — aligned with IG explainer and indic module
|
| 16 |
+
|
| 17 |
+
# ── Load artifacts ────────────────────────────────────────
|
| 18 |
+
with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
|
| 19 |
+
tokenizer = pickle.load(f)
|
| 20 |
+
|
| 21 |
+
with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
|
| 22 |
+
label_encoder = pickle.load(f)
|
| 23 |
+
|
| 24 |
+
model = BertForSequenceClassification.from_pretrained(
|
| 25 |
+
MODEL_DIR, local_files_only=True
|
| 26 |
+
)
|
| 27 |
+
model.eval()
|
| 28 |
+
|
| 29 |
+
# ── Edge-case constants ───────────────────────────────────
|
| 30 |
+
LABEL_WORDS = {
|
| 31 |
+
"water", "electricity", "roads", "garbage",
|
| 32 |
+
"sanitation", "pollution", "transport", "animals",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
NON_GRIEVANCE_PHRASES = {
|
| 36 |
+
"hello", "hi", "hi there", "hey", "hey there",
|
| 37 |
+
"good morning", "good afternoon", "good evening", "good day",
|
| 38 |
+
"greetings", "namaste", "how are you", "how are you doing",
|
| 39 |
+
"hope you are doing well", "hope everything is fine",
|
| 40 |
+
"just checking in", "nice to meet you", "long time no see",
|
| 41 |
+
"good weather", "nice weather", "weather is nice", "weather is good",
|
| 42 |
+
"it is a sunny day", "it is raining today", "pleasant weather",
|
| 43 |
+
"cool weather today", "hot weather today", "cold weather today",
|
| 44 |
+
"it is a good day", "everything is fine", "all good", "no issues",
|
| 45 |
+
"no problem", "things are okay", "everything looks good",
|
| 46 |
+
"nothing to complain", "all services are working",
|
| 47 |
+
"thank you", "thanks", "thanks a lot", "thank you very much",
|
| 48 |
+
"appreciate it", "appreciate your help", "great work", "good job",
|
| 49 |
+
"well done", "excellent service", "for your information",
|
| 50 |
+
"just informing", "sharing information", "today is a holiday",
|
| 51 |
+
"office opens at 10 am", "school reopens next week",
|
| 52 |
+
"meeting scheduled tomorrow", "okay", "ok", "alright", "fine",
|
| 53 |
+
"cool", "great", "nice", "regards", "best regards", "with regards",
|
| 54 |
+
"kind regards", "thank you and regards", "thank you very much sir",
|
| 55 |
+
"test", "testing", "demo", "sample text", "random text",
|
| 56 |
+
"🙂", "👍", "🙏", "😂", "🔥", "!!!", "???",
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ── Text cleaning ─────────────────────────────────────────
|
| 61 |
+
def clean_text(text: str) -> str:
|
| 62 |
+
text = str(text)
|
| 63 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 64 |
+
# FIX: do NOT strip non-ASCII here — this module receives English
|
| 65 |
+
# only (language detection in main.py routes correctly), but
|
| 66 |
+
# stripping non-ASCII would silently corrupt any mis-routed Indic text.
|
| 67 |
+
# Keep only the HTML-strip; whitespace normalisation is sufficient.
|
| 68 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 69 |
+
return text
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ── Input validation ──────────────────────────────────────
|
| 73 |
+
def validate_input(text: str):
|
| 74 |
+
if not text or not text.strip():
|
| 75 |
+
return "empty_text"
|
| 76 |
+
text_l = text.strip().lower()
|
| 77 |
+
if len(text_l) < 10:
|
| 78 |
+
return "too_short"
|
| 79 |
+
if len(text_l.split()) < 3:
|
| 80 |
+
return "too_few_words"
|
| 81 |
+
if text_l in LABEL_WORDS:
|
| 82 |
+
return "label_only"
|
| 83 |
+
if text_l in NON_GRIEVANCE_PHRASES:
|
| 84 |
+
return "non_grievance_text"
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# ── Predict ───────────────────────────────────────────────
|
| 89 |
+
def predict(
|
| 90 |
+
text: str,
|
| 91 |
+
input_ids=None, # O3: pre-tokenised tensor from main.py
|
| 92 |
+
attention_mask=None, # O3: pre-tokenised tensor from main.py
|
| 93 |
+
) -> dict:
|
| 94 |
+
"""
|
| 95 |
+
Predict grievance category for English text.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
text : Raw input string (always required for validation).
|
| 99 |
+
input_ids : Optional pre-tokenised tensor (1, seq_len).
|
| 100 |
+
When provided by main.py the internal tokenisation
|
| 101 |
+
step is skipped — eliminates duplicate tokenisation.
|
| 102 |
+
attention_mask : Required when input_ids is provided.
|
| 103 |
+
|
| 104 |
+
Returns dict with keys: status, category, confidence, class_index.
|
| 105 |
+
"""
|
| 106 |
+
# 1. Rule-based validation (always on raw text)
|
| 107 |
+
reason = validate_input(text)
|
| 108 |
+
if reason:
|
| 109 |
+
return {
|
| 110 |
+
"status": "failed",
|
| 111 |
+
"reason": reason,
|
| 112 |
+
"category": None,
|
| 113 |
+
"confidence": 0.0,
|
| 114 |
+
"class_index": None,
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# 2. Clean text for model consumption
|
| 118 |
+
cleaned = clean_text(text)
|
| 119 |
+
|
| 120 |
+
# 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
|
| 121 |
+
# padding=False — single-string inference needs no padding;
|
| 122 |
+
# avoids [PAD] tokens appearing in IG attributions.
|
| 123 |
+
if input_ids is None:
|
| 124 |
+
enc = tokenizer(
|
| 125 |
+
cleaned,
|
| 126 |
+
return_tensors="pt",
|
| 127 |
+
truncation=True,
|
| 128 |
+
padding=False,
|
| 129 |
+
max_length=MAX_LENGTH,
|
| 130 |
+
)
|
| 131 |
+
input_ids = enc["input_ids"]
|
| 132 |
+
attention_mask = enc["attention_mask"]
|
| 133 |
+
|
| 134 |
+
# 4. Forward pass
|
| 135 |
+
with torch.no_grad():
|
| 136 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 137 |
+
|
| 138 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 139 |
+
conf, pred = torch.max(probs, dim=1)
|
| 140 |
+
confidence = conf.item()
|
| 141 |
+
predicted_index = pred.item()
|
| 142 |
+
|
| 143 |
+
# 5. Confidence gate
|
| 144 |
+
if confidence < 0.30:
|
| 145 |
+
return {
|
| 146 |
+
"status": "success",
|
| 147 |
+
"reason": "low_confidence",
|
| 148 |
+
"category": "Other",
|
| 149 |
+
"confidence": round(confidence, 4),
|
| 150 |
+
"class_index": predicted_index,
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
label = label_encoder.inverse_transform([predicted_index])[0]
|
| 154 |
+
|
| 155 |
+
return {
|
| 156 |
+
"status": "success",
|
| 157 |
+
"category": label,
|
| 158 |
+
"confidence": round(confidence, 4),
|
| 159 |
+
"class_index": predicted_index,
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def get_model_and_tokenizer():
|
| 164 |
+
return model, tokenizer
|
classification/bert_model.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
|
| 3 |
+
# =========================================================
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import pickle
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch
|
| 11 |
+
|
| 12 |
+
from sklearn.model_selection import train_test_split
|
| 13 |
+
from sklearn.preprocessing import LabelEncoder
|
| 14 |
+
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef
|
| 15 |
+
|
| 16 |
+
from transformers import (
|
| 17 |
+
BertTokenizer,
|
| 18 |
+
BertForSequenceClassification,
|
| 19 |
+
Trainer,
|
| 20 |
+
TrainingArguments
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
from torch.utils.data import Dataset
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------
|
| 26 |
+
# CONFIG
|
| 27 |
+
# ---------------------------------------------------------
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 31 |
+
DATA_PATH = os.path.join(BASE_DIR, "train.csv")
|
| 32 |
+
|
| 33 |
+
print("📄 Loading dataset from:", DATA_PATH) # CHANGE if needed
|
| 34 |
+
ARTIFACT_DIR = "classification/artifacts"
|
| 35 |
+
MODEL_DIR = f"{ARTIFACT_DIR}/bert_model"
|
| 36 |
+
MAX_LENGTH = 100
|
| 37 |
+
EPOCHS = 3
|
| 38 |
+
BATCH_SIZE = 16
|
| 39 |
+
LEARNING_RATE = 2e-5
|
| 40 |
+
|
| 41 |
+
os.makedirs(ARTIFACT_DIR, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
# ---------------------------------------------------------
|
| 44 |
+
# 1. LOAD DATA
|
| 45 |
+
# ---------------------------------------------------------
|
| 46 |
+
df = pd.read_csv(DATA_PATH)
|
| 47 |
+
df = df[['text', 'label']]
|
| 48 |
+
df.dropna(inplace=True)
|
| 49 |
+
df.drop_duplicates(inplace=True)
|
| 50 |
+
|
| 51 |
+
# ---------------------------------------------------------
|
| 52 |
+
# 2. CLEAN TEXT (BERT SAFE)
|
| 53 |
+
# ---------------------------------------------------------
|
| 54 |
+
def clean_text(text):
|
| 55 |
+
text = str(text)
|
| 56 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 57 |
+
text = re.sub(r"[^\x00-\x7F]+", " ", text)
|
| 58 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 59 |
+
return text
|
| 60 |
+
|
| 61 |
+
df['text'] = df['text'].apply(clean_text)
|
| 62 |
+
|
| 63 |
+
# ---------------------------------------------------------
|
| 64 |
+
# 3. LABEL ENCODING
|
| 65 |
+
# ---------------------------------------------------------
|
| 66 |
+
label_encoder = LabelEncoder()
|
| 67 |
+
df['label_id'] = label_encoder.fit_transform(df['label'])
|
| 68 |
+
|
| 69 |
+
label_map = dict(zip(label_encoder.classes_,
|
| 70 |
+
label_encoder.transform(label_encoder.classes_)))
|
| 71 |
+
|
| 72 |
+
# SAVE LABEL ENCODER & MAP
|
| 73 |
+
with open(f"{ARTIFACT_DIR}/label_encoder.pkl", "wb") as f:
|
| 74 |
+
pickle.dump(label_encoder, f)
|
| 75 |
+
|
| 76 |
+
with open(f"{ARTIFACT_DIR}/label_map.pkl", "wb") as f:
|
| 77 |
+
pickle.dump(label_map, f)# =========================================================
|
| 78 |
+
# BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
|
| 79 |
+
# =========================================================
|
| 80 |
+
|
| 81 |
+
import os
|
| 82 |
+
import re
|
| 83 |
+
import pickle
|
| 84 |
+
import pandas as pd
|
| 85 |
+
import numpy as np
|
| 86 |
+
import torch
|
| 87 |
+
|
| 88 |
+
from sklearn.model_selection import train_test_split
|
| 89 |
+
from sklearn.preprocessing import LabelEncoder
|
| 90 |
+
from sklearn.metrics import (
|
| 91 |
+
accuracy_score,
|
| 92 |
+
f1_score,
|
| 93 |
+
balanced_accuracy_score,
|
| 94 |
+
matthews_corrcoef
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
from transformers import (
|
| 98 |
+
BertTokenizer,
|
| 99 |
+
BertForSequenceClassification,
|
| 100 |
+
Trainer,
|
| 101 |
+
TrainingArguments
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
from torch.utils.data import Dataset
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ---------------------------------------------------------
|
| 108 |
+
# PATH CONFIG (WINDOWS SAFE)
|
| 109 |
+
# ---------------------------------------------------------
|
| 110 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 111 |
+
DATA_PATH = os.path.join(BASE_DIR, "train.csv")
|
| 112 |
+
|
| 113 |
+
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
|
| 114 |
+
MODEL_DIR = os.path.join(ARTIFACT_DIR, "bert_model")
|
| 115 |
+
|
| 116 |
+
MAX_LENGTH = 100
|
| 117 |
+
EPOCHS = 3
|
| 118 |
+
BATCH_SIZE = 16
|
| 119 |
+
LEARNING_RATE = 2e-5
|
| 120 |
+
|
| 121 |
+
os.makedirs(ARTIFACT_DIR, exist_ok=True)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ---------------------------------------------------------
|
| 125 |
+
# 1. LOAD DATA
|
| 126 |
+
# ---------------------------------------------------------
|
| 127 |
+
print(f"📄 Loading dataset from: {DATA_PATH}")
|
| 128 |
+
|
| 129 |
+
df = pd.read_csv(DATA_PATH)
|
| 130 |
+
df = df[['text', 'label']]
|
| 131 |
+
df.dropna(inplace=True)
|
| 132 |
+
df.drop_duplicates(inplace=True)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ---------------------------------------------------------
|
| 136 |
+
# 2. CLEAN TEXT (BERT SAFE)
|
| 137 |
+
# ---------------------------------------------------------
|
| 138 |
+
def clean_text(text):
|
| 139 |
+
text = str(text)
|
| 140 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 141 |
+
text = re.sub(r"[^\x00-\x7F]+", " ", text)
|
| 142 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 143 |
+
return text
|
| 144 |
+
|
| 145 |
+
df["text"] = df["text"].apply(clean_text)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ---------------------------------------------------------
|
| 149 |
+
# 3. LABEL ENCODING
|
| 150 |
+
# ---------------------------------------------------------
|
| 151 |
+
label_encoder = LabelEncoder()
|
| 152 |
+
df["label_id"] = label_encoder.fit_transform(df["label"])
|
| 153 |
+
|
| 154 |
+
label_map = dict(zip(label_encoder.classes_,
|
| 155 |
+
label_encoder.transform(label_encoder.classes_)))
|
| 156 |
+
|
| 157 |
+
# Save label artifacts
|
| 158 |
+
with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f:
|
| 159 |
+
pickle.dump(label_encoder, f)
|
| 160 |
+
|
| 161 |
+
with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f:
|
| 162 |
+
pickle.dump(label_map, f)
|
| 163 |
+
|
| 164 |
+
NUM_LABELS = len(label_map)
|
| 165 |
+
print(f"✅ Number of classes: {NUM_LABELS}")
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ---------------------------------------------------------
|
| 169 |
+
# 4. TRAIN / VAL / TEST SPLIT
|
| 170 |
+
# ---------------------------------------------------------
|
| 171 |
+
train_df, temp_df = train_test_split(
|
| 172 |
+
df,
|
| 173 |
+
test_size=0.30,
|
| 174 |
+
stratify=df["label_id"],
|
| 175 |
+
random_state=42
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
val_df, test_df = train_test_split(
|
| 179 |
+
temp_df,
|
| 180 |
+
test_size=0.50,
|
| 181 |
+
stratify=temp_df["label_id"],
|
| 182 |
+
random_state=42
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# Save processed splits
|
| 186 |
+
train_df.to_csv(os.path.join(ARTIFACT_DIR, "train.csv"), index=False)
|
| 187 |
+
val_df.to_csv(os.path.join(ARTIFACT_DIR, "val.csv"), index=False)
|
| 188 |
+
test_df.to_csv(os.path.join(ARTIFACT_DIR, "test.csv"), index=False)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# ---------------------------------------------------------
|
| 192 |
+
# 5. TOKENIZER
|
| 193 |
+
# ---------------------------------------------------------
|
| 194 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
| 195 |
+
|
| 196 |
+
with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "wb") as f:
|
| 197 |
+
pickle.dump(tokenizer, f)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# ---------------------------------------------------------
|
| 201 |
+
# 6. TORCH DATASET
|
| 202 |
+
# ---------------------------------------------------------
|
| 203 |
+
class GrievanceDataset(Dataset):
|
| 204 |
+
def __init__(self, texts, labels):
|
| 205 |
+
self.encodings = tokenizer(
|
| 206 |
+
list(texts),
|
| 207 |
+
truncation=True,
|
| 208 |
+
padding=True,
|
| 209 |
+
max_length=MAX_LENGTH
|
| 210 |
+
)
|
| 211 |
+
self.labels = list(labels)
|
| 212 |
+
|
| 213 |
+
def __getitem__(self, idx):
|
| 214 |
+
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
|
| 215 |
+
item["labels"] = torch.tensor(self.labels[idx])
|
| 216 |
+
return item
|
| 217 |
+
|
| 218 |
+
def __len__(self):
|
| 219 |
+
return len(self.labels)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
train_dataset = GrievanceDataset(train_df["text"], train_df["label_id"])
|
| 223 |
+
val_dataset = GrievanceDataset(val_df["text"], val_df["label_id"])
|
| 224 |
+
test_dataset = GrievanceDataset(test_df["text"], test_df["label_id"])
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# ---------------------------------------------------------
|
| 228 |
+
# 7. MODEL
|
| 229 |
+
# ---------------------------------------------------------
|
| 230 |
+
model = BertForSequenceClassification.from_pretrained(
|
| 231 |
+
"bert-base-uncased",
|
| 232 |
+
num_labels=NUM_LABELS
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# ---------------------------------------------------------
|
| 237 |
+
# 8. METRICS
|
| 238 |
+
# ---------------------------------------------------------
|
| 239 |
+
def compute_metrics(eval_pred):
|
| 240 |
+
logits, labels = eval_pred
|
| 241 |
+
preds = np.argmax(logits, axis=1)
|
| 242 |
+
|
| 243 |
+
return {
|
| 244 |
+
"accuracy": accuracy_score(labels, preds),
|
| 245 |
+
"balanced_accuracy": balanced_accuracy_score(labels, preds),
|
| 246 |
+
"f1_weighted": f1_score(labels, preds, average="weighted"),
|
| 247 |
+
"mcc": matthews_corrcoef(labels, preds)
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
# ---------------------------------------------------------
|
| 252 |
+
# 9. TRAINING
|
| 253 |
+
# ---------------------------------------------------------
|
| 254 |
+
training_args = TrainingArguments(
|
| 255 |
+
output_dir=os.path.join(ARTIFACT_DIR, "results"),
|
| 256 |
+
learning_rate=LEARNING_RATE,
|
| 257 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 258 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 259 |
+
num_train_epochs=EPOCHS,
|
| 260 |
+
weight_decay=0.01,
|
| 261 |
+
logging_steps=100,
|
| 262 |
+
save_strategy="no",
|
| 263 |
+
report_to="none"
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
trainer = Trainer(
|
| 267 |
+
model=model,
|
| 268 |
+
args=training_args,
|
| 269 |
+
train_dataset=train_dataset,
|
| 270 |
+
eval_dataset=val_dataset,
|
| 271 |
+
compute_metrics=compute_metrics
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
trainer.train()
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# ---------------------------------------------------------
|
| 278 |
+
# 10. FINAL TEST EVALUATION
|
| 279 |
+
# ---------------------------------------------------------
|
| 280 |
+
predictions = trainer.predict(test_dataset)
|
| 281 |
+
y_true = predictions.label_ids
|
| 282 |
+
y_pred = np.argmax(predictions.predictions, axis=1)
|
| 283 |
+
|
| 284 |
+
print("\n===== FINAL TEST METRICS =====")
|
| 285 |
+
print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
|
| 286 |
+
print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")
|
| 287 |
+
print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")
|
| 288 |
+
print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
# ---------------------------------------------------------
|
| 292 |
+
# 11. SAVE TRAINED MODEL
|
| 293 |
+
# ---------------------------------------------------------
|
| 294 |
+
model.save_pretrained(MODEL_DIR)
|
| 295 |
+
|
| 296 |
+
print("\n✅ PREPROCESSING + TRAINING COMPLETED SUCCESSFULLY")
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
NUM_LABELS = len(label_map)
|
| 300 |
+
|
| 301 |
+
# ---------------------------------------------------------
|
| 302 |
+
# 4. TRAIN / VAL / TEST SPLIT
|
| 303 |
+
# ---------------------------------------------------------
|
| 304 |
+
train_df, temp_df = train_test_split(
|
| 305 |
+
df, test_size=0.30, stratify=df['label_id'], random_state=42
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
val_df, test_df = train_test_split(
|
| 309 |
+
temp_df, test_size=0.50, stratify=temp_df['label_id'], random_state=42
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
# SAVE PREPROCESSED SPLITS
|
| 313 |
+
train_df.to_csv(f"{ARTIFACT_DIR}/train.csv", index=False)
|
| 314 |
+
val_df.to_csv(f"{ARTIFACT_DIR}/val.csv", index=False)
|
| 315 |
+
test_df.to_csv(f"{ARTIFACT_DIR}/test.csv", index=False)
|
| 316 |
+
|
| 317 |
+
# ---------------------------------------------------------
|
| 318 |
+
# 5. TOKENIZER
|
| 319 |
+
# ---------------------------------------------------------
|
| 320 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
| 321 |
+
|
| 322 |
+
# SAVE TOKENIZER
|
| 323 |
+
with open(f"{ARTIFACT_DIR}/tokenizer.pkl", "wb") as f:
|
| 324 |
+
pickle.dump(tokenizer, f)
|
| 325 |
+
|
| 326 |
+
# ---------------------------------------------------------
|
| 327 |
+
# 6. DATASET CLASS
|
| 328 |
+
# ---------------------------------------------------------
|
| 329 |
+
class GrievanceDataset(Dataset):
|
| 330 |
+
def __init__(self, texts, labels):
|
| 331 |
+
self.encodings = tokenizer(
|
| 332 |
+
list(texts),
|
| 333 |
+
truncation=True,
|
| 334 |
+
padding=True,
|
| 335 |
+
max_length=MAX_LENGTH
|
| 336 |
+
)
|
| 337 |
+
self.labels = list(labels)
|
| 338 |
+
|
| 339 |
+
def __getitem__(self, idx):
|
| 340 |
+
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
|
| 341 |
+
item["labels"] = torch.tensor(self.labels[idx])
|
| 342 |
+
return item
|
| 343 |
+
|
| 344 |
+
def __len__(self):
|
| 345 |
+
return len(self.labels)
|
| 346 |
+
|
| 347 |
+
train_dataset = GrievanceDataset(train_df['text'], train_df['label_id'])
|
| 348 |
+
val_dataset = GrievanceDataset(val_df['text'], val_df['label_id'])
|
| 349 |
+
test_dataset = GrievanceDataset(test_df['text'], test_df['label_id'])
|
| 350 |
+
|
| 351 |
+
# ---------------------------------------------------------
|
| 352 |
+
# 7. MODEL
|
| 353 |
+
# ---------------------------------------------------------
|
| 354 |
+
model = BertForSequenceClassification.from_pretrained(
|
| 355 |
+
"bert-base-uncased",
|
| 356 |
+
num_labels=NUM_LABELS
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
# ---------------------------------------------------------
|
| 360 |
+
# 8. METRICS
|
| 361 |
+
# ---------------------------------------------------------
|
| 362 |
+
def compute_metrics(eval_pred):
|
| 363 |
+
logits, labels = eval_pred
|
| 364 |
+
preds = np.argmax(logits, axis=1)
|
| 365 |
+
|
| 366 |
+
return {
|
| 367 |
+
"accuracy": accuracy_score(labels, preds),
|
| 368 |
+
"balanced_accuracy": balanced_accuracy_score(labels, preds),
|
| 369 |
+
"f1": f1_score(labels, preds, average="weighted"),
|
| 370 |
+
"mcc": matthews_corrcoef(labels, preds)
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
# ---------------------------------------------------------
|
| 374 |
+
# 9. TRAINING
|
| 375 |
+
# ---------------------------------------------------------
|
| 376 |
+
training_args = TrainingArguments(
|
| 377 |
+
output_dir=f"{ARTIFACT_DIR}/results",
|
| 378 |
+
learning_rate=LEARNING_RATE,
|
| 379 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 380 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 381 |
+
num_train_epochs=EPOCHS,
|
| 382 |
+
weight_decay=0.01,
|
| 383 |
+
logging_steps=100,
|
| 384 |
+
save_strategy="no",
|
| 385 |
+
report_to="none"
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
trainer = Trainer(
|
| 389 |
+
model=model,
|
| 390 |
+
args=training_args,
|
| 391 |
+
train_dataset=train_dataset,
|
| 392 |
+
eval_dataset=val_dataset,
|
| 393 |
+
tokenizer=tokenizer,
|
| 394 |
+
compute_metrics=compute_metrics
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
trainer.train()
|
| 398 |
+
|
| 399 |
+
# ---------------------------------------------------------
|
| 400 |
+
# 10. FINAL TEST EVALUATION
|
| 401 |
+
# ---------------------------------------------------------
|
| 402 |
+
predictions = trainer.predict(test_dataset)
|
| 403 |
+
y_true = predictions.label_ids
|
| 404 |
+
y_pred = np.argmax(predictions.predictions, axis=1)
|
| 405 |
+
|
| 406 |
+
print("\n===== FINAL TEST METRICS =====")
|
| 407 |
+
print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
|
| 408 |
+
print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")
|
| 409 |
+
print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")
|
| 410 |
+
print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")
|
| 411 |
+
|
| 412 |
+
# ---------------------------------------------------------
|
| 413 |
+
# 11. SAVE TRAINED MODEL
|
| 414 |
+
# ---------------------------------------------------------
|
| 415 |
+
model.save_pretrained(MODEL_DIR)
|
| 416 |
+
|
| 417 |
+
print("\n✅ PREPROCESSING + TRAINING + ARTIFACT GENERATION COMPLETED")
|
classification/classification/artifacts/label_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b0be0d88eed1838fba777af266556aea55e435b970076684d2ad1c8c9b3fb0b
|
| 3 |
+
size 342
|
classification/classification/artifacts/label_map.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8e10c5614e117fd9ccab4af3fa62c0e4c44d23195847586d4d1ddb47f4a00cc
|
| 3 |
+
size 321
|
classification/indic_bert_classify.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# INDICBERT MODEL — CATEGORY CLASSIFICATION (HINDI + TELUGU)
|
| 3 |
+
# =========================================================
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import torch
|
| 8 |
+
import pickle
|
| 9 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 10 |
+
|
| 11 |
+
# ── Path config ───────────────────────────────────────────
|
| 12 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 13 |
+
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
|
| 14 |
+
MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model")
|
| 15 |
+
MAX_LENGTH = 128
|
| 16 |
+
|
| 17 |
+
# ── Load artifacts ────────────────────────────────────────
|
| 18 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
|
| 19 |
+
|
| 20 |
+
with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
|
| 21 |
+
label_encoder = pickle.load(f)
|
| 22 |
+
|
| 23 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 24 |
+
MODEL_DIR, local_files_only=True
|
| 25 |
+
)
|
| 26 |
+
model.eval()
|
| 27 |
+
|
| 28 |
+
# ── Edge-case constants ───────────────────────────────────
|
| 29 |
+
LABEL_WORDS = {
|
| 30 |
+
"water", "electricity", "roads", "garbage",
|
| 31 |
+
"sanitation", "pollution", "transport", "animals",
|
| 32 |
+
"पानी", "बिजली", "सड़क", "कचरा",
|
| 33 |
+
"నీరు", "విద్యుత్", "రోడ్డు", "చెత్త",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
NON_GRIEVANCE_PHRASES = {
|
| 37 |
+
"hello", "hi", "good morning", "good evening",
|
| 38 |
+
"thank you", "thanks", "all good", "no issues", "test", "demo",
|
| 39 |
+
"नमस्ते", "धन्यवाद", "सब ठीक है", "कोई समस्या नहीं",
|
| 40 |
+
"నమస్తే", "ధన్యవాదాలు", "అన్నీ బాగున్నాయి", "సమస్య లేదు",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ── Text cleaning (Indic-safe) ────────────────────────────
|
| 45 |
+
def clean_text(text: str) -> str:
|
| 46 |
+
text = str(text)
|
| 47 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 48 |
+
# Keep Hindi (0900-097F), Telugu (0C00-0C7F), basic ASCII (0020-007F)
|
| 49 |
+
text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
|
| 50 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 51 |
+
return text
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ── Input validation ──────────────────────────────────────
|
| 55 |
+
def validate_input(text: str):
|
| 56 |
+
if not text or not text.strip():
|
| 57 |
+
return "empty_text"
|
| 58 |
+
text_l = text.strip().lower()
|
| 59 |
+
if len(text_l) < 5:
|
| 60 |
+
return "too_short"
|
| 61 |
+
if len(text_l.split()) < 2:
|
| 62 |
+
return "too_few_words"
|
| 63 |
+
if text_l in LABEL_WORDS:
|
| 64 |
+
return "label_only"
|
| 65 |
+
if text_l in NON_GRIEVANCE_PHRASES:
|
| 66 |
+
return "non_grievance_text"
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ── Predict ───────────────────────────────────────────────
|
| 71 |
+
def predict(
|
| 72 |
+
text: str,
|
| 73 |
+
input_ids=None, # O3: pre-tokenised tensor from main.py
|
| 74 |
+
attention_mask=None, # O3: pre-tokenised tensor from main.py
|
| 75 |
+
) -> dict:
|
| 76 |
+
"""
|
| 77 |
+
Predict grievance category for Hindi / Telugu text.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
text : Raw input string (always required for validation).
|
| 81 |
+
input_ids : Optional pre-tokenised tensor (1, seq_len).
|
| 82 |
+
attention_mask : Required when input_ids is provided.
|
| 83 |
+
|
| 84 |
+
Returns dict with keys: status, category, confidence, class_index.
|
| 85 |
+
"""
|
| 86 |
+
# 1. Rule-based validation
|
| 87 |
+
reason = validate_input(text)
|
| 88 |
+
if reason:
|
| 89 |
+
return {
|
| 90 |
+
"status": "failed",
|
| 91 |
+
"reason": reason,
|
| 92 |
+
"category": None,
|
| 93 |
+
"confidence": 0.0,
|
| 94 |
+
"class_index": None,
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# 2. Clean text
|
| 98 |
+
cleaned = clean_text(text)
|
| 99 |
+
|
| 100 |
+
# 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
|
| 101 |
+
if input_ids is None:
|
| 102 |
+
enc = tokenizer(
|
| 103 |
+
cleaned,
|
| 104 |
+
return_tensors="pt",
|
| 105 |
+
truncation=True,
|
| 106 |
+
padding=False,
|
| 107 |
+
max_length=MAX_LENGTH,
|
| 108 |
+
)
|
| 109 |
+
input_ids = enc["input_ids"]
|
| 110 |
+
attention_mask = enc["attention_mask"]
|
| 111 |
+
|
| 112 |
+
# 4. Forward pass
|
| 113 |
+
with torch.no_grad():
|
| 114 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 115 |
+
|
| 116 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 117 |
+
conf, pred = torch.max(probs, dim=1)
|
| 118 |
+
confidence = conf.item()
|
| 119 |
+
predicted_index = pred.item()
|
| 120 |
+
|
| 121 |
+
# 5. Confidence gate
|
| 122 |
+
if confidence < 0.30:
|
| 123 |
+
return {
|
| 124 |
+
"status": "success",
|
| 125 |
+
"reason": "low_confidence",
|
| 126 |
+
"category": "Other",
|
| 127 |
+
"confidence": round(confidence, 4),
|
| 128 |
+
"class_index": predicted_index,
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
label = label_encoder.inverse_transform([predicted_index])[0]
|
| 132 |
+
|
| 133 |
+
return {
|
| 134 |
+
"status": "success",
|
| 135 |
+
"category": label,
|
| 136 |
+
"confidence": round(confidence, 4),
|
| 137 |
+
"class_index": predicted_index,
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def get_model_and_tokenizer():
|
| 142 |
+
return model, tokenizer
|
classification/indic_bert_model.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# INDICBERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
|
| 3 |
+
# Hindi + Telugu Grievance Classification
|
| 4 |
+
# =========================================================
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import pickle
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
from sklearn.model_selection import train_test_split
|
| 14 |
+
from sklearn.preprocessing import LabelEncoder
|
| 15 |
+
from sklearn.metrics import (
|
| 16 |
+
accuracy_score,
|
| 17 |
+
f1_score,
|
| 18 |
+
balanced_accuracy_score,
|
| 19 |
+
matthews_corrcoef
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
from transformers import (
|
| 23 |
+
AutoTokenizer,
|
| 24 |
+
AutoModelForSequenceClassification,
|
| 25 |
+
Trainer,
|
| 26 |
+
TrainingArguments
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
from torch.utils.data import Dataset
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# =========================================================
|
| 33 |
+
# CONFIG
|
| 34 |
+
# =========================================================
|
| 35 |
+
|
| 36 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 37 |
+
DATA_PATH = os.path.join(BASE_DIR, "indic_train.csv")
|
| 38 |
+
|
| 39 |
+
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
|
| 40 |
+
MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model")
|
| 41 |
+
|
| 42 |
+
MAX_LENGTH = 128
|
| 43 |
+
EPOCHS = 4
|
| 44 |
+
BATCH_SIZE = 16
|
| 45 |
+
LEARNING_RATE = 2e-5
|
| 46 |
+
|
| 47 |
+
MODEL_NAME = "ai4bharat/indic-bert"
|
| 48 |
+
|
| 49 |
+
os.makedirs(ARTIFACT_DIR, exist_ok=True)
|
| 50 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 51 |
+
|
| 52 |
+
print(f"📄 Loading dataset from: {DATA_PATH}")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# =========================================================
|
| 56 |
+
# LOAD DATA
|
| 57 |
+
# =========================================================
|
| 58 |
+
|
| 59 |
+
df = pd.read_csv(DATA_PATH)
|
| 60 |
+
|
| 61 |
+
df = df[['text', 'label']]
|
| 62 |
+
|
| 63 |
+
df.dropna(inplace=True)
|
| 64 |
+
df.drop_duplicates(inplace=True)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# =========================================================
|
| 68 |
+
# CLEAN TEXT (KEEP HINDI & TELUGU SAFE)
|
| 69 |
+
# =========================================================
|
| 70 |
+
|
| 71 |
+
def clean_text(text):
|
| 72 |
+
|
| 73 |
+
text = str(text)
|
| 74 |
+
|
| 75 |
+
# Remove HTML
|
| 76 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 77 |
+
|
| 78 |
+
# Remove unwanted symbols but KEEP Indic unicode
|
| 79 |
+
text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
|
| 80 |
+
|
| 81 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 82 |
+
|
| 83 |
+
return text
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
df["text"] = df["text"].apply(clean_text)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# =========================================================
|
| 90 |
+
# LABEL ENCODING
|
| 91 |
+
# =========================================================
|
| 92 |
+
|
| 93 |
+
label_encoder = LabelEncoder()
|
| 94 |
+
|
| 95 |
+
df["label_id"] = label_encoder.fit_transform(df["label"])
|
| 96 |
+
|
| 97 |
+
label_map = dict(zip(
|
| 98 |
+
label_encoder.classes_,
|
| 99 |
+
label_encoder.transform(label_encoder.classes_)
|
| 100 |
+
))
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# SAVE LABEL ARTIFACTS
|
| 104 |
+
with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f:
|
| 105 |
+
pickle.dump(label_encoder, f)
|
| 106 |
+
|
| 107 |
+
with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f:
|
| 108 |
+
pickle.dump(label_map, f)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
NUM_LABELS = len(label_map)
|
| 112 |
+
|
| 113 |
+
print(f"✅ Number of classes: {NUM_LABELS}")
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# =========================================================
|
| 117 |
+
# TRAIN / VAL / TEST SPLIT
|
| 118 |
+
# =========================================================
|
| 119 |
+
|
| 120 |
+
train_df, temp_df = train_test_split(
|
| 121 |
+
df,
|
| 122 |
+
test_size=0.30,
|
| 123 |
+
stratify=df["label_id"],
|
| 124 |
+
random_state=42
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
val_df, test_df = train_test_split(
|
| 128 |
+
temp_df,
|
| 129 |
+
test_size=0.50,
|
| 130 |
+
stratify=temp_df["label_id"],
|
| 131 |
+
random_state=42
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# SAVE SPLITS
|
| 136 |
+
train_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_train.csv"), index=False)
|
| 137 |
+
val_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_val.csv"), index=False)
|
| 138 |
+
test_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_test.csv"), index=False)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# =========================================================
|
| 142 |
+
# TOKENIZER
|
| 143 |
+
# =========================================================
|
| 144 |
+
|
| 145 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 146 |
+
|
| 147 |
+
with open(os.path.join(ARTIFACT_DIR, "indic_tokenizer.pkl"), "wb") as f:
|
| 148 |
+
pickle.dump(tokenizer, f)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# =========================================================
|
| 152 |
+
# DATASET CLASS
|
| 153 |
+
# =========================================================
|
| 154 |
+
|
| 155 |
+
class GrievanceDataset(Dataset):
|
| 156 |
+
|
| 157 |
+
def __init__(self, texts, labels):
|
| 158 |
+
|
| 159 |
+
self.encodings = tokenizer(
|
| 160 |
+
list(texts),
|
| 161 |
+
truncation=True,
|
| 162 |
+
padding=True,
|
| 163 |
+
max_length=MAX_LENGTH
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
self.labels = list(labels)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def __getitem__(self, idx):
|
| 170 |
+
|
| 171 |
+
item = {
|
| 172 |
+
key: torch.tensor(val[idx])
|
| 173 |
+
for key, val in self.encodings.items()
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
item["labels"] = torch.tensor(self.labels[idx])
|
| 177 |
+
|
| 178 |
+
return item
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def __len__(self):
|
| 182 |
+
|
| 183 |
+
return len(self.labels)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
train_dataset = GrievanceDataset(
|
| 188 |
+
train_df["text"],
|
| 189 |
+
train_df["label_id"]
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
val_dataset = GrievanceDataset(
|
| 193 |
+
val_df["text"],
|
| 194 |
+
val_df["label_id"]
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
test_dataset = GrievanceDataset(
|
| 198 |
+
test_df["text"],
|
| 199 |
+
test_df["label_id"]
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# =========================================================
|
| 204 |
+
# MODEL
|
| 205 |
+
# =========================================================
|
| 206 |
+
|
| 207 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 208 |
+
MODEL_NAME,
|
| 209 |
+
num_labels=NUM_LABELS
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# =========================================================
|
| 214 |
+
# METRICS
|
| 215 |
+
# =========================================================
|
| 216 |
+
|
| 217 |
+
def compute_metrics(eval_pred):
|
| 218 |
+
|
| 219 |
+
logits, labels = eval_pred
|
| 220 |
+
|
| 221 |
+
preds = np.argmax(logits, axis=1)
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
|
| 225 |
+
"accuracy": accuracy_score(labels, preds),
|
| 226 |
+
|
| 227 |
+
"balanced_accuracy": balanced_accuracy_score(labels, preds),
|
| 228 |
+
|
| 229 |
+
"f1_weighted": f1_score(labels, preds, average="weighted"),
|
| 230 |
+
|
| 231 |
+
"mcc": matthews_corrcoef(labels, preds)
|
| 232 |
+
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# =========================================================
|
| 237 |
+
# TRAINING
|
| 238 |
+
# =========================================================
|
| 239 |
+
|
| 240 |
+
training_args = TrainingArguments(
|
| 241 |
+
output_dir=f"{ARTIFACT_DIR}/indic_results",
|
| 242 |
+
learning_rate=LEARNING_RATE,
|
| 243 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 244 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 245 |
+
num_train_epochs=EPOCHS,
|
| 246 |
+
weight_decay=0.01,
|
| 247 |
+
logging_steps=100,
|
| 248 |
+
save_strategy="no",
|
| 249 |
+
report_to="none"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
trainer = Trainer(
|
| 254 |
+
model=model,
|
| 255 |
+
args=training_args,
|
| 256 |
+
train_dataset=train_dataset,
|
| 257 |
+
eval_dataset=val_dataset,
|
| 258 |
+
compute_metrics=compute_metrics
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
print("\n🚀 Training IndicBERT Model...\n")
|
| 264 |
+
|
| 265 |
+
trainer.train()
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
# =========================================================
|
| 269 |
+
# FINAL TEST EVALUATION
|
| 270 |
+
# =========================================================
|
| 271 |
+
|
| 272 |
+
predictions = trainer.predict(test_dataset)
|
| 273 |
+
|
| 274 |
+
y_true = predictions.label_ids
|
| 275 |
+
|
| 276 |
+
y_pred = np.argmax(predictions.predictions, axis=1)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
print("\n===== FINAL TEST METRICS =====")
|
| 280 |
+
|
| 281 |
+
print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
|
| 282 |
+
|
| 283 |
+
print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")
|
| 284 |
+
|
| 285 |
+
print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")
|
| 286 |
+
|
| 287 |
+
print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
# =========================================================
|
| 291 |
+
# SAVE MODEL
|
| 292 |
+
# =========================================================
|
| 293 |
+
|
| 294 |
+
model.save_pretrained(MODEL_DIR)
|
| 295 |
+
|
| 296 |
+
tokenizer.save_pretrained(MODEL_DIR)
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
print("\n✅ INDICBERT TRAINING COMPLETED SUCCESSFULLY")
|
classification/indic_train.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
classification/train.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gfas/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# gfas/__init__.py
|
| 3 |
+
# Public surface of the GFAS package.
|
| 4 |
+
# main.py only needs to import `audit` from here.
|
| 5 |
+
# =========================================================
|
| 6 |
+
|
| 7 |
+
from .fairness_audit import audit
|
| 8 |
+
|
| 9 |
+
__all__ = ["audit"]
|
gfas/disparity_analysis.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# gfas/disparity_analysis.py
|
| 3 |
+
# Per-group metric computation and per-dimension disparity
|
| 4 |
+
# analysis (gaps, flags, breakdown table).
|
| 5 |
+
# =========================================================
|
| 6 |
+
import statistics
|
| 7 |
+
|
| 8 |
+
from .fairness_metrics import (
|
| 9 |
+
URGENCY_POSITIVE,
|
| 10 |
+
FAIRNESS_DIMENSIONS,
|
| 11 |
+
PARITY_FLAG_THRESHOLD,
|
| 12 |
+
PRIORITY_FLAG_THRESHOLD,
|
| 13 |
+
TPR_FLAG_THRESHOLD,
|
| 14 |
+
gap_to_score,
|
| 15 |
+
score_label,
|
| 16 |
+
severity,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ── Internal helpers ──────────────────────────────────────
|
| 21 |
+
|
| 22 |
+
def _gap(values: list) -> float | None:
|
| 23 |
+
"""Max – min over a list, ignoring Nones. Returns None if fewer than 2 clean values."""
|
| 24 |
+
clean = [v for v in values if v is not None]
|
| 25 |
+
return round(max(clean) - min(clean), 4) if len(clean) >= 2 else None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ── Public API ────────────────────────────────────────────
|
| 29 |
+
|
| 30 |
+
def compute_group_metrics(items: list) -> dict:
|
| 31 |
+
"""
|
| 32 |
+
Compute per-group fairness metrics for a single bucket of grievance records.
|
| 33 |
+
|
| 34 |
+
Returns a dict with keys:
|
| 35 |
+
count, resolution_rate, statistical_parity,
|
| 36 |
+
equal_opportunity_tpr, mean_priority_score
|
| 37 |
+
"""
|
| 38 |
+
n = len(items)
|
| 39 |
+
|
| 40 |
+
pred_pos = sum(1 for r in items if r["predicted_urgency"] in URGENCY_POSITIVE)
|
| 41 |
+
statistical_parity = round(pred_pos / n, 4)
|
| 42 |
+
|
| 43 |
+
true_pos_pool = [r for r in items if r["true_urgency"] in URGENCY_POSITIVE]
|
| 44 |
+
if true_pos_pool:
|
| 45 |
+
tpr_hits = sum(1 for r in true_pos_pool if r["predicted_urgency"] in URGENCY_POSITIVE)
|
| 46 |
+
equal_opportunity_tpr = round(tpr_hits / len(true_pos_pool), 4)
|
| 47 |
+
else:
|
| 48 |
+
equal_opportunity_tpr = None
|
| 49 |
+
|
| 50 |
+
mean_priority_score = round(statistics.mean([r["priority_score"] for r in items]), 4)
|
| 51 |
+
resolved_count = sum(1 for r in items if r.get("status", "") == "resolved")
|
| 52 |
+
resolution_rate = round(resolved_count / n, 4)
|
| 53 |
+
|
| 54 |
+
return {
|
| 55 |
+
"count": n,
|
| 56 |
+
"resolution_rate": resolution_rate,
|
| 57 |
+
"statistical_parity": statistical_parity,
|
| 58 |
+
"equal_opportunity_tpr": equal_opportunity_tpr,
|
| 59 |
+
"mean_priority_score": mean_priority_score,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def analyse_dimension(dimension: str, group_metrics: dict) -> dict:
|
| 64 |
+
"""
|
| 65 |
+
Given a dimension name and its {group → metrics} dict, compute:
|
| 66 |
+
- gap values across groups
|
| 67 |
+
- fairness score, label, severity
|
| 68 |
+
- flagged groups
|
| 69 |
+
- breakdown table
|
| 70 |
+
- fairness_flags list
|
| 71 |
+
"""
|
| 72 |
+
parity_vals = [v["statistical_parity"] for v in group_metrics.values()]
|
| 73 |
+
priority_vals = [v["mean_priority_score"] for v in group_metrics.values()]
|
| 74 |
+
tpr_vals = [v["equal_opportunity_tpr"] for v in group_metrics.values()
|
| 75 |
+
if v["equal_opportunity_tpr"] is not None]
|
| 76 |
+
res_vals = [v["resolution_rate"] for v in group_metrics.values()]
|
| 77 |
+
|
| 78 |
+
sp_gap = _gap(parity_vals)
|
| 79 |
+
tpr_gap = _gap(tpr_vals)
|
| 80 |
+
pri_gap = _gap(priority_vals)
|
| 81 |
+
res_gap = _gap(res_vals)
|
| 82 |
+
|
| 83 |
+
sub_scores = [s for s in [gap_to_score(sp_gap), gap_to_score(tpr_gap), gap_to_score(pri_gap)]
|
| 84 |
+
if s is not None]
|
| 85 |
+
fairness_score = min(sub_scores) if sub_scores else 100
|
| 86 |
+
|
| 87 |
+
avg_parity = round(statistics.mean(parity_vals), 4) if parity_vals else 0
|
| 88 |
+
avg_resolution = round(statistics.mean(res_vals), 4) if res_vals else 0
|
| 89 |
+
|
| 90 |
+
flagged_groups = [
|
| 91 |
+
g for g, m in group_metrics.items()
|
| 92 |
+
if m["statistical_parity"] < avg_parity - 0.10
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
breakdown = sorted(
|
| 96 |
+
[
|
| 97 |
+
{
|
| 98 |
+
dimension: group,
|
| 99 |
+
"resolutionRate": round(m["statistical_parity"] * 100, 2),
|
| 100 |
+
"total": m["count"],
|
| 101 |
+
"statisticalParity": m["statistical_parity"],
|
| 102 |
+
"tpr": m["equal_opportunity_tpr"],
|
| 103 |
+
"meanPriorityScore": m["mean_priority_score"],
|
| 104 |
+
"isFlagged": group in flagged_groups,
|
| 105 |
+
}
|
| 106 |
+
for group, m in group_metrics.items()
|
| 107 |
+
],
|
| 108 |
+
key=lambda x: x["resolutionRate"],
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
fairness_flags = []
|
| 112 |
+
if sp_gap is not None and sp_gap > PARITY_FLAG_THRESHOLD:
|
| 113 |
+
fairness_flags.append({
|
| 114 |
+
"metric": "statistical_parity",
|
| 115 |
+
"gap": sp_gap,
|
| 116 |
+
"label": f"Urgency-rate gap of {sp_gap * 100:.1f}% across {dimension} groups",
|
| 117 |
+
"interpretation": "Some groups are significantly more (or less) likely to have their grievances classified as high/critical urgency.",
|
| 118 |
+
})
|
| 119 |
+
if pri_gap is not None and pri_gap > PRIORITY_FLAG_THRESHOLD:
|
| 120 |
+
fairness_flags.append({
|
| 121 |
+
"metric": "mean_priority_score",
|
| 122 |
+
"gap": pri_gap,
|
| 123 |
+
"label": f"Priority-score gap of {pri_gap:.3f} across {dimension} groups",
|
| 124 |
+
"interpretation": "Some groups receive systematically higher or lower priority scores, affecting response speed.",
|
| 125 |
+
})
|
| 126 |
+
if tpr_gap is not None and tpr_gap > TPR_FLAG_THRESHOLD:
|
| 127 |
+
fairness_flags.append({
|
| 128 |
+
"metric": "equal_opportunity_tpr",
|
| 129 |
+
"gap": tpr_gap,
|
| 130 |
+
"label": f"Detection-rate gap of {tpr_gap * 100:.1f}% for truly urgent cases across {dimension} groups",
|
| 131 |
+
"interpretation": "The model misses urgent cases at different rates across groups.",
|
| 132 |
+
})
|
| 133 |
+
|
| 134 |
+
return {
|
| 135 |
+
"fairnessScore": fairness_score,
|
| 136 |
+
"fairnessLabel": score_label(fairness_score),
|
| 137 |
+
"severity": severity(fairness_score),
|
| 138 |
+
"groups_found": sorted(group_metrics.keys()),
|
| 139 |
+
"average": round(avg_parity * 100, 2),
|
| 140 |
+
"average_resolution": round(avg_resolution * 100, 2),
|
| 141 |
+
"breakdown": breakdown,
|
| 142 |
+
"flagged": flagged_groups,
|
| 143 |
+
"group_metrics": group_metrics,
|
| 144 |
+
"disparity_summary": {
|
| 145 |
+
"statistical_parity_gap": sp_gap,
|
| 146 |
+
"equal_opportunity_tpr_gap": tpr_gap,
|
| 147 |
+
"mean_priority_score_gap": pri_gap,
|
| 148 |
+
"resolution_rate_gap": res_gap,
|
| 149 |
+
"statistical_parity_gap_label": f"{round(sp_gap * 100, 1)}% urgency-rate spread" if sp_gap is not None else None,
|
| 150 |
+
"equal_opportunity_tpr_gap_label": f"{round(tpr_gap * 100, 1)}% detection-rate gap" if tpr_gap is not None else None,
|
| 151 |
+
"mean_priority_score_gap_label": f"{round(pri_gap, 3)} priority-score spread" if pri_gap is not None else None,
|
| 152 |
+
"resolution_rate_gap_label": f"{round(res_gap * 100, 1)}% resolution-rate gap" if res_gap is not None else None,
|
| 153 |
+
},
|
| 154 |
+
"fairness_flags": fairness_flags,
|
| 155 |
+
"flags_raised": len(fairness_flags),
|
| 156 |
+
}
|
gfas/fairness_audit.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# gfas/fairness_audit.py
|
| 3 |
+
# Input validation and the single callable that main.py
|
| 4 |
+
# imports to power the POST /fairness-audit route.
|
| 5 |
+
# =========================================================
|
| 6 |
+
|
| 7 |
+
from .fairness_metrics import (
|
| 8 |
+
VALID_AREAS,
|
| 9 |
+
VALID_CATEGORIES,
|
| 10 |
+
VALID_LANGUAGES,
|
| 11 |
+
VALID_URGENCY,
|
| 12 |
+
)
|
| 13 |
+
from .gfas_engine import run_fairness_audit
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ── Record-level validation ───────────────────────────────
|
| 17 |
+
|
| 18 |
+
def _validate_record(idx: int, r) -> tuple[dict | None, dict | None]:
|
| 19 |
+
"""
|
| 20 |
+
Normalise and validate a single raw grievance dict.
|
| 21 |
+
|
| 22 |
+
Returns (record, None) on success, (None, skip_entry) on failure.
|
| 23 |
+
"""
|
| 24 |
+
if not isinstance(r, dict):
|
| 25 |
+
return None, {"index": idx, "error": "Not a JSON object"}
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
area = str(r.get("area", "")).strip().lower()
|
| 29 |
+
category = str(r.get("category", "")).strip().lower()
|
| 30 |
+
language = str(r.get("language", "english")).strip().lower()
|
| 31 |
+
pred = str(r.get("predicted_urgency", "medium")).strip().lower()
|
| 32 |
+
true_urg = str(r.get("true_urgency", pred)).strip().lower()
|
| 33 |
+
score = float(r.get("priority_score", 0))
|
| 34 |
+
status = str(r.get("status", "pending")).strip().lower()
|
| 35 |
+
except Exception as e:
|
| 36 |
+
return None, {"index": idx, "error": f"Field parse error: {e}"}
|
| 37 |
+
|
| 38 |
+
if area not in VALID_AREAS:
|
| 39 |
+
return None, {"index": idx, "error": f"area not in VALID_AREAS: '{area}'"}
|
| 40 |
+
if category not in VALID_CATEGORIES:
|
| 41 |
+
return None, {"index": idx, "error": f"category not in VALID_CATEGORIES: '{category}'"}
|
| 42 |
+
|
| 43 |
+
# Soft-correct out-of-vocabulary enum values
|
| 44 |
+
if language not in VALID_LANGUAGES:
|
| 45 |
+
language = "english"
|
| 46 |
+
if pred not in VALID_URGENCY:
|
| 47 |
+
pred = "medium"
|
| 48 |
+
if true_urg not in VALID_URGENCY:
|
| 49 |
+
true_urg = pred
|
| 50 |
+
|
| 51 |
+
return {
|
| 52 |
+
"area": area,
|
| 53 |
+
"category": category,
|
| 54 |
+
"language": language,
|
| 55 |
+
"predicted_urgency": pred,
|
| 56 |
+
"true_urgency": true_urg,
|
| 57 |
+
"priority_score": score,
|
| 58 |
+
"status": status,
|
| 59 |
+
}, None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ── Public callable used by the route ────────────────────
|
| 63 |
+
|
| 64 |
+
def audit(raw_grievances: list) -> tuple[dict | None, dict | None, int]:
|
| 65 |
+
"""
|
| 66 |
+
Validate *raw_grievances* and run the full GFAS pipeline.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
(result_dict, None, 200) — success
|
| 70 |
+
(None, error_dict, 4xx) — validation failure
|
| 71 |
+
"""
|
| 72 |
+
if not isinstance(raw_grievances, list) or not raw_grievances:
|
| 73 |
+
return None, {
|
| 74 |
+
"status": "failed",
|
| 75 |
+
"message": "'grievances' must be a non-empty list.",
|
| 76 |
+
}, 422
|
| 77 |
+
|
| 78 |
+
validated, skipped = [], []
|
| 79 |
+
for idx, r in enumerate(raw_grievances):
|
| 80 |
+
record, err = _validate_record(idx, r)
|
| 81 |
+
if record:
|
| 82 |
+
validated.append(record)
|
| 83 |
+
else:
|
| 84 |
+
skipped.append(err)
|
| 85 |
+
|
| 86 |
+
if len(validated) < 2:
|
| 87 |
+
return None, {
|
| 88 |
+
"status": "failed",
|
| 89 |
+
"message": (
|
| 90 |
+
f"Only {len(validated)} valid record(s) after validation "
|
| 91 |
+
f"({len(skipped)} skipped). Need at least 2 records across different "
|
| 92 |
+
f"groups to compute fairness metrics."
|
| 93 |
+
),
|
| 94 |
+
"skipped": skipped[:10],
|
| 95 |
+
"skipped_count": len(skipped),
|
| 96 |
+
"received_count": len(raw_grievances),
|
| 97 |
+
}, 422
|
| 98 |
+
|
| 99 |
+
audit_result = run_fairness_audit(validated)
|
| 100 |
+
|
| 101 |
+
result = {
|
| 102 |
+
"status": "success",
|
| 103 |
+
"fairness_audit": audit_result,
|
| 104 |
+
"meta": {
|
| 105 |
+
"received": len(raw_grievances),
|
| 106 |
+
"valid": len(validated),
|
| 107 |
+
"skipped": len(skipped),
|
| 108 |
+
"skipped_details": skipped[:5] if skipped else [],
|
| 109 |
+
},
|
| 110 |
+
}
|
| 111 |
+
return result, None, 200
|
gfas/fairness_metrics.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# gfas/fairness_metrics.py
|
| 3 |
+
# Constants and primitive scoring helpers used across GFAS.
|
| 4 |
+
# =========================================================
|
| 5 |
+
|
| 6 |
+
# ── Urgency label sets ────────────────────────────────────
|
| 7 |
+
VALID_URGENCY = {"low", "medium", "high", "critical"}
|
| 8 |
+
URGENCY_POSITIVE = {"high", "critical"}
|
| 9 |
+
|
| 10 |
+
# ── Domain allow-lists ────────────────────────────────────
|
| 11 |
+
VALID_CATEGORIES = {
|
| 12 |
+
"electricity", "garbage", "pollution", "public transport",
|
| 13 |
+
"roads", "sanitation", "stray animals", "water"
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
VALID_LANGUAGES = {"telugu", "english", "hindi"}
|
| 17 |
+
|
| 18 |
+
VALID_AREAS = {
|
| 19 |
+
# Zone 1
|
| 20 |
+
"suryaraopeta", "jagannaickpur", "raja rao peta", "bhanugudi",
|
| 21 |
+
"old town", "rajah street", "main road",
|
| 22 |
+
# Zone 2
|
| 23 |
+
"gandhi nagar", "ashok nagar", "nethaji nagar",
|
| 24 |
+
"srinivasa nagar", "tngo colony", "shankar vilas",
|
| 25 |
+
"collector's colony",
|
| 26 |
+
# Zone 3
|
| 27 |
+
"new town", "bank colony", "drivers colony",
|
| 28 |
+
"fci colony", "burma colony", "dwaraka nagar",
|
| 29 |
+
"ayodhya nagar",
|
| 30 |
+
# Zone 4
|
| 31 |
+
"kakinada port area", "kakinada industrial area",
|
| 32 |
+
"fishing harbour", "dairy farm", "auto nagar",
|
| 33 |
+
"kaleswara rao nagar",
|
| 34 |
+
# Zone 5
|
| 35 |
+
"ramanayyapeta", "rama rao peta", "kondayya palem",
|
| 36 |
+
"ganganapalle", "gudari gunta", "indrapalem",
|
| 37 |
+
# Zone 6
|
| 38 |
+
"sarpavaram", "uppada", "kaikavolu",
|
| 39 |
+
"kothuru", "thammavaram", "thimmapuram",
|
| 40 |
+
# Zone 7
|
| 41 |
+
"vivekananda street", "jr ntr road",
|
| 42 |
+
"jntu kakinada area", "govt general hospital area",
|
| 43 |
+
"apsp camp",
|
| 44 |
+
# Other
|
| 45 |
+
"kakinada beach road", "kakinada bazar",
|
| 46 |
+
"anjaneya nagar",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# ── Audit dimensions ──────────────────────────────────────
|
| 50 |
+
FAIRNESS_DIMENSIONS = ["area", "category", "language"]
|
| 51 |
+
|
| 52 |
+
# ── Flag thresholds ───────────────────────────────────────
|
| 53 |
+
PARITY_FLAG_THRESHOLD = 0.20
|
| 54 |
+
PRIORITY_FLAG_THRESHOLD = 0.20
|
| 55 |
+
TPR_FLAG_THRESHOLD = 0.20
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ── Primitive scorers ─────────────────────────────────────
|
| 59 |
+
|
| 60 |
+
def gap_to_score(gap) -> int:
|
| 61 |
+
"""Convert a disparity gap (0–1) to a 0–100 fairness score (higher = fairer)."""
|
| 62 |
+
if gap is None:
|
| 63 |
+
return 100
|
| 64 |
+
return max(0, min(100, round(100 - float(gap) * 200)))
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def score_label(score: int) -> str:
|
| 68 |
+
if score >= 80:
|
| 69 |
+
return "Fair"
|
| 70 |
+
if score >= 60:
|
| 71 |
+
return "Moderate"
|
| 72 |
+
return "Biased"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def severity(score: int) -> str:
|
| 76 |
+
if score >= 80:
|
| 77 |
+
return "ok"
|
| 78 |
+
if score >= 60:
|
| 79 |
+
return "warning"
|
| 80 |
+
return "critical"
|
gfas/gfas_engine.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# gfas/gfas_engine.py
|
| 3 |
+
# Top-level orchestrator: feeds validated records through
|
| 4 |
+
# disparity analysis and report generation to produce the
|
| 5 |
+
# final fairness audit payload.
|
| 6 |
+
# =========================================================
|
| 7 |
+
import statistics
|
| 8 |
+
from collections import defaultdict
|
| 9 |
+
|
| 10 |
+
from .fairness_metrics import FAIRNESS_DIMENSIONS, score_label, severity
|
| 11 |
+
from .disparity_analysis import compute_group_metrics, analyse_dimension
|
| 12 |
+
from .report_generator import build_alerts, build_recommendations
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def run_fairness_audit(validated_grievances: list) -> dict:
|
| 16 |
+
"""
|
| 17 |
+
Main entry point for GFAS.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
validated_grievances: List of dicts already normalised by fairness_audit.py.
|
| 21 |
+
Required keys: area, category, language, predicted_urgency,
|
| 22 |
+
true_urgency, priority_score, status.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Full fairness audit payload ready for JSON serialisation.
|
| 26 |
+
"""
|
| 27 |
+
dimension_results: dict = {}
|
| 28 |
+
|
| 29 |
+
for dimension in FAIRNESS_DIMENSIONS:
|
| 30 |
+
# ── Bucket records by group value ──────────────────────────────────────
|
| 31 |
+
buckets: dict = defaultdict(list)
|
| 32 |
+
for r in validated_grievances:
|
| 33 |
+
buckets[r[dimension]].append(r)
|
| 34 |
+
|
| 35 |
+
# ── Per-group metrics ──────────────────────────────────────────────────
|
| 36 |
+
group_metrics = {group: compute_group_metrics(items) for group, items in buckets.items()}
|
| 37 |
+
|
| 38 |
+
# ── Dimension-level disparity analysis ─────────────────────────────────
|
| 39 |
+
dimension_results[dimension] = analyse_dimension(dimension, group_metrics)
|
| 40 |
+
|
| 41 |
+
# ── Overall score (mean of dimension scores) ───────────────────────────────
|
| 42 |
+
dim_scores = [dimension_results[d]["fairnessScore"] for d in FAIRNESS_DIMENSIONS]
|
| 43 |
+
overall_score = round(statistics.mean(dim_scores), 2)
|
| 44 |
+
|
| 45 |
+
# ── Alerts + recommendations ───────────────────────────────────────────────
|
| 46 |
+
alerts = build_alerts(dimension_results)
|
| 47 |
+
recommendations = build_recommendations(dimension_results)
|
| 48 |
+
|
| 49 |
+
# ── Summary block ──────────────────────────────────────────────────────────
|
| 50 |
+
sp_gaps = [
|
| 51 |
+
dimension_results[d]["disparity_summary"]["statistical_parity_gap"]
|
| 52 |
+
for d in FAIRNESS_DIMENSIONS
|
| 53 |
+
]
|
| 54 |
+
disparity_index = (
|
| 55 |
+
round(max(v for v in sp_gaps if v is not None), 4)
|
| 56 |
+
if any(v is not None for v in sp_gaps)
|
| 57 |
+
else None
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
return {
|
| 61 |
+
"overallFairnessScore": overall_score,
|
| 62 |
+
"fairnessLabel": score_label(overall_score),
|
| 63 |
+
"severity": severity(overall_score),
|
| 64 |
+
"area": dimension_results["area"],
|
| 65 |
+
"category": dimension_results["category"],
|
| 66 |
+
"language": dimension_results["language"],
|
| 67 |
+
"summary": {
|
| 68 |
+
"totalGrievances": len(validated_grievances),
|
| 69 |
+
"avgResolutionRate": round(
|
| 70 |
+
statistics.mean([r.get("status", "") == "resolved" for r in validated_grievances]) * 100, 2
|
| 71 |
+
),
|
| 72 |
+
"disparityIndex": disparity_index,
|
| 73 |
+
"dimensionsAudited": FAIRNESS_DIMENSIONS,
|
| 74 |
+
"flagsRaised": sum(dimension_results[d]["flags_raised"] for d in FAIRNESS_DIMENSIONS),
|
| 75 |
+
},
|
| 76 |
+
"alerts": alerts,
|
| 77 |
+
"recommendations": recommendations,
|
| 78 |
+
"dimensions_audited": FAIRNESS_DIMENSIONS,
|
| 79 |
+
"total_grievances": len(validated_grievances),
|
| 80 |
+
"results": dimension_results,
|
| 81 |
+
}
|
gfas/report_generator.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# gfas/report_generator.py
|
| 3 |
+
# Builds human-readable alerts and actionable recommendations
|
| 4 |
+
# from per-dimension disparity analysis results.
|
| 5 |
+
# =========================================================
|
| 6 |
+
|
| 7 |
+
from .fairness_metrics import FAIRNESS_DIMENSIONS
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# ── Copy templates ────────────────────────────────────────
|
| 11 |
+
|
| 12 |
+
_TITLE_MAP = {
|
| 13 |
+
"area": "Improve urgency detection in under-served areas",
|
| 14 |
+
"category": "Address priority-score gap across grievance categories",
|
| 15 |
+
"language": "Ensure equitable urgency classification by submission language",
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
def _desc(dimension: str, dr: dict) -> str:
|
| 19 |
+
flagged_str = ", ".join(dr["flagged"][:2]) or "N/A"
|
| 20 |
+
if dimension == "area":
|
| 21 |
+
gap_label = dr["disparity_summary"]["statistical_parity_gap_label"] or "unknown"
|
| 22 |
+
return (
|
| 23 |
+
f"Areas {flagged_str} show urgency-rate gaps of {gap_label}. "
|
| 24 |
+
"Assign dedicated officers and increase patrol frequency in these localities."
|
| 25 |
+
)
|
| 26 |
+
if dimension == "category":
|
| 27 |
+
return (
|
| 28 |
+
f"Categories with low parity scores indicate the model under-prioritises certain complaint types. "
|
| 29 |
+
f"Retrain or re-weight the urgency classifier for {flagged_str}."
|
| 30 |
+
)
|
| 31 |
+
# language
|
| 32 |
+
return (
|
| 33 |
+
f"Grievances in {flagged_str} receive lower urgency scores. "
|
| 34 |
+
"Deploy multilingual reviewers or a translation-aware pre-processing step before classification."
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ── Public API ────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
def build_alerts(dimension_results: dict) -> list[dict]:
|
| 41 |
+
"""
|
| 42 |
+
Return a list of alert dicts for every dimension whose fairness score < 80.
|
| 43 |
+
"""
|
| 44 |
+
alerts = []
|
| 45 |
+
for dim in FAIRNESS_DIMENSIONS:
|
| 46 |
+
dr = dimension_results[dim]
|
| 47 |
+
if dr["fairnessScore"] >= 80:
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
flagged_str = ""
|
| 51 |
+
if dr["flagged"]:
|
| 52 |
+
sample = dr["flagged"][:3]
|
| 53 |
+
extra = len(dr["flagged"]) - 3
|
| 54 |
+
flagged_str = f" Affected {dim}s: {', '.join(sample)}"
|
| 55 |
+
if extra > 0:
|
| 56 |
+
flagged_str += f" +{extra} more"
|
| 57 |
+
flagged_str += "."
|
| 58 |
+
|
| 59 |
+
flag_details = "; ".join(f["label"] for f in dr["fairness_flags"])
|
| 60 |
+
severity_word = "Significant" if dr["fairnessScore"] < 60 else "Moderate"
|
| 61 |
+
action_word = "Immediate review recommended." if dr["fairnessScore"] < 60 else "Monitor resolution trends."
|
| 62 |
+
|
| 63 |
+
alerts.append({
|
| 64 |
+
"severity": dr["severity"],
|
| 65 |
+
"message": f"{severity_word} {dim} fairness disparity ({flag_details}).{flagged_str} {action_word}",
|
| 66 |
+
"dimension": dim,
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
return alerts
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def build_recommendations(dimension_results: dict) -> list[dict]:
|
| 73 |
+
"""
|
| 74 |
+
Return actionable recommendations for every dimension whose fairness score < 80,
|
| 75 |
+
sorted from worst to best.
|
| 76 |
+
"""
|
| 77 |
+
flagged_dims = sorted(
|
| 78 |
+
[d for d in FAIRNESS_DIMENSIONS if dimension_results[d]["fairnessScore"] < 80],
|
| 79 |
+
key=lambda d: dimension_results[d]["fairnessScore"],
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
recommendations = []
|
| 83 |
+
for dim in flagged_dims:
|
| 84 |
+
dr = dimension_results[dim]
|
| 85 |
+
recommendations.append({
|
| 86 |
+
"priority": "high" if dr["fairnessScore"] < 60 else "medium",
|
| 87 |
+
"title": _TITLE_MAP[dim],
|
| 88 |
+
"description": _desc(dim, dr),
|
| 89 |
+
"dimension": dim,
|
| 90 |
+
"affectedArea": ", ".join(dr["flagged"][:2]) or None,
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
return recommendations
|
main.py
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# FLASK API — MULTILINGUAL GRIEVANCE + XPE + GFAS
|
| 3 |
+
# INTEGRATED GRADIENTS ONLY (PRODUCTION VERSION)
|
| 4 |
+
# Hugging Face Spaces — Production Deployment
|
| 5 |
+
# Multimodal: text / audio / image(evidence) support
|
| 6 |
+
# =========================================================
|
| 7 |
+
from flask import Flask, request, jsonify
|
| 8 |
+
import re
|
| 9 |
+
import io
|
| 10 |
+
import traceback
|
| 11 |
+
import logging
|
| 12 |
+
import math
|
| 13 |
+
import os
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
+
|
| 17 |
+
# ── Silence noisy loggers ────────────────────────────────
|
| 18 |
+
logging.getLogger("prophet").setLevel(logging.ERROR)
|
| 19 |
+
logging.getLogger("cmdstanpy").setLevel(logging.ERROR)
|
| 20 |
+
logging.basicConfig(level=logging.INFO)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
from prophet import Prophet
|
| 24 |
+
import pandas as pd
|
| 25 |
+
|
| 26 |
+
# ── EXIF extraction ──────────────────────────────────────
|
| 27 |
+
from PIL import Image
|
| 28 |
+
import piexif
|
| 29 |
+
|
| 30 |
+
# =========================================================
|
| 31 |
+
# CATEGORY PREDICTION
|
| 32 |
+
# =========================================================
|
| 33 |
+
from classification.bert_classify import (
|
| 34 |
+
predict as predict_category_en,
|
| 35 |
+
get_model_and_tokenizer as get_cat_en,
|
| 36 |
+
)
|
| 37 |
+
from classification.indic_bert_classify import (
|
| 38 |
+
predict as predict_category_indic,
|
| 39 |
+
get_model_and_tokenizer as get_cat_indic,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# =========================================================
|
| 43 |
+
# URGENCY PREDICTION
|
| 44 |
+
# =========================================================
|
| 45 |
+
from sentiment_analysis.bert_predict import (
|
| 46 |
+
predict_urgency as predict_urgency_en,
|
| 47 |
+
get_model_and_tokenizer as get_urg_en,
|
| 48 |
+
)
|
| 49 |
+
from sentiment_analysis.indic_bert_predict import (
|
| 50 |
+
predict as predict_urgency_indic,
|
| 51 |
+
get_model_and_tokenizer as get_urg_indic,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# =========================================================
|
| 55 |
+
# MULTIMODAL
|
| 56 |
+
# =========================================================
|
| 57 |
+
from multi_modal.audio_to_text import transcribe_audio
|
| 58 |
+
from multi_modal.image_to_text import extract_text_from_image
|
| 59 |
+
|
| 60 |
+
# =========================================================
|
| 61 |
+
# XPE MODULES
|
| 62 |
+
# =========================================================
|
| 63 |
+
from xpe.priority_engine import compute_priority_score
|
| 64 |
+
from xpe.integrated_gradients_explainer import IntegratedGradientsExplainer
|
| 65 |
+
from xpe.hybrid_explainer import generate_final_reason
|
| 66 |
+
|
| 67 |
+
# =========================================================
|
| 68 |
+
# GFAS — Grievance Fairness Audit System
|
| 69 |
+
# =========================================================
|
| 70 |
+
from gfas import audit as gfas_audit
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# =========================================================
|
| 74 |
+
# COMPILED REGEX — MULTILINGUAL (EN + HI + TE)
|
| 75 |
+
# =========================================================
|
| 76 |
+
_RE_HINDI = re.compile(r'[\u0900-\u097F]')
|
| 77 |
+
_RE_TELUGU = re.compile(r'[\u0C00-\u0C7F]')
|
| 78 |
+
|
| 79 |
+
_SMALL_TALK_PATTERNS = re.compile(
|
| 80 |
+
r"""
|
| 81 |
+
^(hi|hello|hey|dear|sir|madam)\b
|
| 82 |
+
| good\s+(morning|evening|afternoon|night)
|
| 83 |
+
| how\s+(are\s+you|is\s+it\s+going)
|
| 84 |
+
| what'?s\s+up
|
| 85 |
+
| hope\s+you\s+are\s+doing\s+well
|
| 86 |
+
| \b(thank(s|\s+you)|okay|ok|great|nice|good\s+job)\b
|
| 87 |
+
| \b(namaste|namaskar|dhanyavaad|shukriya|theek\s+hai|accha|acha|haan|helo)\b
|
| 88 |
+
| \b(namaskaram|dhanyavadalu|bayapadu|ela\s+unnaru|mee\s+seva)\b
|
| 89 |
+
""",
|
| 90 |
+
re.VERBOSE | re.IGNORECASE,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
_GRIEVANCE_PATTERNS = re.compile(
|
| 94 |
+
r"""
|
| 95 |
+
\b(problem|issue|complain(t)?|grievance|concern
|
| 96 |
+
|inconvenience|harassment|injustice|negligence|misconduct)\b
|
| 97 |
+
| \b(not\s+working|stopped\s+working|not\s+responding|no\s+response
|
| 98 |
+
|no\s+action|fail(ed|ure)?|malfunction(ing)?|defective
|
| 99 |
+
|service\s+down|interrupted|disconnected|outage
|
| 100 |
+
|not\s+restored|not\s+repaired|not\s+fixed|not\s+resolved
|
| 101 |
+
|not\s+completed?|not\s+done|not\s+processed?
|
| 102 |
+
|not\s+functioning|non[-\s]functional)\b
|
| 103 |
+
| \b(delay(ed)?|pending|not\s+received|not\s+delivered
|
| 104 |
+
|still\s+waiting|no\s+update|no\s+resolution
|
| 105 |
+
|no\s+acknowledgm[e]?nt|not\s+credited|not\s+sanctioned
|
| 106 |
+
|not\s+approved|not\s+collected|not\s+cleared
|
| 107 |
+
|overdue|lapsed|under\s+process|under\s+review|awaiting)\b
|
| 108 |
+
| \b(refund|charg(ed|ing)|overcharged|overbilled
|
| 109 |
+
|extra\s+charge|double\s+charg(ed|e)
|
| 110 |
+
|charged\s+twice|billed\s+twice|debited\s+twice
|
| 111 |
+
|wrong\s+bill|wrong\s+amount|incorrect\s+(amount|bill)
|
| 112 |
+
|excess\s+(charge|amount|fee)
|
| 113 |
+
|payment\s+fail(ed|ure)?|transaction\s+fail(ed|ure)?
|
| 114 |
+
|unauthorized\s+transaction|debited|deducted
|
| 115 |
+
|not\s+refunded|duplicate\s+(charge|bill|payment)
|
| 116 |
+
|invoice)\b
|
| 117 |
+
| \bbill\b
|
| 118 |
+
| \b(pothole|waterlogging|no\s+water|water\s+supply
|
| 119 |
+
|power\s+(cut|outage|failure)|electricity\s+(cut|failure|issue)
|
| 120 |
+
|sewage|drainage|garbage|waste\s+collection
|
| 121 |
+
|road\s+(damage|broken|condition|repair)
|
| 122 |
+
|streetlight|footpath
|
| 123 |
+
|no\s+(electricity|water|gas|internet|signal|network)
|
| 124 |
+
|supply\s+(not|stopped|disrupted))\b
|
| 125 |
+
| \b(certificate|ration\s+card|pension|scholarship|subsidy
|
| 126 |
+
|license|passport
|
| 127 |
+
|application\s+(rejected|pending|delayed|not\s+processed)
|
| 128 |
+
|not\s+issued|not\s+granted|denied|rejected|withheld)\b
|
| 129 |
+
| \b(rude|misbehav(ed|iour)|bribe|corruption
|
| 130 |
+
|demanding\s+(money|bribe)|not\s+attending|irresponsible)\b
|
| 131 |
+
| \b(wrong|missing|damaged?|broken|poor\s+service|substandard
|
| 132 |
+
|bad\s+service|very\s+bad|worst\s+service|defect(ive)?)\b
|
| 133 |
+
| \b(unsatisfied|unhappy|disappointed|frustrated|harassed|ignored
|
| 134 |
+
|cheated|deceived|exploited|victimized)\b
|
| 135 |
+
| \b(cancel(l?(ed|ation))?|legal\s+action|escalate[d]?
|
| 136 |
+
|complaint\s+against|take\s+action|file\s+(a\s+)?complaint
|
| 137 |
+
|report\s+(this|the)|seeking\s+(help|redressal|justice)
|
| 138 |
+
|urgent(ly)?|immediately)\b
|
| 139 |
+
| \b(fraud|scam|error|mistake|violation|irregularity|malpractice)\b
|
| 140 |
+
| (समस्या|शिकायत|परेशानी|दिक्कत|नहीं\s*मिला|नहीं\s*आया
|
| 141 |
+
|वापसी|रिफंड|विलंब|देरी|धोखा|गलत|टूटा|खराब
|
| 142 |
+
|बंद\s*हो\s*गया|काम\s*नहीं|जवाब\s*नहीं|कार्रवाई\s*नहीं
|
| 143 |
+
|नाराज|परेशान|निराश|कानूनी\s*कार्रवाई|भुगतान\s*विफल
|
| 144 |
+
|दो\s*बार\s*काटा|दो\s*बार|काटा|बिजली|पानी|सड़क
|
| 145 |
+
|भ्रष्टाचार|रिश्वत|जमा\s*नहीं|जारी\s*नहीं
|
| 146 |
+
|अनधिकृत|अतिरिक्त\s*शुल्क|बिल)
|
| 147 |
+
| \b(samasya|shikayat|pareshani|dikkat|nahi\s+mila|vapasi
|
| 148 |
+
|vilamba|deri|dhokha|galat|tuta|kharab|kaam\s+nahi
|
| 149 |
+
|jawab\s+nahi|naraaz|nirash|kanuni|bhrashtachar
|
| 150 |
+
|do\s+baar|bijli|paani|sadak|jamaa\s+nahi)\b
|
| 151 |
+
| (సమస్య|ఫిర్యాదు|ఇబ్బంది|రాలేదు|పని\s*చేయడం\s*లేదు
|
| 152 |
+
|తిరిగి\s*చెల్లింపు|ఆలస్యం|మోసం|తప్పు|పాడైంది
|
| 153 |
+
|సేవ\s*లేదు|జవాబు\s*లేదు|చర్య\s*లేదు
|
| 154 |
+
|చెల్లింపు\s*విఫలమైంది|నిరాశ|వేధింపు|రద్దు
|
| 155 |
+
|బిల్లు|విద్యుత్|నీరు|రోడ్డు|రెండుసార్లు|వసూలు
|
| 156 |
+
|జమకట్టలేదు|లంచం|అవినీతి)
|
| 157 |
+
| \b(firyadu|ibbandi|raaledu|pani\s+cheyyatledu
|
| 158 |
+
|tirigichellinpu|aalasyam|mosam|tappu|paadaindi
|
| 159 |
+
|seva\s+ledu|nirasha|vedhimpu|raddu|rendu\s+sarlu
|
| 160 |
+
|vasulu|lantham|avineeti)\b
|
| 161 |
+
""",
|
| 162 |
+
re.VERBOSE | re.IGNORECASE,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
_RE_JUNK = re.compile(r'^[\d\W_]+$')
|
| 166 |
+
MIN_TEXT_LENGTH = 8
|
| 167 |
+
|
| 168 |
+
UTC = timezone.utc
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# =========================================================
|
| 172 |
+
# KAKINADA GEO HELPERS
|
| 173 |
+
# =========================================================
|
| 174 |
+
|
| 175 |
+
def _dms_to_decimal(dms, ref: str) -> float:
|
| 176 |
+
degrees = dms[0][0] / dms[0][1]
|
| 177 |
+
minutes = dms[1][0] / dms[1][1]
|
| 178 |
+
seconds = dms[2][0] / dms[2][1]
|
| 179 |
+
decimal = degrees + minutes / 60 + seconds / 3600
|
| 180 |
+
if ref in ("S", "W"):
|
| 181 |
+
decimal = -decimal
|
| 182 |
+
return decimal
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def extract_gps_from_image(image_bytes: bytes) -> tuple | None:
|
| 186 |
+
try:
|
| 187 |
+
img = Image.open(io.BytesIO(image_bytes))
|
| 188 |
+
exif_bytes = img.info.get("exif")
|
| 189 |
+
if not exif_bytes:
|
| 190 |
+
return None
|
| 191 |
+
exif_data = piexif.load(exif_bytes)
|
| 192 |
+
gps_data = exif_data.get("GPS", {})
|
| 193 |
+
if not gps_data:
|
| 194 |
+
return None
|
| 195 |
+
lat_dms = gps_data.get(piexif.GPSIFD.GPSLatitude)
|
| 196 |
+
lat_ref = gps_data.get(piexif.GPSIFD.GPSLatitudeRef)
|
| 197 |
+
lon_dms = gps_data.get(piexif.GPSIFD.GPSLongitude)
|
| 198 |
+
lon_ref = gps_data.get(piexif.GPSIFD.GPSLongitudeRef)
|
| 199 |
+
if not (lat_dms and lat_ref and lon_dms and lon_ref):
|
| 200 |
+
return None
|
| 201 |
+
lat = _dms_to_decimal(lat_dms, lat_ref.decode() if isinstance(lat_ref, bytes) else lat_ref)
|
| 202 |
+
lon = _dms_to_decimal(lon_dms, lon_ref.decode() if isinstance(lon_ref, bytes) else lon_ref)
|
| 203 |
+
return lat, lon
|
| 204 |
+
except Exception:
|
| 205 |
+
return None
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def is_kakinada(lat: float, lon: float) -> bool:
|
| 209 |
+
try:
|
| 210 |
+
lat = float(lat)
|
| 211 |
+
lon = float(lon)
|
| 212 |
+
except (TypeError, ValueError):
|
| 213 |
+
return False
|
| 214 |
+
return 16.85 <= lat <= 17.10 and 82.15 <= lon <= 82.35
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def check_image_location(image_bytes: bytes) -> str:
|
| 218 |
+
coords = extract_gps_from_image(image_bytes)
|
| 219 |
+
if coords is None:
|
| 220 |
+
return "no_gps"
|
| 221 |
+
lat, lon = coords
|
| 222 |
+
return "valid" if is_kakinada(lat, lon) else "invalid"
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# =========================================================
|
| 226 |
+
# LANGUAGE DETECTION
|
| 227 |
+
# =========================================================
|
| 228 |
+
def detect_language(text: str) -> str:
|
| 229 |
+
if _RE_HINDI.search(text):
|
| 230 |
+
return "hindi"
|
| 231 |
+
if _RE_TELUGU.search(text):
|
| 232 |
+
return "telugu"
|
| 233 |
+
return "english"
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# =========================================================
|
| 237 |
+
# INPUT VALIDATION
|
| 238 |
+
# =========================================================
|
| 239 |
+
_VALIDATION_MESSAGES = {
|
| 240 |
+
"too_short": "Text is too short. Please provide at least 8 characters describing your issue.",
|
| 241 |
+
"junk_input": "Input contains only numbers or special characters. Please describe your grievance in words.",
|
| 242 |
+
"small_talk": "This looks like a greeting or small talk. Please describe the issue you are facing.",
|
| 243 |
+
"no_grievance": (
|
| 244 |
+
"No grievance signal detected. Please describe your problem clearly — "
|
| 245 |
+
"e.g. 'My electricity bill was charged twice' or 'Water supply disrupted for 3 days'."
|
| 246 |
+
),
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def validate_text(text) -> tuple:
|
| 251 |
+
if not isinstance(text, str):
|
| 252 |
+
return False, "too_short"
|
| 253 |
+
stripped = text.strip()
|
| 254 |
+
if len(stripped) < 5:
|
| 255 |
+
return False, "too_short"
|
| 256 |
+
if _RE_JUNK.fullmatch(stripped.lower()):
|
| 257 |
+
return False, "junk_input"
|
| 258 |
+
return True, None
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
# =========================================================
|
| 262 |
+
# INITIALIZE APP
|
| 263 |
+
# =========================================================
|
| 264 |
+
app = Flask(__name__)
|
| 265 |
+
|
| 266 |
+
# ── Hugging Face Spaces: disable debug, allow large uploads ──────────────────
|
| 267 |
+
app.config["MAX_CONTENT_LENGTH"] = int(os.environ.get("MAX_UPLOAD_MB", "32")) * 1024 * 1024
|
| 268 |
+
|
| 269 |
+
# =========================================================
|
| 270 |
+
# LOAD MODELS (once at startup)
|
| 271 |
+
# =========================================================
|
| 272 |
+
logger.info("🔄 Loading models...")
|
| 273 |
+
cat_model_en, cat_tok_en = get_cat_en()
|
| 274 |
+
cat_model_indic, cat_tok_indic = get_cat_indic()
|
| 275 |
+
urg_model_en, urg_tok_en = get_urg_en()
|
| 276 |
+
urg_model_indic, urg_tok_indic = get_urg_indic()
|
| 277 |
+
logger.info("✅ Models loaded.")
|
| 278 |
+
|
| 279 |
+
# =========================================================
|
| 280 |
+
# INITIALIZE IG EXPLAINERS (once at startup)
|
| 281 |
+
# =========================================================
|
| 282 |
+
logger.info("🔄 Initializing Integrated Gradients explainers...")
|
| 283 |
+
category_explainer_en = IntegratedGradientsExplainer(cat_model_en, cat_tok_en)
|
| 284 |
+
category_explainer_indic = IntegratedGradientsExplainer(cat_model_indic, cat_tok_indic)
|
| 285 |
+
urgency_explainer_en = IntegratedGradientsExplainer(urg_model_en, urg_tok_en)
|
| 286 |
+
urgency_explainer_indic = IntegratedGradientsExplainer(urg_model_indic, urg_tok_indic)
|
| 287 |
+
logger.info("✅ Integrated Gradients ready.")
|
| 288 |
+
|
| 289 |
+
_RESOURCES = {
|
| 290 |
+
"english": {
|
| 291 |
+
"cat_fn": predict_category_en,
|
| 292 |
+
"urg_fn": predict_urgency_en,
|
| 293 |
+
"cat_exp": category_explainer_en,
|
| 294 |
+
"urg_exp": urgency_explainer_en,
|
| 295 |
+
}
|
| 296 |
+
}
|
| 297 |
+
_RESOURCES_INDIC = {
|
| 298 |
+
"cat_fn": predict_category_indic,
|
| 299 |
+
"urg_fn": predict_urgency_indic,
|
| 300 |
+
"cat_exp": category_explainer_indic,
|
| 301 |
+
"urg_exp": urgency_explainer_indic,
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _get_resources(language: str) -> dict:
|
| 306 |
+
return _RESOURCES.get(language, _RESOURCES_INDIC)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
# =========================================================
|
| 310 |
+
# HOTSPOT FORECAST CONSTANTS
|
| 311 |
+
# =========================================================
|
| 312 |
+
VALID_LABELS = [
|
| 313 |
+
"electricity", "garbage", "pollution",
|
| 314 |
+
"public transport", "roads",
|
| 315 |
+
"sanitation", "stray animals", "water",
|
| 316 |
+
]
|
| 317 |
+
|
| 318 |
+
_PROPHET_MAX_WORKERS = int(os.environ.get("PROPHET_MAX_WORKERS", "4"))
|
| 319 |
+
|
| 320 |
+
RISK_LEVEL_THRESHOLDS = [
|
| 321 |
+
(75, "Critical"),
|
| 322 |
+
(50, "High"),
|
| 323 |
+
(25, "Medium"),
|
| 324 |
+
(0, "Low"),
|
| 325 |
+
]
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def _risk_to_level(score_0_100: float) -> str:
|
| 329 |
+
for threshold, label in RISK_LEVEL_THRESHOLDS:
|
| 330 |
+
if score_0_100 >= threshold:
|
| 331 |
+
return label
|
| 332 |
+
return "Low"
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def _fit_and_forecast(area: str, category: str, group_df, horizon: int) -> dict | None:
|
| 336 |
+
if group_df["ds"].nunique() < 2:
|
| 337 |
+
return None
|
| 338 |
+
|
| 339 |
+
ts = group_df[["ds", "y"]].sort_values("ds")
|
| 340 |
+
model = Prophet(weekly_seasonality=False, daily_seasonality=False)
|
| 341 |
+
model.fit(ts)
|
| 342 |
+
|
| 343 |
+
future = model.make_future_dataframe(periods=horizon)
|
| 344 |
+
forecast = model.predict(future)
|
| 345 |
+
|
| 346 |
+
recent_avg = ts.tail(3)["y"].mean()
|
| 347 |
+
forecast_avg = forecast.tail(horizon)["yhat"].mean()
|
| 348 |
+
|
| 349 |
+
if recent_avg == 0:
|
| 350 |
+
growth = 0.0
|
| 351 |
+
else:
|
| 352 |
+
raw_growth = ((forecast_avg - recent_avg) / recent_avg) * 100
|
| 353 |
+
growth = max(-500.0, min(500.0, raw_growth))
|
| 354 |
+
|
| 355 |
+
avg_priority = float(group_df["priorityScore"].mean())
|
| 356 |
+
raw_risk = 0.5 * (growth / 100) + 0.3 * avg_priority + 0.2 * (recent_avg / 5)
|
| 357 |
+
risk_score_100 = round(100 / (1 + math.exp(-raw_risk)), 2)
|
| 358 |
+
|
| 359 |
+
horizon_fc = forecast.tail(horizon)
|
| 360 |
+
yhat_range = (horizon_fc["yhat_upper"] - horizon_fc["yhat_lower"]).mean()
|
| 361 |
+
yhat_mean = horizon_fc["yhat"].abs().mean()
|
| 362 |
+
confidence = round(1.0 - min(1.0, yhat_range / (yhat_mean + 1e-9)), 4)
|
| 363 |
+
|
| 364 |
+
level = _risk_to_level(risk_score_100)
|
| 365 |
+
|
| 366 |
+
return {
|
| 367 |
+
"area": area,
|
| 368 |
+
"category": category,
|
| 369 |
+
"riskScore": risk_score_100,
|
| 370 |
+
"level": level,
|
| 371 |
+
"growthPercent": round(float(growth), 2),
|
| 372 |
+
"forecastHorizonDays": horizon,
|
| 373 |
+
"confidenceScore": confidence,
|
| 374 |
+
"_recentAvg": round(float(recent_avg), 2),
|
| 375 |
+
"_forecastAvg": round(float(forecast_avg), 2),
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
# =========================================================
|
| 380 |
+
# HEALTH CHECK
|
| 381 |
+
# =========================================================
|
| 382 |
+
@app.route("/", methods=["GET"])
|
| 383 |
+
def health():
|
| 384 |
+
return jsonify({
|
| 385 |
+
"status": "ok",
|
| 386 |
+
"version": os.environ.get("APP_VERSION", "1.0.0"),
|
| 387 |
+
"message": "Multilingual Grievance API (EN / HI / TE) with IG + GFAS — running",
|
| 388 |
+
"endpoints": {
|
| 389 |
+
"POST /predict": "Classify a single grievance — text / audio / image (multipart/form-data).",
|
| 390 |
+
"POST /fairness-audit": "GFAS audit over N grievance records.",
|
| 391 |
+
"POST /hotspot-forecast": "Prophet-based hotspot forecasting.",
|
| 392 |
+
},
|
| 393 |
+
})
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
@app.route("/health", methods=["GET"])
|
| 397 |
+
def health_check():
|
| 398 |
+
"""Dedicated health probe for HF Spaces liveness checks."""
|
| 399 |
+
return jsonify({"status": "ok"}), 200
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
# =========================================================
|
| 403 |
+
# POST /predict
|
| 404 |
+
# =========================================================
|
| 405 |
+
@app.route("/predict", methods=["POST"])
|
| 406 |
+
def predict_grievance():
|
| 407 |
+
try:
|
| 408 |
+
content_type = request.content_type or ""
|
| 409 |
+
|
| 410 |
+
if "application/json" in content_type:
|
| 411 |
+
data = request.get_json(silent=True) or {}
|
| 412 |
+
text_input = data.get("text", "").strip()
|
| 413 |
+
explain_flag = bool(data.get("explain", False))
|
| 414 |
+
has_text = bool(text_input)
|
| 415 |
+
has_audio = False
|
| 416 |
+
has_image = False
|
| 417 |
+
image_bytes = None
|
| 418 |
+
audio_file = None
|
| 419 |
+
else:
|
| 420 |
+
text_input = request.form.get("text", "").strip()
|
| 421 |
+
explain_raw = request.form.get("explain", "false").strip().lower()
|
| 422 |
+
explain_flag = explain_raw in ("true", "1", "yes")
|
| 423 |
+
has_text = bool(text_input)
|
| 424 |
+
has_audio = "audio" in request.files
|
| 425 |
+
has_image = "image" in request.files
|
| 426 |
+
image_bytes = request.files["image"].read() if has_image else None
|
| 427 |
+
audio_file = request.files["audio"] if has_audio else None
|
| 428 |
+
|
| 429 |
+
logger.info(
|
| 430 |
+
"[predict] content_type=%s has_text=%s has_audio=%s has_image=%s",
|
| 431 |
+
content_type[:40], has_text, has_audio, has_image,
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
if not has_text and not has_audio and not has_image:
|
| 435 |
+
return jsonify({
|
| 436 |
+
"status": "failed",
|
| 437 |
+
"code": "missing_input",
|
| 438 |
+
"message": "Please provide at least one of: 'text', 'audio', or 'image'.",
|
| 439 |
+
}), 400
|
| 440 |
+
|
| 441 |
+
# ── Mode A — IMAGE ONLY ────────────────────────────────────────────────
|
| 442 |
+
if has_image and not has_text and not has_audio:
|
| 443 |
+
location_status = check_image_location(image_bytes)
|
| 444 |
+
if location_status in ("invalid", "no_gps"):
|
| 445 |
+
return jsonify({
|
| 446 |
+
"status": "failed",
|
| 447 |
+
"code": "location_invalid",
|
| 448 |
+
"message": "Request rejected. Image location is outside Kakinada jurisdiction or contains no GPS metadata.",
|
| 449 |
+
"location": "invalid",
|
| 450 |
+
}), 403
|
| 451 |
+
grievance_text = extract_text_from_image(image_bytes)
|
| 452 |
+
input_mode = "image"
|
| 453 |
+
location_field = None
|
| 454 |
+
|
| 455 |
+
# ── Mode B — AUDIO ONLY ────────────────────────────────────────────────
|
| 456 |
+
elif has_audio and not has_text and not has_image:
|
| 457 |
+
grievance_text = transcribe_audio(audio_file)
|
| 458 |
+
input_mode = "audio"
|
| 459 |
+
location_field = None
|
| 460 |
+
|
| 461 |
+
# ── Mode C — TEXT ONLY ─────────────────────────────────────────────────
|
| 462 |
+
elif has_text and not has_image and not has_audio:
|
| 463 |
+
grievance_text = text_input
|
| 464 |
+
input_mode = "text"
|
| 465 |
+
location_field = None
|
| 466 |
+
|
| 467 |
+
# ── Mode D — TEXT + IMAGE (evidence) ──────────────────────────────────
|
| 468 |
+
elif has_text and has_image and not has_audio:
|
| 469 |
+
grievance_text = text_input
|
| 470 |
+
input_mode = "text+image"
|
| 471 |
+
loc_status = check_image_location(image_bytes)
|
| 472 |
+
location_field = "valid" if loc_status == "valid" else "invalid"
|
| 473 |
+
|
| 474 |
+
# ── Mode E — AUDIO + IMAGE (evidence) ─────────────────────────────────
|
| 475 |
+
elif has_audio and has_image and not has_text:
|
| 476 |
+
grievance_text = transcribe_audio(audio_file)
|
| 477 |
+
input_mode = "audio+image"
|
| 478 |
+
loc_status = check_image_location(image_bytes)
|
| 479 |
+
location_field = "valid" if loc_status == "valid" else "invalid"
|
| 480 |
+
|
| 481 |
+
else:
|
| 482 |
+
return jsonify({
|
| 483 |
+
"status": "failed",
|
| 484 |
+
"code": "missing_input",
|
| 485 |
+
"message": "Please provide at least one of: 'text', 'audio', or 'image'.",
|
| 486 |
+
}), 400
|
| 487 |
+
|
| 488 |
+
is_valid, error_code = validate_text(grievance_text)
|
| 489 |
+
if not is_valid:
|
| 490 |
+
return jsonify({
|
| 491 |
+
"status": "failed",
|
| 492 |
+
"code": error_code,
|
| 493 |
+
"message": _VALIDATION_MESSAGES[error_code],
|
| 494 |
+
}), 422
|
| 495 |
+
|
| 496 |
+
language = detect_language(grievance_text)
|
| 497 |
+
res = _get_resources(language)
|
| 498 |
+
|
| 499 |
+
category_result = res["cat_fn"](grievance_text)
|
| 500 |
+
category = category_result["category"]
|
| 501 |
+
category_conf = category_result["confidence"]
|
| 502 |
+
category_index = category_result.get("class_index", 0)
|
| 503 |
+
|
| 504 |
+
urgency_result = res["urg_fn"](grievance_text)
|
| 505 |
+
urgency = urgency_result["urgency"]
|
| 506 |
+
urgency_conf = urgency_result["confidence"]
|
| 507 |
+
urgency_index = urgency_result.get("class_index", 0)
|
| 508 |
+
|
| 509 |
+
priority_result = compute_priority_score(category, urgency, urgency_conf)
|
| 510 |
+
priority_score = priority_result["score"]
|
| 511 |
+
priority_band = priority_result["band"]
|
| 512 |
+
|
| 513 |
+
category_tokens: list = []
|
| 514 |
+
urgency_tokens: list = []
|
| 515 |
+
if explain_flag:
|
| 516 |
+
category_tokens = res["cat_exp"].explain(grievance_text, category_index)
|
| 517 |
+
urgency_tokens = res["urg_exp"].explain(grievance_text, urgency_index)
|
| 518 |
+
|
| 519 |
+
explanation = generate_final_reason(
|
| 520 |
+
grievance_text, category, urgency, priority_score,
|
| 521 |
+
category_tokens, urgency_tokens,
|
| 522 |
+
)
|
| 523 |
+
|
| 524 |
+
response_body = {
|
| 525 |
+
"status": "success",
|
| 526 |
+
"input_mode": input_mode,
|
| 527 |
+
"text": grievance_text,
|
| 528 |
+
"language": language,
|
| 529 |
+
"category": category,
|
| 530 |
+
"category_confidence": category_conf,
|
| 531 |
+
"urgency": urgency,
|
| 532 |
+
"urgency_confidence": urgency_conf,
|
| 533 |
+
"priority_score": priority_score,
|
| 534 |
+
"priority_band": priority_band,
|
| 535 |
+
"explanation": {
|
| 536 |
+
"category_tokens": category_tokens,
|
| 537 |
+
"urgency_tokens": urgency_tokens,
|
| 538 |
+
"category_decision": explanation["category_decision"],
|
| 539 |
+
"urgency_decision": explanation["urgency_decision"],
|
| 540 |
+
"priority_summary": explanation["priority_summary"],
|
| 541 |
+
"final_reason": explanation["final_reason"],
|
| 542 |
+
},
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
if location_field is not None:
|
| 546 |
+
response_body["location"] = location_field
|
| 547 |
+
|
| 548 |
+
return jsonify(response_body)
|
| 549 |
+
|
| 550 |
+
except Exception as e:
|
| 551 |
+
logger.exception("[predict] Unhandled exception")
|
| 552 |
+
return jsonify({
|
| 553 |
+
"status": "failed",
|
| 554 |
+
"code": "internal_error",
|
| 555 |
+
"message": str(e),
|
| 556 |
+
"trace": traceback.format_exc(),
|
| 557 |
+
}), 500
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
# =========================================================
|
| 561 |
+
# POST /fairness-audit
|
| 562 |
+
# =========================================================
|
| 563 |
+
@app.route("/fairness-audit", methods=["POST"])
|
| 564 |
+
def fairness_audit():
|
| 565 |
+
try:
|
| 566 |
+
data = request.get_json(silent=True)
|
| 567 |
+
if not data:
|
| 568 |
+
return jsonify({"status": "failed", "message": "Invalid JSON body."}), 400
|
| 569 |
+
|
| 570 |
+
result, error, status_code = gfas_audit(data.get("grievances"))
|
| 571 |
+
|
| 572 |
+
if error:
|
| 573 |
+
return jsonify(error), status_code
|
| 574 |
+
return jsonify(result), status_code
|
| 575 |
+
|
| 576 |
+
except Exception as e:
|
| 577 |
+
logger.exception("[fairness-audit] Unhandled exception")
|
| 578 |
+
return jsonify({
|
| 579 |
+
"status": "failed",
|
| 580 |
+
"error": str(e),
|
| 581 |
+
"trace": traceback.format_exc(),
|
| 582 |
+
}), 500
|
| 583 |
+
|
| 584 |
+
|
| 585 |
+
# =========================================================
|
| 586 |
+
# POST /hotspot-forecast
|
| 587 |
+
# =========================================================
|
| 588 |
+
@app.route("/hotspot-forecast", methods=["POST"])
|
| 589 |
+
def hotspot_forecast():
|
| 590 |
+
try:
|
| 591 |
+
data = request.get_json(force=True)
|
| 592 |
+
grievances = data.get("grievances", [])
|
| 593 |
+
horizon = int(data.get("horizon_days", 1))
|
| 594 |
+
top_n = int(data.get("top_n", 10))
|
| 595 |
+
source_window = int(data.get("source_window_days", 45))
|
| 596 |
+
generated_at = datetime.now(UTC).isoformat()
|
| 597 |
+
|
| 598 |
+
if not grievances:
|
| 599 |
+
return jsonify({"status": "failed", "message": "No grievances supplied"}), 422
|
| 600 |
+
|
| 601 |
+
df = pd.DataFrame(grievances)
|
| 602 |
+
if df.empty:
|
| 603 |
+
return jsonify({
|
| 604 |
+
"status": "success",
|
| 605 |
+
"generated_at": generated_at,
|
| 606 |
+
"top_hotspots": [],
|
| 607 |
+
})
|
| 608 |
+
|
| 609 |
+
df["area"] = df["area"].astype(str).str.lower().str.strip()
|
| 610 |
+
df["category"] = df["category"].astype(str).str.lower().str.strip()
|
| 611 |
+
df["ds"] = pd.to_datetime(df["createdAt"], errors="coerce", utc=True).dt.tz_convert(None)
|
| 612 |
+
df = df.dropna(subset=["ds"])
|
| 613 |
+
df["y"] = 1
|
| 614 |
+
df = df[df["category"].isin(VALID_LABELS)]
|
| 615 |
+
|
| 616 |
+
if df.empty:
|
| 617 |
+
return jsonify({
|
| 618 |
+
"status": "success",
|
| 619 |
+
"generated_at": generated_at,
|
| 620 |
+
"top_hotspots": [],
|
| 621 |
+
})
|
| 622 |
+
|
| 623 |
+
df = df.groupby(["area", "category", "ds"]).agg(
|
| 624 |
+
{"y": "sum", "priorityScore": "mean"}
|
| 625 |
+
).reset_index()
|
| 626 |
+
|
| 627 |
+
groups = list(df.groupby(["area", "category"]))
|
| 628 |
+
hotspots = []
|
| 629 |
+
errors = []
|
| 630 |
+
|
| 631 |
+
with ThreadPoolExecutor(max_workers=_PROPHET_MAX_WORKERS) as executor:
|
| 632 |
+
futures = {
|
| 633 |
+
executor.submit(_fit_and_forecast, area, cat, gdf, horizon): (area, cat)
|
| 634 |
+
for (area, cat), gdf in groups
|
| 635 |
+
}
|
| 636 |
+
for future in as_completed(futures):
|
| 637 |
+
area, category = futures[future]
|
| 638 |
+
try:
|
| 639 |
+
result = future.result()
|
| 640 |
+
if result is None:
|
| 641 |
+
continue
|
| 642 |
+
|
| 643 |
+
result["flaskSnapshot"] = {
|
| 644 |
+
"recentAvg": result.pop("_recentAvg"),
|
| 645 |
+
"forecastAvg": result.pop("_forecastAvg"),
|
| 646 |
+
"sourceWindowDays": source_window,
|
| 647 |
+
"forecastHorizonDays": horizon,
|
| 648 |
+
"generatedAt": generated_at,
|
| 649 |
+
}
|
| 650 |
+
result["sourceWindowDays"] = source_window
|
| 651 |
+
hotspots.append(result)
|
| 652 |
+
|
| 653 |
+
except Exception as e:
|
| 654 |
+
errors.append({"area": area, "category": category, "error": str(e)})
|
| 655 |
+
logger.warning("[hotspot] Prophet failed for %s/%s: %s", area, category, e)
|
| 656 |
+
|
| 657 |
+
ranked = sorted(hotspots, key=lambda x: x["riskScore"], reverse=True)
|
| 658 |
+
|
| 659 |
+
return jsonify({
|
| 660 |
+
"status": "success",
|
| 661 |
+
"generated_at": generated_at,
|
| 662 |
+
"top_hotspots": ranked[:top_n],
|
| 663 |
+
"meta": {
|
| 664 |
+
"groups_evaluated": len(groups),
|
| 665 |
+
"forecasts_computed": len(hotspots),
|
| 666 |
+
"error_count": len(errors),
|
| 667 |
+
"errors": errors,
|
| 668 |
+
"source_window_days": source_window,
|
| 669 |
+
"horizon_days": horizon,
|
| 670 |
+
},
|
| 671 |
+
})
|
| 672 |
+
|
| 673 |
+
except Exception as e:
|
| 674 |
+
logger.exception("[hotspot-forecast] Unhandled exception")
|
| 675 |
+
return jsonify({"status": "failed", "message": str(e)}), 500
|
| 676 |
+
|
| 677 |
+
|
| 678 |
+
# =========================================================
|
| 679 |
+
# GLOBAL ERROR HANDLERS
|
| 680 |
+
# =========================================================
|
| 681 |
+
@app.errorhandler(413)
|
| 682 |
+
def request_entity_too_large(e):
|
| 683 |
+
return jsonify({
|
| 684 |
+
"status": "failed",
|
| 685 |
+
"code": "payload_too_large",
|
| 686 |
+
"message": f"Upload exceeds the {app.config['MAX_CONTENT_LENGTH'] // (1024*1024)} MB limit.",
|
| 687 |
+
}), 413
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
@app.errorhandler(404)
|
| 691 |
+
def not_found(e):
|
| 692 |
+
return jsonify({"status": "failed", "code": "not_found", "message": "Endpoint not found."}), 404
|
| 693 |
+
|
| 694 |
+
|
| 695 |
+
@app.errorhandler(405)
|
| 696 |
+
def method_not_allowed(e):
|
| 697 |
+
return jsonify({"status": "failed", "code": "method_not_allowed", "message": "HTTP method not allowed."}), 405
|
| 698 |
+
|
| 699 |
+
|
| 700 |
+
# =========================================================
|
| 701 |
+
# RUN SERVER — Hugging Face Spaces uses port 7860
|
| 702 |
+
# =========================================================
|
| 703 |
+
if __name__ == "__main__":
|
| 704 |
+
port = int(os.environ.get("PORT", 7860))
|
| 705 |
+
debug = os.environ.get("FLASK_DEBUG", "false").lower() == "true"
|
| 706 |
+
logger.info("🚀 Starting Multilingual Grievance API on port %d (debug=%s)", port, debug)
|
| 707 |
+
app.run(host="0.0.0.0", port=port, debug=debug, threaded=True)
|
multi_modal/audio_to_text.py
ADDED
|
@@ -0,0 +1,463 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# multi_modal/audio_to_text.py
|
| 3 |
+
#
|
| 4 |
+
# Converts an uploaded audio file to text using Whisper.
|
| 5 |
+
#
|
| 6 |
+
# Supports: WAV, MP3, OGG, FLAC, M4A, WEBM (mobile browsers)
|
| 7 |
+
# Languages: Telugu / Hindi / English (forced, no random scripts)
|
| 8 |
+
#
|
| 9 |
+
# FIXES vs previous version:
|
| 10 |
+
# 1. Hallucination detection — Georgian/Chinese/Arabic output
|
| 11 |
+
# (ვვვვ... etc.) is detected and discarded, returns ""
|
| 12 |
+
# 2. Language forcing — tries TE → HI → EN in order instead
|
| 13 |
+
# of pure auto-detect which picks random scripts
|
| 14 |
+
# 3. Valid script check — only accepts Latin, Telugu,
|
| 15 |
+
# Devanagari output. Anything else = hallucination.
|
| 16 |
+
# 4. 500 error fix — empty/invalid transcription now safely
|
| 17 |
+
# returns "" instead of passing garbage to BERT classifier
|
| 18 |
+
# =========================================================
|
| 19 |
+
|
| 20 |
+
import os
|
| 21 |
+
import tempfile
|
| 22 |
+
import unicodedata
|
| 23 |
+
import torch
|
| 24 |
+
import numpy as np
|
| 25 |
+
from transformers import pipeline
|
| 26 |
+
|
| 27 |
+
# ── Environment ────────────────────────────────────────────────────────────────
|
| 28 |
+
_AUDIO_BACKEND = os.environ.get("AUDIO_BACKEND", "local") # "local" | "hf_api"
|
| 29 |
+
_HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 30 |
+
|
| 31 |
+
# ── Model selection ────────────────────────────────────────────────────────────
|
| 32 |
+
MODEL_ID = os.environ.get("WHISPER_MODEL", "openai/whisper-small")
|
| 33 |
+
_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
+
|
| 35 |
+
# ── Valid Unicode scripts for EN / HI / TE ────────────────────────────────────
|
| 36 |
+
# Whisper hallucinates Georgian (ვ), Chinese (的), Arabic (ال) on bad audio.
|
| 37 |
+
# Only these script prefixes (from unicodedata.name) are accepted as real output.
|
| 38 |
+
_VALID_SCRIPTS = {
|
| 39 |
+
"LATIN", # English
|
| 40 |
+
"DEVANAGARI", # Hindi
|
| 41 |
+
"TELUGU", # Telugu
|
| 42 |
+
"COMMON", # punctuation, digits, spaces
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
# Languages tried in order.
|
| 46 |
+
# EN first — fastest for English audio (most common).
|
| 47 |
+
# Only these 3 are permitted — no other language accepted.
|
| 48 |
+
_LANGUAGE_ORDER = ["en", "te", "hi"]
|
| 49 |
+
_ALLOWED_LANGUAGES = {"en", "te", "hi"}
|
| 50 |
+
|
| 51 |
+
# Expected dominant script per forced language.
|
| 52 |
+
# If we force "te" but get back Devanagari-heavy text, it is wrong.
|
| 53 |
+
# If we force "hi" but get back Telugu-heavy text, it is wrong.
|
| 54 |
+
# This prevents Telugu audio from being accepted as Hindi.
|
| 55 |
+
_LANG_EXPECTED_SCRIPT = {
|
| 56 |
+
"en": {"LATIN"},
|
| 57 |
+
"te": {"TELUGU"},
|
| 58 |
+
"hi": {"DEVANAGARI"},
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ── Load Whisper ONCE at import time ──────────────────────────────────────────
|
| 63 |
+
if _AUDIO_BACKEND == "local":
|
| 64 |
+
print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
|
| 65 |
+
_ASR_PIPELINE = pipeline(
|
| 66 |
+
task = "automatic-speech-recognition",
|
| 67 |
+
model = MODEL_ID,
|
| 68 |
+
device = _DEVICE,
|
| 69 |
+
)
|
| 70 |
+
print(f"✅ Whisper '{MODEL_ID}' loaded.")
|
| 71 |
+
else:
|
| 72 |
+
_ASR_PIPELINE = None
|
| 73 |
+
print(f"ℹ️ Whisper skipped — using HF API backend.")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 77 |
+
# HALLUCINATION DETECTION
|
| 78 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 79 |
+
def _is_valid_transcription(text: str) -> bool:
|
| 80 |
+
"""
|
| 81 |
+
Returns True only if the transcription looks like real speech.
|
| 82 |
+
|
| 83 |
+
Checks:
|
| 84 |
+
1. Script check -- must be mostly Latin / Devanagari / Telugu
|
| 85 |
+
2. Repetition check -- rejects looping hallucinations like
|
| 86 |
+
"apne apne apne apne..." where a word repeats 5+ times
|
| 87 |
+
"""
|
| 88 |
+
if not text or len(text.strip()) < 3:
|
| 89 |
+
return False
|
| 90 |
+
|
| 91 |
+
chars = [c for c in text if not c.isspace()]
|
| 92 |
+
if not chars:
|
| 93 |
+
return False
|
| 94 |
+
|
| 95 |
+
# Check 1: Script validation
|
| 96 |
+
valid_count = 0
|
| 97 |
+
for c in chars:
|
| 98 |
+
try:
|
| 99 |
+
char_name = unicodedata.name(c, "")
|
| 100 |
+
script = char_name.split()[0] if char_name else "UNKNOWN"
|
| 101 |
+
if script in _VALID_SCRIPTS:
|
| 102 |
+
valid_count += 1
|
| 103 |
+
except Exception:
|
| 104 |
+
pass
|
| 105 |
+
|
| 106 |
+
ratio = valid_count / len(chars)
|
| 107 |
+
if ratio < 0.60:
|
| 108 |
+
print(f"[audio_to_text] WARNING script hallucination "
|
| 109 |
+
f"(valid_ratio={ratio:.2f}) discarding: {text[:60]!r}")
|
| 110 |
+
return False
|
| 111 |
+
|
| 112 |
+
# Check 2: Repetition detection
|
| 113 |
+
# "apne apne apne apne apne apne..." = Whisper looping hallucination
|
| 114 |
+
words = text.strip().split()
|
| 115 |
+
if len(words) >= 6:
|
| 116 |
+
# Max consecutive repeated word
|
| 117 |
+
max_repeat = 1
|
| 118 |
+
cur_repeat = 1
|
| 119 |
+
for i in range(1, len(words)):
|
| 120 |
+
if words[i].lower() == words[i - 1].lower():
|
| 121 |
+
cur_repeat += 1
|
| 122 |
+
max_repeat = max(max_repeat, cur_repeat)
|
| 123 |
+
else:
|
| 124 |
+
cur_repeat = 1
|
| 125 |
+
if max_repeat >= 5:
|
| 126 |
+
print(f"[audio_to_text] WARNING repetition hallucination "
|
| 127 |
+
f"(word repeats {max_repeat}x) discarding: {text[:60]!r}")
|
| 128 |
+
return False
|
| 129 |
+
|
| 130 |
+
# Low vocabulary diversity = looping hallucination
|
| 131 |
+
# "I love you. I love you..." = 3 unique / 15 words = 0.20 unique ratio
|
| 132 |
+
# Real speech always has more variety — threshold: <0.15 for longer texts
|
| 133 |
+
unique_ratio = len(set(w.lower() for w in words)) / len(words)
|
| 134 |
+
if unique_ratio < 0.15 and len(words) > 15:
|
| 135 |
+
print(f"[audio_to_text] WARNING low-diversity hallucination "
|
| 136 |
+
f"(unique_ratio={unique_ratio:.2f}) discarding: {text[:60]!r}")
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
+
# Check 3: Character-level repetition — catches "अग्वावावावाव..." patterns
|
| 140 |
+
# where substrings repeat at character level (not caught by word check)
|
| 141 |
+
if len(text) > 20:
|
| 142 |
+
# Take a 4-char ngram from position 10 and count how many times it appears
|
| 143 |
+
probe = text[8:12]
|
| 144 |
+
rep_count = text.count(probe)
|
| 145 |
+
if rep_count > len(text) // 8: # appears more than once per 8 chars = looping
|
| 146 |
+
print(f"[audio_to_text] WARNING char-level repetition "
|
| 147 |
+
f"(probe {probe!r} repeats {rep_count}x) discarding: {text[:60]!r}")
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
return True
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 154 |
+
# PUBLIC API
|
| 155 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 156 |
+
def transcribe_audio(audio_file) -> str:
|
| 157 |
+
"""
|
| 158 |
+
Transcribe an uploaded audio file to text.
|
| 159 |
+
|
| 160 |
+
Parameters
|
| 161 |
+
----------
|
| 162 |
+
audio_file : werkzeug.datastructures.FileStorage
|
| 163 |
+
File from Flask request.files["audio"].
|
| 164 |
+
Accepts WAV, MP3, OGG, FLAC, M4A, WEBM.
|
| 165 |
+
|
| 166 |
+
Returns
|
| 167 |
+
-------
|
| 168 |
+
str
|
| 169 |
+
Transcribed text in EN / HI / TE.
|
| 170 |
+
Returns "" on failure or hallucination — never raises.
|
| 171 |
+
"""
|
| 172 |
+
if _AUDIO_BACKEND == "hf_api" and _HF_TOKEN:
|
| 173 |
+
return _transcribe_via_hf_api(audio_file)
|
| 174 |
+
return _transcribe_local(audio_file)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 178 |
+
# LOCAL PATH
|
| 179 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 180 |
+
def _transcribe_local(audio_file) -> str:
|
| 181 |
+
try:
|
| 182 |
+
audio_bytes = audio_file.read()
|
| 183 |
+
if not audio_bytes:
|
| 184 |
+
print("[audio_to_text] ⚠️ Empty audio file.")
|
| 185 |
+
return ""
|
| 186 |
+
|
| 187 |
+
suffix = _get_suffix(audio_file)
|
| 188 |
+
|
| 189 |
+
# Write to temp file — pydub needs a file path on disk
|
| 190 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 191 |
+
tmp.write(audio_bytes)
|
| 192 |
+
tmp_path = tmp.name
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
audio_array, sample_rate = _load_audio(tmp_path, suffix)
|
| 196 |
+
finally:
|
| 197 |
+
try:
|
| 198 |
+
os.unlink(tmp_path)
|
| 199 |
+
except OSError:
|
| 200 |
+
pass
|
| 201 |
+
|
| 202 |
+
if audio_array is None:
|
| 203 |
+
print("[audio_to_text] ❌ Could not decode audio — is ffmpeg installed?")
|
| 204 |
+
return ""
|
| 205 |
+
|
| 206 |
+
# ── Audio quality diagnostics ──────────────────────────────────────────
|
| 207 |
+
duration_sec = len(audio_array) / 16_000
|
| 208 |
+
rms = float(np.sqrt(np.mean(audio_array ** 2)))
|
| 209 |
+
peak = float(np.max(np.abs(audio_array)))
|
| 210 |
+
print(f"[audio_to_text] 🔍 duration={duration_sec:.1f}s | rms={rms:.4f} | peak={peak:.4f}")
|
| 211 |
+
|
| 212 |
+
# Reject silent or extremely quiet audio — Whisper hallucinates on silence
|
| 213 |
+
if rms < 0.001:
|
| 214 |
+
print("[audio_to_text] ❌ Audio is silent (rms<0.001) — nothing to transcribe")
|
| 215 |
+
return ""
|
| 216 |
+
if duration_sec < 0.5:
|
| 217 |
+
print(f"[audio_to_text] ❌ Audio too short ({duration_sec:.2f}s) — minimum 0.5s")
|
| 218 |
+
return ""
|
| 219 |
+
|
| 220 |
+
# ── Try EN → TE → HI — never pure auto-detect ─────────────────────────
|
| 221 |
+
# language=None causes Whisper to hallucinate Georgian/Chinese on bad audio.
|
| 222 |
+
# Forcing each language and validating the output is far more reliable.
|
| 223 |
+
#
|
| 224 |
+
# IMPORTANT: the pipeline mutates the input dict internally on the first
|
| 225 |
+
# call, so subsequent calls receive a broken dict. Fix: rebuild it fresh
|
| 226 |
+
# for every language attempt using a copy of the original numpy array.
|
| 227 |
+
audio_array_copy = audio_array.copy()
|
| 228 |
+
|
| 229 |
+
for lang in _LANGUAGE_ORDER:
|
| 230 |
+
try:
|
| 231 |
+
# Fresh dict every iteration — never reuse across pipeline calls
|
| 232 |
+
audio_input = {"raw": audio_array_copy.copy(), "sampling_rate": 16_000}
|
| 233 |
+
result = _ASR_PIPELINE(
|
| 234 |
+
audio_input,
|
| 235 |
+
generate_kwargs={
|
| 236 |
+
"language": lang,
|
| 237 |
+
"task": "transcribe",
|
| 238 |
+
# temperature and compression_ratio_threshold cause a
|
| 239 |
+
# 'logprobs' bug in some transformers versions — removed.
|
| 240 |
+
# Hallucination is handled by our own validator instead.
|
| 241 |
+
},
|
| 242 |
+
return_timestamps=False,
|
| 243 |
+
)
|
| 244 |
+
text = result.get("text", "").strip()
|
| 245 |
+
|
| 246 |
+
if not text:
|
| 247 |
+
print(f"[audio_to_text] ↩️ lang={lang} -> empty, trying next")
|
| 248 |
+
continue
|
| 249 |
+
|
| 250 |
+
# Strict language whitelist — only EN / HI / TE accepted.
|
| 251 |
+
# Whisper sometimes returns text in a completely different language
|
| 252 |
+
# even when forced (e.g. forced TE returns Khmer). Detect this by
|
| 253 |
+
# checking the detected_language field when available.
|
| 254 |
+
detected_lang = result.get("chunks", [{}])[0].get("language", lang) if isinstance(result.get("chunks"), list) else lang
|
| 255 |
+
|
| 256 |
+
if _is_valid_transcription(text):
|
| 257 |
+
# Extra check: does the output script match the forced language?
|
| 258 |
+
# Whisper-small often outputs Hindi (Devanagari) when forced to TE.
|
| 259 |
+
# Reject if dominant script does not match expected script for lang.
|
| 260 |
+
expected_scripts = _LANG_EXPECTED_SCRIPT.get(lang, None)
|
| 261 |
+
if expected_scripts and lang != "en":
|
| 262 |
+
chars = [c for c in text if not c.isspace()]
|
| 263 |
+
script_counts = {}
|
| 264 |
+
for c in chars:
|
| 265 |
+
try:
|
| 266 |
+
sc = unicodedata.name(c, "").split()[0]
|
| 267 |
+
script_counts[sc] = script_counts.get(sc, 0) + 1
|
| 268 |
+
except Exception:
|
| 269 |
+
pass
|
| 270 |
+
dominant = max(script_counts, key=script_counts.get) if script_counts else "UNKNOWN"
|
| 271 |
+
if dominant not in expected_scripts and dominant not in ("COMMON", "LATIN"):
|
| 272 |
+
print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
|
| 273 |
+
continue
|
| 274 |
+
|
| 275 |
+
print(f"[audio_to_text] OK lang={lang} | "
|
| 276 |
+
f"{len(text)} chars: {text[:100]}")
|
| 277 |
+
return text
|
| 278 |
+
else:
|
| 279 |
+
print(f"[audio_to_text] lang={lang} hallucinated — trying next")
|
| 280 |
+
continue
|
| 281 |
+
|
| 282 |
+
except Exception as e:
|
| 283 |
+
print(f"[audio_to_text] ❌ lang={lang} error: {e}")
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
print("[audio_to_text] ❌ All language attempts failed — returning empty")
|
| 287 |
+
return ""
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
print(f"[audio_to_text] ❌ Transcription failed: {e}")
|
| 291 |
+
return ""
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def _load_audio(file_path: str, suffix: str):
|
| 295 |
+
"""
|
| 296 |
+
Load audio file as float32 numpy array at 16 kHz mono.
|
| 297 |
+
|
| 298 |
+
Strategy:
|
| 299 |
+
1. pydub — handles MP3, OGG, WEBM, M4A, WAV, FLAC (needs ffmpeg)
|
| 300 |
+
2. soundfile fallback — WAV and FLAC only (no ffmpeg needed)
|
| 301 |
+
|
| 302 |
+
Returns (audio_array, 16000) or (None, None) on failure.
|
| 303 |
+
"""
|
| 304 |
+
# ── pydub (primary) ────────────────────────────────────────────────────────
|
| 305 |
+
try:
|
| 306 |
+
from pydub import AudioSegment
|
| 307 |
+
|
| 308 |
+
fmt = suffix.lstrip(".").lower()
|
| 309 |
+
fmt_map = {"m4a": "mp4", "webm": "webm", "ogg": "ogg"}
|
| 310 |
+
fmt = fmt_map.get(fmt, fmt)
|
| 311 |
+
|
| 312 |
+
audio_seg = AudioSegment.from_file(file_path, format=fmt)
|
| 313 |
+
audio_seg = audio_seg.set_channels(1).set_frame_rate(16_000)
|
| 314 |
+
samples = np.array(audio_seg.get_array_of_samples(), dtype=np.float32)
|
| 315 |
+
|
| 316 |
+
# Normalize based on actual sample width — pydub can return int16 OR int32
|
| 317 |
+
# depending on source format. Always normalize to float32 [-1.0, 1.0]
|
| 318 |
+
sample_width = audio_seg.sample_width # bytes per sample: 1=8bit, 2=16bit, 4=32bit
|
| 319 |
+
max_val = float(2 ** (8 * sample_width - 1))
|
| 320 |
+
samples = samples / max_val
|
| 321 |
+
# Safety clamp — should already be in range but guard against edge cases
|
| 322 |
+
samples = np.clip(samples, -1.0, 1.0)
|
| 323 |
+
|
| 324 |
+
print(f"[audio_to_text] pydub decoded: sample_width={sample_width}B "
|
| 325 |
+
f"max_val={max_val:.0f} post_rms={float(np.sqrt(np.mean(samples**2))):.4f}")
|
| 326 |
+
|
| 327 |
+
return samples, 16_000
|
| 328 |
+
|
| 329 |
+
except ImportError:
|
| 330 |
+
print("[audio_to_text] pydub not installed — falling back to soundfile")
|
| 331 |
+
print(" pip install pydub + install ffmpeg")
|
| 332 |
+
except Exception as e:
|
| 333 |
+
print(f"[audio_to_text] pydub failed ({e}) — trying soundfile")
|
| 334 |
+
|
| 335 |
+
# ── soundfile (fallback — WAV/FLAC only) ───────────────────────────────────
|
| 336 |
+
try:
|
| 337 |
+
import soundfile as sf
|
| 338 |
+
audio_array, sample_rate = sf.read(file_path, dtype="float32")
|
| 339 |
+
|
| 340 |
+
if audio_array.ndim > 1:
|
| 341 |
+
audio_array = audio_array.mean(axis=1)
|
| 342 |
+
|
| 343 |
+
if sample_rate != 16_000:
|
| 344 |
+
audio_array = _resample(audio_array, sample_rate, 16_000)
|
| 345 |
+
|
| 346 |
+
return audio_array, 16_000
|
| 347 |
+
|
| 348 |
+
except Exception as e:
|
| 349 |
+
print(f"[audio_to_text] soundfile failed: {e}")
|
| 350 |
+
return None, None
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 354 |
+
# HF API PATH — production / HF Spaces
|
| 355 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 356 |
+
def _transcribe_via_hf_api(audio_file) -> str:
|
| 357 |
+
"""
|
| 358 |
+
Production path — HuggingFace Inference API (whisper-large-v3 on HF GPU).
|
| 359 |
+
Set AUDIO_BACKEND=hf_api and HF_TOKEN=hf_xxx in HF Space Secrets.
|
| 360 |
+
|
| 361 |
+
Why large-v3 via API instead of loading locally:
|
| 362 |
+
- large-v3 = 3GB — too large to load on free HF Spaces
|
| 363 |
+
- HF API runs it on GPU — faster than local CPU anyway (~15-30s vs 3min)
|
| 364 |
+
- Free tier: 1000 requests/day — enough for a civic portal
|
| 365 |
+
|
| 366 |
+
large-v3 auto-detect is accurate enough for EN/TE/HI — no need for
|
| 367 |
+
the 3-attempt language loop used in local path.
|
| 368 |
+
"""
|
| 369 |
+
import requests
|
| 370 |
+
|
| 371 |
+
try:
|
| 372 |
+
audio_bytes = audio_file.read()
|
| 373 |
+
if not audio_bytes:
|
| 374 |
+
return ""
|
| 375 |
+
|
| 376 |
+
print(f"[audio_to_text] HF API: sending {len(audio_bytes)} bytes to whisper-large-v3...")
|
| 377 |
+
|
| 378 |
+
# First attempt: auto-detect language (large-v3 is accurate enough)
|
| 379 |
+
res = requests.post(
|
| 380 |
+
"https://api-inference.huggingface.co/models/openai/whisper-large-v3",
|
| 381 |
+
headers = {"Authorization": f"Bearer {_HF_TOKEN}"},
|
| 382 |
+
data = audio_bytes,
|
| 383 |
+
timeout = 120, # HF free tier can queue up to 60s before processing
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
# Handle model loading (HF cold start)
|
| 387 |
+
if res.status_code == 503:
|
| 388 |
+
import time
|
| 389 |
+
print("[audio_to_text] HF API: model loading — waiting 20s...")
|
| 390 |
+
time.sleep(20)
|
| 391 |
+
res = requests.post(
|
| 392 |
+
"https://api-inference.huggingface.co/models/openai/whisper-large-v3",
|
| 393 |
+
headers = {"Authorization": f"Bearer {_HF_TOKEN}"},
|
| 394 |
+
data = audio_bytes,
|
| 395 |
+
timeout = 120,
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
if res.ok:
|
| 399 |
+
data = res.json()
|
| 400 |
+
# HF API returns {"text": "..."} or [{"generated_text": "..."}]
|
| 401 |
+
if isinstance(data, dict):
|
| 402 |
+
text = data.get("text", "").strip()
|
| 403 |
+
elif isinstance(data, list) and data:
|
| 404 |
+
text = data[0].get("generated_text", "").strip()
|
| 405 |
+
else:
|
| 406 |
+
text = ""
|
| 407 |
+
|
| 408 |
+
if _is_valid_transcription(text):
|
| 409 |
+
print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
|
| 410 |
+
return text
|
| 411 |
+
else:
|
| 412 |
+
print(f"[audio_to_text] HF API hallucination discarded: {text[:60]!r}")
|
| 413 |
+
return ""
|
| 414 |
+
else:
|
| 415 |
+
print(f"[audio_to_text] HF API error {res.status_code}: {res.text[:300]}")
|
| 416 |
+
return ""
|
| 417 |
+
|
| 418 |
+
except requests.exceptions.Timeout:
|
| 419 |
+
print("[audio_to_text] HF API timeout — model may be overloaded")
|
| 420 |
+
return ""
|
| 421 |
+
except Exception as e:
|
| 422 |
+
print(f"[audio_to_text] HF API exception: {e}")
|
| 423 |
+
return ""
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 427 |
+
# HELPERS
|
| 428 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 429 |
+
def _get_suffix(audio_file) -> str:
|
| 430 |
+
"""Determine file extension from FileStorage. Defaults to .webm."""
|
| 431 |
+
filename = getattr(audio_file, "filename", "") or ""
|
| 432 |
+
mime = getattr(audio_file, "mimetype", "") or ""
|
| 433 |
+
|
| 434 |
+
_MIME_TO_EXT = {
|
| 435 |
+
"audio/wav": ".wav", "audio/x-wav": ".wav", "audio/wave": ".wav",
|
| 436 |
+
"audio/mpeg": ".mp3", "audio/mp3": ".mp3",
|
| 437 |
+
"audio/ogg": ".ogg",
|
| 438 |
+
"audio/flac": ".flac", "audio/x-flac": ".flac",
|
| 439 |
+
"audio/mp4": ".m4a", "audio/x-m4a": ".m4a",
|
| 440 |
+
"audio/webm": ".webm", "video/webm": ".webm",
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
if "." in filename:
|
| 444 |
+
return "." + filename.rsplit(".", 1)[-1].lower()
|
| 445 |
+
|
| 446 |
+
# Default to .webm — Chrome/Edge MediaRecorder always sends webm
|
| 447 |
+
return _MIME_TO_EXT.get(mime.lower(), ".webm")
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def _resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
|
| 451 |
+
"""Resample audio array from orig_sr to target_sr."""
|
| 452 |
+
try:
|
| 453 |
+
from scipy.signal import resample_poly
|
| 454 |
+
from math import gcd
|
| 455 |
+
g = gcd(orig_sr, target_sr)
|
| 456 |
+
return resample_poly(audio, target_sr // g, orig_sr // g).astype(np.float32)
|
| 457 |
+
except ImportError:
|
| 458 |
+
target_length = int(len(audio) * target_sr / orig_sr)
|
| 459 |
+
return np.interp(
|
| 460 |
+
np.linspace(0, len(audio) - 1, target_length),
|
| 461 |
+
np.arange(len(audio)),
|
| 462 |
+
audio,
|
| 463 |
+
).astype(np.float32)
|
multi_modal/image_to_text.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# multi_modal/image_to_text.py
|
| 3 |
+
#
|
| 4 |
+
# Converts an uploaded image into grievance text for BERT.
|
| 5 |
+
#
|
| 6 |
+
# Labels: Electricity | Garbage | Pollution | Public Transport |
|
| 7 |
+
# Roads | Sanitation | Stray Animals | Water
|
| 8 |
+
#
|
| 9 |
+
# OUTPUT RULE:
|
| 10 |
+
# Only raw observed text from OCR + BLIP caption.
|
| 11 |
+
# No predefined phrases, no templates, no appended context.
|
| 12 |
+
# BERT classifier must receive unbiased descriptive text.
|
| 13 |
+
#
|
| 14 |
+
# PIPELINE:
|
| 15 |
+
# Step 1 — Preprocess (sharpen, contrast, resize)
|
| 16 |
+
# Step 2 — EasyOCR (visible text in EN/HI/TE)
|
| 17 |
+
# Step 3 — BLIP-base, 5 civic prompts, best-of-5 by keyword score
|
| 18 |
+
# Step 4 — Clean fusion: OCR + caption, no added words
|
| 19 |
+
#
|
| 20 |
+
# FOR RENDER:
|
| 21 |
+
# Set IMAGE_BACKEND=hf_api + HF_TOKEN=hf_xxx in .env
|
| 22 |
+
# =========================================================
|
| 23 |
+
|
| 24 |
+
import io
|
| 25 |
+
import os
|
| 26 |
+
import re
|
| 27 |
+
import torch
|
| 28 |
+
import numpy as np
|
| 29 |
+
from PIL import Image, ImageFilter, ImageEnhance
|
| 30 |
+
|
| 31 |
+
# ── Environment ────────────────────────────────────────────────────────────────
|
| 32 |
+
_BACKEND = os.environ.get("IMAGE_BACKEND", "local")
|
| 33 |
+
_HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 34 |
+
_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 35 |
+
|
| 36 |
+
# ── Lazy model handles ─────────────────────────────────────────────────────────
|
| 37 |
+
_ocr_reader = None
|
| 38 |
+
_blip_processor = None
|
| 39 |
+
_blip_model = None
|
| 40 |
+
|
| 41 |
+
# ── Five civic prompts for BLIP ────────────────────────────────────────────────
|
| 42 |
+
# Steers BLIP toward civic observation language.
|
| 43 |
+
# Best-scoring caption across all 5 is selected.
|
| 44 |
+
_CIVIC_PROMPTS = [
|
| 45 |
+
"a civic grievance showing",
|
| 46 |
+
"a public infrastructure problem showing",
|
| 47 |
+
"a sanitation or garbage problem showing",
|
| 48 |
+
"a water or drainage problem showing",
|
| 49 |
+
"a road or footpath damage showing",
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
# ── Civic keyword set — for scoring captions only, never appended to output ───
|
| 53 |
+
_CIVIC_KEYWORDS = {
|
| 54 |
+
"garbage", "waste", "trash", "litter", "dumped", "overflowing", "filth",
|
| 55 |
+
"sewage", "drain", "clog", "smell", "foul", "unhygienic", "sanitation",
|
| 56 |
+
"pothole", "road", "damaged", "broken", "crack", "footpath", "pavement",
|
| 57 |
+
"accident", "vehicle", "commuter", "traffic",
|
| 58 |
+
"water", "flood", "waterlog", "overflow", "leak", "pipe", "supply",
|
| 59 |
+
"stagnant", "puddle", "inundated",
|
| 60 |
+
"electricity", "wire", "pole", "streetlight", "cable", "spark", "fallen",
|
| 61 |
+
"pollution", "smoke", "dust", "emission", "contamination",
|
| 62 |
+
"animal", "stray", "dog", "cattle", "menace",
|
| 63 |
+
"transport", "bus", "auto", "road", "signal",
|
| 64 |
+
"hazard", "risk", "danger", "health", "resident", "street", "public",
|
| 65 |
+
"municipal", "colony", "area", "locality", "civic", "problem",
|
| 66 |
+
"issue", "blocked", "accumulated", "piled", "scattered",
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 71 |
+
# PUBLIC API
|
| 72 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 73 |
+
def extract_text_from_image(image_bytes: bytes) -> str:
|
| 74 |
+
"""
|
| 75 |
+
Convert raw image bytes into clean descriptive grievance text.
|
| 76 |
+
|
| 77 |
+
Output is purely what the image contains — no predefined phrases.
|
| 78 |
+
BERT classifier receives unbiased input.
|
| 79 |
+
|
| 80 |
+
Parameters
|
| 81 |
+
----------
|
| 82 |
+
image_bytes : bytes
|
| 83 |
+
Raw bytes from Flask request.files["image"].read()
|
| 84 |
+
|
| 85 |
+
Returns
|
| 86 |
+
-------
|
| 87 |
+
str
|
| 88 |
+
Clean observed description e.g.:
|
| 89 |
+
"garbage dumped on the road near residential area"
|
| 90 |
+
"pothole on the main road"
|
| 91 |
+
"water supply pipeline broken leaking on street"
|
| 92 |
+
"stray dogs near garbage pile"
|
| 93 |
+
Returns "" on total failure (never raises).
|
| 94 |
+
"""
|
| 95 |
+
if not image_bytes:
|
| 96 |
+
return ""
|
| 97 |
+
|
| 98 |
+
if _BACKEND == "hf_api" and _HF_TOKEN:
|
| 99 |
+
return _extract_via_hf_api(image_bytes)
|
| 100 |
+
|
| 101 |
+
return _extract_local(image_bytes)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 105 |
+
# STEP 1 — PREPROCESSING
|
| 106 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 107 |
+
def _preprocess_image(image_bytes: bytes) -> Image.Image:
|
| 108 |
+
"""
|
| 109 |
+
Prepare image for best BLIP + OCR accuracy.
|
| 110 |
+
- RGB conversion
|
| 111 |
+
- Resize: long edge capped at 1024px
|
| 112 |
+
- UnsharpMask: recovers blurry phone shots
|
| 113 |
+
- Contrast +20%: helps BLIP detect civic features
|
| 114 |
+
"""
|
| 115 |
+
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 116 |
+
|
| 117 |
+
w, h = img.size
|
| 118 |
+
if max(w, h) > 1024:
|
| 119 |
+
scale = 1024 / max(w, h)
|
| 120 |
+
img = img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
|
| 121 |
+
|
| 122 |
+
img = img.filter(ImageFilter.UnsharpMask(radius=1.5, percent=120, threshold=3))
|
| 123 |
+
img = ImageEnhance.Contrast(img).enhance(1.2)
|
| 124 |
+
return img
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 128 |
+
# STEP 2 — OCR
|
| 129 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 130 |
+
def _load_ocr():
|
| 131 |
+
global _ocr_reader
|
| 132 |
+
if _ocr_reader is not None:
|
| 133 |
+
return None if _ocr_reader == "unavailable" else _ocr_reader
|
| 134 |
+
try:
|
| 135 |
+
import easyocr
|
| 136 |
+
print("🔄 Loading EasyOCR (EN + HI + TE)…")
|
| 137 |
+
_ocr_reader = easyocr.Reader(
|
| 138 |
+
["en", "hi", "te"],
|
| 139 |
+
gpu=torch.cuda.is_available(),
|
| 140 |
+
verbose=False,
|
| 141 |
+
)
|
| 142 |
+
print("✅ EasyOCR loaded.")
|
| 143 |
+
except ImportError:
|
| 144 |
+
print("⚠️ easyocr not installed — run: pip install easyocr")
|
| 145 |
+
_ocr_reader = "unavailable"
|
| 146 |
+
except Exception as e:
|
| 147 |
+
print(f"⚠️ EasyOCR load error: {e}")
|
| 148 |
+
_ocr_reader = "unavailable"
|
| 149 |
+
return None if _ocr_reader == "unavailable" else _ocr_reader
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _run_ocr(img: Image.Image) -> str:
|
| 153 |
+
"""Extract visible text from image. Returns '' if nothing meaningful."""
|
| 154 |
+
try:
|
| 155 |
+
reader = _load_ocr()
|
| 156 |
+
if reader is None:
|
| 157 |
+
return ""
|
| 158 |
+
img_np = np.array(img)
|
| 159 |
+
results = reader.readtext(img_np, detail=0, paragraph=True)
|
| 160 |
+
text = " ".join(results).strip()
|
| 161 |
+
return text if len(text) >= 6 else ""
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"[image_to_text] OCR error: {e}")
|
| 164 |
+
return ""
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 168 |
+
# STEP 3 — BLIP MULTI-PROMPT CAPTIONING
|
| 169 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 170 |
+
def _load_blip():
|
| 171 |
+
global _blip_processor, _blip_model
|
| 172 |
+
if _blip_model is not None:
|
| 173 |
+
return (None, None) if _blip_model == "unavailable" else (_blip_processor, _blip_model)
|
| 174 |
+
try:
|
| 175 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 176 |
+
print("🔄 Loading BLIP-base captioning model (~450 MB)…")
|
| 177 |
+
_blip_processor = BlipProcessor.from_pretrained(
|
| 178 |
+
"Salesforce/blip-image-captioning-base"
|
| 179 |
+
)
|
| 180 |
+
_blip_model = BlipForConditionalGeneration.from_pretrained(
|
| 181 |
+
"Salesforce/blip-image-captioning-base"
|
| 182 |
+
).to(_DEVICE)
|
| 183 |
+
_blip_model.eval()
|
| 184 |
+
print("✅ BLIP-base loaded.")
|
| 185 |
+
except Exception as e:
|
| 186 |
+
print(f"⚠️ BLIP load error: {e}")
|
| 187 |
+
_blip_model = "unavailable"
|
| 188 |
+
return (None, None) if _blip_model == "unavailable" else (_blip_processor, _blip_model)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def _score_caption(caption: str) -> int:
|
| 192 |
+
"""Count civic keywords in caption. Used for selection only — never added to output."""
|
| 193 |
+
words = set(re.findall(r'\b\w+\b', caption.lower()))
|
| 194 |
+
return len(words & _CIVIC_KEYWORDS)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _run_blip_multi_prompt(img: Image.Image) -> str:
|
| 198 |
+
"""
|
| 199 |
+
Run BLIP with 5 civic prompts.
|
| 200 |
+
Returns the caption with the highest civic keyword score.
|
| 201 |
+
Raw caption only — no extra words added.
|
| 202 |
+
"""
|
| 203 |
+
processor, model = _load_blip()
|
| 204 |
+
if model is None:
|
| 205 |
+
return ""
|
| 206 |
+
|
| 207 |
+
best_caption = ""
|
| 208 |
+
best_score = -1
|
| 209 |
+
|
| 210 |
+
for prompt in _CIVIC_PROMPTS:
|
| 211 |
+
try:
|
| 212 |
+
inputs = processor(
|
| 213 |
+
img,
|
| 214 |
+
text = prompt,
|
| 215 |
+
return_tensors = "pt",
|
| 216 |
+
).to(_DEVICE)
|
| 217 |
+
|
| 218 |
+
with torch.no_grad():
|
| 219 |
+
output = model.generate(
|
| 220 |
+
**inputs,
|
| 221 |
+
max_new_tokens = 60,
|
| 222 |
+
num_beams = 5,
|
| 223 |
+
early_stopping = True,
|
| 224 |
+
no_repeat_ngram_size = 3,
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
caption = processor.decode(output[0], skip_special_tokens=True).strip()
|
| 228 |
+
|
| 229 |
+
# Skip if model just echoed the prompt with no new content
|
| 230 |
+
if len(caption) <= len(prompt) + 5:
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
score = _score_caption(caption)
|
| 234 |
+
if score > best_score:
|
| 235 |
+
best_score = score
|
| 236 |
+
best_caption = caption
|
| 237 |
+
|
| 238 |
+
except Exception as e:
|
| 239 |
+
print(f"[image_to_text] Prompt failed: {e}")
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
# Unconditional fallback
|
| 243 |
+
if not best_caption:
|
| 244 |
+
try:
|
| 245 |
+
inputs = processor(img, return_tensors="pt").to(_DEVICE)
|
| 246 |
+
with torch.no_grad():
|
| 247 |
+
output = model.generate(
|
| 248 |
+
**inputs,
|
| 249 |
+
max_new_tokens = 60,
|
| 250 |
+
num_beams = 5,
|
| 251 |
+
no_repeat_ngram_size = 3,
|
| 252 |
+
)
|
| 253 |
+
best_caption = processor.decode(output[0], skip_special_tokens=True).strip()
|
| 254 |
+
except Exception as e:
|
| 255 |
+
print(f"[image_to_text] Unconditional fallback failed: {e}")
|
| 256 |
+
|
| 257 |
+
return best_caption
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 261 |
+
# STEP 4 — CLEAN FUSION (no predefined phrases added)
|
| 262 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 263 |
+
def _is_redundant(text_a: str, text_b: str) -> bool:
|
| 264 |
+
"""True if text_a is >60% word-overlap with text_b (already covered)."""
|
| 265 |
+
words_a = set(text_a.lower().split())
|
| 266 |
+
words_b = set(text_b.lower().split())
|
| 267 |
+
if not words_a:
|
| 268 |
+
return True
|
| 269 |
+
return len(words_a & words_b) / len(words_a) > 0.6
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def _fuse(ocr_text: str, caption: str) -> str:
|
| 273 |
+
"""
|
| 274 |
+
Combine OCR + caption into a single clean string.
|
| 275 |
+
|
| 276 |
+
Rules (no predefined text added at any point):
|
| 277 |
+
OCR > 20 chars → OCR is primary (it's the actual complaint text)
|
| 278 |
+
Caption appended only if it adds new information
|
| 279 |
+
OCR short/none → Caption is the output, as-is from BLIP
|
| 280 |
+
Both empty → return ""
|
| 281 |
+
"""
|
| 282 |
+
ocr = ocr_text.strip()
|
| 283 |
+
caption = caption.strip()
|
| 284 |
+
|
| 285 |
+
if len(ocr) > 20:
|
| 286 |
+
# OCR has the real complaint text — use it directly
|
| 287 |
+
if caption and not _is_redundant(caption, ocr):
|
| 288 |
+
return f"{ocr}. {caption}"
|
| 289 |
+
return ocr
|
| 290 |
+
|
| 291 |
+
if caption:
|
| 292 |
+
# Pure photo — BLIP caption is the output, nothing added
|
| 293 |
+
if ocr and not _is_redundant(ocr, caption):
|
| 294 |
+
return f"{caption}. {ocr}"
|
| 295 |
+
return caption
|
| 296 |
+
|
| 297 |
+
return ocr or caption
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 301 |
+
# LOCAL PIPELINE
|
| 302 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 303 |
+
def _extract_local(image_bytes: bytes) -> str:
|
| 304 |
+
try:
|
| 305 |
+
img = _preprocess_image(image_bytes)
|
| 306 |
+
ocr_text = _run_ocr(img)
|
| 307 |
+
caption = _run_blip_multi_prompt(img)
|
| 308 |
+
|
| 309 |
+
if ocr_text:
|
| 310 |
+
print(f"[image_to_text] OCR: {ocr_text[:100]}")
|
| 311 |
+
if caption:
|
| 312 |
+
print(f"[image_to_text] Caption (score={_score_caption(caption)}): {caption[:100]}")
|
| 313 |
+
|
| 314 |
+
result = _fuse(ocr_text, caption)
|
| 315 |
+
print(f"[image_to_text] ✅ Output: {result[:160]}")
|
| 316 |
+
return result
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
print(f"[image_to_text] ❌ Pipeline failed: {e}")
|
| 320 |
+
return ""
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 324 |
+
# HF API PATH — Render / production
|
| 325 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 326 |
+
def _extract_via_hf_api(image_bytes: bytes) -> str:
|
| 327 |
+
"""
|
| 328 |
+
Production path — HuggingFace Inference API.
|
| 329 |
+
Raw caption returned as-is. No predefined text added.
|
| 330 |
+
"""
|
| 331 |
+
import requests
|
| 332 |
+
try:
|
| 333 |
+
res = requests.post(
|
| 334 |
+
"https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large",
|
| 335 |
+
headers = {"Authorization": f"Bearer {_HF_TOKEN}"},
|
| 336 |
+
data = image_bytes,
|
| 337 |
+
timeout = 30,
|
| 338 |
+
)
|
| 339 |
+
if res.ok:
|
| 340 |
+
data = res.json()
|
| 341 |
+
caption = data[0].get("generated_text", "").strip() if isinstance(data, list) else ""
|
| 342 |
+
print(f"[image_to_text] ✅ HF API output: {caption[:160]}")
|
| 343 |
+
return caption
|
| 344 |
+
except Exception as e:
|
| 345 |
+
print(f"[image_to_text] HF API error: {e}")
|
| 346 |
+
return ""
|
requirements.txt
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Core ML / DL ──────────────────────────────────────────────
|
| 2 |
+
torch
|
| 3 |
+
transformers==5.2.0
|
| 4 |
+
tokenizers==0.22.2
|
| 5 |
+
accelerate>=1.1.0
|
| 6 |
+
safetensors>=0.4.3
|
| 7 |
+
huggingface-hub>=1.3.0
|
| 8 |
+
|
| 9 |
+
# ── Audio ──────────────────────────────────────────────────────
|
| 10 |
+
pydub
|
| 11 |
+
soundfile
|
| 12 |
+
scipy
|
| 13 |
+
|
| 14 |
+
# ── Image ──────────────────────────────────────────────────────
|
| 15 |
+
Pillow
|
| 16 |
+
easyocr
|
| 17 |
+
opencv-python-headless
|
| 18 |
+
|
| 19 |
+
# ── NLP / Text ─────────────────────────────────────────────────
|
| 20 |
+
sentencepiece
|
| 21 |
+
tiktoken
|
| 22 |
+
protobuf>=5.28.0
|
| 23 |
+
regex
|
| 24 |
+
nltk
|
| 25 |
+
indic-nlp-library
|
| 26 |
+
stopwordsiso
|
| 27 |
+
|
| 28 |
+
# ── Explainability ─────────────────────────────────────────────
|
| 29 |
+
captum
|
| 30 |
+
shap>=0.44
|
| 31 |
+
|
| 32 |
+
# ── Forecasting ────────────────────────────────────────────────
|
| 33 |
+
prophet
|
| 34 |
+
|
| 35 |
+
# ── Data / ML ──────────────────────────────────────────────────
|
| 36 |
+
pandas
|
| 37 |
+
numpy
|
| 38 |
+
scikit-learn
|
| 39 |
+
matplotlib
|
| 40 |
+
seaborn
|
| 41 |
+
|
| 42 |
+
# ── Backend ────────────────────────────────────────────────────
|
| 43 |
+
# Flask only — MongoDB + Cloudinary are handled by Express/Node
|
| 44 |
+
flask
|
| 45 |
+
flask-cors
|
| 46 |
+
gunicorn
|
| 47 |
+
werkzeug
|
| 48 |
+
python-dotenv
|
| 49 |
+
requests==2.32.3
|
| 50 |
+
python-dotenv==1.0.1
|
sentiment_analysis/artifacts/indic_urgency_model/config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"AlbertForSequenceClassification"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0,
|
| 6 |
+
"bos_token_id": 2,
|
| 7 |
+
"classifier_dropout_prob": 0.1,
|
| 8 |
+
"down_scale_factor": 1,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"embedding_size": 128,
|
| 11 |
+
"eos_token_id": 3,
|
| 12 |
+
"gap_size": 0,
|
| 13 |
+
"hidden_act": "gelu",
|
| 14 |
+
"hidden_dropout_prob": 0,
|
| 15 |
+
"hidden_size": 768,
|
| 16 |
+
"id2label": {
|
| 17 |
+
"0": "LABEL_0",
|
| 18 |
+
"1": "LABEL_1",
|
| 19 |
+
"2": "LABEL_2",
|
| 20 |
+
"3": "LABEL_3"
|
| 21 |
+
},
|
| 22 |
+
"initializer_range": 0.02,
|
| 23 |
+
"inner_group_num": 1,
|
| 24 |
+
"intermediate_size": 3072,
|
| 25 |
+
"label2id": {
|
| 26 |
+
"LABEL_0": 0,
|
| 27 |
+
"LABEL_1": 1,
|
| 28 |
+
"LABEL_2": 2,
|
| 29 |
+
"LABEL_3": 3
|
| 30 |
+
},
|
| 31 |
+
"layer_norm_eps": 1e-12,
|
| 32 |
+
"max_position_embeddings": 512,
|
| 33 |
+
"model_type": "albert",
|
| 34 |
+
"net_structure_type": 0,
|
| 35 |
+
"num_attention_heads": 12,
|
| 36 |
+
"num_hidden_groups": 1,
|
| 37 |
+
"num_hidden_layers": 12,
|
| 38 |
+
"num_memory_blocks": 0,
|
| 39 |
+
"pad_token_id": 0,
|
| 40 |
+
"problem_type": "single_label_classification",
|
| 41 |
+
"tie_word_embeddings": true,
|
| 42 |
+
"transformers_version": "5.1.0",
|
| 43 |
+
"type_vocab_size": 2,
|
| 44 |
+
"use_cache": false,
|
| 45 |
+
"vocab_size": 200000
|
| 46 |
+
}
|
sentiment_analysis/artifacts/indic_urgency_model/label_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:693a9eaa2ea5336e580da8eb27a85d30bb0e2100184c6a491f5d60f5df14abf7
|
| 3 |
+
size 275
|
sentiment_analysis/artifacts/indic_urgency_model/label_map.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d80d3aacf4a3ee5cabfa43c871d944fce68e1e4602d2154417d1c4ca3899edf7
|
| 3 |
+
size 194
|
sentiment_analysis/artifacts/indic_urgency_model/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5eb8a6dd044987f78c003f910440efb162c76d3b780ff2c0026c19158fac2df
|
| 3 |
+
size 14969267
|
sentiment_analysis/artifacts/indic_urgency_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": true,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "[CLS]",
|
| 5 |
+
"cls_token": "[CLS]",
|
| 6 |
+
"do_lower_case": true,
|
| 7 |
+
"eos_token": "[SEP]",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<pad>",
|
| 10 |
+
"[CLS]",
|
| 11 |
+
"[SEP]",
|
| 12 |
+
"[MASK]"
|
| 13 |
+
],
|
| 14 |
+
"is_local": false,
|
| 15 |
+
"keep_accents": false,
|
| 16 |
+
"mask_token": "[MASK]",
|
| 17 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 18 |
+
"pad_token": "<pad>",
|
| 19 |
+
"sep_token": "[SEP]",
|
| 20 |
+
"tokenizer_class": "AlbertTokenizer",
|
| 21 |
+
"trim_offsets": true,
|
| 22 |
+
"unk_id": 1,
|
| 23 |
+
"unk_token": "<unk>"
|
| 24 |
+
}
|
sentiment_analysis/artifacts/urgency_bert_model/config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_cross_attention": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": null,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"eos_token_id": null,
|
| 11 |
+
"gradient_checkpointing": false,
|
| 12 |
+
"hidden_act": "gelu",
|
| 13 |
+
"hidden_dropout_prob": 0.1,
|
| 14 |
+
"hidden_size": 768,
|
| 15 |
+
"id2label": {
|
| 16 |
+
"0": "LABEL_0",
|
| 17 |
+
"1": "LABEL_1",
|
| 18 |
+
"2": "LABEL_2",
|
| 19 |
+
"3": "LABEL_3"
|
| 20 |
+
},
|
| 21 |
+
"initializer_range": 0.02,
|
| 22 |
+
"intermediate_size": 3072,
|
| 23 |
+
"is_decoder": false,
|
| 24 |
+
"label2id": {
|
| 25 |
+
"LABEL_0": 0,
|
| 26 |
+
"LABEL_1": 1,
|
| 27 |
+
"LABEL_2": 2,
|
| 28 |
+
"LABEL_3": 3
|
| 29 |
+
},
|
| 30 |
+
"layer_norm_eps": 1e-12,
|
| 31 |
+
"max_position_embeddings": 512,
|
| 32 |
+
"model_type": "bert",
|
| 33 |
+
"num_attention_heads": 12,
|
| 34 |
+
"num_hidden_layers": 12,
|
| 35 |
+
"pad_token_id": 0,
|
| 36 |
+
"position_embedding_type": "absolute",
|
| 37 |
+
"problem_type": "single_label_classification",
|
| 38 |
+
"tie_word_embeddings": true,
|
| 39 |
+
"transformers_version": "5.1.0",
|
| 40 |
+
"type_vocab_size": 2,
|
| 41 |
+
"use_cache": false,
|
| 42 |
+
"vocab_size": 30522
|
| 43 |
+
}
|
sentiment_analysis/artifacts/urgency_bert_model/label_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:693a9eaa2ea5336e580da8eb27a85d30bb0e2100184c6a491f5d60f5df14abf7
|
| 3 |
+
size 275
|
sentiment_analysis/artifacts/urgency_bert_model/label_map.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d80d3aacf4a3ee5cabfa43c871d944fce68e1e4602d2154417d1c4ca3899edf7
|
| 3 |
+
size 194
|
sentiment_analysis/artifacts/urgency_bert_model/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sentiment_analysis/artifacts/urgency_bert_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_lower_case": true,
|
| 5 |
+
"is_local": false,
|
| 6 |
+
"mask_token": "[MASK]",
|
| 7 |
+
"model_max_length": 512,
|
| 8 |
+
"pad_token": "[PAD]",
|
| 9 |
+
"sep_token": "[SEP]",
|
| 10 |
+
"strip_accents": null,
|
| 11 |
+
"tokenize_chinese_chars": true,
|
| 12 |
+
"tokenizer_class": "BertTokenizer",
|
| 13 |
+
"unk_token": "[UNK]"
|
| 14 |
+
}
|
sentiment_analysis/bert_model.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# BERT URGENCY MODEL TRAINING
|
| 3 |
+
# File: bert_model.py
|
| 4 |
+
# Purpose: Train urgency prediction (Low, Medium, High, Critical)
|
| 5 |
+
# =========================================================
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import pickle
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import numpy as np
|
| 12 |
+
import torch
|
| 13 |
+
|
| 14 |
+
from sklearn.model_selection import train_test_split
|
| 15 |
+
from sklearn.preprocessing import LabelEncoder
|
| 16 |
+
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef
|
| 17 |
+
|
| 18 |
+
from transformers import (
|
| 19 |
+
BertTokenizer,
|
| 20 |
+
BertForSequenceClassification,
|
| 21 |
+
Trainer,
|
| 22 |
+
TrainingArguments
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
from torch.utils.data import Dataset
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# =========================================================
|
| 29 |
+
# PATH CONFIGURATION
|
| 30 |
+
# =========================================================
|
| 31 |
+
|
| 32 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 33 |
+
|
| 34 |
+
DATA_PATH = os.path.join(BASE_DIR, "urgency_train.csv")
|
| 35 |
+
|
| 36 |
+
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
|
| 37 |
+
|
| 38 |
+
MODEL_DIR = os.path.join(ARTIFACT_DIR, "urgency_bert_model")
|
| 39 |
+
|
| 40 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# =========================================================
|
| 44 |
+
# PARAMETERS
|
| 45 |
+
# =========================================================
|
| 46 |
+
|
| 47 |
+
MAX_LENGTH = 128
|
| 48 |
+
EPOCHS = 4
|
| 49 |
+
BATCH_SIZE = 16
|
| 50 |
+
LEARNING_RATE = 2e-5
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# =========================================================
|
| 54 |
+
# LOAD DATASET
|
| 55 |
+
# =========================================================
|
| 56 |
+
|
| 57 |
+
print(f"\nLoading dataset from: {DATA_PATH}")
|
| 58 |
+
|
| 59 |
+
df = pd.read_csv(DATA_PATH)
|
| 60 |
+
|
| 61 |
+
df = df[["text", "urgency"]]
|
| 62 |
+
|
| 63 |
+
df.dropna(inplace=True)
|
| 64 |
+
|
| 65 |
+
df.drop_duplicates(inplace=True)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# =========================================================
|
| 69 |
+
# CLEAN TEXT
|
| 70 |
+
# =========================================================
|
| 71 |
+
|
| 72 |
+
def clean_text(text):
|
| 73 |
+
|
| 74 |
+
text = str(text)
|
| 75 |
+
|
| 76 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 77 |
+
|
| 78 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 79 |
+
|
| 80 |
+
return text
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
df["text"] = df["text"].apply(clean_text)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# =========================================================
|
| 87 |
+
# LABEL ENCODING
|
| 88 |
+
# =========================================================
|
| 89 |
+
|
| 90 |
+
label_encoder = LabelEncoder()
|
| 91 |
+
|
| 92 |
+
df["label_id"] = label_encoder.fit_transform(df["urgency"])
|
| 93 |
+
|
| 94 |
+
label_map = dict(zip(
|
| 95 |
+
label_encoder.classes_,
|
| 96 |
+
label_encoder.transform(label_encoder.classes_)
|
| 97 |
+
))
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# SAVE LABEL ARTIFACTS
|
| 101 |
+
|
| 102 |
+
with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "wb") as f:
|
| 103 |
+
pickle.dump(label_encoder, f)
|
| 104 |
+
|
| 105 |
+
with open(os.path.join(MODEL_DIR, "label_map.pkl"), "wb") as f:
|
| 106 |
+
pickle.dump(label_map, f)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
NUM_LABELS = len(label_map)
|
| 110 |
+
|
| 111 |
+
print("Classes:", label_map)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# =========================================================
|
| 115 |
+
# SPLIT DATA
|
| 116 |
+
# =========================================================
|
| 117 |
+
|
| 118 |
+
train_df, temp_df = train_test_split(
|
| 119 |
+
|
| 120 |
+
df,
|
| 121 |
+
test_size=0.30,
|
| 122 |
+
stratify=df["label_id"],
|
| 123 |
+
random_state=42
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
val_df, test_df = train_test_split(
|
| 127 |
+
|
| 128 |
+
temp_df,
|
| 129 |
+
test_size=0.50,
|
| 130 |
+
stratify=temp_df["label_id"],
|
| 131 |
+
random_state=42
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# =========================================================
|
| 136 |
+
# TOKENIZER
|
| 137 |
+
# =========================================================
|
| 138 |
+
|
| 139 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
| 140 |
+
|
| 141 |
+
tokenizer.save_pretrained(MODEL_DIR)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# =========================================================
|
| 145 |
+
# DATASET CLASS
|
| 146 |
+
# =========================================================
|
| 147 |
+
|
| 148 |
+
class UrgencyDataset(Dataset):
|
| 149 |
+
|
| 150 |
+
def __init__(self, texts, labels):
|
| 151 |
+
|
| 152 |
+
self.encodings = tokenizer(
|
| 153 |
+
list(texts),
|
| 154 |
+
truncation=True,
|
| 155 |
+
padding=True,
|
| 156 |
+
max_length=MAX_LENGTH
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
self.labels = list(labels)
|
| 160 |
+
|
| 161 |
+
def __getitem__(self, idx):
|
| 162 |
+
|
| 163 |
+
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
|
| 164 |
+
|
| 165 |
+
item["labels"] = torch.tensor(self.labels[idx])
|
| 166 |
+
|
| 167 |
+
return item
|
| 168 |
+
|
| 169 |
+
def __len__(self):
|
| 170 |
+
|
| 171 |
+
return len(self.labels)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
train_dataset = UrgencyDataset(train_df["text"], train_df["label_id"])
|
| 175 |
+
|
| 176 |
+
val_dataset = UrgencyDataset(val_df["text"], val_df["label_id"])
|
| 177 |
+
|
| 178 |
+
test_dataset = UrgencyDataset(test_df["text"], test_df["label_id"])
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# =========================================================
|
| 182 |
+
# LOAD MODEL
|
| 183 |
+
# =========================================================
|
| 184 |
+
|
| 185 |
+
model = BertForSequenceClassification.from_pretrained(
|
| 186 |
+
|
| 187 |
+
"bert-base-uncased",
|
| 188 |
+
|
| 189 |
+
num_labels=NUM_LABELS
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# =========================================================
|
| 194 |
+
# METRICS
|
| 195 |
+
# =========================================================
|
| 196 |
+
|
| 197 |
+
def compute_metrics(eval_pred):
|
| 198 |
+
|
| 199 |
+
logits, labels = eval_pred
|
| 200 |
+
|
| 201 |
+
preds = np.argmax(logits, axis=1)
|
| 202 |
+
|
| 203 |
+
return {
|
| 204 |
+
|
| 205 |
+
"accuracy": accuracy_score(labels, preds),
|
| 206 |
+
|
| 207 |
+
"balanced_accuracy": balanced_accuracy_score(labels, preds),
|
| 208 |
+
|
| 209 |
+
"f1_weighted": f1_score(labels, preds, average="weighted"),
|
| 210 |
+
|
| 211 |
+
"mcc": matthews_corrcoef(labels, preds)
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
# =========================================================
|
| 216 |
+
# TRAINING CONFIG
|
| 217 |
+
# =========================================================
|
| 218 |
+
|
| 219 |
+
training_args = TrainingArguments(
|
| 220 |
+
output_dir=os.path.join(ARTIFACT_DIR, "results"),
|
| 221 |
+
learning_rate=LEARNING_RATE,
|
| 222 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 223 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 224 |
+
num_train_epochs=EPOCHS,
|
| 225 |
+
weight_decay=0.01,
|
| 226 |
+
logging_steps=100,
|
| 227 |
+
save_strategy="no",
|
| 228 |
+
report_to="none"
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
trainer = Trainer(
|
| 232 |
+
model=model,
|
| 233 |
+
args=training_args,
|
| 234 |
+
train_dataset=train_dataset,
|
| 235 |
+
eval_dataset=val_dataset,
|
| 236 |
+
compute_metrics=compute_metrics
|
| 237 |
+
)
|
| 238 |
+
print("\nTraining urgency BERT model...")
|
| 239 |
+
|
| 240 |
+
trainer.train()
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
# =========================================================
|
| 244 |
+
# FINAL TEST EVALUATION
|
| 245 |
+
# =========================================================
|
| 246 |
+
|
| 247 |
+
predictions = trainer.predict(test_dataset)
|
| 248 |
+
|
| 249 |
+
y_true = predictions.label_ids
|
| 250 |
+
|
| 251 |
+
y_pred = np.argmax(predictions.predictions, axis=1)
|
| 252 |
+
|
| 253 |
+
print("\nFINAL TEST RESULTS")
|
| 254 |
+
|
| 255 |
+
print("Accuracy:", accuracy_score(y_true, y_pred))
|
| 256 |
+
|
| 257 |
+
print("F1:", f1_score(y_true, y_pred, average="weighted"))
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# =========================================================
|
| 261 |
+
# SAVE MODEL
|
| 262 |
+
# =========================================================
|
| 263 |
+
|
| 264 |
+
trainer.save_model(MODEL_DIR)
|
| 265 |
+
|
| 266 |
+
print("\nUrgency BERT model saved successfully.")
|
| 267 |
+
|
| 268 |
+
|
sentiment_analysis/bert_predict.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# BERT URGENCY PREDICTION — ENGLISH
|
| 3 |
+
# =========================================================
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import torch
|
| 7 |
+
import pickle
|
| 8 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
| 9 |
+
|
| 10 |
+
# ── Load artifacts ────────────────────────────────────────
|
| 11 |
+
BASE_DIR = os.path.dirname(__file__)
|
| 12 |
+
MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "urgency_bert_model")
|
| 13 |
+
|
| 14 |
+
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
|
| 15 |
+
model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
|
| 16 |
+
|
| 17 |
+
label_encoder = pickle.load(
|
| 18 |
+
open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb")
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
model.eval()
|
| 22 |
+
|
| 23 |
+
MAX_LENGTH = 128
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ── Predict ───────────────────────────────────────────────
|
| 27 |
+
def predict_urgency(
|
| 28 |
+
text: str,
|
| 29 |
+
input_ids=None, # O3: pre-tokenised tensor from main.py
|
| 30 |
+
attention_mask=None, # O3: pre-tokenised tensor from main.py
|
| 31 |
+
) -> dict:
|
| 32 |
+
"""
|
| 33 |
+
Predict urgency level for English grievance text.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
text : Raw input string.
|
| 37 |
+
input_ids : Optional pre-tokenised tensor (1, seq_len).
|
| 38 |
+
attention_mask : Required when input_ids is provided.
|
| 39 |
+
|
| 40 |
+
Returns dict with keys: urgency, confidence, class_index.
|
| 41 |
+
"""
|
| 42 |
+
# O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
|
| 43 |
+
if input_ids is None:
|
| 44 |
+
enc = tokenizer(
|
| 45 |
+
text,
|
| 46 |
+
return_tensors="pt",
|
| 47 |
+
truncation=True,
|
| 48 |
+
padding=False,
|
| 49 |
+
max_length=MAX_LENGTH,
|
| 50 |
+
)
|
| 51 |
+
input_ids = enc["input_ids"]
|
| 52 |
+
attention_mask = enc["attention_mask"]
|
| 53 |
+
|
| 54 |
+
with torch.no_grad():
|
| 55 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 56 |
+
|
| 57 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 58 |
+
conf, pred = torch.max(probs, dim=1)
|
| 59 |
+
confidence = conf.item()
|
| 60 |
+
predicted_index = pred.item()
|
| 61 |
+
|
| 62 |
+
urgency = label_encoder.inverse_transform([predicted_index])[0]
|
| 63 |
+
|
| 64 |
+
return {
|
| 65 |
+
"urgency": urgency,
|
| 66 |
+
"confidence": round(confidence, 4),
|
| 67 |
+
"class_index": predicted_index,
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def get_model_and_tokenizer():
|
| 72 |
+
return model, tokenizer
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ── Standalone test ───────────────────────────────────────
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
print("\nBERT Urgency Prediction Test")
|
| 78 |
+
while True:
|
| 79 |
+
text = input("\nEnter grievance (or 'exit'): ")
|
| 80 |
+
if text.lower() == "exit":
|
| 81 |
+
break
|
| 82 |
+
print(predict_urgency(text))
|
sentiment_analysis/indic_bert_model.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# INDICBERT URGENCY MODEL TRAINING
|
| 3 |
+
# File: indic_model.py
|
| 4 |
+
# Supports: Hindi + Telugu urgency prediction
|
| 5 |
+
# Labels: Low, Medium, High, Critical
|
| 6 |
+
# =========================================================
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import pickle
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
import torch
|
| 14 |
+
|
| 15 |
+
from sklearn.model_selection import train_test_split
|
| 16 |
+
from sklearn.preprocessing import LabelEncoder
|
| 17 |
+
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef
|
| 18 |
+
|
| 19 |
+
from transformers import (
|
| 20 |
+
AutoTokenizer,
|
| 21 |
+
AutoModelForSequenceClassification,
|
| 22 |
+
Trainer,
|
| 23 |
+
TrainingArguments
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
from torch.utils.data import Dataset
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# =========================================================
|
| 30 |
+
# PATH CONFIG
|
| 31 |
+
# =========================================================
|
| 32 |
+
|
| 33 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 34 |
+
|
| 35 |
+
DATA_PATH = os.path.join(BASE_DIR, "urgency_indic.csv")
|
| 36 |
+
|
| 37 |
+
ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
|
| 38 |
+
|
| 39 |
+
MODEL_DIR = os.path.join(ARTIFACT_DIR, "indic_urgency_model")
|
| 40 |
+
|
| 41 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# =========================================================
|
| 45 |
+
# PARAMETERS
|
| 46 |
+
# =========================================================
|
| 47 |
+
|
| 48 |
+
MODEL_NAME = "ai4bharat/indic-bert"
|
| 49 |
+
|
| 50 |
+
MAX_LENGTH = 128
|
| 51 |
+
EPOCHS = 4
|
| 52 |
+
BATCH_SIZE = 16
|
| 53 |
+
LEARNING_RATE = 2e-5
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# =========================================================
|
| 57 |
+
# LOAD DATASET
|
| 58 |
+
# =========================================================
|
| 59 |
+
|
| 60 |
+
print(f"\nLoading Indic urgency dataset from: {DATA_PATH}")
|
| 61 |
+
|
| 62 |
+
df = pd.read_csv(DATA_PATH)
|
| 63 |
+
|
| 64 |
+
df = df[["text", "urgency"]]
|
| 65 |
+
|
| 66 |
+
df.dropna(inplace=True)
|
| 67 |
+
|
| 68 |
+
df.drop_duplicates(inplace=True)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# =========================================================
|
| 72 |
+
# CLEAN TEXT
|
| 73 |
+
# =========================================================
|
| 74 |
+
|
| 75 |
+
def clean_text(text):
|
| 76 |
+
|
| 77 |
+
text = str(text)
|
| 78 |
+
|
| 79 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 80 |
+
|
| 81 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 82 |
+
|
| 83 |
+
return text
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
df["text"] = df["text"].apply(clean_text)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# =========================================================
|
| 90 |
+
# LABEL ENCODING
|
| 91 |
+
# =========================================================
|
| 92 |
+
|
| 93 |
+
label_encoder = LabelEncoder()
|
| 94 |
+
|
| 95 |
+
df["label_id"] = label_encoder.fit_transform(df["urgency"])
|
| 96 |
+
|
| 97 |
+
label_map = dict(zip(
|
| 98 |
+
label_encoder.classes_,
|
| 99 |
+
label_encoder.transform(label_encoder.classes_)
|
| 100 |
+
))
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# SAVE LABEL ARTIFACTS
|
| 104 |
+
|
| 105 |
+
with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "wb") as f:
|
| 106 |
+
pickle.dump(label_encoder, f)
|
| 107 |
+
|
| 108 |
+
with open(os.path.join(MODEL_DIR, "label_map.pkl"), "wb") as f:
|
| 109 |
+
pickle.dump(label_map, f)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
NUM_LABELS = len(label_map)
|
| 113 |
+
|
| 114 |
+
print("Classes:", label_map)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# =========================================================
|
| 118 |
+
# TRAIN / VAL / TEST SPLIT
|
| 119 |
+
# =========================================================
|
| 120 |
+
|
| 121 |
+
train_df, temp_df = train_test_split(
|
| 122 |
+
|
| 123 |
+
df,
|
| 124 |
+
test_size=0.30,
|
| 125 |
+
stratify=df["label_id"],
|
| 126 |
+
random_state=42
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
val_df, test_df = train_test_split(
|
| 130 |
+
|
| 131 |
+
temp_df,
|
| 132 |
+
test_size=0.50,
|
| 133 |
+
stratify=temp_df["label_id"],
|
| 134 |
+
random_state=42
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# =========================================================
|
| 139 |
+
# TOKENIZER
|
| 140 |
+
# =========================================================
|
| 141 |
+
|
| 142 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 143 |
+
|
| 144 |
+
tokenizer.save_pretrained(MODEL_DIR)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
# =========================================================
|
| 148 |
+
# DATASET CLASS
|
| 149 |
+
# =========================================================
|
| 150 |
+
|
| 151 |
+
class IndicUrgencyDataset(Dataset):
|
| 152 |
+
|
| 153 |
+
def __init__(self, texts, labels):
|
| 154 |
+
|
| 155 |
+
self.encodings = tokenizer(
|
| 156 |
+
|
| 157 |
+
list(texts),
|
| 158 |
+
|
| 159 |
+
truncation=True,
|
| 160 |
+
|
| 161 |
+
padding=True,
|
| 162 |
+
|
| 163 |
+
max_length=MAX_LENGTH
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
self.labels = list(labels)
|
| 167 |
+
|
| 168 |
+
def __getitem__(self, idx):
|
| 169 |
+
|
| 170 |
+
item = {
|
| 171 |
+
key: torch.tensor(val[idx])
|
| 172 |
+
for key, val in self.encodings.items()
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
item["labels"] = torch.tensor(self.labels[idx])
|
| 176 |
+
|
| 177 |
+
return item
|
| 178 |
+
|
| 179 |
+
def __len__(self):
|
| 180 |
+
|
| 181 |
+
return len(self.labels)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
train_dataset = IndicUrgencyDataset(train_df["text"], train_df["label_id"])
|
| 185 |
+
|
| 186 |
+
val_dataset = IndicUrgencyDataset(val_df["text"], val_df["label_id"])
|
| 187 |
+
|
| 188 |
+
test_dataset = IndicUrgencyDataset(test_df["text"], test_df["label_id"])
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# =========================================================
|
| 192 |
+
# MODEL
|
| 193 |
+
# =========================================================
|
| 194 |
+
|
| 195 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 196 |
+
|
| 197 |
+
MODEL_NAME,
|
| 198 |
+
|
| 199 |
+
num_labels=NUM_LABELS
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# =========================================================
|
| 204 |
+
# METRICS
|
| 205 |
+
# =========================================================
|
| 206 |
+
|
| 207 |
+
def compute_metrics(eval_pred):
|
| 208 |
+
|
| 209 |
+
logits, labels = eval_pred
|
| 210 |
+
|
| 211 |
+
preds = np.argmax(logits, axis=1)
|
| 212 |
+
|
| 213 |
+
return {
|
| 214 |
+
|
| 215 |
+
"accuracy": accuracy_score(labels, preds),
|
| 216 |
+
|
| 217 |
+
"balanced_accuracy": balanced_accuracy_score(labels, preds),
|
| 218 |
+
|
| 219 |
+
"f1_weighted": f1_score(labels, preds, average="weighted"),
|
| 220 |
+
|
| 221 |
+
"mcc": matthews_corrcoef(labels, preds)
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# =========================================================
|
| 226 |
+
# TRAINING CONFIG
|
| 227 |
+
# =========================================================
|
| 228 |
+
training_args = TrainingArguments(
|
| 229 |
+
output_dir=f"{ARTIFACT_DIR}/indic_results",
|
| 230 |
+
learning_rate=LEARNING_RATE,
|
| 231 |
+
per_device_train_batch_size=BATCH_SIZE,
|
| 232 |
+
per_device_eval_batch_size=BATCH_SIZE,
|
| 233 |
+
num_train_epochs=EPOCHS,
|
| 234 |
+
weight_decay=0.01,
|
| 235 |
+
logging_steps=100,
|
| 236 |
+
save_strategy="no",
|
| 237 |
+
report_to="none"
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
trainer = Trainer(
|
| 242 |
+
model=model,
|
| 243 |
+
args=training_args,
|
| 244 |
+
train_dataset=train_dataset,
|
| 245 |
+
eval_dataset=val_dataset,
|
| 246 |
+
compute_metrics=compute_metrics
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
print("\nTraining IndicBERT urgency model...")
|
| 250 |
+
|
| 251 |
+
trainer.train()
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# =========================================================
|
| 255 |
+
# SAVE MODEL
|
| 256 |
+
# =========================================================
|
| 257 |
+
|
| 258 |
+
trainer.save_model(MODEL_DIR)
|
| 259 |
+
|
| 260 |
+
print("\nIndicBERT urgency model saved successfully.")
|
sentiment_analysis/indic_bert_predict.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================================================
|
| 2 |
+
# INDICBERT URGENCY PREDICTION — HINDI + TELUGU
|
| 3 |
+
# =========================================================
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import torch
|
| 8 |
+
import pickle
|
| 9 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 10 |
+
|
| 11 |
+
# ── Load artifacts ────────────────────────────────────────
|
| 12 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 13 |
+
MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "indic_urgency_model")
|
| 14 |
+
|
| 15 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
|
| 16 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
|
| 17 |
+
model.eval()
|
| 18 |
+
|
| 19 |
+
with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb") as f:
|
| 20 |
+
label_encoder = pickle.load(f)
|
| 21 |
+
|
| 22 |
+
MAX_LENGTH = 128
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ── Text cleaning ─────────────────────────────────────────
|
| 26 |
+
def clean_text(text: str) -> str:
|
| 27 |
+
text = str(text)
|
| 28 |
+
text = re.sub(r"<.*?>", " ", text)
|
| 29 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 30 |
+
return text
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ── Predict ───────────────────────────────────────────────
|
| 34 |
+
def predict(
|
| 35 |
+
text: str,
|
| 36 |
+
input_ids=None, # O3: pre-tokenised tensor from main.py
|
| 37 |
+
attention_mask=None, # O3: pre-tokenised tensor from main.py
|
| 38 |
+
) -> dict:
|
| 39 |
+
"""
|
| 40 |
+
Predict urgency level for Hindi / Telugu grievance text.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
text : Raw input string.
|
| 44 |
+
input_ids : Optional pre-tokenised tensor (1, seq_len).
|
| 45 |
+
attention_mask : Required when input_ids is provided.
|
| 46 |
+
|
| 47 |
+
Returns dict with keys: urgency, confidence, class_index.
|
| 48 |
+
"""
|
| 49 |
+
# O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
|
| 50 |
+
if input_ids is None:
|
| 51 |
+
cleaned = clean_text(text)
|
| 52 |
+
enc = tokenizer(
|
| 53 |
+
cleaned,
|
| 54 |
+
return_tensors="pt",
|
| 55 |
+
truncation=True,
|
| 56 |
+
padding=False,
|
| 57 |
+
max_length=MAX_LENGTH,
|
| 58 |
+
)
|
| 59 |
+
input_ids = enc["input_ids"]
|
| 60 |
+
attention_mask = enc["attention_mask"]
|
| 61 |
+
|
| 62 |
+
with torch.no_grad():
|
| 63 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 64 |
+
|
| 65 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 66 |
+
conf, pred = torch.max(probs, dim=1)
|
| 67 |
+
confidence = conf.item()
|
| 68 |
+
predicted_index = pred.item()
|
| 69 |
+
|
| 70 |
+
urgency = label_encoder.inverse_transform([predicted_index])[0]
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
"urgency": urgency,
|
| 74 |
+
"confidence": round(confidence, 4),
|
| 75 |
+
"class_index": predicted_index,
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_model_and_tokenizer():
|
| 80 |
+
return model, tokenizer
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ── Standalone test ───────────────────────────────────────
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
while True:
|
| 86 |
+
text = input("\nEnter Hindi/Telugu grievance (or 'exit'): ")
|
| 87 |
+
if text.lower() == "exit":
|
| 88 |
+
break
|
| 89 |
+
print(predict(text))
|