MOHAN799S commited on
Commit
8da2d54
·
1 Parent(s): 53e8064

Deploy CivicConnect AI Engine — BERT + BLIP + EasyOCR + Whisper API

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +55 -0
  2. .gitattributes +2 -0
  3. .gitignore +51 -0
  4. Dockerfile +67 -0
  5. classification/artifacts/bert_model/config.json +51 -0
  6. classification/artifacts/indic_test.csv +0 -0
  7. classification/artifacts/indic_tokenizer.pkl +3 -0
  8. classification/artifacts/indic_train.csv +0 -0
  9. classification/artifacts/indic_val.csv +0 -0
  10. classification/artifacts/indicbert_model/config.json +54 -0
  11. classification/artifacts/indicbert_model/tokenizer.json +3 -0
  12. classification/artifacts/indicbert_model/tokenizer_config.json +24 -0
  13. classification/artifacts/label_encoder.pkl +3 -0
  14. classification/artifacts/label_map.pkl +3 -0
  15. classification/artifacts/test.csv +523 -0
  16. classification/artifacts/tokenizer.pkl +3 -0
  17. classification/artifacts/train.csv +0 -0
  18. classification/artifacts/val.csv +523 -0
  19. classification/bert_classify.py +164 -0
  20. classification/bert_model.py +417 -0
  21. classification/classification/artifacts/label_encoder.pkl +3 -0
  22. classification/classification/artifacts/label_map.pkl +3 -0
  23. classification/indic_bert_classify.py +142 -0
  24. classification/indic_bert_model.py +299 -0
  25. classification/indic_train.csv +0 -0
  26. classification/train.csv +0 -0
  27. gfas/__init__.py +9 -0
  28. gfas/disparity_analysis.py +156 -0
  29. gfas/fairness_audit.py +111 -0
  30. gfas/fairness_metrics.py +80 -0
  31. gfas/gfas_engine.py +81 -0
  32. gfas/report_generator.py +93 -0
  33. main.py +707 -0
  34. multi_modal/audio_to_text.py +463 -0
  35. multi_modal/image_to_text.py +346 -0
  36. requirements.txt +50 -0
  37. sentiment_analysis/artifacts/indic_urgency_model/config.json +46 -0
  38. sentiment_analysis/artifacts/indic_urgency_model/label_encoder.pkl +3 -0
  39. sentiment_analysis/artifacts/indic_urgency_model/label_map.pkl +3 -0
  40. sentiment_analysis/artifacts/indic_urgency_model/tokenizer.json +3 -0
  41. sentiment_analysis/artifacts/indic_urgency_model/tokenizer_config.json +24 -0
  42. sentiment_analysis/artifacts/urgency_bert_model/config.json +43 -0
  43. sentiment_analysis/artifacts/urgency_bert_model/label_encoder.pkl +3 -0
  44. sentiment_analysis/artifacts/urgency_bert_model/label_map.pkl +3 -0
  45. sentiment_analysis/artifacts/urgency_bert_model/tokenizer.json +0 -0
  46. sentiment_analysis/artifacts/urgency_bert_model/tokenizer_config.json +14 -0
  47. sentiment_analysis/bert_model.py +268 -0
  48. sentiment_analysis/bert_predict.py +82 -0
  49. sentiment_analysis/indic_bert_model.py +260 -0
  50. sentiment_analysis/indic_bert_predict.py +89 -0
.dockerignore ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Version control ───────────────────────────────────────
2
+ .git
3
+ .gitignore
4
+
5
+ # ── Python cache ──────────────────────────────────────────
6
+ __pycache__
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+ .Python
11
+ *.egg-info
12
+ dist/
13
+ build/
14
+ .eggs/
15
+
16
+ # ── Virtual environments ──────────────────────────────────
17
+ .venv
18
+ venv/
19
+ env/
20
+ ENV/
21
+
22
+ # ── Local secrets / config ────────────────────────────────
23
+ .env
24
+ .env.*
25
+ !.env.example
26
+
27
+ # ── Test / dev artefacts ──────────────────────────────────
28
+ tests/
29
+ *.test.py
30
+ pytest.ini
31
+ .pytest_cache/
32
+ .coverage
33
+ htmlcov/
34
+
35
+ # ── Jupyter notebooks ─────────────────────────────────────
36
+ *.ipynb
37
+ .ipynb_checkpoints/
38
+
39
+ # ── OS junk ───────────────────────────────────────────────
40
+ .DS_Store
41
+ Thumbs.db
42
+
43
+ # ── Docs / CI (not needed at runtime) ─────────────────────
44
+ docs/
45
+ *.md
46
+ !README.md
47
+ .github/
48
+
49
+ # ── Large local model checkpoints (downloaded at runtime) ─
50
+ # Comment these out if you bundle models into the image.
51
+ models/
52
+ *.bin
53
+ *.safetensors
54
+ *.pt
55
+ *.ckpt
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ classification/artifacts/indicbert_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ sentiment_analysis/artifacts/indic_urgency_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # In PowerShell inside civicconnect-ai-engine folder:
2
+ @"
3
+ __pycache__/
4
+ *.pyc
5
+ *.pyo
6
+ .env
7
+ venv/
8
+ .venv/
9
+ *.log
10
+ classification/artifacts/results/
11
+ classification/artifacts/indic_results/
12
+ sentiment_analysis/artifacts/results/
13
+ sentiment_analysis/artifacts/indic_results/
14
+ "@ | Out-File -FilePath .gitignore -Encoding utf8
15
+
16
+ # Python
17
+ __pycache__/
18
+ *.py[cod]
19
+ *.egg-info/
20
+ dist/
21
+ build/
22
+ .eggs/
23
+
24
+ # Environments
25
+ .venv/
26
+ venv/
27
+ env/
28
+
29
+ # Secrets
30
+ .env
31
+ *.key
32
+
33
+ # Models (large binaries — use HF Hub or Git LFS)
34
+ models/
35
+ *.bin
36
+ *.safetensors
37
+ *.pt
38
+ *.ckpt
39
+
40
+ # OS
41
+ .DS_Store
42
+ Thumbs.db
43
+
44
+ # IDE
45
+ .vscode/
46
+ .idea/
47
+
48
+ # Test artefacts
49
+ .pytest_cache/
50
+ .coverage
51
+ htmlcov/
Dockerfile ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # CivicConnect — Flask AI Engine
3
+ # Deploy target: Hugging Face Spaces (Docker SDK)
4
+ # =========================================================
5
+
6
+ FROM python:3.11-slim
7
+
8
+ # ── System dependencies ───────────────────────────────────
9
+ # ffmpeg → pydub audio decode (webm/ogg/mp3 → wav)
10
+ # libsndfile1 → soundfile (WAV/FLAC fallback)
11
+ # libgl1-mesa-glx + libglib2.0-0 → EasyOCR / OpenCV headless
12
+ # libgomp1 → PyTorch multi-threaded CPU ops
13
+ # git → HF model downloads via git-lfs if needed
14
+ # curl → health-check probes / HF API calls
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ ffmpeg \
17
+ libsndfile1 \
18
+ libgl1-mesa-glx \
19
+ libglib2.0-0 \
20
+ libgomp1 \
21
+ git \
22
+ curl \
23
+ && rm -rf /var/lib/apt/lists/*
24
+
25
+ # ── Create non-root user (HF Spaces requirement) ─────────
26
+ RUN useradd -m -u 1000 appuser
27
+
28
+ # ── Set working directory ─────────────────────────────────
29
+ WORKDIR /app
30
+
31
+ # ── Copy requirements first (layer cache) ────────────────
32
+ COPY requirements.txt .
33
+
34
+ # ── Install Python dependencies ───────────────────────────
35
+ RUN pip install --no-cache-dir --upgrade pip \
36
+ && pip install --no-cache-dir -r requirements.txt
37
+
38
+ # ── Copy application source ───────────────────────────────
39
+ COPY --chown=appuser:appuser . .
40
+
41
+ # ── Environment defaults (overridden by HF Secrets) ──────
42
+ ENV PORT=7860
43
+ ENV PYTHONUNBUFFERED=1
44
+ ENV HF_HOME=/app/.cache/huggingface
45
+
46
+ # ── Switch to non-root user ───────────────────────────────
47
+ USER appuser
48
+
49
+ # ── Expose port ───────────────────────────────────────────
50
+ EXPOSE 7860
51
+
52
+ # ── Healthcheck ───────────────────────────────────────────
53
+ HEALTHCHECK --interval=60s --timeout=10s --start-period=120s --retries=3 \
54
+ CMD curl -f http://localhost:7860/ || exit 1
55
+
56
+ # ── Start server ──────────────────────────────────────────
57
+ # 1 worker only — models are loaded once at startup (global state).
58
+ # 600s timeout handles audio+image (Whisper large-v3 ≈ 2-3 min on CPU).
59
+ CMD ["gunicorn", \
60
+ "--bind", "0.0.0.0:7860", \
61
+ "--workers", "1", \
62
+ "--timeout", "600", \
63
+ "--keep-alive", "5", \
64
+ "--log-level", "info", \
65
+ "--access-logfile", "-", \
66
+ "--error-logfile", "-", \
67
+ "main:app"]
classification/artifacts/bert_model/config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "gradient_checkpointing": false,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 768,
15
+ "id2label": {
16
+ "0": "LABEL_0",
17
+ "1": "LABEL_1",
18
+ "2": "LABEL_2",
19
+ "3": "LABEL_3",
20
+ "4": "LABEL_4",
21
+ "5": "LABEL_5",
22
+ "6": "LABEL_6",
23
+ "7": "LABEL_7"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 3072,
27
+ "is_decoder": false,
28
+ "label2id": {
29
+ "LABEL_0": 0,
30
+ "LABEL_1": 1,
31
+ "LABEL_2": 2,
32
+ "LABEL_3": 3,
33
+ "LABEL_4": 4,
34
+ "LABEL_5": 5,
35
+ "LABEL_6": 6,
36
+ "LABEL_7": 7
37
+ },
38
+ "layer_norm_eps": 1e-12,
39
+ "max_position_embeddings": 512,
40
+ "model_type": "bert",
41
+ "num_attention_heads": 12,
42
+ "num_hidden_layers": 12,
43
+ "pad_token_id": 0,
44
+ "position_embedding_type": "absolute",
45
+ "problem_type": "single_label_classification",
46
+ "tie_word_embeddings": true,
47
+ "transformers_version": "5.0.0",
48
+ "type_vocab_size": 2,
49
+ "use_cache": false,
50
+ "vocab_size": 30522
51
+ }
classification/artifacts/indic_test.csv ADDED
The diff for this file is too large to render. See raw diff
 
classification/artifacts/indic_tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa2396c5b53605359d466d67fc892aaca020711ae8ac7b7ab2fd9336d82c428c
3
+ size 14979445
classification/artifacts/indic_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
classification/artifacts/indic_val.csv ADDED
The diff for this file is too large to render. See raw diff
 
classification/artifacts/indicbert_model/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AlbertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0,
6
+ "bos_token_id": 2,
7
+ "classifier_dropout_prob": 0.1,
8
+ "down_scale_factor": 1,
9
+ "dtype": "float32",
10
+ "embedding_size": 128,
11
+ "eos_token_id": 3,
12
+ "gap_size": 0,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0,
15
+ "hidden_size": 768,
16
+ "id2label": {
17
+ "0": "LABEL_0",
18
+ "1": "LABEL_1",
19
+ "2": "LABEL_2",
20
+ "3": "LABEL_3",
21
+ "4": "LABEL_4",
22
+ "5": "LABEL_5",
23
+ "6": "LABEL_6",
24
+ "7": "LABEL_7"
25
+ },
26
+ "initializer_range": 0.02,
27
+ "inner_group_num": 1,
28
+ "intermediate_size": 3072,
29
+ "label2id": {
30
+ "LABEL_0": 0,
31
+ "LABEL_1": 1,
32
+ "LABEL_2": 2,
33
+ "LABEL_3": 3,
34
+ "LABEL_4": 4,
35
+ "LABEL_5": 5,
36
+ "LABEL_6": 6,
37
+ "LABEL_7": 7
38
+ },
39
+ "layer_norm_eps": 1e-12,
40
+ "max_position_embeddings": 512,
41
+ "model_type": "albert",
42
+ "net_structure_type": 0,
43
+ "num_attention_heads": 12,
44
+ "num_hidden_groups": 1,
45
+ "num_hidden_layers": 12,
46
+ "num_memory_blocks": 0,
47
+ "pad_token_id": 0,
48
+ "problem_type": "single_label_classification",
49
+ "tie_word_embeddings": true,
50
+ "transformers_version": "5.1.0",
51
+ "type_vocab_size": 2,
52
+ "use_cache": false,
53
+ "vocab_size": 200000
54
+ }
classification/artifacts/indicbert_model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d34df3ca6a5769c1f8ae24a1e64517f3c37a934fd221d9a2ae2c5164d5e21be5
3
+ size 14969520
classification/artifacts/indicbert_model/tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "[CLS]",
5
+ "cls_token": "[CLS]",
6
+ "do_lower_case": true,
7
+ "eos_token": "[SEP]",
8
+ "extra_special_tokens": [
9
+ "<pad>",
10
+ "[CLS]",
11
+ "[SEP]",
12
+ "[MASK]"
13
+ ],
14
+ "is_local": false,
15
+ "keep_accents": false,
16
+ "mask_token": "[MASK]",
17
+ "model_max_length": 1000000000000000019884624838656,
18
+ "pad_token": "<pad>",
19
+ "sep_token": "[SEP]",
20
+ "tokenizer_class": "AlbertTokenizer",
21
+ "trim_offsets": true,
22
+ "unk_id": 1,
23
+ "unk_token": "<unk>"
24
+ }
classification/artifacts/label_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b0be0d88eed1838fba777af266556aea55e435b970076684d2ad1c8c9b3fb0b
3
+ size 342
classification/artifacts/label_map.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e10c5614e117fd9ccab4af3fa62c0e4c44d23195847586d4d1ddb47f4a00cc
3
+ size 321
classification/artifacts/test.csv ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ text,label,label_id
2
+ Transformer emits burning smell,Electricity,0
3
+ Power failures interrupt electric signaling systems causing traffic confusion,Electricity,0
4
+ Water main repair causing disruption,Water,7
5
+ Stray dogs near playgrounds scaring children,Stray Animals,6
6
+ Buses running without first aid kits,Public Transport,3
7
+ Electric poles damaged near highway,Electricity,0
8
+ No information boards at bus stops,Public Transport,3
9
+ Low voltage causing fan malfunction,Electricity,0
10
+ Stray animals damage public seating and benches,Stray Animals,6
11
+ Pigs damaging road surfaces,Stray Animals,6
12
+ Overall public transport problem in locality,Public Transport,3
13
+ Stray dogs barking loudly at night,Stray Animals,6
14
+ The ongoing noise from the water pump is making it difficult for families to rest and relax at home,Water,7
15
+ Street dogs biting pedestrians,Stray Animals,6
16
+ Household waste overflow leads to foul odor spreading into nearby streets,Garbage,1
17
+ Overflowing dustbins near temple,Garbage,1
18
+ Roads are left uneven after utility maintenance work,Roads,4
19
+ Waste not collected from high-rise apartments,Garbage,1
20
+ Air pollution impacts children outdoor play,Pollution,2
21
+ Irregular bus timings causing inconvenience,Public Transport,3
22
+ Delayed trains disrupt workforce movement across city zones,Public Transport,3
23
+ Dust from unpaved road affecting residents,Pollution,2
24
+ Unreliable water supply affects small businesses and shops,Water,7
25
+ Bus shelters without proper signage,Public Transport,3
26
+ Industrial noise pollution disrupts normal daily activities,Pollution,2
27
+ The water pump noise persists for long durations causing mental stress,Water,7
28
+ Road near temple congested during festivals,Roads,4
29
+ Inadequate fleet capacity increases wait times,Public Transport,3
30
+ Accumulated waste near street corner,Garbage,1
31
+ Sanitation workers absent on weekends,Sanitation,5
32
+ Stray animals damage underground cables and water pipelines,Stray Animals,6
33
+ Road drainage damaged,Roads,4
34
+ Garbage has not been cleared from the market area for several days,Garbage,1
35
+ Water contamination affects public health outcomes,Water,7
36
+ Road markings missing on newly constructed road,Roads,4
37
+ Industrial waste contaminating groundwater,Pollution,2
38
+ Water supply pressure varies widely between floors,Water,7
39
+ Plastic waste dumped near playground,Garbage,1
40
+ Electrical outages disable automated road safety equipment creating hazardous conditions,Electricity,0
41
+ Garbage accumulation problem,Garbage,1
42
+ Pollutants from fuel combustion linger in enclosed transport zones,Pollution,2
43
+ Road construction debris narrows lanes and slows vehicles,Roads,4
44
+ Noise pollution increases mental fatigue,Pollution,2
45
+ Power outages interrupt data driven traffic optimization,Electricity,0
46
+ Public transport breakdowns increase dependency on informal transit options,Public Transport,3
47
+ Uncontrolled dumping of waste is leading to severe soil contamination,Pollution,2
48
+ Industrial fumes causing bad odor in colony,Pollution,2
49
+ Water supply interrupted by municipal work,Water,7
50
+ Garbage collection staff do not cover this street regularly,Garbage,1
51
+ Residents complain of constant disturbance caused by water pump vibrations and sound,Water,7
52
+ Poor road lighting affects night time driving safety,Roads,4
53
+ Smoke from factories causing health issues,Pollution,2
54
+ Street clogged due to construction,Roads,4
55
+ Stray goats eating plants in public gardens,Stray Animals,6
56
+ Dogs blocking roads and footpaths,Stray Animals,6
57
+ The water pump generates ongoing noise that affects mental calmness of residents,Water,7
58
+ Electric supply interruptions disrupt essential household chores,Electricity,0
59
+ Stop start driving patterns significantly increase emission output,Pollution,2
60
+ Streetlights not operational,Electricity,0
61
+ Street corners clogged with wet waste,Sanitation,5
62
+ Inefficient driving patterns raise overall emission output,Pollution,2
63
+ Drinking water quality has deteriorated significantly over the past few months,Water,7
64
+ Unsegregated waste decomposition is polluting the air and surrounding land,Pollution,2
65
+ Odor from chemical treatment plant near houses,Pollution,2
66
+ Noise pollution disrupts residential sleep cycles,Pollution,2
67
+ Stray goats blocking sidewalks,Stray Animals,6
68
+ Road bottlenecks from incomplete construction trap vehicles in narrow corridors,Roads,4
69
+ Stray cattle damage public gardens and green belts,Stray Animals,6
70
+ Overhead tank pump malfunctioning,Water,7
71
+ Potholes near bus stop causing delays,Roads,4
72
+ Stray animals causing fear among women residents,Stray Animals,6
73
+ Waste collection vehicles arrive at irregular times causing inconvenience,Sanitation,5
74
+ Roads are left damaged after cable laying work,Roads,4
75
+ Open burning of waste releases toxic smoke affecting nearby households,Pollution,2
76
+ Poor road finishing is leading to early deterioration,Roads,4
77
+ No proper lighting inside buses at night,Public Transport,3
78
+ Aggressive stray dogs attacking children,Stray Animals,6
79
+ Drain water mixing with rainwater,Sanitation,5
80
+ Street corners full of waste,Sanitation,5
81
+ No drinking water facility in our street,Water,7
82
+ Stray dogs roaming in parks causing fear,Stray Animals,6
83
+ Nighttime operation of the water pump is leading to frequent sleep interruptions for nearby residents,Water,7
84
+ Traffic signals not visible,Roads,4
85
+ Traffic signals not synchronized,Roads,4
86
+ The water pump produces nonstop noise causing stress,Water,7
87
+ Low water pressure in commercial buildings,Water,7
88
+ Roadside drainage overflowing,Roads,4
89
+ Garbage left near drainage channels contributes to blockages during rainfall,Garbage,1
90
+ Persistent congestion contributes to chronic air pollution exposure,Pollution,2
91
+ Public water tap is broken,Water,7
92
+ Overcharging by bus conductors,Public Transport,3
93
+ Waterlogging on damaged roads makes them impassable,Roads,4
94
+ Power supply resumes late,Electricity,0
95
+ Animal presence affects pedestrian safety,Stray Animals,6
96
+ Roadside damage narrows lanes and slows traffic,Roads,4
97
+ Electric failures disrupt smart road analytics causing unmanaged flow,Electricity,0
98
+ Lack of buses in early morning hours,Public Transport,3
99
+ Low water pressure in offices,Water,7
100
+ Animal movement causes repeated traffic interruptions,Stray Animals,6
101
+ Fuse blowing repeatedly in our neighborhood,Electricity,0
102
+ Pollution due to illegal dumping,Pollution,2
103
+ Residents report difficulty sleeping due to loud water pump operation,Water,7
104
+ Garbage pile near market area,Garbage,1
105
+ Road construction debris increases airborne particulate matter,Pollution,2
106
+ Stray animals block footpaths forcing pedestrians onto roads,Stray Animals,6
107
+ Road signage failures cause navigation confusion,Roads,4
108
+ Water pipeline damage reduces distribution efficiency,Water,7
109
+ Electric poles obstruct roads and pedestrian pathways,Electricity,0
110
+ Low voltage affecting fans and lights,Electricity,0
111
+ Waste disposal points are not properly sanitized,Sanitation,5
112
+ No shelter for injured stray animals,Stray Animals,6
113
+ Power outage affecting schools,Electricity,0
114
+ Stray animals biting delivery workers,Stray Animals,6
115
+ Sanitation workers not equipped with tools,Sanitation,5
116
+ Electric wires spark during strong winds,Electricity,0
117
+ Irregular tanker delivery schedules increase uncertainty,Water,7
118
+ Road network imbalance shifts traffic to residential streets,Roads,4
119
+ Delayed water pipeline repairs increase hardship,Water,7
120
+ Electric failures shut down borewell motors affecting residential water access,Electricity,0
121
+ Frequent power cuts in monsoon,Electricity,0
122
+ Garbage bins overflow during weekends due to lack of timely pickup,Garbage,1
123
+ Pollution from roadside burning affects nearby shops,Pollution,2
124
+ Street clogged due to parked vehicles,Roads,4
125
+ Power outages cause loss of productivity for freelancers,Electricity,0
126
+ Open waste dumping encourages animal congregation near residences,Garbage,1
127
+ Pipeline repair delayed due to traffic,Water,7
128
+ No animal control measures implemented,Stray Animals,6
129
+ Stray animals causing accidents on roads,Stray Animals,6
130
+ Garbage overflowing near house,Garbage,1
131
+ Public toilets without privacy,Sanitation,5
132
+ Improper waste handling causes frequent odor problems,Sanitation,5
133
+ Water supply stopped without notice,Water,7
134
+ Bus stops not visible at night,Public Transport,3
135
+ Dust pollution due to construction work,Pollution,2
136
+ Stagnant water near market street,Sanitation,5
137
+ Bus routes not covering industrial corridors,Public Transport,3
138
+ Uneven road height is causing frequent vehicle underbody damage,Roads,4
139
+ Garbage collection lacks proper supervision,Garbage,1
140
+ Noise pollution from night time construction disrupts sleep,Pollution,2
141
+ Streetlights not repaired after accident,Roads,4
142
+ Drain water on footpath,Sanitation,5
143
+ Garbage disposal issue,Garbage,1
144
+ Water supply irregular in community area,Water,7
145
+ Transformer overloaded during peak load,Electricity,0
146
+ Transport hubs suffer from sanitation and crowd management issues,Public Transport,3
147
+ Road shoulder eroded near river bank,Roads,4
148
+ No buses for late-night travel,Public Transport,3
149
+ Sanitation services decline during public holidays,Sanitation,5
150
+ Residents report disturbance from water pump operation,Water,7
151
+ Stray animals disrupt public spaces frequently,Stray Animals,6
152
+ Dirty streets near bus stand,Sanitation,5
153
+ Street handpump dry for several days,Water,7
154
+ Power instability affects electric vehicle infrastructure availability,Electricity,0
155
+ Waste accumulation is contaminating nearby stormwater drains,Pollution,2
156
+ Stray animals causing dirt accumulation near markets,Stray Animals,6
157
+ Air pollution from cement factory chimney,Pollution,2
158
+ Water from tap has suspended particles,Water,7
159
+ Stray animals remain untreated for diseases spreading infection,Stray Animals,6
160
+ Drivers refusing service in rain,Public Transport,3
161
+ Public sanitation issues affect overall quality of life,Sanitation,5
162
+ Transformer emits smoke,Electricity,0
163
+ Waste dumped illegally,Garbage,1
164
+ Road capacity limitations force vehicles into dense clusters,Roads,4
165
+ Sewage smell near bus stand,Sanitation,5
166
+ Air pollution from diesel generators in colony,Pollution,2
167
+ Industrial pollution damages surrounding ecosystems,Pollution,2
168
+ Road safety is compromised due to roaming animals,Stray Animals,6
169
+ Dust from demolition site near playground,Pollution,2
170
+ Streetlights not working near bus stop,Electricity,0
171
+ Noise pollution from train operations,Pollution,2
172
+ Garbage collection staff leave waste behind after partial pickup,Garbage,1
173
+ Blocked drains causing flooding in colony,Sanitation,5
174
+ Polluted drainage water seeps into residential plots,Pollution,2
175
+ Mud road causing dust problem,Roads,4
176
+ Road repair materials used are of very poor quality,Roads,4
177
+ Dust from quarry affecting local residents,Pollution,2
178
+ Garbage collection stopped,Garbage,1
179
+ Footpath uneven and unsafe,Roads,4
180
+ Pipeline under construction causing water shortage,Water,7
181
+ Stalling engines emit excessive smoke degrading air quality,Pollution,2
182
+ Garbage heaps are obstructing traffic and pedestrian movement,Garbage,1
183
+ Lack of regular street sweeping leads to dust and waste accumulation,Sanitation,5
184
+ Electric system faults disrupt coordination of transport infrastructure,Electricity,0
185
+ Persistent congestion sustains unhealthy air quality levels,Pollution,2
186
+ Slow moving traffic produces higher emissions per distance traveled,Pollution,2
187
+ Pipeline damage causing water supply disruption,Water,7
188
+ Rainwater flooding water storage area,Water,7
189
+ Water tankers are irregular and insufficient to meet residential requirements,Water,7
190
+ Garbage not cleared regularly,Garbage,1
191
+ Stray cattle wander into drainage channels causing blockages,Stray Animals,6
192
+ Vehicles forced closer together increase localized emission density,Pollution,2
193
+ Polluted waste disposal attracts stray animals,Pollution,2
194
+ Unvaccinated stray dogs increase the risk of rabies in the neighborhood,Stray Animals,6
195
+ Overhead wires hanging low,Electricity,0
196
+ Frequent power interruptions near major intersections cause traffic buildup and prolonged vehicle idling affecting air quality,Electricity,0
197
+ Water cuts affecting hospital area,Water,7
198
+ Dust from open construction affecting local residents,Pollution,2
199
+ Electricity outages disrupt water pumping operations,Electricity,0
200
+ Uneven road surface near hospital entrance,Roads,4
201
+ Congestion related emissions worsen air quality in commercial districts,Pollution,2
202
+ Noise pollution near hospital affecting patients,Pollution,2
203
+ Garbage not removed from community center,Garbage,1
204
+ Power instability disrupts electric vehicle adoption in urban transport corridors,Electricity,0
205
+ Persistent humming from the water pump is causing stress and discomfort for residents living close to the facility,Water,7
206
+ Road near bus stand full of potholes,Roads,4
207
+ Stray dogs attack pets in apartment complexes,Stray Animals,6
208
+ Water from taps has sand particles,Water,7
209
+ Environmental hazards in residential zone,Pollution,2
210
+ Unsafe travel conditions for women,Public Transport,3
211
+ Garbage accumulation is increasing environmental risk,Garbage,1
212
+ Dirty water from taps during rainy season,Water,7
213
+ Stray animals creating mess near water bodies,Stray Animals,6
214
+ Sanitation systems fail leading to sewage water backing up onto roads,Sanitation,5
215
+ Street lighting outages increase accident risk on poorly visible roads,Electricity,0
216
+ Residents are affected by water pump sound levels exceeding acceptable limits,Water,7
217
+ Stray cattle obstruct parking areas in residential zones,Stray Animals,6
218
+ Long waiting time for buses,Public Transport,3
219
+ Bus stops not cleaned after festivals,Public Transport,3
220
+ Water leaking under road surface,Water,7
221
+ Buses without GPS updates,Public Transport,3
222
+ Streetlights not working on Elm Road,Roads,4
223
+ Water shortage impacts daily cleaning and sanitation,Water,7
224
+ Stray animals creating mess in community areas,Stray Animals,6
225
+ Buses overcrowded during festivals,Public Transport,3
226
+ Pollution affecting children and elderly,Pollution,2
227
+ Public toilets lack regular cleaning schedules,Sanitation,5
228
+ Damaged roads increase travel uncertainty,Roads,4
229
+ Improper road grading leads to water accumulation,Roads,4
230
+ Sewage water leaking onto roads,Sanitation,5
231
+ Pollution from waste burning spreads toxic particles,Pollution,2
232
+ Garbage remains scattered near commercial complexes for days,Garbage,1
233
+ Bus routes not covering industrial areas,Public Transport,3
234
+ Waste disposal areas are not properly fenced,Sanitation,5
235
+ Water main burst near market,Water,7
236
+ Road surface wear increases particulate release from tires,Roads,4
237
+ Air pollution from diesel generators,Pollution,2
238
+ Potholes near park causing accidents,Roads,4
239
+ Leaking overhead tanks result in continuous water wastage,Water,7
240
+ Public sanitation facilities are insufficient in crowded areas,Sanitation,5
241
+ Residents face daily inconvenience due to uncontrolled noise from the water pump,Water,7
242
+ Drain smell causing illness,Sanitation,5
243
+ Cats multiplying rapidly in neighborhood,Stray Animals,6
244
+ Stray cats creating hygiene problems in markets,Stray Animals,6
245
+ Road issue near temple,Roads,4
246
+ Uncollected garbage provides feeding grounds for roaming animals,Garbage,1
247
+ Frequent power cuts affecting shops,Electricity,0
248
+ Damaged road surfaces slow traffic causing inefficient fuel usage,Roads,4
249
+ Improper disposal of waste is contaminating nearby open areas,Sanitation,5
250
+ The water pump produces constant noise that disrupts daily household activities,Water,7
251
+ Residents experience repeated water pump disturbance,Water,7
252
+ Voltage surges damage electronic devices unexpectedly,Electricity,0
253
+ Industrial noise pollution affects quality of life,Pollution,2
254
+ Garbage mismanagement amplifies sanitation maintenance burden,Garbage,1
255
+ Electricity outages increase fire safety risks,Electricity,0
256
+ Sewage discharge causing river pollution,Pollution,2
257
+ Accumulated waste blocks sanitation channels leading to stagnant wastewater,Garbage,1
258
+ Garbage is scattered due to stray animals tearing open trash bags,Garbage,1
259
+ Oil spill in water body,Pollution,2
260
+ Uncollected waste decomposes and flows into sewage lines worsening sanitation blockages,Garbage,1
261
+ Sanitation workers lack adequate training,Sanitation,5
262
+ Water supply irregular in new housing society,Water,7
263
+ Traffic congestion due to narrow road,Roads,4
264
+ Generator noise late at night,Pollution,2
265
+ Unattended garbage heaps are spoiling the appearance of the locality,Garbage,1
266
+ Solid waste blocking drainage system,Pollution,2
267
+ Improper waste handling is creating sanitation problems,Garbage,1
268
+ Stray animals causing injuries to pedestrians,Stray Animals,6
269
+ Stray dogs attacking postal workers,Stray Animals,6
270
+ Buses running without permits,Public Transport,3
271
+ Electric supply irregular,Electricity,0
272
+ Monkey problem near market area,Stray Animals,6
273
+ Waste degradation is impacting environmental and public health,Pollution,2
274
+ Factory emissions affecting children with asthma,Pollution,2
275
+ Low speed traffic generates higher emission per distance,Pollution,2
276
+ Sanitation services are delayed without prior notice,Sanitation,5
277
+ Garbage accumulation causes repeated public complaints across departments,Garbage,1
278
+ Uncollected household waste attracting flies,Garbage,1
279
+ Stray cats damaging community gardens,Stray Animals,6
280
+ Road construction stopped halfway,Roads,4
281
+ Illegal dumping of construction debris,Garbage,1
282
+ Transformer oil leakage near road,Electricity,0
283
+ Garbage from nearby construction sites is dumped illegally,Garbage,1
284
+ Waste contamination is impacting soil and water quality,Pollution,2
285
+ Road damage worsens during monsoon due to poor drainage integration,Roads,4
286
+ Poor water quality leads to foul taste and odor,Water,7
287
+ Public garbage bins are overflowing and spilling waste onto the roads,Garbage,1
288
+ No coordination with train schedules,Public Transport,3
289
+ Electricity issues affect remote work productivity,Electricity,0
290
+ Electric poles with loose wires,Electricity,0
291
+ Water supply resumes late morning,Water,7
292
+ Garbage heaps block proper drainage,Garbage,1
293
+ Public sanitation infrastructure is inadequate for urban demands,Sanitation,5
294
+ Road repair delays extend traffic disruption periods,Roads,4
295
+ Roads are breaking repeatedly after each monsoon season,Roads,4
296
+ Drivers rude to passengers,Public Transport,3
297
+ Water pollution spreading diseases,Pollution,2
298
+ Sudden electricity cuts without prior notice are affecting daily work from home activities,Electricity,0
299
+ Supply water appears muddy after pipeline repair work in the locality,Water,7
300
+ Garbage dumping spots are unmanaged and poorly maintained,Garbage,1
301
+ Old buses with faulty engines,Public Transport,3
302
+ Industrial pollution increases cancer risks,Pollution,2
303
+ Discolored water flows from taps after long supply gaps,Water,7
304
+ Route diversions lead to unpredictable commute durations,Public Transport,3
305
+ Sanitation workers not performing evening duty,Sanitation,5
306
+ Uneven roads cause discomfort for passengers,Roads,4
307
+ Stray dogs chase vehicles during nighttime hours,Stray Animals,6
308
+ Stray cats entering school premises,Stray Animals,6
309
+ Noise pollution from traffic affecting students,Pollution,2
310
+ Sanitation facilities near residential areas are neglected,Sanitation,5
311
+ Stray cats entering restaurants,Stray Animals,6
312
+ Electricity supply schedules are not followed consistently,Electricity,0
313
+ Water supply cut without prior notice,Water,7
314
+ Noise from sports stadium affecting neighborhood,Pollution,2
315
+ Fuse keeps blowing in kitchen,Electricity,0
316
+ Air pollution from industrial chimneys,Pollution,2
317
+ Stray animals roam freely due to ineffective monitoring,Stray Animals,6
318
+ Stray dogs fighting with other animals in streets,Stray Animals,6
319
+ No electricity supply in office,Electricity,0
320
+ Heavy vehicles damaging residential roads,Roads,4
321
+ Household garbage is not collected on holidays leading to buildup,Garbage,1
322
+ The water pump operates loudly affecting quality of life,Water,7
323
+ Dog attacks reported in locality,Stray Animals,6
324
+ Open dumping near street causing pollution,Garbage,1
325
+ Leaking sewage lines pollute nearby water bodies,Sanitation,5
326
+ Frequent engine problems,Public Transport,3
327
+ Drain cleaning not done,Sanitation,5
328
+ Broken road near school,Roads,4
329
+ Stray animals scavenging from open garbage,Stray Animals,6
330
+ Road markings faded causing confusion for drivers,Roads,4
331
+ Road near temple full of potholes,Roads,4
332
+ Blocked drains causing stagnant water,Sanitation,5
333
+ Animals causing traffic jams,Stray Animals,6
334
+ Waste decomposition emits pollutants into residential areas,Pollution,2
335
+ Transformer noise disturbing residents,Electricity,0
336
+ Overflowing sewage near street,Sanitation,5
337
+ Poor road quality impacts overall city image,Roads,4
338
+ Water main leakage near central road,Water,7
339
+ No water connection for new house,Water,7
340
+ Sanitation workers do not segregate waste properly,Sanitation,5
341
+ Road bottlenecks caused by poor design delay movement,Roads,4
342
+ Streetlights off during night,Electricity,0
343
+ The water pump operates loudly and disrupts household peace,Water,7
344
+ Narrow roads reduce bus movement efficiency,Public Transport,3
345
+ Sewage smell unbearable,Sanitation,5
346
+ Water cuts extended for more than 24 hours,Water,7
347
+ Electric wires near playground unsafe,Electricity,0
348
+ Stray animals damaging parked vehicles,Stray Animals,6
349
+ Garbage collection is irregular during rainy seasons,Garbage,1
350
+ Garbage collection does not cover all households equally,Sanitation,5
351
+ Dust from road repair work affecting houses,Pollution,2
352
+ Temporary road fixes fail within weeks of implementation,Roads,4
353
+ Waste collection vehicles are insufficient for this locality,Garbage,1
354
+ Pollution caused by heavy trucks affects nearby residential colonies,Pollution,2
355
+ Sanitation workers not maintaining cleanliness,Sanitation,5
356
+ Road repair materials blocking lanes,Roads,4
357
+ Residents experience sleep issues due to water pump noise,Water,7
358
+ Garbage burning issue,Garbage,1
359
+ Voltage drops affecting office equipment,Electricity,0
360
+ Sanitation system collapse impacts drinking water safety,Sanitation,5
361
+ Water supply lines are poorly mapped leading to frequent damage,Water,7
362
+ Fuse keeps tripping in rainy season,Electricity,0
363
+ Streetlight flickering at night,Electricity,0
364
+ Overflowing dustbins near market,Garbage,1
365
+ Noise from generators disturbing neighborhood,Pollution,2
366
+ Fuse box damaged,Electricity,0
367
+ Poor water pressure affects bathroom usage severely,Water,7
368
+ Cattle blocking traffic movement,Stray Animals,6
369
+ Lack of seating at bus stops,Public Transport,3
370
+ Low water pressure in government buildings,Water,7
371
+ Dust from demolition site causing allergies,Pollution,2
372
+ Dense vehicle clusters elevate local emission concentration,Pollution,2
373
+ Road width constraints force stop start movement,Roads,4
374
+ Open burning of plastic waste,Pollution,2
375
+ Stray dogs occupy bus stops causing inconvenience to commuters,Stray Animals,6
376
+ Road markings are missing due to worn out surfaces,Roads,4
377
+ Road work causing traffic jam,Roads,4
378
+ Drivers ignoring pedestrian crossings,Public Transport,3
379
+ Water supply is inconsistent across different times of day,Water,7
380
+ No electricity in entire colony,Electricity,0
381
+ Waste burning polluting surroundings,Pollution,2
382
+ Waste collection points are poorly located causing inconvenience,Sanitation,5
383
+ Stray dogs bark loudly during late night hours,Stray Animals,6
384
+ Stray dogs forming packs near bus depot,Stray Animals,6
385
+ Waste related pollution is impacting daily life,Pollution,2
386
+ Bus not stopping at designated bus stop,Public Transport,3
387
+ Street drains filled with plastic waste,Sanitation,5
388
+ Buses overcrowded during weekends,Public Transport,3
389
+ Sanitation failures impact school environments,Sanitation,5
390
+ Industrial smoke causing respiratory issues,Pollution,2
391
+ Broken road surfaces near residential areas are making daily commuting unsafe for motorists,Roads,4
392
+ Water leakage near school entrance,Water,7
393
+ Drivers not assisting differently-abled passengers,Public Transport,3
394
+ Public sanitation services are underfunded and understaffed,Sanitation,5
395
+ Streetlights off on major roads,Electricity,0
396
+ Sewage water stagnant near temple,Sanitation,5
397
+ Stray cows wandering in residential streets,Stray Animals,6
398
+ Stray dogs lack vaccination leading to health hazards,Stray Animals,6
399
+ Garbage truck skipped area,Garbage,1
400
+ Sewage water stagnant near residential block,Sanitation,5
401
+ Improper waste treatment is increasing pollution levels in surrounding neighborhoods,Pollution,2
402
+ No proper drainage in colony,Sanitation,5
403
+ Sanitation workers not using safety equipment,Sanitation,5
404
+ No monitoring of stray animals in colony,Stray Animals,6
405
+ Water pipelines are damaged during unrelated construction work,Water,7
406
+ Bus breakdowns happening frequently,Public Transport,3
407
+ Street littered with garbage and sewage,Sanitation,5
408
+ Overloaded transformers frequently trip causing blackouts,Electricity,0
409
+ Transport inefficiency impacts economic productivity,Public Transport,3
410
+ Garbage collection delays are a recurring issue,Garbage,1
411
+ Pipeline blockage causing low water supply,Water,7
412
+ Electricity department response time is unsatisfactory,Electricity,0
413
+ Power supply disrupted without notice,Electricity,0
414
+ Unannounced water shutdowns cause inconvenience to residents,Water,7
415
+ Traffic congestion due to road narrowing,Roads,4
416
+ Transformer failure affecting area,Electricity,0
417
+ Garbage piles near manholes worsen sewage backflow issues,Garbage,1
418
+ Chemical smell spreading in residential area,Pollution,2
419
+ Roads are not resurfaced regularly,Roads,4
420
+ The water pump generates loud operational noise that disrupts sleep and rest patterns,Water,7
421
+ Garbage piles remain after festival events,Garbage,1
422
+ Stray dogs roaming near hospitals,Stray Animals,6
423
+ Voltage drop prevents proper functioning of appliances,Electricity,0
424
+ Noise pollution near hospital area,Pollution,2
425
+ Water cuts affecting commercial complexes,Water,7
426
+ Water complaint pending for long time,Water,7
427
+ Stray dogs roaming near hospitals causing fear,Stray Animals,6
428
+ No response to stray animal complaints,Stray Animals,6
429
+ Power interruptions affect automated road management infrastructure,Electricity,0
430
+ Electric poles block proper road widening projects,Electricity,0
431
+ Roadside garbage obstructing traffic,Roads,4
432
+ Road surface uneven after monsoon,Roads,4
433
+ Tap water contains visible particles making it unsafe for consumption,Water,7
434
+ Lack of proper information for routes,Public Transport,3
435
+ Air pollution from generators affects indoor air quality,Pollution,2
436
+ Voltage fluctuations causing hazards,Electricity,0
437
+ Industrial waste dumping has degraded soil quality severely,Pollution,2
438
+ Animals damage sanitation pipelines searching for food,Stray Animals,6
439
+ Pollution control norms are not enforced on local industries,Pollution,2
440
+ Pollution from stone crushing units spreads fine dust,Pollution,2
441
+ Public toilets without maintenance,Sanitation,5
442
+ Mechanical noise from the water pump is becoming unbearable during nighttime hours for residents,Water,7
443
+ Water pollution due to domestic waste dumping,Pollution,2
444
+ Street corners dirty due to uncollected waste,Sanitation,5
445
+ Electric issues interrupt automated toll and traffic flow systems,Electricity,0
446
+ Waste collection timing clashes with peak activity hours,Sanitation,5
447
+ Road surface uneven after rains,Roads,4
448
+ Odor from sewage line leaks in residential area,Pollution,2
449
+ Roadside garbage obstructing lanes,Roads,4
450
+ Poorly maintained valves cause water wastage,Water,7
451
+ Power cut without information,Electricity,0
452
+ Public sanitation infrastructure lacks maintenance,Sanitation,5
453
+ Garbage collection delayed during holidays,Garbage,1
454
+ Noise pollution from construction machinery in mornings,Pollution,2
455
+ Stagnant water near bus stop,Sanitation,5
456
+ Industrial fumes affecting neighborhood air quality,Pollution,2
457
+ Water meter shows unusual consumption,Water,7
458
+ Road edges damaged by waterlogging,Roads,4
459
+ Road repair work causing extended traffic jams,Roads,4
460
+ Traffic bottlenecks delay public transport vehicles,Public Transport,3
461
+ Sanitation workers not collecting waste on time,Sanitation,5
462
+ Water leakage contributes to road deterioration,Water,7
463
+ Sanitation services do not cover all localities equally,Sanitation,5
464
+ Water flow is insufficient for basic hygiene needs,Water,7
465
+ Electric instability affects charging dependent transit reducing efficiency,Electricity,0
466
+ Water pump vibrations are clearly audible inside houses and are causing continuous disturbance to residents,Water,7
467
+ Noise pollution from construction near offices,Pollution,2
468
+ Improper disposal of waste is impacting daily life,Garbage,1
469
+ Street littered with wet and dry waste,Sanitation,5
470
+ Public toilets lacking handwashing facilities,Sanitation,5
471
+ Bus stop information boards missing,Public Transport,3
472
+ Broken roads pose risks to senior citizens,Roads,4
473
+ Persistent water pump noise affects indoor comfort,Water,7
474
+ Garbage bin broken,Garbage,1
475
+ Garbage piles are becoming permanent fixtures,Garbage,1
476
+ Loud water pump vibrations are causing discomfort and anxiety among residents living close to it,Water,7
477
+ Water supply resumes with air bursts damaging pipelines,Water,7
478
+ Road surface damaged by heavy rainfall,Roads,4
479
+ Dirty water from community taps,Water,7
480
+ Residents complain about excessive water pump sound disturbing peaceful living,Water,7
481
+ Sanitation workers absent in evening rounds,Sanitation,5
482
+ Electrical load failures impact road tunnel ventilation systems,Electricity,0
483
+ Water pressure drops drastically during peak usage hours every day,Water,7
484
+ Stray cows grazing near road construction,Stray Animals,6
485
+ Garbage bin missing,Garbage,1
486
+ Uncollected waste causing bad smell,Pollution,2
487
+ Animal herds slow down traffic significantly,Stray Animals,6
488
+ Road repair material spilling on lanes,Roads,4
489
+ Public toilets without proper maintenance,Sanitation,5
490
+ Low voltage in offices,Electricity,0
491
+ Water supply disrupted due to pipeline cleaning,Water,7
492
+ Garbage problem near shops,Garbage,1
493
+ The water pump produces continuous noise impacting residential peace,Water,7
494
+ Dirty water from taps after rain,Water,7
495
+ Water supply disrupted due to power outage,Water,7
496
+ Animals roaming near hospital area,Stray Animals,6
497
+ Damaged roads are affecting delivery services,Roads,4
498
+ Power interruptions affect electric mobility adoption increasing combustion usage,Electricity,0
499
+ Unstable power supply affects industrial equipment performance,Electricity,0
500
+ The water pump generates excessive sound that disrupts normal household activities,Water,7
501
+ Electric wires pass dangerously close to trees,Electricity,0
502
+ Air pollution from industrial boilers,Pollution,2
503
+ Drain water breeding mosquitoes,Sanitation,5
504
+ No schedule boards at bus stops,Public Transport,3
505
+ Stray dogs causing accidents at intersections,Stray Animals,6
506
+ Air pollution from burning crop residue,Pollution,2
507
+ Pollution spreads due to lack of integrated urban planning,Pollution,2
508
+ Electric infrastructure planning ignores future demand growth,Electricity,0
509
+ Residents are affected by loud water pump sounds daily,Water,7
510
+ Accumulated waste near street lights,Garbage,1
511
+ Waste disposal areas emit strong foul odors,Sanitation,5
512
+ Water infrastructure cannot handle peak demand loads,Water,7
513
+ Uncollected wet waste causing odor,Garbage,1
514
+ Drainage blockage causing muddy water on road,Roads,4
515
+ Dirty streets causing mosquito nuisance,Sanitation,5
516
+ Street corners dirty with garbage,Sanitation,5
517
+ Waste collection staff do not collect garbage from interior lanes,Garbage,1
518
+ Overflowing bins causing insect breeding,Garbage,1
519
+ Increased fuel combustion worsens environmental air conditions,Pollution,2
520
+ Voltage spikes damaging ACs,Electricity,0
521
+ No CCTV in buses for safety,Public Transport,3
522
+ Dust from stone crushing units affecting schools,Pollution,2
523
+ Electric cable broken,Electricity,0
classification/artifacts/tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c3bb446109f57871636dcbaf11730f886c37cbab2e72deb065ba0619617fefa
3
+ size 851995
classification/artifacts/train.csv ADDED
The diff for this file is too large to render. See raw diff
 
classification/artifacts/val.csv ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ text,label,label_id
2
+ Poorly designed road curves increase accident risk,Roads,4
3
+ Prolonged power failures during summer are making living conditions unbearable,Electricity,0
4
+ Dogs sleeping on busy roads,Stray Animals,6
5
+ Stray animals making loud noise at night,Stray Animals,6
6
+ No water in entire colony,Water,7
7
+ Decomposing organic waste is creating environmental pollution beyond sanitation issues,Pollution,2
8
+ Open dumping of waste is causing soil and air pollution in nearby residential areas,Pollution,2
9
+ Road repair work incomplete,Roads,4
10
+ Bus terminals poorly lit at night,Public Transport,3
11
+ Road surfaces are not designed for heavy vehicle load,Roads,4
12
+ Frequent power interruptions affect electric road signage reliability,Electricity,0
13
+ Pollution from vehicle exhaust accumulates in narrow streets,Pollution,2
14
+ Open dumping of garbage in streets,Pollution,2
15
+ Odor from sewage treatment plant,Pollution,2
16
+ Garbage is accumulating near parks and playgrounds,Garbage,1
17
+ Frequent accidents at junction of Oak and Pine,Roads,4
18
+ Street water valves leaking after rain,Water,7
19
+ Constant water pump noise interferes with peaceful residential living conditions,Water,7
20
+ Stray goats damaging roadside plants,Stray Animals,6
21
+ Voltage fluctuations affecting lights,Electricity,0
22
+ Sewage overflow near transport hubs causes commuter distress,Sanitation,5
23
+ Water cuts affecting schools and offices,Water,7
24
+ Road widening work delayed,Roads,4
25
+ No shelters at remote bus stops,Public Transport,3
26
+ Sanitation failures contaminate nearby water sources,Sanitation,5
27
+ No vaccination for stray animals in colony,Stray Animals,6
28
+ Electric wires near trees causing hazard,Electricity,0
29
+ Streetlights not functioning near school,Electricity,0
30
+ Potholes near school entrance,Roads,4
31
+ Water main repair causing water shortage,Water,7
32
+ Residents face regular sleep disruption due to excessive water pump noise,Water,7
33
+ Garbage dumping continues despite warning notices,Sanitation,5
34
+ Polluted river water emits foul odor affecting nearby areas,Pollution,2
35
+ Stray cattle block traffic lanes during peak hours,Stray Animals,6
36
+ Bus stops not properly sheltered,Public Transport,3
37
+ Wastewater stagnation seeps into water storage areas,Sanitation,5
38
+ Stray cattle graze on roadside greenery damaging landscaping,Stray Animals,6
39
+ Stray animals leaving waste near houses,Stray Animals,6
40
+ Stray dogs attacking school children,Stray Animals,6
41
+ Garbage collection vehicles skip this area frequently,Garbage,1
42
+ Power supply interruptions affect hospitals and clinics nearby,Electricity,0
43
+ Water pump sound pollution is affecting the quality of life of people living in surrounding apartments,Water,7
44
+ Air quality worsens due to inefficient traffic flow patterns,Pollution,2
45
+ Stray animals wander into construction sites creating hazards,Stray Animals,6
46
+ Sanitation leaks affect nearby commercial areas,Sanitation,5
47
+ Sanitation issues worsen waterborne disease risks,Sanitation,5
48
+ Blocked drains causing water stagnation,Sanitation,5
49
+ Road curvature issues force slow driving in high traffic zones,Roads,4
50
+ Poorly designed road layouts force vehicles to idle longer increasing travel delays,Roads,4
51
+ Improper compaction during construction is weakening road strength,Roads,4
52
+ Overhead tank water not sufficient,Water,7
53
+ Uncovered garbage piles pose serious health risks to residents,Sanitation,5
54
+ Road conditions are worsening despite recent repairs,Roads,4
55
+ Roadside encroachments reduce effective driving space,Roads,4
56
+ Overflowing septic tanks near street,Sanitation,5
57
+ Garbage remains uncollected after scheduled pickup times,Garbage,1
58
+ Electric supply does not meet modern appliance requirements,Electricity,0
59
+ Water pipeline damaged during excavation,Water,7
60
+ Street drains not cleaned for weeks,Sanitation,5
61
+ Overhead tank not refilled after maintenance,Water,7
62
+ Dust from cement plant affecting residential area,Pollution,2
63
+ Open burning of leaves and trash creating smoke,Pollution,2
64
+ Inadequate water supply is affecting sanitation and hygiene in homes,Water,7
65
+ Garbage piles emit strong odor and attract rodents due to delayed removal,Garbage,1
66
+ Electric lines spark during rains creating fire hazards,Electricity,0
67
+ Residents are unable to maintain quiet living conditions due to water pump noise,Water,7
68
+ Road damaged due to heavy rain,Roads,4
69
+ No water in community tank,Water,7
70
+ Stray animals obstructing emergency vehicles,Stray Animals,6
71
+ Garbage attracting mosquitoes and flies,Pollution,2
72
+ Road surfaces peel off creating dangerous driving conditions,Roads,4
73
+ Stray cattle feed on roadside waste creating health issues,Stray Animals,6
74
+ Persistent water pump noise impacts mental well-being of residents,Water,7
75
+ Bus lane blocked by parked trucks,Roads,4
76
+ Cracked pavement reduces driving efficiency and increases fuel usage,Roads,4
77
+ Poor connectivity to rural areas,Public Transport,3
78
+ Road cracks widening after rains,Roads,4
79
+ Buses not stopping at proper stops,Public Transport,3
80
+ No water supply in Sarpavaram since morning,Water,7
81
+ Road shoulder eroding,Roads,4
82
+ Garbage collection frequency is inadequate,Garbage,1
83
+ No electricity in entire street,Electricity,0
84
+ No dedicated buses for women,Public Transport,3
85
+ Garbage collection systems are failing in this zone,Garbage,1
86
+ Frequent breakdowns during peak hours,Public Transport,3
87
+ Electric supply interruptions impact food storage safety,Electricity,0
88
+ Water pump breakdown near park,Water,7
89
+ Insufficient water supply is creating severe inconvenience for large families,Water,7
90
+ The water pump generates loud operational sounds disrupting rest,Water,7
91
+ Road near bridge damaged,Roads,4
92
+ Power outages prevent effective road signal synchronization during peak hours,Electricity,0
93
+ Damaged roads increase vehicle maintenance costs,Roads,4
94
+ Road near school has potholes,Roads,4
95
+ Residents complain about loud water pump operation,Water,7
96
+ Road construction debris narrows traffic lanes,Roads,4
97
+ Overhead tank pump not working properly,Water,7
98
+ Stray cattle rest under streetlights blocking visibility,Stray Animals,6
99
+ Smoke from roadside burning affecting nearby homes,Pollution,2
100
+ Smoke from tire burning polluting air,Pollution,2
101
+ Stray dogs in residential streets making noise,Stray Animals,6
102
+ Residents are troubled by water pump sound,Water,7
103
+ Electric supply disruptions affect safety lighting reducing night time traffic efficiency,Electricity,0
104
+ Stray dogs chasing joggers in parks,Stray Animals,6
105
+ Power instability affects hospital infrastructure impacting sanitation and water use,Electricity,0
106
+ Residents experience continuous irritation due to loud water pump vibrations,Water,7
107
+ Power cuts during night,Electricity,0
108
+ Low voltage in hospital affecting equipment,Electricity,0
109
+ Dust from demolition affecting local market,Pollution,2
110
+ Electric infrastructure repairs are delayed unnecessarily,Electricity,0
111
+ Metro station cleanliness issues,Public Transport,3
112
+ Water cuts affecting residents for multiple days,Water,7
113
+ Damaged traffic signs causing confusion,Roads,4
114
+ Garbage is scattered by animals because collection is irregular,Garbage,1
115
+ Unclean seats and floors in buses,Public Transport,3
116
+ DJ sound creating public nuisance,Pollution,2
117
+ Stray dogs enter school premises causing panic among students,Stray Animals,6
118
+ Waste bins not available in market area,Garbage,1
119
+ Industrial effluents polluting pond water,Pollution,2
120
+ Overflowing trash bins are spreading foul smells but the primary issue is improper garbage collection,Garbage,1
121
+ Garbage disposal points are unmanaged and constantly overflowing,Garbage,1
122
+ Water supply does not meet basic daily consumption needs,Water,7
123
+ Water supply irregular after pipeline repair,Water,7
124
+ Garbage not collected for over a week,Garbage,1
125
+ Stray animals gather near roadside eateries creating mess,Stray Animals,6
126
+ Smoke from vehicles affecting morning walkers,Pollution,2
127
+ Open drain without cover,Sanitation,5
128
+ Garbage scattered near community hall,Garbage,1
129
+ Frequent voltage drop during evenings,Electricity,0
130
+ Waste pollution poses long term health risks,Pollution,2
131
+ Persistent water pump noise disrupts residents,Water,7
132
+ No buses connecting new residential areas,Public Transport,3
133
+ Waste buildup near water bodies contaminates local supply sources indirectly,Garbage,1
134
+ Road repair work causing inconvenience,Roads,4
135
+ Residents are disturbed by loud water pump sound,Water,7
136
+ Traffic slowdowns intensify pollution concentration near residences,Pollution,2
137
+ Electric meter malfunctioning,Electricity,0
138
+ Residents report water pump sound disturbance,Water,7
139
+ Public toilets without proper lighting,Sanitation,5
140
+ Stray dogs making loud noise at night,Stray Animals,6
141
+ Potholes causing accidents,Roads,4
142
+ Stray animals causing traffic accidents,Stray Animals,6
143
+ Damaged pavement causing accidents,Roads,4
144
+ Vehicle idling near intersections raises particulate concentration levels,Pollution,2
145
+ Poor road connectivity affects transit access,Public Transport,3
146
+ Waste management issues are worsening over time,Garbage,1
147
+ Noise pollution increases stress levels among residents,Pollution,2
148
+ Residents experience irritation due to water pump noise,Water,7
149
+ Industrial dust settles on homes causing cleanliness issues,Pollution,2
150
+ Lack of awareness leads to mixing of wet and dry waste,Sanitation,5
151
+ Open garbage near playground causing health hazard,Garbage,1
152
+ Open dumping of waste is increasing environmental pollution,Garbage,1
153
+ High voltage surges in colony,Electricity,0
154
+ Pollution from unregulated industries harms environment,Pollution,2
155
+ Contaminated water supply is increasing dependency on bottled water,Water,7
156
+ Stray dogs fight over food causing injuries and noise,Stray Animals,6
157
+ No water for drinking and cooking,Water,7
158
+ Garbage disposal methods are outdated and ineffective,Garbage,1
159
+ Water pollution from river effluents,Pollution,2
160
+ Drain water stagnation,Sanitation,5
161
+ Power instability disrupts functioning of smart road systems,Electricity,0
162
+ Dogs barking at night causing disturbance,Stray Animals,6
163
+ Sanitation blockages worsen during monsoon season causing flooding,Sanitation,5
164
+ Damaged footpath causing inconvenience to pedestrians,Roads,4
165
+ Poor air quality near traffic junction,Pollution,2
166
+ Garbage pile near school,Garbage,1
167
+ Waste dumping near school gate,Garbage,1
168
+ Sewage water leaking onto streets,Sanitation,5
169
+ Electric poles not properly grounded,Electricity,0
170
+ Pollution from vehicle congestion impacts air quality daily,Pollution,2
171
+ Stray dogs roam in packs increasing attack risks,Stray Animals,6
172
+ No security cameras at bus stations,Public Transport,3
173
+ The water pump produces disturbing sounds that interfere with peaceful living conditions in the locality,Water,7
174
+ The water pump emits ongoing mechanical noise causing irritation,Water,7
175
+ Electric meter not updating readings,Electricity,0
176
+ Smoke from garbage burning near school,Pollution,2
177
+ Buses without proper lighting at night,Public Transport,3
178
+ Monkeys entering homes frequently,Stray Animals,6
179
+ Waste collection disrupted after festival,Garbage,1
180
+ The water pump generates continuous sound that penetrates walls and disturbs indoor peace,Water,7
181
+ Water supply restoration takes excessively long after repairs,Water,7
182
+ Potholes causing tire punctures in colony roads,Roads,4
183
+ Stray animals causing hygiene issues in alleys,Stray Animals,6
184
+ Odor from open sewage causing discomfort,Pollution,2
185
+ Voltage fluctuations during evening hours,Electricity,0
186
+ Electricity failures affect street lighting at night,Electricity,0
187
+ Residents complain that water pump sound disrupts rest and relaxation,Water,7
188
+ Dirty streets due to irregular cleaning,Sanitation,5
189
+ Cats creating noise at night,Stray Animals,6
190
+ Low pressure water supply prevents proper cleaning,Water,7
191
+ Blocked drainage causing flooding near park,Sanitation,5
192
+ Road near bridge uneven and dangerous,Roads,4
193
+ Garbage dumping near markets creates unhygienic conditions,Sanitation,5
194
+ Persistent water pump noise causes frustration and mental strain among residents,Water,7
195
+ Waste decomposition is releasing strong pollutants into the air,Pollution,2
196
+ Drain cleaning vehicle not coming,Sanitation,5
197
+ Oil spills polluting street drains,Pollution,2
198
+ No ramps in buses for wheelchairs,Public Transport,3
199
+ Traffic signs missing on busy roads,Roads,4
200
+ Stray animals fighting each other in streets,Stray Animals,6
201
+ Stray cats entering residential buildings,Stray Animals,6
202
+ Dirty streets near market area,Sanitation,5
203
+ Overhead wires sagging dangerously,Electricity,0
204
+ Stray cows wandering near schools,Stray Animals,6
205
+ Water quality test failed,Water,7
206
+ Excessive noise from generators violates permissible sound levels,Pollution,2
207
+ Supply water contains excessive chlorine smell,Water,7
208
+ Pollution from waste incineration spreads toxins,Pollution,2
209
+ Garbage bins are broken and unusable forcing people to dump waste outside,Garbage,1
210
+ Public sanitation facilities lack proper water supply,Sanitation,5
211
+ Inconsistent water flow damages household water storage systems,Water,7
212
+ Unreliable power disables water purification infrastructure intermittently,Electricity,0
213
+ Power cuts affecting businesses,Electricity,0
214
+ No night bus services available,Public Transport,3
215
+ Odor from sewage backup in street,Pollution,2
216
+ Traffic congestion near junction,Roads,4
217
+ Drainage problem near shops,Sanitation,5
218
+ Long term waste accumulation is degrading environmental quality,Pollution,2
219
+ Residents express concern over long-term exposure to water pump noise pollution,Water,7
220
+ Roadside erosion releases dust affecting respiratory health,Pollution,2
221
+ Dust from construction debris affecting children,Pollution,2
222
+ Uneven pavements disrupt public transport schedules,Roads,4
223
+ Streetlights off on main road,Electricity,0
224
+ Road surfaces have sunk creating deep depressions,Roads,4
225
+ Air pollution from brick kiln operations,Pollution,2
226
+ Old buses causing discomfort to passengers,Public Transport,3
227
+ Overhead tank valve is malfunctioning,Water,7
228
+ Streetlights not working in residential area,Electricity,0
229
+ Power cuts during peak hours,Electricity,0
230
+ Electric line damaged due to rain,Electricity,0
231
+ Bus drivers not following traffic rules,Public Transport,3
232
+ Drinking water tanker arrives late,Water,7
233
+ Stray animals leaving waste on streets,Stray Animals,6
234
+ Stray cattle sit on speed breakers causing visibility issues,Stray Animals,6
235
+ Poor sanitation maintenance increases mosquito breeding,Sanitation,5
236
+ Street littered with plastic bags,Garbage,1
237
+ Garbage trucks not covering all streets,Garbage,1
238
+ No emergency numbers displayed in buses,Public Transport,3
239
+ Airborne particulate matter rises due to prolonged vehicle idling in traffic heavy corridors,Pollution,2
240
+ Irregular water supply forces residents to rely on unsafe storage methods,Water,7
241
+ Stray dogs disrupt morning walks in residential colonies,Stray Animals,6
242
+ Voltage drops affect illuminated road signage clarity at night,Electricity,0
243
+ Garbage trucks create spillage during transportation,Sanitation,5
244
+ Electric wires exposed near playground,Electricity,0
245
+ Garbage remains uncleared despite municipal schedules,Garbage,1
246
+ Electricity department does not provide outage updates,Electricity,0
247
+ Road near market has cracks,Roads,4
248
+ Damaged road shoulders reduce usable driving space,Roads,4
249
+ Pollution levels rise due to unmanaged waste decay,Pollution,2
250
+ Plastic waste mixed with organic waste,Garbage,1
251
+ Stray dogs forming packs near temples,Stray Animals,6
252
+ Streetlight poles broken,Electricity,0
253
+ Road surface slippery due to oil spillage,Roads,4
254
+ Garbage bins are not sufficient for waste volume,Garbage,1
255
+ Garbage disposal points are poorly managed and constantly overflowing,Garbage,1
256
+ Stray cows blocking traffic on highways,Stray Animals,6
257
+ Burning leaves and trash releases harmful pollutants into the air,Pollution,2
258
+ Blocked drains causing flooding near park,Sanitation,5
259
+ Power failures affect monitoring of traffic density resulting in unmanaged congestion,Electricity,0
260
+ Long queues for public transport tickets,Public Transport,3
261
+ Buses without functional horn or lights,Public Transport,3
262
+ Uncontrolled pollution in urban area,Pollution,2
263
+ Auto drivers misbehaving with passengers,Public Transport,3
264
+ Electricity outages affect elevator operations in apartments,Electricity,0
265
+ Electric infrastructure maintenance is irregular and insufficient,Electricity,0
266
+ Fuse boxes not maintained,Electricity,0
267
+ No asphalt layer on road,Roads,4
268
+ Water supply interruptions affect hospitals and schools,Water,7
269
+ Bus staff not enforcing safety measures,Public Transport,3
270
+ Water leakage causes erosion around building foundations,Water,7
271
+ No shelter homes for injured strays,Stray Animals,6
272
+ Dirty drains causing mosquito nuisance,Sanitation,5
273
+ Frequent power cuts during peak hours,Electricity,0
274
+ Narrow road design leads to chronic congestion during working hours,Roads,4
275
+ Drain clogged for days,Sanitation,5
276
+ Stray animals defecate near homes causing hygiene problems,Stray Animals,6
277
+ No segregation of wet and dry waste,Garbage,1
278
+ Power cuts occur without any prior announcements,Electricity,0
279
+ Poor maintenance of public transport vehicles,Public Transport,3
280
+ Drain water flowing continuously,Sanitation,5
281
+ Road surfaces have lost structural integrity,Roads,4
282
+ Water contamination in handpump near residential block,Water,7
283
+ Power cuts affecting hospitals,Electricity,0
284
+ Garbage from nearby markets is dumped irresponsibly in residential zones,Garbage,1
285
+ Odor pollution from garbage dumping site,Pollution,2
286
+ Buses not stopping at requested locations,Public Transport,3
287
+ Water supply irregular in residential colony,Water,7
288
+ Environmental pollution is increasing due to lack of proper waste treatment,Pollution,2
289
+ Electricity department fails to upgrade outdated infrastructure,Electricity,0
290
+ Street littered with paper and plastic waste,Garbage,1
291
+ Noise from night-time clubs disturbing residents,Pollution,2
292
+ Open defecation near residential area,Sanitation,5
293
+ Electric failures affect emergency response systems across major road corridors,Electricity,0
294
+ Open garbage near hospital creating health hazard,Garbage,1
295
+ Water pollution due to sewage leakage,Pollution,2
296
+ Public dustbin overflowing,Garbage,1
297
+ Multiple pollution sources in area,Pollution,2
298
+ Stray cats spreading diseases in markets,Stray Animals,6
299
+ Road shoulders are damaged making pedestrian movement unsafe,Roads,4
300
+ Transformer making noise,Electricity,0
301
+ Bus drivers refusing to stop at requested locations,Public Transport,3
302
+ No proper animal control in residential areas,Stray Animals,6
303
+ Smoke from crematorium affecting local area,Pollution,2
304
+ Stray dogs near railway station scaring passengers,Stray Animals,6
305
+ Garbage dumping near drains causes sewage overflow during rainfall,Garbage,1
306
+ Drain overflow creating traffic issue,Sanitation,5
307
+ Dirty drains causing waterlogging during monsoon,Sanitation,5
308
+ Garbage not collected from high-rise buildings,Garbage,1
309
+ Garbage collection irregular in park area,Garbage,1
310
+ Supply water contains excessive sediment affecting water filters,Water,7
311
+ Sanitation backflow damages road surfaces,Sanitation,5
312
+ Stray animals blocking sidewalks,Stray Animals,6
313
+ Water supply stops abruptly without any official communication,Water,7
314
+ Stray animals damage parked vehicles while searching for food,Stray Animals,6
315
+ Overflowing bins near bus stops create hygiene and commuter discomfort,Garbage,1
316
+ Buses not running on weekends,Public Transport,3
317
+ Bus drivers ignoring signals,Public Transport,3
318
+ Incomplete road projects create traffic bottlenecks,Roads,4
319
+ Electricity outages occur frequently during weekends,Electricity,0
320
+ Traffic signal timing not optimized,Roads,4
321
+ Buses not following GPS routes,Public Transport,3
322
+ Stray animals sleeping on pavements,Stray Animals,6
323
+ Drain blockage causing flooding,Sanitation,5
324
+ Electricity supply issues disrupt online education,Electricity,0
325
+ Polluted soil contains hazardous chemicals,Pollution,2
326
+ The water pump noise remains constant without breaks causing ongoing stress to nearby households,Water,7
327
+ Water overflow near house,Water,7
328
+ Garbage collection vehicles do not arrive on time,Garbage,1
329
+ Trapped traffic emits concentrated pollutants impacting nearby pedestrians,Pollution,2
330
+ Road surface uneven near playground,Roads,4
331
+ Poor road quality is forcing vehicles to take long detours,Roads,4
332
+ Long term waste dumping is degrading environmental health in the locality,Pollution,2
333
+ Stray cats entering homes and damaging property,Stray Animals,6
334
+ Traffic signals malfunctioning,Roads,4
335
+ Odor from sewage backup in residential area,Pollution,2
336
+ Stray dogs gather near food waste sites,Stray Animals,6
337
+ Detour related fuel burn increases atmospheric contamination,Pollution,2
338
+ No announcements for stops for visually impaired,Public Transport,3
339
+ Garbage disposal practices need urgent improvement,Garbage,1
340
+ Uncollected household garbage near shops,Garbage,1
341
+ Pollution from heavy vehicles affects residential zones,Pollution,2
342
+ Noise from metro construction disturbing residents,Pollution,2
343
+ Waste bins are not covered allowing animals to scatter garbage,Garbage,1
344
+ Garbage is not collected daily leading to foul smells and health concerns,Sanitation,5
345
+ Loose electrical connections cause repeated outages,Electricity,0
346
+ Electric shock from pole,Electricity,0
347
+ Smoke from roadside eateries causing health issues,Pollution,2
348
+ Road repair complaint ignored,Roads,4
349
+ Broken roads increase travel stress and fatigue,Roads,4
350
+ Waste disposal practices increase environmental hazards,Sanitation,5
351
+ Decaying waste releases gases that significantly degrade air quality,Pollution,2
352
+ Electric outage since yesterday,Electricity,0
353
+ Public sanitation services lack accountability,Sanitation,5
354
+ Drivers not following assigned routes,Public Transport,3
355
+ Buses overcrowded with standing passengers,Public Transport,3
356
+ Water supply does not meet the needs of growing population in the area,Water,7
357
+ Frequent fare disputes in buses,Public Transport,3
358
+ Stray animals causing sanitation issues,Stray Animals,6
359
+ Accumulated waste attracts insects and rodents,Sanitation,5
360
+ Sewage water flowing on road,Sanitation,5
361
+ Air pollution from construction dust near hospital,Pollution,2
362
+ Drainage overflow in street,Sanitation,5
363
+ Sanitation workers do not report for duty regularly,Sanitation,5
364
+ Water pressure drops affect sanitation and hygiene practices,Water,7
365
+ Overflowing water tanks cause wastage due to faulty valves,Water,7
366
+ Animal interference increases sanitation workload,Stray Animals,6
367
+ Overflowing drains near school,Sanitation,5
368
+ Garbage is piling up near bus stops and public areas,Garbage,1
369
+ Open drains near houses,Sanitation,5
370
+ Power outages occur daily during peak usage hours without explanation,Electricity,0
371
+ Sewage stagnation creates unhygienic living conditions,Sanitation,5
372
+ Odor from chemical treatment plant near road,Pollution,2
373
+ Stray goats eating flowers in gardens,Stray Animals,6
374
+ Odor from garbage dump near residential area,Pollution,2
375
+ Animals crossing roads increase collision risk,Stray Animals,6
376
+ Speed bumps not visible at night,Roads,4
377
+ Odor from sewage near commercial complex,Pollution,2
378
+ Air pollution aggravates asthma and respiratory conditions,Pollution,2
379
+ Drain overflow near hospital,Sanitation,5
380
+ Water management inefficiency impacts urban resilience,Water,7
381
+ Dumped waste is polluting nearby agricultural land,Pollution,2
382
+ Street taps dry after pipeline maintenance,Water,7
383
+ Organic waste decomposition is polluting the air and attracting disease carrying insects,Pollution,2
384
+ Lack of municipal response to stray animal complaints is concerning,Stray Animals,6
385
+ Water meters show abnormal readings despite limited water usage,Water,7
386
+ Uneven road surface,Roads,4
387
+ Waste disposal sites attract stray animals,Sanitation,5
388
+ Industrial pollution affects nearby residential quality of life,Pollution,2
389
+ Electric lines are exposed and unsafe in public areas,Electricity,0
390
+ Public toilets lacking maintenance schedule,Sanitation,5
391
+ Ongoing water pump vibrations are creating a persistent nuisance for families living nearby,Water,7
392
+ Water tankers charge high prices due to municipal shortages,Water,7
393
+ Dogs entering houses frequently,Stray Animals,6
394
+ Water pipeline leakage near main road,Water,7
395
+ Waste segregation is not practiced consistently,Garbage,1
396
+ Street corners full of mixed garbage,Garbage,1
397
+ Streetlight poles corroded,Electricity,0
398
+ Buses not following scheduled intervals,Public Transport,3
399
+ Power failures disrupt home medical equipment usage,Electricity,0
400
+ Road surfaces are damaged by heavy construction vehicle movement,Roads,4
401
+ Air pollution from coal transport trucks,Pollution,2
402
+ Garbage collection trucks not available,Garbage,1
403
+ Stray animals knocking over dustbins,Stray Animals,6
404
+ Construction debris dumped illegally,Pollution,2
405
+ Pollution caused by waste decay is affecting nearby residential comfort,Pollution,2
406
+ Damaged culvert causing road erosion,Roads,4
407
+ Waste breakdown affects surrounding environmental conditions,Pollution,2
408
+ Stray goats wandering on highways,Stray Animals,6
409
+ Garbage is often burned causing air pollution,Sanitation,5
410
+ Power outage affecting local shops,Electricity,0
411
+ Sanitation neglect increases long term infrastructure damage,Sanitation,5
412
+ Street light flickering,Electricity,0
413
+ Poor road connectivity increases travel time and logistical inefficiency,Roads,4
414
+ Water contamination from untreated sewage,Pollution,2
415
+ Garbage piles draw stray dogs increasing public safety concerns,Garbage,1
416
+ Water pump noise disturbing residents,Water,7
417
+ Stray animals disrupt peaceful living in residential areas,Stray Animals,6
418
+ Garbage collection delayed during rainy season,Garbage,1
419
+ Residents complain about persistent water pump noise,Water,7
420
+ Road maintenance work is delayed for months without explanation,Roads,4
421
+ Stray goats entering parks,Stray Animals,6
422
+ Road surface cracks widening,Roads,4
423
+ Bus stations without toilets or drinking water,Public Transport,3
424
+ Traffic signals not functioning at major intersection,Roads,4
425
+ Road network gaps increase dependency on longer travel routes,Roads,4
426
+ Electric poles damaged due to storm,Electricity,0
427
+ Power cuts disrupting businesses,Electricity,0
428
+ Stray animals lack proper shelters leading to street occupation,Stray Animals,6
429
+ Stray cattle wander into marketplaces creating safety hazards,Stray Animals,6
430
+ Road near bus stop damaged,Roads,4
431
+ Improper waste management is causing resident dissatisfaction,Garbage,1
432
+ Garbage remains scattered after market hours,Garbage,1
433
+ Industrial wastewater discharge pollutes groundwater sources,Pollution,2
434
+ Garbage accumulation creates breeding grounds for pests,Sanitation,5
435
+ Road full of potholes near bus stand,Roads,4
436
+ Waste management practices do not meet basic standards,Sanitation,5
437
+ Odor from poultry market affecting houses,Pollution,2
438
+ Overhead tank overflowing constantly,Water,7
439
+ Sewage pipe damaged,Sanitation,5
440
+ Smoke from cooking chimneys in dense areas,Pollution,2
441
+ Dirty drains causing street flooding,Sanitation,5
442
+ Water tank cleaning required,Water,7
443
+ Garbage lying near drain,Garbage,1
444
+ Drivers not issuing proper receipts,Public Transport,3
445
+ Water tanker delivery inconsistent,Water,7
446
+ Suspended dust particles from road surfaces are contributing to respiratory discomfort,Pollution,2
447
+ Lack of buses in suburban areas,Public Transport,3
448
+ Stray dogs guard territories aggressively near houses,Stray Animals,6
449
+ Stagnant water near public park,Sanitation,5
450
+ Road near market slippery after rain,Roads,4
451
+ Sanitation workers not patrolling community areas,Sanitation,5
452
+ Water pipes frequently clog causing supply interruptions,Water,7
453
+ No electricity in residential block,Electricity,0
454
+ Odor from leather tanning unit near river,Pollution,2
455
+ Noise pollution from nearby nightclub,Pollution,2
456
+ Water mains corrode causing supply issues,Water,7
457
+ Drain water entering homes,Sanitation,5
458
+ Water contamination reported in residential area,Water,7
459
+ Water supply pressure is too weak to reach upper floors of buildings,Water,7
460
+ Voltage drop in colony during night,Electricity,0
461
+ Burning of garbage creating toxic smoke,Pollution,2
462
+ Garbage pile near street corner attracting rats,Garbage,1
463
+ Road shoulder erosion limits usable space leading to congestion,Roads,4
464
+ Damaged roads near hospitals are affecting ambulance movement,Roads,4
465
+ Potholes near shopping complex,Roads,4
466
+ Road surface dust contributes to respiratory discomfort,Roads,4
467
+ Water infrastructure repairs lack proper supervision,Water,7
468
+ No proper waiting areas at bus terminals,Public Transport,3
469
+ Garbage scattered by stray animals,Garbage,1
470
+ Garbage not segregated in bins,Garbage,1
471
+ Water tanker service not available on time,Water,7
472
+ Dust from open construction sites affecting market,Pollution,2
473
+ Air pollution increases hospital visits for breathing issues,Pollution,2
474
+ Public toilets locked or inaccessible,Sanitation,5
475
+ Overflowing sewage near commercial area,Sanitation,5
476
+ Electric outages halt automated water distribution scheduling systems,Electricity,0
477
+ The water pump emits a harsh mechanical sound that causes constant irritation to residents,Water,7
478
+ Bad road near hospital,Roads,4
479
+ Pollution affecting quality of life,Pollution,2
480
+ Water supply timing not communicated,Water,7
481
+ Water pipelines lack proper insulation and protection,Water,7
482
+ Excessive vehicle emissions in this area have significantly reduced air quality levels,Pollution,2
483
+ Stray animals cause night time disturbances near homes,Stray Animals,6
484
+ Bus drivers not obeying traffic signals,Public Transport,3
485
+ Sanitation services are poorly monitored,Sanitation,5
486
+ No separate buses for students,Public Transport,3
487
+ Temporary road repairs wash away during rains,Roads,4
488
+ Noise pollution from factories disturbs nearby residents,Pollution,2
489
+ Stray animals affect commuter comfort,Stray Animals,6
490
+ Damaged roads force vehicles to take longer detours increasing fuel consumption,Roads,4
491
+ Garbage has been left unattended near public places,Garbage,1
492
+ Sewage leaks enter drainage and road systems,Sanitation,5
493
+ Improper garbage disposal affects nearby residential buildings,Sanitation,5
494
+ Garbage heaps are visible across multiple streets,Garbage,1
495
+ Electric supply disruptions force manual traffic handling causing congestion,Electricity,0
496
+ Electric infrastructure failures disrupt adaptive traffic control systems,Electricity,0
497
+ Sanitation complaints are not addressed promptly by authorities,Sanitation,5
498
+ Garbage dumped in public spaces is affecting cleanliness,Garbage,1
499
+ Smoke from tire burning in industrial area,Pollution,2
500
+ Damaged guardrail causing accident,Roads,4
501
+ Low water pressure in newly built area,Water,7
502
+ Irregular water supply during festival season,Water,7
503
+ Unregulated tanker water sources raise safety concerns,Water,7
504
+ Water from taps has unusual odor,Water,7
505
+ Accumulated waste near bus stops,Garbage,1
506
+ Street water valves leaking,Water,7
507
+ Voltage spikes affecting ACs,Electricity,0
508
+ Smoke from burning tires on roadside,Pollution,2
509
+ Stray goats wandering in playgrounds,Stray Animals,6
510
+ Stray cattle block roads during peak hours disrupting traffic flow,Stray Animals,6
511
+ Unfriendly behavior from bus staff,Public Transport,3
512
+ Water supply infrastructure expansion has not kept pace with growth,Water,7
513
+ The loud humming of the water pump causes discomfort throughout the day and night,Water,7
514
+ Road construction debris restricts lane capacity,Roads,4
515
+ Streetlight malfunction causing darkness,Electricity,0
516
+ Water supply disrupted due to civic work,Water,7
517
+ Dirty water flowing from street taps,Water,7
518
+ Pollution from heavy machinery continues throughout night,Pollution,2
519
+ Stray animals causing traffic delays,Stray Animals,6
520
+ Industrial waste discharged into river,Pollution,2
521
+ Electric poles with broken cross arms,Electricity,0
522
+ Improper waste dumping worsens underground sanitation congestion,Garbage,1
523
+ Water supply schedules change without notification,Water,7
classification/bert_classify.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # BERT MODEL — CATEGORY CLASSIFICATION (ENGLISH)
3
+ # =========================================================
4
+
5
+ import os
6
+ import re
7
+ import torch
8
+ import pickle
9
+ from transformers import BertForSequenceClassification
10
+
11
+ # ── Path config ───────────────────────────────────────────
12
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
+ ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
14
+ MODEL_DIR = os.path.join(ARTIFACT_DIR, "bert_model")
15
+ MAX_LENGTH = 128 # FIX: was 100 — aligned with IG explainer and indic module
16
+
17
+ # ── Load artifacts ────────────────────────────────────────
18
+ with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "rb") as f:
19
+ tokenizer = pickle.load(f)
20
+
21
+ with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
22
+ label_encoder = pickle.load(f)
23
+
24
+ model = BertForSequenceClassification.from_pretrained(
25
+ MODEL_DIR, local_files_only=True
26
+ )
27
+ model.eval()
28
+
29
+ # ── Edge-case constants ───────────────────────────────────
30
+ LABEL_WORDS = {
31
+ "water", "electricity", "roads", "garbage",
32
+ "sanitation", "pollution", "transport", "animals",
33
+ }
34
+
35
+ NON_GRIEVANCE_PHRASES = {
36
+ "hello", "hi", "hi there", "hey", "hey there",
37
+ "good morning", "good afternoon", "good evening", "good day",
38
+ "greetings", "namaste", "how are you", "how are you doing",
39
+ "hope you are doing well", "hope everything is fine",
40
+ "just checking in", "nice to meet you", "long time no see",
41
+ "good weather", "nice weather", "weather is nice", "weather is good",
42
+ "it is a sunny day", "it is raining today", "pleasant weather",
43
+ "cool weather today", "hot weather today", "cold weather today",
44
+ "it is a good day", "everything is fine", "all good", "no issues",
45
+ "no problem", "things are okay", "everything looks good",
46
+ "nothing to complain", "all services are working",
47
+ "thank you", "thanks", "thanks a lot", "thank you very much",
48
+ "appreciate it", "appreciate your help", "great work", "good job",
49
+ "well done", "excellent service", "for your information",
50
+ "just informing", "sharing information", "today is a holiday",
51
+ "office opens at 10 am", "school reopens next week",
52
+ "meeting scheduled tomorrow", "okay", "ok", "alright", "fine",
53
+ "cool", "great", "nice", "regards", "best regards", "with regards",
54
+ "kind regards", "thank you and regards", "thank you very much sir",
55
+ "test", "testing", "demo", "sample text", "random text",
56
+ "🙂", "👍", "🙏", "😂", "🔥", "!!!", "???",
57
+ }
58
+
59
+
60
+ # ── Text cleaning ─────────────────────────────────────────
61
+ def clean_text(text: str) -> str:
62
+ text = str(text)
63
+ text = re.sub(r"<.*?>", " ", text)
64
+ # FIX: do NOT strip non-ASCII here — this module receives English
65
+ # only (language detection in main.py routes correctly), but
66
+ # stripping non-ASCII would silently corrupt any mis-routed Indic text.
67
+ # Keep only the HTML-strip; whitespace normalisation is sufficient.
68
+ text = re.sub(r"\s+", " ", text).strip()
69
+ return text
70
+
71
+
72
+ # ── Input validation ──────────────────────────────────────
73
+ def validate_input(text: str):
74
+ if not text or not text.strip():
75
+ return "empty_text"
76
+ text_l = text.strip().lower()
77
+ if len(text_l) < 10:
78
+ return "too_short"
79
+ if len(text_l.split()) < 3:
80
+ return "too_few_words"
81
+ if text_l in LABEL_WORDS:
82
+ return "label_only"
83
+ if text_l in NON_GRIEVANCE_PHRASES:
84
+ return "non_grievance_text"
85
+ return None
86
+
87
+
88
+ # ── Predict ───────────────────────────────────────────────
89
+ def predict(
90
+ text: str,
91
+ input_ids=None, # O3: pre-tokenised tensor from main.py
92
+ attention_mask=None, # O3: pre-tokenised tensor from main.py
93
+ ) -> dict:
94
+ """
95
+ Predict grievance category for English text.
96
+
97
+ Args:
98
+ text : Raw input string (always required for validation).
99
+ input_ids : Optional pre-tokenised tensor (1, seq_len).
100
+ When provided by main.py the internal tokenisation
101
+ step is skipped — eliminates duplicate tokenisation.
102
+ attention_mask : Required when input_ids is provided.
103
+
104
+ Returns dict with keys: status, category, confidence, class_index.
105
+ """
106
+ # 1. Rule-based validation (always on raw text)
107
+ reason = validate_input(text)
108
+ if reason:
109
+ return {
110
+ "status": "failed",
111
+ "reason": reason,
112
+ "category": None,
113
+ "confidence": 0.0,
114
+ "class_index": None,
115
+ }
116
+
117
+ # 2. Clean text for model consumption
118
+ cleaned = clean_text(text)
119
+
120
+ # 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
121
+ # padding=False — single-string inference needs no padding;
122
+ # avoids [PAD] tokens appearing in IG attributions.
123
+ if input_ids is None:
124
+ enc = tokenizer(
125
+ cleaned,
126
+ return_tensors="pt",
127
+ truncation=True,
128
+ padding=False,
129
+ max_length=MAX_LENGTH,
130
+ )
131
+ input_ids = enc["input_ids"]
132
+ attention_mask = enc["attention_mask"]
133
+
134
+ # 4. Forward pass
135
+ with torch.no_grad():
136
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
137
+
138
+ probs = torch.softmax(outputs.logits, dim=1)
139
+ conf, pred = torch.max(probs, dim=1)
140
+ confidence = conf.item()
141
+ predicted_index = pred.item()
142
+
143
+ # 5. Confidence gate
144
+ if confidence < 0.30:
145
+ return {
146
+ "status": "success",
147
+ "reason": "low_confidence",
148
+ "category": "Other",
149
+ "confidence": round(confidence, 4),
150
+ "class_index": predicted_index,
151
+ }
152
+
153
+ label = label_encoder.inverse_transform([predicted_index])[0]
154
+
155
+ return {
156
+ "status": "success",
157
+ "category": label,
158
+ "confidence": round(confidence, 4),
159
+ "class_index": predicted_index,
160
+ }
161
+
162
+
163
+ def get_model_and_tokenizer():
164
+ return model, tokenizer
classification/bert_model.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
3
+ # =========================================================
4
+
5
+ import os
6
+ import re
7
+ import pickle
8
+ import pandas as pd
9
+ import numpy as np
10
+ import torch
11
+
12
+ from sklearn.model_selection import train_test_split
13
+ from sklearn.preprocessing import LabelEncoder
14
+ from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef
15
+
16
+ from transformers import (
17
+ BertTokenizer,
18
+ BertForSequenceClassification,
19
+ Trainer,
20
+ TrainingArguments
21
+ )
22
+
23
+ from torch.utils.data import Dataset
24
+
25
+ # ---------------------------------------------------------
26
+ # CONFIG
27
+ # ---------------------------------------------------------
28
+
29
+
30
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
31
+ DATA_PATH = os.path.join(BASE_DIR, "train.csv")
32
+
33
+ print("📄 Loading dataset from:", DATA_PATH) # CHANGE if needed
34
+ ARTIFACT_DIR = "classification/artifacts"
35
+ MODEL_DIR = f"{ARTIFACT_DIR}/bert_model"
36
+ MAX_LENGTH = 100
37
+ EPOCHS = 3
38
+ BATCH_SIZE = 16
39
+ LEARNING_RATE = 2e-5
40
+
41
+ os.makedirs(ARTIFACT_DIR, exist_ok=True)
42
+
43
+ # ---------------------------------------------------------
44
+ # 1. LOAD DATA
45
+ # ---------------------------------------------------------
46
+ df = pd.read_csv(DATA_PATH)
47
+ df = df[['text', 'label']]
48
+ df.dropna(inplace=True)
49
+ df.drop_duplicates(inplace=True)
50
+
51
+ # ---------------------------------------------------------
52
+ # 2. CLEAN TEXT (BERT SAFE)
53
+ # ---------------------------------------------------------
54
+ def clean_text(text):
55
+ text = str(text)
56
+ text = re.sub(r"<.*?>", " ", text)
57
+ text = re.sub(r"[^\x00-\x7F]+", " ", text)
58
+ text = re.sub(r"\s+", " ", text).strip()
59
+ return text
60
+
61
+ df['text'] = df['text'].apply(clean_text)
62
+
63
+ # ---------------------------------------------------------
64
+ # 3. LABEL ENCODING
65
+ # ---------------------------------------------------------
66
+ label_encoder = LabelEncoder()
67
+ df['label_id'] = label_encoder.fit_transform(df['label'])
68
+
69
+ label_map = dict(zip(label_encoder.classes_,
70
+ label_encoder.transform(label_encoder.classes_)))
71
+
72
+ # SAVE LABEL ENCODER & MAP
73
+ with open(f"{ARTIFACT_DIR}/label_encoder.pkl", "wb") as f:
74
+ pickle.dump(label_encoder, f)
75
+
76
+ with open(f"{ARTIFACT_DIR}/label_map.pkl", "wb") as f:
77
+ pickle.dump(label_map, f)# =========================================================
78
+ # BERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
79
+ # =========================================================
80
+
81
+ import os
82
+ import re
83
+ import pickle
84
+ import pandas as pd
85
+ import numpy as np
86
+ import torch
87
+
88
+ from sklearn.model_selection import train_test_split
89
+ from sklearn.preprocessing import LabelEncoder
90
+ from sklearn.metrics import (
91
+ accuracy_score,
92
+ f1_score,
93
+ balanced_accuracy_score,
94
+ matthews_corrcoef
95
+ )
96
+
97
+ from transformers import (
98
+ BertTokenizer,
99
+ BertForSequenceClassification,
100
+ Trainer,
101
+ TrainingArguments
102
+ )
103
+
104
+ from torch.utils.data import Dataset
105
+
106
+
107
+ # ---------------------------------------------------------
108
+ # PATH CONFIG (WINDOWS SAFE)
109
+ # ---------------------------------------------------------
110
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
111
+ DATA_PATH = os.path.join(BASE_DIR, "train.csv")
112
+
113
+ ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
114
+ MODEL_DIR = os.path.join(ARTIFACT_DIR, "bert_model")
115
+
116
+ MAX_LENGTH = 100
117
+ EPOCHS = 3
118
+ BATCH_SIZE = 16
119
+ LEARNING_RATE = 2e-5
120
+
121
+ os.makedirs(ARTIFACT_DIR, exist_ok=True)
122
+
123
+
124
+ # ---------------------------------------------------------
125
+ # 1. LOAD DATA
126
+ # ---------------------------------------------------------
127
+ print(f"📄 Loading dataset from: {DATA_PATH}")
128
+
129
+ df = pd.read_csv(DATA_PATH)
130
+ df = df[['text', 'label']]
131
+ df.dropna(inplace=True)
132
+ df.drop_duplicates(inplace=True)
133
+
134
+
135
+ # ---------------------------------------------------------
136
+ # 2. CLEAN TEXT (BERT SAFE)
137
+ # ---------------------------------------------------------
138
+ def clean_text(text):
139
+ text = str(text)
140
+ text = re.sub(r"<.*?>", " ", text)
141
+ text = re.sub(r"[^\x00-\x7F]+", " ", text)
142
+ text = re.sub(r"\s+", " ", text).strip()
143
+ return text
144
+
145
+ df["text"] = df["text"].apply(clean_text)
146
+
147
+
148
+ # ---------------------------------------------------------
149
+ # 3. LABEL ENCODING
150
+ # ---------------------------------------------------------
151
+ label_encoder = LabelEncoder()
152
+ df["label_id"] = label_encoder.fit_transform(df["label"])
153
+
154
+ label_map = dict(zip(label_encoder.classes_,
155
+ label_encoder.transform(label_encoder.classes_)))
156
+
157
+ # Save label artifacts
158
+ with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f:
159
+ pickle.dump(label_encoder, f)
160
+
161
+ with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f:
162
+ pickle.dump(label_map, f)
163
+
164
+ NUM_LABELS = len(label_map)
165
+ print(f"✅ Number of classes: {NUM_LABELS}")
166
+
167
+
168
+ # ---------------------------------------------------------
169
+ # 4. TRAIN / VAL / TEST SPLIT
170
+ # ---------------------------------------------------------
171
+ train_df, temp_df = train_test_split(
172
+ df,
173
+ test_size=0.30,
174
+ stratify=df["label_id"],
175
+ random_state=42
176
+ )
177
+
178
+ val_df, test_df = train_test_split(
179
+ temp_df,
180
+ test_size=0.50,
181
+ stratify=temp_df["label_id"],
182
+ random_state=42
183
+ )
184
+
185
+ # Save processed splits
186
+ train_df.to_csv(os.path.join(ARTIFACT_DIR, "train.csv"), index=False)
187
+ val_df.to_csv(os.path.join(ARTIFACT_DIR, "val.csv"), index=False)
188
+ test_df.to_csv(os.path.join(ARTIFACT_DIR, "test.csv"), index=False)
189
+
190
+
191
+ # ---------------------------------------------------------
192
+ # 5. TOKENIZER
193
+ # ---------------------------------------------------------
194
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
195
+
196
+ with open(os.path.join(ARTIFACT_DIR, "tokenizer.pkl"), "wb") as f:
197
+ pickle.dump(tokenizer, f)
198
+
199
+
200
+ # ---------------------------------------------------------
201
+ # 6. TORCH DATASET
202
+ # ---------------------------------------------------------
203
+ class GrievanceDataset(Dataset):
204
+ def __init__(self, texts, labels):
205
+ self.encodings = tokenizer(
206
+ list(texts),
207
+ truncation=True,
208
+ padding=True,
209
+ max_length=MAX_LENGTH
210
+ )
211
+ self.labels = list(labels)
212
+
213
+ def __getitem__(self, idx):
214
+ item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
215
+ item["labels"] = torch.tensor(self.labels[idx])
216
+ return item
217
+
218
+ def __len__(self):
219
+ return len(self.labels)
220
+
221
+
222
+ train_dataset = GrievanceDataset(train_df["text"], train_df["label_id"])
223
+ val_dataset = GrievanceDataset(val_df["text"], val_df["label_id"])
224
+ test_dataset = GrievanceDataset(test_df["text"], test_df["label_id"])
225
+
226
+
227
+ # ---------------------------------------------------------
228
+ # 7. MODEL
229
+ # ---------------------------------------------------------
230
+ model = BertForSequenceClassification.from_pretrained(
231
+ "bert-base-uncased",
232
+ num_labels=NUM_LABELS
233
+ )
234
+
235
+
236
+ # ---------------------------------------------------------
237
+ # 8. METRICS
238
+ # ---------------------------------------------------------
239
+ def compute_metrics(eval_pred):
240
+ logits, labels = eval_pred
241
+ preds = np.argmax(logits, axis=1)
242
+
243
+ return {
244
+ "accuracy": accuracy_score(labels, preds),
245
+ "balanced_accuracy": balanced_accuracy_score(labels, preds),
246
+ "f1_weighted": f1_score(labels, preds, average="weighted"),
247
+ "mcc": matthews_corrcoef(labels, preds)
248
+ }
249
+
250
+
251
+ # ---------------------------------------------------------
252
+ # 9. TRAINING
253
+ # ---------------------------------------------------------
254
+ training_args = TrainingArguments(
255
+ output_dir=os.path.join(ARTIFACT_DIR, "results"),
256
+ learning_rate=LEARNING_RATE,
257
+ per_device_train_batch_size=BATCH_SIZE,
258
+ per_device_eval_batch_size=BATCH_SIZE,
259
+ num_train_epochs=EPOCHS,
260
+ weight_decay=0.01,
261
+ logging_steps=100,
262
+ save_strategy="no",
263
+ report_to="none"
264
+ )
265
+
266
+ trainer = Trainer(
267
+ model=model,
268
+ args=training_args,
269
+ train_dataset=train_dataset,
270
+ eval_dataset=val_dataset,
271
+ compute_metrics=compute_metrics
272
+ )
273
+
274
+ trainer.train()
275
+
276
+
277
+ # ---------------------------------------------------------
278
+ # 10. FINAL TEST EVALUATION
279
+ # ---------------------------------------------------------
280
+ predictions = trainer.predict(test_dataset)
281
+ y_true = predictions.label_ids
282
+ y_pred = np.argmax(predictions.predictions, axis=1)
283
+
284
+ print("\n===== FINAL TEST METRICS =====")
285
+ print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
286
+ print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")
287
+ print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")
288
+ print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")
289
+
290
+
291
+ # ---------------------------------------------------------
292
+ # 11. SAVE TRAINED MODEL
293
+ # ---------------------------------------------------------
294
+ model.save_pretrained(MODEL_DIR)
295
+
296
+ print("\n✅ PREPROCESSING + TRAINING COMPLETED SUCCESSFULLY")
297
+
298
+
299
+ NUM_LABELS = len(label_map)
300
+
301
+ # ---------------------------------------------------------
302
+ # 4. TRAIN / VAL / TEST SPLIT
303
+ # ---------------------------------------------------------
304
+ train_df, temp_df = train_test_split(
305
+ df, test_size=0.30, stratify=df['label_id'], random_state=42
306
+ )
307
+
308
+ val_df, test_df = train_test_split(
309
+ temp_df, test_size=0.50, stratify=temp_df['label_id'], random_state=42
310
+ )
311
+
312
+ # SAVE PREPROCESSED SPLITS
313
+ train_df.to_csv(f"{ARTIFACT_DIR}/train.csv", index=False)
314
+ val_df.to_csv(f"{ARTIFACT_DIR}/val.csv", index=False)
315
+ test_df.to_csv(f"{ARTIFACT_DIR}/test.csv", index=False)
316
+
317
+ # ---------------------------------------------------------
318
+ # 5. TOKENIZER
319
+ # ---------------------------------------------------------
320
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
321
+
322
+ # SAVE TOKENIZER
323
+ with open(f"{ARTIFACT_DIR}/tokenizer.pkl", "wb") as f:
324
+ pickle.dump(tokenizer, f)
325
+
326
+ # ---------------------------------------------------------
327
+ # 6. DATASET CLASS
328
+ # ---------------------------------------------------------
329
+ class GrievanceDataset(Dataset):
330
+ def __init__(self, texts, labels):
331
+ self.encodings = tokenizer(
332
+ list(texts),
333
+ truncation=True,
334
+ padding=True,
335
+ max_length=MAX_LENGTH
336
+ )
337
+ self.labels = list(labels)
338
+
339
+ def __getitem__(self, idx):
340
+ item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
341
+ item["labels"] = torch.tensor(self.labels[idx])
342
+ return item
343
+
344
+ def __len__(self):
345
+ return len(self.labels)
346
+
347
+ train_dataset = GrievanceDataset(train_df['text'], train_df['label_id'])
348
+ val_dataset = GrievanceDataset(val_df['text'], val_df['label_id'])
349
+ test_dataset = GrievanceDataset(test_df['text'], test_df['label_id'])
350
+
351
+ # ---------------------------------------------------------
352
+ # 7. MODEL
353
+ # ---------------------------------------------------------
354
+ model = BertForSequenceClassification.from_pretrained(
355
+ "bert-base-uncased",
356
+ num_labels=NUM_LABELS
357
+ )
358
+
359
+ # ---------------------------------------------------------
360
+ # 8. METRICS
361
+ # ---------------------------------------------------------
362
+ def compute_metrics(eval_pred):
363
+ logits, labels = eval_pred
364
+ preds = np.argmax(logits, axis=1)
365
+
366
+ return {
367
+ "accuracy": accuracy_score(labels, preds),
368
+ "balanced_accuracy": balanced_accuracy_score(labels, preds),
369
+ "f1": f1_score(labels, preds, average="weighted"),
370
+ "mcc": matthews_corrcoef(labels, preds)
371
+ }
372
+
373
+ # ---------------------------------------------------------
374
+ # 9. TRAINING
375
+ # ---------------------------------------------------------
376
+ training_args = TrainingArguments(
377
+ output_dir=f"{ARTIFACT_DIR}/results",
378
+ learning_rate=LEARNING_RATE,
379
+ per_device_train_batch_size=BATCH_SIZE,
380
+ per_device_eval_batch_size=BATCH_SIZE,
381
+ num_train_epochs=EPOCHS,
382
+ weight_decay=0.01,
383
+ logging_steps=100,
384
+ save_strategy="no",
385
+ report_to="none"
386
+ )
387
+
388
+ trainer = Trainer(
389
+ model=model,
390
+ args=training_args,
391
+ train_dataset=train_dataset,
392
+ eval_dataset=val_dataset,
393
+ tokenizer=tokenizer,
394
+ compute_metrics=compute_metrics
395
+ )
396
+
397
+ trainer.train()
398
+
399
+ # ---------------------------------------------------------
400
+ # 10. FINAL TEST EVALUATION
401
+ # ---------------------------------------------------------
402
+ predictions = trainer.predict(test_dataset)
403
+ y_true = predictions.label_ids
404
+ y_pred = np.argmax(predictions.predictions, axis=1)
405
+
406
+ print("\n===== FINAL TEST METRICS =====")
407
+ print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
408
+ print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")
409
+ print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")
410
+ print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")
411
+
412
+ # ---------------------------------------------------------
413
+ # 11. SAVE TRAINED MODEL
414
+ # ---------------------------------------------------------
415
+ model.save_pretrained(MODEL_DIR)
416
+
417
+ print("\n✅ PREPROCESSING + TRAINING + ARTIFACT GENERATION COMPLETED")
classification/classification/artifacts/label_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b0be0d88eed1838fba777af266556aea55e435b970076684d2ad1c8c9b3fb0b
3
+ size 342
classification/classification/artifacts/label_map.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8e10c5614e117fd9ccab4af3fa62c0e4c44d23195847586d4d1ddb47f4a00cc
3
+ size 321
classification/indic_bert_classify.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # INDICBERT MODEL — CATEGORY CLASSIFICATION (HINDI + TELUGU)
3
+ # =========================================================
4
+
5
+ import os
6
+ import re
7
+ import torch
8
+ import pickle
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+
11
+ # ── Path config ───────────────────────────────────────────
12
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
+ ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
14
+ MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model")
15
+ MAX_LENGTH = 128
16
+
17
+ # ── Load artifacts ────────────────────────────────────────
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
19
+
20
+ with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "rb") as f:
21
+ label_encoder = pickle.load(f)
22
+
23
+ model = AutoModelForSequenceClassification.from_pretrained(
24
+ MODEL_DIR, local_files_only=True
25
+ )
26
+ model.eval()
27
+
28
+ # ── Edge-case constants ───────────────────────────────────
29
+ LABEL_WORDS = {
30
+ "water", "electricity", "roads", "garbage",
31
+ "sanitation", "pollution", "transport", "animals",
32
+ "पानी", "बिजली", "सड़क", "कचरा",
33
+ "నీరు", "విద్యుత్", "రోడ్డు", "చెత్త",
34
+ }
35
+
36
+ NON_GRIEVANCE_PHRASES = {
37
+ "hello", "hi", "good morning", "good evening",
38
+ "thank you", "thanks", "all good", "no issues", "test", "demo",
39
+ "नमस्ते", "धन्यवाद", "सब ठीक है", "कोई समस्या नहीं",
40
+ "నమస్తే", "ధన్యవాదాలు", "అన్నీ బాగున్నాయి", "సమస్య లేదు",
41
+ }
42
+
43
+
44
+ # ── Text cleaning (Indic-safe) ────────────────────────────
45
+ def clean_text(text: str) -> str:
46
+ text = str(text)
47
+ text = re.sub(r"<.*?>", " ", text)
48
+ # Keep Hindi (0900-097F), Telugu (0C00-0C7F), basic ASCII (0020-007F)
49
+ text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
50
+ text = re.sub(r"\s+", " ", text).strip()
51
+ return text
52
+
53
+
54
+ # ── Input validation ──────────────────────────────────────
55
+ def validate_input(text: str):
56
+ if not text or not text.strip():
57
+ return "empty_text"
58
+ text_l = text.strip().lower()
59
+ if len(text_l) < 5:
60
+ return "too_short"
61
+ if len(text_l.split()) < 2:
62
+ return "too_few_words"
63
+ if text_l in LABEL_WORDS:
64
+ return "label_only"
65
+ if text_l in NON_GRIEVANCE_PHRASES:
66
+ return "non_grievance_text"
67
+ return None
68
+
69
+
70
+ # ── Predict ───────────────────────────────────────────────
71
+ def predict(
72
+ text: str,
73
+ input_ids=None, # O3: pre-tokenised tensor from main.py
74
+ attention_mask=None, # O3: pre-tokenised tensor from main.py
75
+ ) -> dict:
76
+ """
77
+ Predict grievance category for Hindi / Telugu text.
78
+
79
+ Args:
80
+ text : Raw input string (always required for validation).
81
+ input_ids : Optional pre-tokenised tensor (1, seq_len).
82
+ attention_mask : Required when input_ids is provided.
83
+
84
+ Returns dict with keys: status, category, confidence, class_index.
85
+ """
86
+ # 1. Rule-based validation
87
+ reason = validate_input(text)
88
+ if reason:
89
+ return {
90
+ "status": "failed",
91
+ "reason": reason,
92
+ "category": None,
93
+ "confidence": 0.0,
94
+ "class_index": None,
95
+ }
96
+
97
+ # 2. Clean text
98
+ cleaned = clean_text(text)
99
+
100
+ # 3. O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
101
+ if input_ids is None:
102
+ enc = tokenizer(
103
+ cleaned,
104
+ return_tensors="pt",
105
+ truncation=True,
106
+ padding=False,
107
+ max_length=MAX_LENGTH,
108
+ )
109
+ input_ids = enc["input_ids"]
110
+ attention_mask = enc["attention_mask"]
111
+
112
+ # 4. Forward pass
113
+ with torch.no_grad():
114
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
115
+
116
+ probs = torch.softmax(outputs.logits, dim=1)
117
+ conf, pred = torch.max(probs, dim=1)
118
+ confidence = conf.item()
119
+ predicted_index = pred.item()
120
+
121
+ # 5. Confidence gate
122
+ if confidence < 0.30:
123
+ return {
124
+ "status": "success",
125
+ "reason": "low_confidence",
126
+ "category": "Other",
127
+ "confidence": round(confidence, 4),
128
+ "class_index": predicted_index,
129
+ }
130
+
131
+ label = label_encoder.inverse_transform([predicted_index])[0]
132
+
133
+ return {
134
+ "status": "success",
135
+ "category": label,
136
+ "confidence": round(confidence, 4),
137
+ "class_index": predicted_index,
138
+ }
139
+
140
+
141
+ def get_model_and_tokenizer():
142
+ return model, tokenizer
classification/indic_bert_model.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # INDICBERT PREPROCESSING + TRAINING + ARTIFACT GENERATION
3
+ # Hindi + Telugu Grievance Classification
4
+ # =========================================================
5
+
6
+ import os
7
+ import re
8
+ import pickle
9
+ import pandas as pd
10
+ import numpy as np
11
+ import torch
12
+
13
+ from sklearn.model_selection import train_test_split
14
+ from sklearn.preprocessing import LabelEncoder
15
+ from sklearn.metrics import (
16
+ accuracy_score,
17
+ f1_score,
18
+ balanced_accuracy_score,
19
+ matthews_corrcoef
20
+ )
21
+
22
+ from transformers import (
23
+ AutoTokenizer,
24
+ AutoModelForSequenceClassification,
25
+ Trainer,
26
+ TrainingArguments
27
+ )
28
+
29
+ from torch.utils.data import Dataset
30
+
31
+
32
+ # =========================================================
33
+ # CONFIG
34
+ # =========================================================
35
+
36
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
37
+ DATA_PATH = os.path.join(BASE_DIR, "indic_train.csv")
38
+
39
+ ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
40
+ MODEL_DIR = os.path.join(ARTIFACT_DIR, "indicbert_model")
41
+
42
+ MAX_LENGTH = 128
43
+ EPOCHS = 4
44
+ BATCH_SIZE = 16
45
+ LEARNING_RATE = 2e-5
46
+
47
+ MODEL_NAME = "ai4bharat/indic-bert"
48
+
49
+ os.makedirs(ARTIFACT_DIR, exist_ok=True)
50
+ os.makedirs(MODEL_DIR, exist_ok=True)
51
+
52
+ print(f"📄 Loading dataset from: {DATA_PATH}")
53
+
54
+
55
+ # =========================================================
56
+ # LOAD DATA
57
+ # =========================================================
58
+
59
+ df = pd.read_csv(DATA_PATH)
60
+
61
+ df = df[['text', 'label']]
62
+
63
+ df.dropna(inplace=True)
64
+ df.drop_duplicates(inplace=True)
65
+
66
+
67
+ # =========================================================
68
+ # CLEAN TEXT (KEEP HINDI & TELUGU SAFE)
69
+ # =========================================================
70
+
71
+ def clean_text(text):
72
+
73
+ text = str(text)
74
+
75
+ # Remove HTML
76
+ text = re.sub(r"<.*?>", " ", text)
77
+
78
+ # Remove unwanted symbols but KEEP Indic unicode
79
+ text = re.sub(r"[^\u0900-\u097F\u0C00-\u0C7F\u0020-\u007F]", " ", text)
80
+
81
+ text = re.sub(r"\s+", " ", text).strip()
82
+
83
+ return text
84
+
85
+
86
+ df["text"] = df["text"].apply(clean_text)
87
+
88
+
89
+ # =========================================================
90
+ # LABEL ENCODING
91
+ # =========================================================
92
+
93
+ label_encoder = LabelEncoder()
94
+
95
+ df["label_id"] = label_encoder.fit_transform(df["label"])
96
+
97
+ label_map = dict(zip(
98
+ label_encoder.classes_,
99
+ label_encoder.transform(label_encoder.classes_)
100
+ ))
101
+
102
+
103
+ # SAVE LABEL ARTIFACTS
104
+ with open(os.path.join(ARTIFACT_DIR, "label_encoder.pkl"), "wb") as f:
105
+ pickle.dump(label_encoder, f)
106
+
107
+ with open(os.path.join(ARTIFACT_DIR, "label_map.pkl"), "wb") as f:
108
+ pickle.dump(label_map, f)
109
+
110
+
111
+ NUM_LABELS = len(label_map)
112
+
113
+ print(f"✅ Number of classes: {NUM_LABELS}")
114
+
115
+
116
+ # =========================================================
117
+ # TRAIN / VAL / TEST SPLIT
118
+ # =========================================================
119
+
120
+ train_df, temp_df = train_test_split(
121
+ df,
122
+ test_size=0.30,
123
+ stratify=df["label_id"],
124
+ random_state=42
125
+ )
126
+
127
+ val_df, test_df = train_test_split(
128
+ temp_df,
129
+ test_size=0.50,
130
+ stratify=temp_df["label_id"],
131
+ random_state=42
132
+ )
133
+
134
+
135
+ # SAVE SPLITS
136
+ train_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_train.csv"), index=False)
137
+ val_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_val.csv"), index=False)
138
+ test_df.to_csv(os.path.join(ARTIFACT_DIR, "indic_test.csv"), index=False)
139
+
140
+
141
+ # =========================================================
142
+ # TOKENIZER
143
+ # =========================================================
144
+
145
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
146
+
147
+ with open(os.path.join(ARTIFACT_DIR, "indic_tokenizer.pkl"), "wb") as f:
148
+ pickle.dump(tokenizer, f)
149
+
150
+
151
+ # =========================================================
152
+ # DATASET CLASS
153
+ # =========================================================
154
+
155
+ class GrievanceDataset(Dataset):
156
+
157
+ def __init__(self, texts, labels):
158
+
159
+ self.encodings = tokenizer(
160
+ list(texts),
161
+ truncation=True,
162
+ padding=True,
163
+ max_length=MAX_LENGTH
164
+ )
165
+
166
+ self.labels = list(labels)
167
+
168
+
169
+ def __getitem__(self, idx):
170
+
171
+ item = {
172
+ key: torch.tensor(val[idx])
173
+ for key, val in self.encodings.items()
174
+ }
175
+
176
+ item["labels"] = torch.tensor(self.labels[idx])
177
+
178
+ return item
179
+
180
+
181
+ def __len__(self):
182
+
183
+ return len(self.labels)
184
+
185
+
186
+
187
+ train_dataset = GrievanceDataset(
188
+ train_df["text"],
189
+ train_df["label_id"]
190
+ )
191
+
192
+ val_dataset = GrievanceDataset(
193
+ val_df["text"],
194
+ val_df["label_id"]
195
+ )
196
+
197
+ test_dataset = GrievanceDataset(
198
+ test_df["text"],
199
+ test_df["label_id"]
200
+ )
201
+
202
+
203
+ # =========================================================
204
+ # MODEL
205
+ # =========================================================
206
+
207
+ model = AutoModelForSequenceClassification.from_pretrained(
208
+ MODEL_NAME,
209
+ num_labels=NUM_LABELS
210
+ )
211
+
212
+
213
+ # =========================================================
214
+ # METRICS
215
+ # =========================================================
216
+
217
+ def compute_metrics(eval_pred):
218
+
219
+ logits, labels = eval_pred
220
+
221
+ preds = np.argmax(logits, axis=1)
222
+
223
+ return {
224
+
225
+ "accuracy": accuracy_score(labels, preds),
226
+
227
+ "balanced_accuracy": balanced_accuracy_score(labels, preds),
228
+
229
+ "f1_weighted": f1_score(labels, preds, average="weighted"),
230
+
231
+ "mcc": matthews_corrcoef(labels, preds)
232
+
233
+ }
234
+
235
+
236
+ # =========================================================
237
+ # TRAINING
238
+ # =========================================================
239
+
240
+ training_args = TrainingArguments(
241
+ output_dir=f"{ARTIFACT_DIR}/indic_results",
242
+ learning_rate=LEARNING_RATE,
243
+ per_device_train_batch_size=BATCH_SIZE,
244
+ per_device_eval_batch_size=BATCH_SIZE,
245
+ num_train_epochs=EPOCHS,
246
+ weight_decay=0.01,
247
+ logging_steps=100,
248
+ save_strategy="no",
249
+ report_to="none"
250
+ )
251
+
252
+
253
+ trainer = Trainer(
254
+ model=model,
255
+ args=training_args,
256
+ train_dataset=train_dataset,
257
+ eval_dataset=val_dataset,
258
+ compute_metrics=compute_metrics
259
+ )
260
+
261
+
262
+
263
+ print("\n🚀 Training IndicBERT Model...\n")
264
+
265
+ trainer.train()
266
+
267
+
268
+ # =========================================================
269
+ # FINAL TEST EVALUATION
270
+ # =========================================================
271
+
272
+ predictions = trainer.predict(test_dataset)
273
+
274
+ y_true = predictions.label_ids
275
+
276
+ y_pred = np.argmax(predictions.predictions, axis=1)
277
+
278
+
279
+ print("\n===== FINAL TEST METRICS =====")
280
+
281
+ print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
282
+
283
+ print(f"Balanced Accuracy : {balanced_accuracy_score(y_true, y_pred):.4f}")
284
+
285
+ print(f"Weighted F1 : {f1_score(y_true, y_pred, average='weighted'):.4f}")
286
+
287
+ print(f"MCC : {matthews_corrcoef(y_true, y_pred):.4f}")
288
+
289
+
290
+ # =========================================================
291
+ # SAVE MODEL
292
+ # =========================================================
293
+
294
+ model.save_pretrained(MODEL_DIR)
295
+
296
+ tokenizer.save_pretrained(MODEL_DIR)
297
+
298
+
299
+ print("\n✅ INDICBERT TRAINING COMPLETED SUCCESSFULLY")
classification/indic_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
classification/train.csv ADDED
The diff for this file is too large to render. See raw diff
 
gfas/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # gfas/__init__.py
3
+ # Public surface of the GFAS package.
4
+ # main.py only needs to import `audit` from here.
5
+ # =========================================================
6
+
7
+ from .fairness_audit import audit
8
+
9
+ __all__ = ["audit"]
gfas/disparity_analysis.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # gfas/disparity_analysis.py
3
+ # Per-group metric computation and per-dimension disparity
4
+ # analysis (gaps, flags, breakdown table).
5
+ # =========================================================
6
+ import statistics
7
+
8
+ from .fairness_metrics import (
9
+ URGENCY_POSITIVE,
10
+ FAIRNESS_DIMENSIONS,
11
+ PARITY_FLAG_THRESHOLD,
12
+ PRIORITY_FLAG_THRESHOLD,
13
+ TPR_FLAG_THRESHOLD,
14
+ gap_to_score,
15
+ score_label,
16
+ severity,
17
+ )
18
+
19
+
20
+ # ── Internal helpers ──────────────────────────────────────
21
+
22
+ def _gap(values: list) -> float | None:
23
+ """Max – min over a list, ignoring Nones. Returns None if fewer than 2 clean values."""
24
+ clean = [v for v in values if v is not None]
25
+ return round(max(clean) - min(clean), 4) if len(clean) >= 2 else None
26
+
27
+
28
+ # ── Public API ────────────────────────────────────────────
29
+
30
+ def compute_group_metrics(items: list) -> dict:
31
+ """
32
+ Compute per-group fairness metrics for a single bucket of grievance records.
33
+
34
+ Returns a dict with keys:
35
+ count, resolution_rate, statistical_parity,
36
+ equal_opportunity_tpr, mean_priority_score
37
+ """
38
+ n = len(items)
39
+
40
+ pred_pos = sum(1 for r in items if r["predicted_urgency"] in URGENCY_POSITIVE)
41
+ statistical_parity = round(pred_pos / n, 4)
42
+
43
+ true_pos_pool = [r for r in items if r["true_urgency"] in URGENCY_POSITIVE]
44
+ if true_pos_pool:
45
+ tpr_hits = sum(1 for r in true_pos_pool if r["predicted_urgency"] in URGENCY_POSITIVE)
46
+ equal_opportunity_tpr = round(tpr_hits / len(true_pos_pool), 4)
47
+ else:
48
+ equal_opportunity_tpr = None
49
+
50
+ mean_priority_score = round(statistics.mean([r["priority_score"] for r in items]), 4)
51
+ resolved_count = sum(1 for r in items if r.get("status", "") == "resolved")
52
+ resolution_rate = round(resolved_count / n, 4)
53
+
54
+ return {
55
+ "count": n,
56
+ "resolution_rate": resolution_rate,
57
+ "statistical_parity": statistical_parity,
58
+ "equal_opportunity_tpr": equal_opportunity_tpr,
59
+ "mean_priority_score": mean_priority_score,
60
+ }
61
+
62
+
63
+ def analyse_dimension(dimension: str, group_metrics: dict) -> dict:
64
+ """
65
+ Given a dimension name and its {group → metrics} dict, compute:
66
+ - gap values across groups
67
+ - fairness score, label, severity
68
+ - flagged groups
69
+ - breakdown table
70
+ - fairness_flags list
71
+ """
72
+ parity_vals = [v["statistical_parity"] for v in group_metrics.values()]
73
+ priority_vals = [v["mean_priority_score"] for v in group_metrics.values()]
74
+ tpr_vals = [v["equal_opportunity_tpr"] for v in group_metrics.values()
75
+ if v["equal_opportunity_tpr"] is not None]
76
+ res_vals = [v["resolution_rate"] for v in group_metrics.values()]
77
+
78
+ sp_gap = _gap(parity_vals)
79
+ tpr_gap = _gap(tpr_vals)
80
+ pri_gap = _gap(priority_vals)
81
+ res_gap = _gap(res_vals)
82
+
83
+ sub_scores = [s for s in [gap_to_score(sp_gap), gap_to_score(tpr_gap), gap_to_score(pri_gap)]
84
+ if s is not None]
85
+ fairness_score = min(sub_scores) if sub_scores else 100
86
+
87
+ avg_parity = round(statistics.mean(parity_vals), 4) if parity_vals else 0
88
+ avg_resolution = round(statistics.mean(res_vals), 4) if res_vals else 0
89
+
90
+ flagged_groups = [
91
+ g for g, m in group_metrics.items()
92
+ if m["statistical_parity"] < avg_parity - 0.10
93
+ ]
94
+
95
+ breakdown = sorted(
96
+ [
97
+ {
98
+ dimension: group,
99
+ "resolutionRate": round(m["statistical_parity"] * 100, 2),
100
+ "total": m["count"],
101
+ "statisticalParity": m["statistical_parity"],
102
+ "tpr": m["equal_opportunity_tpr"],
103
+ "meanPriorityScore": m["mean_priority_score"],
104
+ "isFlagged": group in flagged_groups,
105
+ }
106
+ for group, m in group_metrics.items()
107
+ ],
108
+ key=lambda x: x["resolutionRate"],
109
+ )
110
+
111
+ fairness_flags = []
112
+ if sp_gap is not None and sp_gap > PARITY_FLAG_THRESHOLD:
113
+ fairness_flags.append({
114
+ "metric": "statistical_parity",
115
+ "gap": sp_gap,
116
+ "label": f"Urgency-rate gap of {sp_gap * 100:.1f}% across {dimension} groups",
117
+ "interpretation": "Some groups are significantly more (or less) likely to have their grievances classified as high/critical urgency.",
118
+ })
119
+ if pri_gap is not None and pri_gap > PRIORITY_FLAG_THRESHOLD:
120
+ fairness_flags.append({
121
+ "metric": "mean_priority_score",
122
+ "gap": pri_gap,
123
+ "label": f"Priority-score gap of {pri_gap:.3f} across {dimension} groups",
124
+ "interpretation": "Some groups receive systematically higher or lower priority scores, affecting response speed.",
125
+ })
126
+ if tpr_gap is not None and tpr_gap > TPR_FLAG_THRESHOLD:
127
+ fairness_flags.append({
128
+ "metric": "equal_opportunity_tpr",
129
+ "gap": tpr_gap,
130
+ "label": f"Detection-rate gap of {tpr_gap * 100:.1f}% for truly urgent cases across {dimension} groups",
131
+ "interpretation": "The model misses urgent cases at different rates across groups.",
132
+ })
133
+
134
+ return {
135
+ "fairnessScore": fairness_score,
136
+ "fairnessLabel": score_label(fairness_score),
137
+ "severity": severity(fairness_score),
138
+ "groups_found": sorted(group_metrics.keys()),
139
+ "average": round(avg_parity * 100, 2),
140
+ "average_resolution": round(avg_resolution * 100, 2),
141
+ "breakdown": breakdown,
142
+ "flagged": flagged_groups,
143
+ "group_metrics": group_metrics,
144
+ "disparity_summary": {
145
+ "statistical_parity_gap": sp_gap,
146
+ "equal_opportunity_tpr_gap": tpr_gap,
147
+ "mean_priority_score_gap": pri_gap,
148
+ "resolution_rate_gap": res_gap,
149
+ "statistical_parity_gap_label": f"{round(sp_gap * 100, 1)}% urgency-rate spread" if sp_gap is not None else None,
150
+ "equal_opportunity_tpr_gap_label": f"{round(tpr_gap * 100, 1)}% detection-rate gap" if tpr_gap is not None else None,
151
+ "mean_priority_score_gap_label": f"{round(pri_gap, 3)} priority-score spread" if pri_gap is not None else None,
152
+ "resolution_rate_gap_label": f"{round(res_gap * 100, 1)}% resolution-rate gap" if res_gap is not None else None,
153
+ },
154
+ "fairness_flags": fairness_flags,
155
+ "flags_raised": len(fairness_flags),
156
+ }
gfas/fairness_audit.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # gfas/fairness_audit.py
3
+ # Input validation and the single callable that main.py
4
+ # imports to power the POST /fairness-audit route.
5
+ # =========================================================
6
+
7
+ from .fairness_metrics import (
8
+ VALID_AREAS,
9
+ VALID_CATEGORIES,
10
+ VALID_LANGUAGES,
11
+ VALID_URGENCY,
12
+ )
13
+ from .gfas_engine import run_fairness_audit
14
+
15
+
16
+ # ── Record-level validation ───────────────────────────────
17
+
18
+ def _validate_record(idx: int, r) -> tuple[dict | None, dict | None]:
19
+ """
20
+ Normalise and validate a single raw grievance dict.
21
+
22
+ Returns (record, None) on success, (None, skip_entry) on failure.
23
+ """
24
+ if not isinstance(r, dict):
25
+ return None, {"index": idx, "error": "Not a JSON object"}
26
+
27
+ try:
28
+ area = str(r.get("area", "")).strip().lower()
29
+ category = str(r.get("category", "")).strip().lower()
30
+ language = str(r.get("language", "english")).strip().lower()
31
+ pred = str(r.get("predicted_urgency", "medium")).strip().lower()
32
+ true_urg = str(r.get("true_urgency", pred)).strip().lower()
33
+ score = float(r.get("priority_score", 0))
34
+ status = str(r.get("status", "pending")).strip().lower()
35
+ except Exception as e:
36
+ return None, {"index": idx, "error": f"Field parse error: {e}"}
37
+
38
+ if area not in VALID_AREAS:
39
+ return None, {"index": idx, "error": f"area not in VALID_AREAS: '{area}'"}
40
+ if category not in VALID_CATEGORIES:
41
+ return None, {"index": idx, "error": f"category not in VALID_CATEGORIES: '{category}'"}
42
+
43
+ # Soft-correct out-of-vocabulary enum values
44
+ if language not in VALID_LANGUAGES:
45
+ language = "english"
46
+ if pred not in VALID_URGENCY:
47
+ pred = "medium"
48
+ if true_urg not in VALID_URGENCY:
49
+ true_urg = pred
50
+
51
+ return {
52
+ "area": area,
53
+ "category": category,
54
+ "language": language,
55
+ "predicted_urgency": pred,
56
+ "true_urgency": true_urg,
57
+ "priority_score": score,
58
+ "status": status,
59
+ }, None
60
+
61
+
62
+ # ── Public callable used by the route ────────────────────
63
+
64
+ def audit(raw_grievances: list) -> tuple[dict | None, dict | None, int]:
65
+ """
66
+ Validate *raw_grievances* and run the full GFAS pipeline.
67
+
68
+ Returns:
69
+ (result_dict, None, 200) — success
70
+ (None, error_dict, 4xx) — validation failure
71
+ """
72
+ if not isinstance(raw_grievances, list) or not raw_grievances:
73
+ return None, {
74
+ "status": "failed",
75
+ "message": "'grievances' must be a non-empty list.",
76
+ }, 422
77
+
78
+ validated, skipped = [], []
79
+ for idx, r in enumerate(raw_grievances):
80
+ record, err = _validate_record(idx, r)
81
+ if record:
82
+ validated.append(record)
83
+ else:
84
+ skipped.append(err)
85
+
86
+ if len(validated) < 2:
87
+ return None, {
88
+ "status": "failed",
89
+ "message": (
90
+ f"Only {len(validated)} valid record(s) after validation "
91
+ f"({len(skipped)} skipped). Need at least 2 records across different "
92
+ f"groups to compute fairness metrics."
93
+ ),
94
+ "skipped": skipped[:10],
95
+ "skipped_count": len(skipped),
96
+ "received_count": len(raw_grievances),
97
+ }, 422
98
+
99
+ audit_result = run_fairness_audit(validated)
100
+
101
+ result = {
102
+ "status": "success",
103
+ "fairness_audit": audit_result,
104
+ "meta": {
105
+ "received": len(raw_grievances),
106
+ "valid": len(validated),
107
+ "skipped": len(skipped),
108
+ "skipped_details": skipped[:5] if skipped else [],
109
+ },
110
+ }
111
+ return result, None, 200
gfas/fairness_metrics.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # gfas/fairness_metrics.py
3
+ # Constants and primitive scoring helpers used across GFAS.
4
+ # =========================================================
5
+
6
+ # ── Urgency label sets ────────────────────────────────────
7
+ VALID_URGENCY = {"low", "medium", "high", "critical"}
8
+ URGENCY_POSITIVE = {"high", "critical"}
9
+
10
+ # ── Domain allow-lists ────────────────────────────────────
11
+ VALID_CATEGORIES = {
12
+ "electricity", "garbage", "pollution", "public transport",
13
+ "roads", "sanitation", "stray animals", "water"
14
+ }
15
+
16
+ VALID_LANGUAGES = {"telugu", "english", "hindi"}
17
+
18
+ VALID_AREAS = {
19
+ # Zone 1
20
+ "suryaraopeta", "jagannaickpur", "raja rao peta", "bhanugudi",
21
+ "old town", "rajah street", "main road",
22
+ # Zone 2
23
+ "gandhi nagar", "ashok nagar", "nethaji nagar",
24
+ "srinivasa nagar", "tngo colony", "shankar vilas",
25
+ "collector's colony",
26
+ # Zone 3
27
+ "new town", "bank colony", "drivers colony",
28
+ "fci colony", "burma colony", "dwaraka nagar",
29
+ "ayodhya nagar",
30
+ # Zone 4
31
+ "kakinada port area", "kakinada industrial area",
32
+ "fishing harbour", "dairy farm", "auto nagar",
33
+ "kaleswara rao nagar",
34
+ # Zone 5
35
+ "ramanayyapeta", "rama rao peta", "kondayya palem",
36
+ "ganganapalle", "gudari gunta", "indrapalem",
37
+ # Zone 6
38
+ "sarpavaram", "uppada", "kaikavolu",
39
+ "kothuru", "thammavaram", "thimmapuram",
40
+ # Zone 7
41
+ "vivekananda street", "jr ntr road",
42
+ "jntu kakinada area", "govt general hospital area",
43
+ "apsp camp",
44
+ # Other
45
+ "kakinada beach road", "kakinada bazar",
46
+ "anjaneya nagar",
47
+ }
48
+
49
+ # ── Audit dimensions ──────────────────────────────────────
50
+ FAIRNESS_DIMENSIONS = ["area", "category", "language"]
51
+
52
+ # ── Flag thresholds ───────────────────────────────────────
53
+ PARITY_FLAG_THRESHOLD = 0.20
54
+ PRIORITY_FLAG_THRESHOLD = 0.20
55
+ TPR_FLAG_THRESHOLD = 0.20
56
+
57
+
58
+ # ── Primitive scorers ─────────────────────────────────────
59
+
60
+ def gap_to_score(gap) -> int:
61
+ """Convert a disparity gap (0–1) to a 0–100 fairness score (higher = fairer)."""
62
+ if gap is None:
63
+ return 100
64
+ return max(0, min(100, round(100 - float(gap) * 200)))
65
+
66
+
67
+ def score_label(score: int) -> str:
68
+ if score >= 80:
69
+ return "Fair"
70
+ if score >= 60:
71
+ return "Moderate"
72
+ return "Biased"
73
+
74
+
75
+ def severity(score: int) -> str:
76
+ if score >= 80:
77
+ return "ok"
78
+ if score >= 60:
79
+ return "warning"
80
+ return "critical"
gfas/gfas_engine.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # gfas/gfas_engine.py
3
+ # Top-level orchestrator: feeds validated records through
4
+ # disparity analysis and report generation to produce the
5
+ # final fairness audit payload.
6
+ # =========================================================
7
+ import statistics
8
+ from collections import defaultdict
9
+
10
+ from .fairness_metrics import FAIRNESS_DIMENSIONS, score_label, severity
11
+ from .disparity_analysis import compute_group_metrics, analyse_dimension
12
+ from .report_generator import build_alerts, build_recommendations
13
+
14
+
15
+ def run_fairness_audit(validated_grievances: list) -> dict:
16
+ """
17
+ Main entry point for GFAS.
18
+
19
+ Args:
20
+ validated_grievances: List of dicts already normalised by fairness_audit.py.
21
+ Required keys: area, category, language, predicted_urgency,
22
+ true_urgency, priority_score, status.
23
+
24
+ Returns:
25
+ Full fairness audit payload ready for JSON serialisation.
26
+ """
27
+ dimension_results: dict = {}
28
+
29
+ for dimension in FAIRNESS_DIMENSIONS:
30
+ # ── Bucket records by group value ──────────────────────────────────────
31
+ buckets: dict = defaultdict(list)
32
+ for r in validated_grievances:
33
+ buckets[r[dimension]].append(r)
34
+
35
+ # ── Per-group metrics ──────────────────────────────────────────────────
36
+ group_metrics = {group: compute_group_metrics(items) for group, items in buckets.items()}
37
+
38
+ # ── Dimension-level disparity analysis ─────────────────────────────────
39
+ dimension_results[dimension] = analyse_dimension(dimension, group_metrics)
40
+
41
+ # ── Overall score (mean of dimension scores) ───────────────────────────────
42
+ dim_scores = [dimension_results[d]["fairnessScore"] for d in FAIRNESS_DIMENSIONS]
43
+ overall_score = round(statistics.mean(dim_scores), 2)
44
+
45
+ # ── Alerts + recommendations ───────────────────────────────────────────────
46
+ alerts = build_alerts(dimension_results)
47
+ recommendations = build_recommendations(dimension_results)
48
+
49
+ # ── Summary block ──────────────────────────────────────────────────────────
50
+ sp_gaps = [
51
+ dimension_results[d]["disparity_summary"]["statistical_parity_gap"]
52
+ for d in FAIRNESS_DIMENSIONS
53
+ ]
54
+ disparity_index = (
55
+ round(max(v for v in sp_gaps if v is not None), 4)
56
+ if any(v is not None for v in sp_gaps)
57
+ else None
58
+ )
59
+
60
+ return {
61
+ "overallFairnessScore": overall_score,
62
+ "fairnessLabel": score_label(overall_score),
63
+ "severity": severity(overall_score),
64
+ "area": dimension_results["area"],
65
+ "category": dimension_results["category"],
66
+ "language": dimension_results["language"],
67
+ "summary": {
68
+ "totalGrievances": len(validated_grievances),
69
+ "avgResolutionRate": round(
70
+ statistics.mean([r.get("status", "") == "resolved" for r in validated_grievances]) * 100, 2
71
+ ),
72
+ "disparityIndex": disparity_index,
73
+ "dimensionsAudited": FAIRNESS_DIMENSIONS,
74
+ "flagsRaised": sum(dimension_results[d]["flags_raised"] for d in FAIRNESS_DIMENSIONS),
75
+ },
76
+ "alerts": alerts,
77
+ "recommendations": recommendations,
78
+ "dimensions_audited": FAIRNESS_DIMENSIONS,
79
+ "total_grievances": len(validated_grievances),
80
+ "results": dimension_results,
81
+ }
gfas/report_generator.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # gfas/report_generator.py
3
+ # Builds human-readable alerts and actionable recommendations
4
+ # from per-dimension disparity analysis results.
5
+ # =========================================================
6
+
7
+ from .fairness_metrics import FAIRNESS_DIMENSIONS
8
+
9
+
10
+ # ── Copy templates ────────────────────────────────────────
11
+
12
+ _TITLE_MAP = {
13
+ "area": "Improve urgency detection in under-served areas",
14
+ "category": "Address priority-score gap across grievance categories",
15
+ "language": "Ensure equitable urgency classification by submission language",
16
+ }
17
+
18
+ def _desc(dimension: str, dr: dict) -> str:
19
+ flagged_str = ", ".join(dr["flagged"][:2]) or "N/A"
20
+ if dimension == "area":
21
+ gap_label = dr["disparity_summary"]["statistical_parity_gap_label"] or "unknown"
22
+ return (
23
+ f"Areas {flagged_str} show urgency-rate gaps of {gap_label}. "
24
+ "Assign dedicated officers and increase patrol frequency in these localities."
25
+ )
26
+ if dimension == "category":
27
+ return (
28
+ f"Categories with low parity scores indicate the model under-prioritises certain complaint types. "
29
+ f"Retrain or re-weight the urgency classifier for {flagged_str}."
30
+ )
31
+ # language
32
+ return (
33
+ f"Grievances in {flagged_str} receive lower urgency scores. "
34
+ "Deploy multilingual reviewers or a translation-aware pre-processing step before classification."
35
+ )
36
+
37
+
38
+ # ── Public API ────────────────────────────────────────────
39
+
40
+ def build_alerts(dimension_results: dict) -> list[dict]:
41
+ """
42
+ Return a list of alert dicts for every dimension whose fairness score < 80.
43
+ """
44
+ alerts = []
45
+ for dim in FAIRNESS_DIMENSIONS:
46
+ dr = dimension_results[dim]
47
+ if dr["fairnessScore"] >= 80:
48
+ continue
49
+
50
+ flagged_str = ""
51
+ if dr["flagged"]:
52
+ sample = dr["flagged"][:3]
53
+ extra = len(dr["flagged"]) - 3
54
+ flagged_str = f" Affected {dim}s: {', '.join(sample)}"
55
+ if extra > 0:
56
+ flagged_str += f" +{extra} more"
57
+ flagged_str += "."
58
+
59
+ flag_details = "; ".join(f["label"] for f in dr["fairness_flags"])
60
+ severity_word = "Significant" if dr["fairnessScore"] < 60 else "Moderate"
61
+ action_word = "Immediate review recommended." if dr["fairnessScore"] < 60 else "Monitor resolution trends."
62
+
63
+ alerts.append({
64
+ "severity": dr["severity"],
65
+ "message": f"{severity_word} {dim} fairness disparity ({flag_details}).{flagged_str} {action_word}",
66
+ "dimension": dim,
67
+ })
68
+
69
+ return alerts
70
+
71
+
72
+ def build_recommendations(dimension_results: dict) -> list[dict]:
73
+ """
74
+ Return actionable recommendations for every dimension whose fairness score < 80,
75
+ sorted from worst to best.
76
+ """
77
+ flagged_dims = sorted(
78
+ [d for d in FAIRNESS_DIMENSIONS if dimension_results[d]["fairnessScore"] < 80],
79
+ key=lambda d: dimension_results[d]["fairnessScore"],
80
+ )
81
+
82
+ recommendations = []
83
+ for dim in flagged_dims:
84
+ dr = dimension_results[dim]
85
+ recommendations.append({
86
+ "priority": "high" if dr["fairnessScore"] < 60 else "medium",
87
+ "title": _TITLE_MAP[dim],
88
+ "description": _desc(dim, dr),
89
+ "dimension": dim,
90
+ "affectedArea": ", ".join(dr["flagged"][:2]) or None,
91
+ })
92
+
93
+ return recommendations
main.py ADDED
@@ -0,0 +1,707 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # FLASK API — MULTILINGUAL GRIEVANCE + XPE + GFAS
3
+ # INTEGRATED GRADIENTS ONLY (PRODUCTION VERSION)
4
+ # Hugging Face Spaces — Production Deployment
5
+ # Multimodal: text / audio / image(evidence) support
6
+ # =========================================================
7
+ from flask import Flask, request, jsonify
8
+ import re
9
+ import io
10
+ import traceback
11
+ import logging
12
+ import math
13
+ import os
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
+ from datetime import datetime, timezone
16
+
17
+ # ── Silence noisy loggers ────────────────────────────────
18
+ logging.getLogger("prophet").setLevel(logging.ERROR)
19
+ logging.getLogger("cmdstanpy").setLevel(logging.ERROR)
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ from prophet import Prophet
24
+ import pandas as pd
25
+
26
+ # ── EXIF extraction ──────────────────────────────────────
27
+ from PIL import Image
28
+ import piexif
29
+
30
+ # =========================================================
31
+ # CATEGORY PREDICTION
32
+ # =========================================================
33
+ from classification.bert_classify import (
34
+ predict as predict_category_en,
35
+ get_model_and_tokenizer as get_cat_en,
36
+ )
37
+ from classification.indic_bert_classify import (
38
+ predict as predict_category_indic,
39
+ get_model_and_tokenizer as get_cat_indic,
40
+ )
41
+
42
+ # =========================================================
43
+ # URGENCY PREDICTION
44
+ # =========================================================
45
+ from sentiment_analysis.bert_predict import (
46
+ predict_urgency as predict_urgency_en,
47
+ get_model_and_tokenizer as get_urg_en,
48
+ )
49
+ from sentiment_analysis.indic_bert_predict import (
50
+ predict as predict_urgency_indic,
51
+ get_model_and_tokenizer as get_urg_indic,
52
+ )
53
+
54
+ # =========================================================
55
+ # MULTIMODAL
56
+ # =========================================================
57
+ from multi_modal.audio_to_text import transcribe_audio
58
+ from multi_modal.image_to_text import extract_text_from_image
59
+
60
+ # =========================================================
61
+ # XPE MODULES
62
+ # =========================================================
63
+ from xpe.priority_engine import compute_priority_score
64
+ from xpe.integrated_gradients_explainer import IntegratedGradientsExplainer
65
+ from xpe.hybrid_explainer import generate_final_reason
66
+
67
+ # =========================================================
68
+ # GFAS — Grievance Fairness Audit System
69
+ # =========================================================
70
+ from gfas import audit as gfas_audit
71
+
72
+
73
+ # =========================================================
74
+ # COMPILED REGEX — MULTILINGUAL (EN + HI + TE)
75
+ # =========================================================
76
+ _RE_HINDI = re.compile(r'[\u0900-\u097F]')
77
+ _RE_TELUGU = re.compile(r'[\u0C00-\u0C7F]')
78
+
79
+ _SMALL_TALK_PATTERNS = re.compile(
80
+ r"""
81
+ ^(hi|hello|hey|dear|sir|madam)\b
82
+ | good\s+(morning|evening|afternoon|night)
83
+ | how\s+(are\s+you|is\s+it\s+going)
84
+ | what'?s\s+up
85
+ | hope\s+you\s+are\s+doing\s+well
86
+ | \b(thank(s|\s+you)|okay|ok|great|nice|good\s+job)\b
87
+ | \b(namaste|namaskar|dhanyavaad|shukriya|theek\s+hai|accha|acha|haan|helo)\b
88
+ | \b(namaskaram|dhanyavadalu|bayapadu|ela\s+unnaru|mee\s+seva)\b
89
+ """,
90
+ re.VERBOSE | re.IGNORECASE,
91
+ )
92
+
93
+ _GRIEVANCE_PATTERNS = re.compile(
94
+ r"""
95
+ \b(problem|issue|complain(t)?|grievance|concern
96
+ |inconvenience|harassment|injustice|negligence|misconduct)\b
97
+ | \b(not\s+working|stopped\s+working|not\s+responding|no\s+response
98
+ |no\s+action|fail(ed|ure)?|malfunction(ing)?|defective
99
+ |service\s+down|interrupted|disconnected|outage
100
+ |not\s+restored|not\s+repaired|not\s+fixed|not\s+resolved
101
+ |not\s+completed?|not\s+done|not\s+processed?
102
+ |not\s+functioning|non[-\s]functional)\b
103
+ | \b(delay(ed)?|pending|not\s+received|not\s+delivered
104
+ |still\s+waiting|no\s+update|no\s+resolution
105
+ |no\s+acknowledgm[e]?nt|not\s+credited|not\s+sanctioned
106
+ |not\s+approved|not\s+collected|not\s+cleared
107
+ |overdue|lapsed|under\s+process|under\s+review|awaiting)\b
108
+ | \b(refund|charg(ed|ing)|overcharged|overbilled
109
+ |extra\s+charge|double\s+charg(ed|e)
110
+ |charged\s+twice|billed\s+twice|debited\s+twice
111
+ |wrong\s+bill|wrong\s+amount|incorrect\s+(amount|bill)
112
+ |excess\s+(charge|amount|fee)
113
+ |payment\s+fail(ed|ure)?|transaction\s+fail(ed|ure)?
114
+ |unauthorized\s+transaction|debited|deducted
115
+ |not\s+refunded|duplicate\s+(charge|bill|payment)
116
+ |invoice)\b
117
+ | \bbill\b
118
+ | \b(pothole|waterlogging|no\s+water|water\s+supply
119
+ |power\s+(cut|outage|failure)|electricity\s+(cut|failure|issue)
120
+ |sewage|drainage|garbage|waste\s+collection
121
+ |road\s+(damage|broken|condition|repair)
122
+ |streetlight|footpath
123
+ |no\s+(electricity|water|gas|internet|signal|network)
124
+ |supply\s+(not|stopped|disrupted))\b
125
+ | \b(certificate|ration\s+card|pension|scholarship|subsidy
126
+ |license|passport
127
+ |application\s+(rejected|pending|delayed|not\s+processed)
128
+ |not\s+issued|not\s+granted|denied|rejected|withheld)\b
129
+ | \b(rude|misbehav(ed|iour)|bribe|corruption
130
+ |demanding\s+(money|bribe)|not\s+attending|irresponsible)\b
131
+ | \b(wrong|missing|damaged?|broken|poor\s+service|substandard
132
+ |bad\s+service|very\s+bad|worst\s+service|defect(ive)?)\b
133
+ | \b(unsatisfied|unhappy|disappointed|frustrated|harassed|ignored
134
+ |cheated|deceived|exploited|victimized)\b
135
+ | \b(cancel(l?(ed|ation))?|legal\s+action|escalate[d]?
136
+ |complaint\s+against|take\s+action|file\s+(a\s+)?complaint
137
+ |report\s+(this|the)|seeking\s+(help|redressal|justice)
138
+ |urgent(ly)?|immediately)\b
139
+ | \b(fraud|scam|error|mistake|violation|irregularity|malpractice)\b
140
+ | (समस्या|शिकायत|परेशानी|दिक्कत|नहीं\s*मिला|नहीं\s*आया
141
+ |वापसी|रिफंड|विलंब|देरी|धोखा|गलत|टूटा|खराब
142
+ |बंद\s*हो\s*गया|काम\s*नहीं|जवाब\s*नहीं|कार्रवाई\s*नहीं
143
+ |नाराज|परेशान|निराश|कानूनी\s*कार्रवाई|भुगतान\s*विफल
144
+ |दो\s*बार\s*काटा|दो\s*बार|काटा|बिजली|पानी|सड़क
145
+ |भ्रष्टाचार|रिश्वत|जमा\s*नहीं|जारी\s*नहीं
146
+ |अनधिकृत|अतिरिक्त\s*शुल्क|बिल)
147
+ | \b(samasya|shikayat|pareshani|dikkat|nahi\s+mila|vapasi
148
+ |vilamba|deri|dhokha|galat|tuta|kharab|kaam\s+nahi
149
+ |jawab\s+nahi|naraaz|nirash|kanuni|bhrashtachar
150
+ |do\s+baar|bijli|paani|sadak|jamaa\s+nahi)\b
151
+ | (సమస్య|ఫిర్యాదు|ఇబ్బంది|రాలేదు|పని\s*చేయడం\s*లేదు
152
+ |తిరిగి\s*చెల్లింపు|ఆలస్యం|మోసం|తప్పు|పాడైంది
153
+ |సేవ\s*లేదు|జవాబు\s*లేదు|చర్య\s*లేదు
154
+ |చెల్లింపు\s*విఫలమైంది|నిరాశ|వేధింపు|రద్దు
155
+ |బిల్లు|విద్యుత్|నీరు|రోడ్డు|రెండుసార్లు|వసూలు
156
+ |జమకట్టలేదు|లంచం|అవినీతి)
157
+ | \b(firyadu|ibbandi|raaledu|pani\s+cheyyatledu
158
+ |tirigichellinpu|aalasyam|mosam|tappu|paadaindi
159
+ |seva\s+ledu|nirasha|vedhimpu|raddu|rendu\s+sarlu
160
+ |vasulu|lantham|avineeti)\b
161
+ """,
162
+ re.VERBOSE | re.IGNORECASE,
163
+ )
164
+
165
+ _RE_JUNK = re.compile(r'^[\d\W_]+$')
166
+ MIN_TEXT_LENGTH = 8
167
+
168
+ UTC = timezone.utc
169
+
170
+
171
+ # =========================================================
172
+ # KAKINADA GEO HELPERS
173
+ # =========================================================
174
+
175
+ def _dms_to_decimal(dms, ref: str) -> float:
176
+ degrees = dms[0][0] / dms[0][1]
177
+ minutes = dms[1][0] / dms[1][1]
178
+ seconds = dms[2][0] / dms[2][1]
179
+ decimal = degrees + minutes / 60 + seconds / 3600
180
+ if ref in ("S", "W"):
181
+ decimal = -decimal
182
+ return decimal
183
+
184
+
185
+ def extract_gps_from_image(image_bytes: bytes) -> tuple | None:
186
+ try:
187
+ img = Image.open(io.BytesIO(image_bytes))
188
+ exif_bytes = img.info.get("exif")
189
+ if not exif_bytes:
190
+ return None
191
+ exif_data = piexif.load(exif_bytes)
192
+ gps_data = exif_data.get("GPS", {})
193
+ if not gps_data:
194
+ return None
195
+ lat_dms = gps_data.get(piexif.GPSIFD.GPSLatitude)
196
+ lat_ref = gps_data.get(piexif.GPSIFD.GPSLatitudeRef)
197
+ lon_dms = gps_data.get(piexif.GPSIFD.GPSLongitude)
198
+ lon_ref = gps_data.get(piexif.GPSIFD.GPSLongitudeRef)
199
+ if not (lat_dms and lat_ref and lon_dms and lon_ref):
200
+ return None
201
+ lat = _dms_to_decimal(lat_dms, lat_ref.decode() if isinstance(lat_ref, bytes) else lat_ref)
202
+ lon = _dms_to_decimal(lon_dms, lon_ref.decode() if isinstance(lon_ref, bytes) else lon_ref)
203
+ return lat, lon
204
+ except Exception:
205
+ return None
206
+
207
+
208
+ def is_kakinada(lat: float, lon: float) -> bool:
209
+ try:
210
+ lat = float(lat)
211
+ lon = float(lon)
212
+ except (TypeError, ValueError):
213
+ return False
214
+ return 16.85 <= lat <= 17.10 and 82.15 <= lon <= 82.35
215
+
216
+
217
+ def check_image_location(image_bytes: bytes) -> str:
218
+ coords = extract_gps_from_image(image_bytes)
219
+ if coords is None:
220
+ return "no_gps"
221
+ lat, lon = coords
222
+ return "valid" if is_kakinada(lat, lon) else "invalid"
223
+
224
+
225
+ # =========================================================
226
+ # LANGUAGE DETECTION
227
+ # =========================================================
228
+ def detect_language(text: str) -> str:
229
+ if _RE_HINDI.search(text):
230
+ return "hindi"
231
+ if _RE_TELUGU.search(text):
232
+ return "telugu"
233
+ return "english"
234
+
235
+
236
+ # =========================================================
237
+ # INPUT VALIDATION
238
+ # =========================================================
239
+ _VALIDATION_MESSAGES = {
240
+ "too_short": "Text is too short. Please provide at least 8 characters describing your issue.",
241
+ "junk_input": "Input contains only numbers or special characters. Please describe your grievance in words.",
242
+ "small_talk": "This looks like a greeting or small talk. Please describe the issue you are facing.",
243
+ "no_grievance": (
244
+ "No grievance signal detected. Please describe your problem clearly — "
245
+ "e.g. 'My electricity bill was charged twice' or 'Water supply disrupted for 3 days'."
246
+ ),
247
+ }
248
+
249
+
250
+ def validate_text(text) -> tuple:
251
+ if not isinstance(text, str):
252
+ return False, "too_short"
253
+ stripped = text.strip()
254
+ if len(stripped) < 5:
255
+ return False, "too_short"
256
+ if _RE_JUNK.fullmatch(stripped.lower()):
257
+ return False, "junk_input"
258
+ return True, None
259
+
260
+
261
+ # =========================================================
262
+ # INITIALIZE APP
263
+ # =========================================================
264
+ app = Flask(__name__)
265
+
266
+ # ── Hugging Face Spaces: disable debug, allow large uploads ──────────────────
267
+ app.config["MAX_CONTENT_LENGTH"] = int(os.environ.get("MAX_UPLOAD_MB", "32")) * 1024 * 1024
268
+
269
+ # =========================================================
270
+ # LOAD MODELS (once at startup)
271
+ # =========================================================
272
+ logger.info("🔄 Loading models...")
273
+ cat_model_en, cat_tok_en = get_cat_en()
274
+ cat_model_indic, cat_tok_indic = get_cat_indic()
275
+ urg_model_en, urg_tok_en = get_urg_en()
276
+ urg_model_indic, urg_tok_indic = get_urg_indic()
277
+ logger.info("✅ Models loaded.")
278
+
279
+ # =========================================================
280
+ # INITIALIZE IG EXPLAINERS (once at startup)
281
+ # =========================================================
282
+ logger.info("🔄 Initializing Integrated Gradients explainers...")
283
+ category_explainer_en = IntegratedGradientsExplainer(cat_model_en, cat_tok_en)
284
+ category_explainer_indic = IntegratedGradientsExplainer(cat_model_indic, cat_tok_indic)
285
+ urgency_explainer_en = IntegratedGradientsExplainer(urg_model_en, urg_tok_en)
286
+ urgency_explainer_indic = IntegratedGradientsExplainer(urg_model_indic, urg_tok_indic)
287
+ logger.info("✅ Integrated Gradients ready.")
288
+
289
+ _RESOURCES = {
290
+ "english": {
291
+ "cat_fn": predict_category_en,
292
+ "urg_fn": predict_urgency_en,
293
+ "cat_exp": category_explainer_en,
294
+ "urg_exp": urgency_explainer_en,
295
+ }
296
+ }
297
+ _RESOURCES_INDIC = {
298
+ "cat_fn": predict_category_indic,
299
+ "urg_fn": predict_urgency_indic,
300
+ "cat_exp": category_explainer_indic,
301
+ "urg_exp": urgency_explainer_indic,
302
+ }
303
+
304
+
305
+ def _get_resources(language: str) -> dict:
306
+ return _RESOURCES.get(language, _RESOURCES_INDIC)
307
+
308
+
309
+ # =========================================================
310
+ # HOTSPOT FORECAST CONSTANTS
311
+ # =========================================================
312
+ VALID_LABELS = [
313
+ "electricity", "garbage", "pollution",
314
+ "public transport", "roads",
315
+ "sanitation", "stray animals", "water",
316
+ ]
317
+
318
+ _PROPHET_MAX_WORKERS = int(os.environ.get("PROPHET_MAX_WORKERS", "4"))
319
+
320
+ RISK_LEVEL_THRESHOLDS = [
321
+ (75, "Critical"),
322
+ (50, "High"),
323
+ (25, "Medium"),
324
+ (0, "Low"),
325
+ ]
326
+
327
+
328
+ def _risk_to_level(score_0_100: float) -> str:
329
+ for threshold, label in RISK_LEVEL_THRESHOLDS:
330
+ if score_0_100 >= threshold:
331
+ return label
332
+ return "Low"
333
+
334
+
335
+ def _fit_and_forecast(area: str, category: str, group_df, horizon: int) -> dict | None:
336
+ if group_df["ds"].nunique() < 2:
337
+ return None
338
+
339
+ ts = group_df[["ds", "y"]].sort_values("ds")
340
+ model = Prophet(weekly_seasonality=False, daily_seasonality=False)
341
+ model.fit(ts)
342
+
343
+ future = model.make_future_dataframe(periods=horizon)
344
+ forecast = model.predict(future)
345
+
346
+ recent_avg = ts.tail(3)["y"].mean()
347
+ forecast_avg = forecast.tail(horizon)["yhat"].mean()
348
+
349
+ if recent_avg == 0:
350
+ growth = 0.0
351
+ else:
352
+ raw_growth = ((forecast_avg - recent_avg) / recent_avg) * 100
353
+ growth = max(-500.0, min(500.0, raw_growth))
354
+
355
+ avg_priority = float(group_df["priorityScore"].mean())
356
+ raw_risk = 0.5 * (growth / 100) + 0.3 * avg_priority + 0.2 * (recent_avg / 5)
357
+ risk_score_100 = round(100 / (1 + math.exp(-raw_risk)), 2)
358
+
359
+ horizon_fc = forecast.tail(horizon)
360
+ yhat_range = (horizon_fc["yhat_upper"] - horizon_fc["yhat_lower"]).mean()
361
+ yhat_mean = horizon_fc["yhat"].abs().mean()
362
+ confidence = round(1.0 - min(1.0, yhat_range / (yhat_mean + 1e-9)), 4)
363
+
364
+ level = _risk_to_level(risk_score_100)
365
+
366
+ return {
367
+ "area": area,
368
+ "category": category,
369
+ "riskScore": risk_score_100,
370
+ "level": level,
371
+ "growthPercent": round(float(growth), 2),
372
+ "forecastHorizonDays": horizon,
373
+ "confidenceScore": confidence,
374
+ "_recentAvg": round(float(recent_avg), 2),
375
+ "_forecastAvg": round(float(forecast_avg), 2),
376
+ }
377
+
378
+
379
+ # =========================================================
380
+ # HEALTH CHECK
381
+ # =========================================================
382
+ @app.route("/", methods=["GET"])
383
+ def health():
384
+ return jsonify({
385
+ "status": "ok",
386
+ "version": os.environ.get("APP_VERSION", "1.0.0"),
387
+ "message": "Multilingual Grievance API (EN / HI / TE) with IG + GFAS — running",
388
+ "endpoints": {
389
+ "POST /predict": "Classify a single grievance — text / audio / image (multipart/form-data).",
390
+ "POST /fairness-audit": "GFAS audit over N grievance records.",
391
+ "POST /hotspot-forecast": "Prophet-based hotspot forecasting.",
392
+ },
393
+ })
394
+
395
+
396
+ @app.route("/health", methods=["GET"])
397
+ def health_check():
398
+ """Dedicated health probe for HF Spaces liveness checks."""
399
+ return jsonify({"status": "ok"}), 200
400
+
401
+
402
+ # =========================================================
403
+ # POST /predict
404
+ # =========================================================
405
+ @app.route("/predict", methods=["POST"])
406
+ def predict_grievance():
407
+ try:
408
+ content_type = request.content_type or ""
409
+
410
+ if "application/json" in content_type:
411
+ data = request.get_json(silent=True) or {}
412
+ text_input = data.get("text", "").strip()
413
+ explain_flag = bool(data.get("explain", False))
414
+ has_text = bool(text_input)
415
+ has_audio = False
416
+ has_image = False
417
+ image_bytes = None
418
+ audio_file = None
419
+ else:
420
+ text_input = request.form.get("text", "").strip()
421
+ explain_raw = request.form.get("explain", "false").strip().lower()
422
+ explain_flag = explain_raw in ("true", "1", "yes")
423
+ has_text = bool(text_input)
424
+ has_audio = "audio" in request.files
425
+ has_image = "image" in request.files
426
+ image_bytes = request.files["image"].read() if has_image else None
427
+ audio_file = request.files["audio"] if has_audio else None
428
+
429
+ logger.info(
430
+ "[predict] content_type=%s has_text=%s has_audio=%s has_image=%s",
431
+ content_type[:40], has_text, has_audio, has_image,
432
+ )
433
+
434
+ if not has_text and not has_audio and not has_image:
435
+ return jsonify({
436
+ "status": "failed",
437
+ "code": "missing_input",
438
+ "message": "Please provide at least one of: 'text', 'audio', or 'image'.",
439
+ }), 400
440
+
441
+ # ── Mode A — IMAGE ONLY ────────────────────────────────────────────────
442
+ if has_image and not has_text and not has_audio:
443
+ location_status = check_image_location(image_bytes)
444
+ if location_status in ("invalid", "no_gps"):
445
+ return jsonify({
446
+ "status": "failed",
447
+ "code": "location_invalid",
448
+ "message": "Request rejected. Image location is outside Kakinada jurisdiction or contains no GPS metadata.",
449
+ "location": "invalid",
450
+ }), 403
451
+ grievance_text = extract_text_from_image(image_bytes)
452
+ input_mode = "image"
453
+ location_field = None
454
+
455
+ # ── Mode B — AUDIO ONLY ────────────────────────────────────────────────
456
+ elif has_audio and not has_text and not has_image:
457
+ grievance_text = transcribe_audio(audio_file)
458
+ input_mode = "audio"
459
+ location_field = None
460
+
461
+ # ── Mode C — TEXT ONLY ─────────────────────────────────────────────────
462
+ elif has_text and not has_image and not has_audio:
463
+ grievance_text = text_input
464
+ input_mode = "text"
465
+ location_field = None
466
+
467
+ # ── Mode D — TEXT + IMAGE (evidence) ──────────────────────────────────
468
+ elif has_text and has_image and not has_audio:
469
+ grievance_text = text_input
470
+ input_mode = "text+image"
471
+ loc_status = check_image_location(image_bytes)
472
+ location_field = "valid" if loc_status == "valid" else "invalid"
473
+
474
+ # ── Mode E — AUDIO + IMAGE (evidence) ─────────────────────────────────
475
+ elif has_audio and has_image and not has_text:
476
+ grievance_text = transcribe_audio(audio_file)
477
+ input_mode = "audio+image"
478
+ loc_status = check_image_location(image_bytes)
479
+ location_field = "valid" if loc_status == "valid" else "invalid"
480
+
481
+ else:
482
+ return jsonify({
483
+ "status": "failed",
484
+ "code": "missing_input",
485
+ "message": "Please provide at least one of: 'text', 'audio', or 'image'.",
486
+ }), 400
487
+
488
+ is_valid, error_code = validate_text(grievance_text)
489
+ if not is_valid:
490
+ return jsonify({
491
+ "status": "failed",
492
+ "code": error_code,
493
+ "message": _VALIDATION_MESSAGES[error_code],
494
+ }), 422
495
+
496
+ language = detect_language(grievance_text)
497
+ res = _get_resources(language)
498
+
499
+ category_result = res["cat_fn"](grievance_text)
500
+ category = category_result["category"]
501
+ category_conf = category_result["confidence"]
502
+ category_index = category_result.get("class_index", 0)
503
+
504
+ urgency_result = res["urg_fn"](grievance_text)
505
+ urgency = urgency_result["urgency"]
506
+ urgency_conf = urgency_result["confidence"]
507
+ urgency_index = urgency_result.get("class_index", 0)
508
+
509
+ priority_result = compute_priority_score(category, urgency, urgency_conf)
510
+ priority_score = priority_result["score"]
511
+ priority_band = priority_result["band"]
512
+
513
+ category_tokens: list = []
514
+ urgency_tokens: list = []
515
+ if explain_flag:
516
+ category_tokens = res["cat_exp"].explain(grievance_text, category_index)
517
+ urgency_tokens = res["urg_exp"].explain(grievance_text, urgency_index)
518
+
519
+ explanation = generate_final_reason(
520
+ grievance_text, category, urgency, priority_score,
521
+ category_tokens, urgency_tokens,
522
+ )
523
+
524
+ response_body = {
525
+ "status": "success",
526
+ "input_mode": input_mode,
527
+ "text": grievance_text,
528
+ "language": language,
529
+ "category": category,
530
+ "category_confidence": category_conf,
531
+ "urgency": urgency,
532
+ "urgency_confidence": urgency_conf,
533
+ "priority_score": priority_score,
534
+ "priority_band": priority_band,
535
+ "explanation": {
536
+ "category_tokens": category_tokens,
537
+ "urgency_tokens": urgency_tokens,
538
+ "category_decision": explanation["category_decision"],
539
+ "urgency_decision": explanation["urgency_decision"],
540
+ "priority_summary": explanation["priority_summary"],
541
+ "final_reason": explanation["final_reason"],
542
+ },
543
+ }
544
+
545
+ if location_field is not None:
546
+ response_body["location"] = location_field
547
+
548
+ return jsonify(response_body)
549
+
550
+ except Exception as e:
551
+ logger.exception("[predict] Unhandled exception")
552
+ return jsonify({
553
+ "status": "failed",
554
+ "code": "internal_error",
555
+ "message": str(e),
556
+ "trace": traceback.format_exc(),
557
+ }), 500
558
+
559
+
560
+ # =========================================================
561
+ # POST /fairness-audit
562
+ # =========================================================
563
+ @app.route("/fairness-audit", methods=["POST"])
564
+ def fairness_audit():
565
+ try:
566
+ data = request.get_json(silent=True)
567
+ if not data:
568
+ return jsonify({"status": "failed", "message": "Invalid JSON body."}), 400
569
+
570
+ result, error, status_code = gfas_audit(data.get("grievances"))
571
+
572
+ if error:
573
+ return jsonify(error), status_code
574
+ return jsonify(result), status_code
575
+
576
+ except Exception as e:
577
+ logger.exception("[fairness-audit] Unhandled exception")
578
+ return jsonify({
579
+ "status": "failed",
580
+ "error": str(e),
581
+ "trace": traceback.format_exc(),
582
+ }), 500
583
+
584
+
585
+ # =========================================================
586
+ # POST /hotspot-forecast
587
+ # =========================================================
588
+ @app.route("/hotspot-forecast", methods=["POST"])
589
+ def hotspot_forecast():
590
+ try:
591
+ data = request.get_json(force=True)
592
+ grievances = data.get("grievances", [])
593
+ horizon = int(data.get("horizon_days", 1))
594
+ top_n = int(data.get("top_n", 10))
595
+ source_window = int(data.get("source_window_days", 45))
596
+ generated_at = datetime.now(UTC).isoformat()
597
+
598
+ if not grievances:
599
+ return jsonify({"status": "failed", "message": "No grievances supplied"}), 422
600
+
601
+ df = pd.DataFrame(grievances)
602
+ if df.empty:
603
+ return jsonify({
604
+ "status": "success",
605
+ "generated_at": generated_at,
606
+ "top_hotspots": [],
607
+ })
608
+
609
+ df["area"] = df["area"].astype(str).str.lower().str.strip()
610
+ df["category"] = df["category"].astype(str).str.lower().str.strip()
611
+ df["ds"] = pd.to_datetime(df["createdAt"], errors="coerce", utc=True).dt.tz_convert(None)
612
+ df = df.dropna(subset=["ds"])
613
+ df["y"] = 1
614
+ df = df[df["category"].isin(VALID_LABELS)]
615
+
616
+ if df.empty:
617
+ return jsonify({
618
+ "status": "success",
619
+ "generated_at": generated_at,
620
+ "top_hotspots": [],
621
+ })
622
+
623
+ df = df.groupby(["area", "category", "ds"]).agg(
624
+ {"y": "sum", "priorityScore": "mean"}
625
+ ).reset_index()
626
+
627
+ groups = list(df.groupby(["area", "category"]))
628
+ hotspots = []
629
+ errors = []
630
+
631
+ with ThreadPoolExecutor(max_workers=_PROPHET_MAX_WORKERS) as executor:
632
+ futures = {
633
+ executor.submit(_fit_and_forecast, area, cat, gdf, horizon): (area, cat)
634
+ for (area, cat), gdf in groups
635
+ }
636
+ for future in as_completed(futures):
637
+ area, category = futures[future]
638
+ try:
639
+ result = future.result()
640
+ if result is None:
641
+ continue
642
+
643
+ result["flaskSnapshot"] = {
644
+ "recentAvg": result.pop("_recentAvg"),
645
+ "forecastAvg": result.pop("_forecastAvg"),
646
+ "sourceWindowDays": source_window,
647
+ "forecastHorizonDays": horizon,
648
+ "generatedAt": generated_at,
649
+ }
650
+ result["sourceWindowDays"] = source_window
651
+ hotspots.append(result)
652
+
653
+ except Exception as e:
654
+ errors.append({"area": area, "category": category, "error": str(e)})
655
+ logger.warning("[hotspot] Prophet failed for %s/%s: %s", area, category, e)
656
+
657
+ ranked = sorted(hotspots, key=lambda x: x["riskScore"], reverse=True)
658
+
659
+ return jsonify({
660
+ "status": "success",
661
+ "generated_at": generated_at,
662
+ "top_hotspots": ranked[:top_n],
663
+ "meta": {
664
+ "groups_evaluated": len(groups),
665
+ "forecasts_computed": len(hotspots),
666
+ "error_count": len(errors),
667
+ "errors": errors,
668
+ "source_window_days": source_window,
669
+ "horizon_days": horizon,
670
+ },
671
+ })
672
+
673
+ except Exception as e:
674
+ logger.exception("[hotspot-forecast] Unhandled exception")
675
+ return jsonify({"status": "failed", "message": str(e)}), 500
676
+
677
+
678
+ # =========================================================
679
+ # GLOBAL ERROR HANDLERS
680
+ # =========================================================
681
+ @app.errorhandler(413)
682
+ def request_entity_too_large(e):
683
+ return jsonify({
684
+ "status": "failed",
685
+ "code": "payload_too_large",
686
+ "message": f"Upload exceeds the {app.config['MAX_CONTENT_LENGTH'] // (1024*1024)} MB limit.",
687
+ }), 413
688
+
689
+
690
+ @app.errorhandler(404)
691
+ def not_found(e):
692
+ return jsonify({"status": "failed", "code": "not_found", "message": "Endpoint not found."}), 404
693
+
694
+
695
+ @app.errorhandler(405)
696
+ def method_not_allowed(e):
697
+ return jsonify({"status": "failed", "code": "method_not_allowed", "message": "HTTP method not allowed."}), 405
698
+
699
+
700
+ # =========================================================
701
+ # RUN SERVER — Hugging Face Spaces uses port 7860
702
+ # =========================================================
703
+ if __name__ == "__main__":
704
+ port = int(os.environ.get("PORT", 7860))
705
+ debug = os.environ.get("FLASK_DEBUG", "false").lower() == "true"
706
+ logger.info("🚀 Starting Multilingual Grievance API on port %d (debug=%s)", port, debug)
707
+ app.run(host="0.0.0.0", port=port, debug=debug, threaded=True)
multi_modal/audio_to_text.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # multi_modal/audio_to_text.py
3
+ #
4
+ # Converts an uploaded audio file to text using Whisper.
5
+ #
6
+ # Supports: WAV, MP3, OGG, FLAC, M4A, WEBM (mobile browsers)
7
+ # Languages: Telugu / Hindi / English (forced, no random scripts)
8
+ #
9
+ # FIXES vs previous version:
10
+ # 1. Hallucination detection — Georgian/Chinese/Arabic output
11
+ # (ვვვვ... etc.) is detected and discarded, returns ""
12
+ # 2. Language forcing — tries TE → HI → EN in order instead
13
+ # of pure auto-detect which picks random scripts
14
+ # 3. Valid script check — only accepts Latin, Telugu,
15
+ # Devanagari output. Anything else = hallucination.
16
+ # 4. 500 error fix — empty/invalid transcription now safely
17
+ # returns "" instead of passing garbage to BERT classifier
18
+ # =========================================================
19
+
20
+ import os
21
+ import tempfile
22
+ import unicodedata
23
+ import torch
24
+ import numpy as np
25
+ from transformers import pipeline
26
+
27
+ # ── Environment ────────────────────────────────────────────────────────────────
28
+ _AUDIO_BACKEND = os.environ.get("AUDIO_BACKEND", "local") # "local" | "hf_api"
29
+ _HF_TOKEN = os.environ.get("HF_TOKEN", "")
30
+
31
+ # ── Model selection ────────────────────────────────────────────────────────────
32
+ MODEL_ID = os.environ.get("WHISPER_MODEL", "openai/whisper-small")
33
+ _DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
34
+
35
+ # ── Valid Unicode scripts for EN / HI / TE ────────────────────────────────────
36
+ # Whisper hallucinates Georgian (ვ), Chinese (的), Arabic (ال) on bad audio.
37
+ # Only these script prefixes (from unicodedata.name) are accepted as real output.
38
+ _VALID_SCRIPTS = {
39
+ "LATIN", # English
40
+ "DEVANAGARI", # Hindi
41
+ "TELUGU", # Telugu
42
+ "COMMON", # punctuation, digits, spaces
43
+ }
44
+
45
+ # Languages tried in order.
46
+ # EN first — fastest for English audio (most common).
47
+ # Only these 3 are permitted — no other language accepted.
48
+ _LANGUAGE_ORDER = ["en", "te", "hi"]
49
+ _ALLOWED_LANGUAGES = {"en", "te", "hi"}
50
+
51
+ # Expected dominant script per forced language.
52
+ # If we force "te" but get back Devanagari-heavy text, it is wrong.
53
+ # If we force "hi" but get back Telugu-heavy text, it is wrong.
54
+ # This prevents Telugu audio from being accepted as Hindi.
55
+ _LANG_EXPECTED_SCRIPT = {
56
+ "en": {"LATIN"},
57
+ "te": {"TELUGU"},
58
+ "hi": {"DEVANAGARI"},
59
+ }
60
+
61
+
62
+ # ── Load Whisper ONCE at import time ──────────────────────────────────────────
63
+ if _AUDIO_BACKEND == "local":
64
+ print(f"🔄 Loading Whisper '{MODEL_ID}' on {_DEVICE}…")
65
+ _ASR_PIPELINE = pipeline(
66
+ task = "automatic-speech-recognition",
67
+ model = MODEL_ID,
68
+ device = _DEVICE,
69
+ )
70
+ print(f"✅ Whisper '{MODEL_ID}' loaded.")
71
+ else:
72
+ _ASR_PIPELINE = None
73
+ print(f"ℹ️ Whisper skipped — using HF API backend.")
74
+
75
+
76
+ # ─────────────────────────────────────────────────────────────────────────────
77
+ # HALLUCINATION DETECTION
78
+ # ─────────────────────────────────────────────────────────────────────────────
79
+ def _is_valid_transcription(text: str) -> bool:
80
+ """
81
+ Returns True only if the transcription looks like real speech.
82
+
83
+ Checks:
84
+ 1. Script check -- must be mostly Latin / Devanagari / Telugu
85
+ 2. Repetition check -- rejects looping hallucinations like
86
+ "apne apne apne apne..." where a word repeats 5+ times
87
+ """
88
+ if not text or len(text.strip()) < 3:
89
+ return False
90
+
91
+ chars = [c for c in text if not c.isspace()]
92
+ if not chars:
93
+ return False
94
+
95
+ # Check 1: Script validation
96
+ valid_count = 0
97
+ for c in chars:
98
+ try:
99
+ char_name = unicodedata.name(c, "")
100
+ script = char_name.split()[0] if char_name else "UNKNOWN"
101
+ if script in _VALID_SCRIPTS:
102
+ valid_count += 1
103
+ except Exception:
104
+ pass
105
+
106
+ ratio = valid_count / len(chars)
107
+ if ratio < 0.60:
108
+ print(f"[audio_to_text] WARNING script hallucination "
109
+ f"(valid_ratio={ratio:.2f}) discarding: {text[:60]!r}")
110
+ return False
111
+
112
+ # Check 2: Repetition detection
113
+ # "apne apne apne apne apne apne..." = Whisper looping hallucination
114
+ words = text.strip().split()
115
+ if len(words) >= 6:
116
+ # Max consecutive repeated word
117
+ max_repeat = 1
118
+ cur_repeat = 1
119
+ for i in range(1, len(words)):
120
+ if words[i].lower() == words[i - 1].lower():
121
+ cur_repeat += 1
122
+ max_repeat = max(max_repeat, cur_repeat)
123
+ else:
124
+ cur_repeat = 1
125
+ if max_repeat >= 5:
126
+ print(f"[audio_to_text] WARNING repetition hallucination "
127
+ f"(word repeats {max_repeat}x) discarding: {text[:60]!r}")
128
+ return False
129
+
130
+ # Low vocabulary diversity = looping hallucination
131
+ # "I love you. I love you..." = 3 unique / 15 words = 0.20 unique ratio
132
+ # Real speech always has more variety — threshold: <0.15 for longer texts
133
+ unique_ratio = len(set(w.lower() for w in words)) / len(words)
134
+ if unique_ratio < 0.15 and len(words) > 15:
135
+ print(f"[audio_to_text] WARNING low-diversity hallucination "
136
+ f"(unique_ratio={unique_ratio:.2f}) discarding: {text[:60]!r}")
137
+ return False
138
+
139
+ # Check 3: Character-level repetition — catches "अग्वावावावाव..." patterns
140
+ # where substrings repeat at character level (not caught by word check)
141
+ if len(text) > 20:
142
+ # Take a 4-char ngram from position 10 and count how many times it appears
143
+ probe = text[8:12]
144
+ rep_count = text.count(probe)
145
+ if rep_count > len(text) // 8: # appears more than once per 8 chars = looping
146
+ print(f"[audio_to_text] WARNING char-level repetition "
147
+ f"(probe {probe!r} repeats {rep_count}x) discarding: {text[:60]!r}")
148
+ return False
149
+
150
+ return True
151
+
152
+
153
+ # ─────────────────────────────────────────────────────────────────────────────
154
+ # PUBLIC API
155
+ # ─────────────────────────────────────────────────────────────────────────────
156
+ def transcribe_audio(audio_file) -> str:
157
+ """
158
+ Transcribe an uploaded audio file to text.
159
+
160
+ Parameters
161
+ ----------
162
+ audio_file : werkzeug.datastructures.FileStorage
163
+ File from Flask request.files["audio"].
164
+ Accepts WAV, MP3, OGG, FLAC, M4A, WEBM.
165
+
166
+ Returns
167
+ -------
168
+ str
169
+ Transcribed text in EN / HI / TE.
170
+ Returns "" on failure or hallucination — never raises.
171
+ """
172
+ if _AUDIO_BACKEND == "hf_api" and _HF_TOKEN:
173
+ return _transcribe_via_hf_api(audio_file)
174
+ return _transcribe_local(audio_file)
175
+
176
+
177
+ # ─────────────────────────────────────────────────────────────────────────────
178
+ # LOCAL PATH
179
+ # ─────────────────────────────────────────────────────────────────────────────
180
+ def _transcribe_local(audio_file) -> str:
181
+ try:
182
+ audio_bytes = audio_file.read()
183
+ if not audio_bytes:
184
+ print("[audio_to_text] ⚠️ Empty audio file.")
185
+ return ""
186
+
187
+ suffix = _get_suffix(audio_file)
188
+
189
+ # Write to temp file — pydub needs a file path on disk
190
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
191
+ tmp.write(audio_bytes)
192
+ tmp_path = tmp.name
193
+
194
+ try:
195
+ audio_array, sample_rate = _load_audio(tmp_path, suffix)
196
+ finally:
197
+ try:
198
+ os.unlink(tmp_path)
199
+ except OSError:
200
+ pass
201
+
202
+ if audio_array is None:
203
+ print("[audio_to_text] ❌ Could not decode audio — is ffmpeg installed?")
204
+ return ""
205
+
206
+ # ── Audio quality diagnostics ──────────────────────────────────────────
207
+ duration_sec = len(audio_array) / 16_000
208
+ rms = float(np.sqrt(np.mean(audio_array ** 2)))
209
+ peak = float(np.max(np.abs(audio_array)))
210
+ print(f"[audio_to_text] 🔍 duration={duration_sec:.1f}s | rms={rms:.4f} | peak={peak:.4f}")
211
+
212
+ # Reject silent or extremely quiet audio — Whisper hallucinates on silence
213
+ if rms < 0.001:
214
+ print("[audio_to_text] ❌ Audio is silent (rms<0.001) — nothing to transcribe")
215
+ return ""
216
+ if duration_sec < 0.5:
217
+ print(f"[audio_to_text] ❌ Audio too short ({duration_sec:.2f}s) — minimum 0.5s")
218
+ return ""
219
+
220
+ # ── Try EN → TE → HI — never pure auto-detect ─────────────────────────
221
+ # language=None causes Whisper to hallucinate Georgian/Chinese on bad audio.
222
+ # Forcing each language and validating the output is far more reliable.
223
+ #
224
+ # IMPORTANT: the pipeline mutates the input dict internally on the first
225
+ # call, so subsequent calls receive a broken dict. Fix: rebuild it fresh
226
+ # for every language attempt using a copy of the original numpy array.
227
+ audio_array_copy = audio_array.copy()
228
+
229
+ for lang in _LANGUAGE_ORDER:
230
+ try:
231
+ # Fresh dict every iteration — never reuse across pipeline calls
232
+ audio_input = {"raw": audio_array_copy.copy(), "sampling_rate": 16_000}
233
+ result = _ASR_PIPELINE(
234
+ audio_input,
235
+ generate_kwargs={
236
+ "language": lang,
237
+ "task": "transcribe",
238
+ # temperature and compression_ratio_threshold cause a
239
+ # 'logprobs' bug in some transformers versions — removed.
240
+ # Hallucination is handled by our own validator instead.
241
+ },
242
+ return_timestamps=False,
243
+ )
244
+ text = result.get("text", "").strip()
245
+
246
+ if not text:
247
+ print(f"[audio_to_text] ↩️ lang={lang} -> empty, trying next")
248
+ continue
249
+
250
+ # Strict language whitelist — only EN / HI / TE accepted.
251
+ # Whisper sometimes returns text in a completely different language
252
+ # even when forced (e.g. forced TE returns Khmer). Detect this by
253
+ # checking the detected_language field when available.
254
+ detected_lang = result.get("chunks", [{}])[0].get("language", lang) if isinstance(result.get("chunks"), list) else lang
255
+
256
+ if _is_valid_transcription(text):
257
+ # Extra check: does the output script match the forced language?
258
+ # Whisper-small often outputs Hindi (Devanagari) when forced to TE.
259
+ # Reject if dominant script does not match expected script for lang.
260
+ expected_scripts = _LANG_EXPECTED_SCRIPT.get(lang, None)
261
+ if expected_scripts and lang != "en":
262
+ chars = [c for c in text if not c.isspace()]
263
+ script_counts = {}
264
+ for c in chars:
265
+ try:
266
+ sc = unicodedata.name(c, "").split()[0]
267
+ script_counts[sc] = script_counts.get(sc, 0) + 1
268
+ except Exception:
269
+ pass
270
+ dominant = max(script_counts, key=script_counts.get) if script_counts else "UNKNOWN"
271
+ if dominant not in expected_scripts and dominant not in ("COMMON", "LATIN"):
272
+ print(f"[audio_to_text] script mismatch: forced {lang} but got {dominant} — trying next")
273
+ continue
274
+
275
+ print(f"[audio_to_text] OK lang={lang} | "
276
+ f"{len(text)} chars: {text[:100]}")
277
+ return text
278
+ else:
279
+ print(f"[audio_to_text] lang={lang} hallucinated — trying next")
280
+ continue
281
+
282
+ except Exception as e:
283
+ print(f"[audio_to_text] ❌ lang={lang} error: {e}")
284
+ continue
285
+
286
+ print("[audio_to_text] ❌ All language attempts failed — returning empty")
287
+ return ""
288
+
289
+ except Exception as e:
290
+ print(f"[audio_to_text] ❌ Transcription failed: {e}")
291
+ return ""
292
+
293
+
294
+ def _load_audio(file_path: str, suffix: str):
295
+ """
296
+ Load audio file as float32 numpy array at 16 kHz mono.
297
+
298
+ Strategy:
299
+ 1. pydub — handles MP3, OGG, WEBM, M4A, WAV, FLAC (needs ffmpeg)
300
+ 2. soundfile fallback — WAV and FLAC only (no ffmpeg needed)
301
+
302
+ Returns (audio_array, 16000) or (None, None) on failure.
303
+ """
304
+ # ── pydub (primary) ────────────────────────────────────────────────────────
305
+ try:
306
+ from pydub import AudioSegment
307
+
308
+ fmt = suffix.lstrip(".").lower()
309
+ fmt_map = {"m4a": "mp4", "webm": "webm", "ogg": "ogg"}
310
+ fmt = fmt_map.get(fmt, fmt)
311
+
312
+ audio_seg = AudioSegment.from_file(file_path, format=fmt)
313
+ audio_seg = audio_seg.set_channels(1).set_frame_rate(16_000)
314
+ samples = np.array(audio_seg.get_array_of_samples(), dtype=np.float32)
315
+
316
+ # Normalize based on actual sample width — pydub can return int16 OR int32
317
+ # depending on source format. Always normalize to float32 [-1.0, 1.0]
318
+ sample_width = audio_seg.sample_width # bytes per sample: 1=8bit, 2=16bit, 4=32bit
319
+ max_val = float(2 ** (8 * sample_width - 1))
320
+ samples = samples / max_val
321
+ # Safety clamp — should already be in range but guard against edge cases
322
+ samples = np.clip(samples, -1.0, 1.0)
323
+
324
+ print(f"[audio_to_text] pydub decoded: sample_width={sample_width}B "
325
+ f"max_val={max_val:.0f} post_rms={float(np.sqrt(np.mean(samples**2))):.4f}")
326
+
327
+ return samples, 16_000
328
+
329
+ except ImportError:
330
+ print("[audio_to_text] pydub not installed — falling back to soundfile")
331
+ print(" pip install pydub + install ffmpeg")
332
+ except Exception as e:
333
+ print(f"[audio_to_text] pydub failed ({e}) — trying soundfile")
334
+
335
+ # ── soundfile (fallback — WAV/FLAC only) ───────────────────────────────────
336
+ try:
337
+ import soundfile as sf
338
+ audio_array, sample_rate = sf.read(file_path, dtype="float32")
339
+
340
+ if audio_array.ndim > 1:
341
+ audio_array = audio_array.mean(axis=1)
342
+
343
+ if sample_rate != 16_000:
344
+ audio_array = _resample(audio_array, sample_rate, 16_000)
345
+
346
+ return audio_array, 16_000
347
+
348
+ except Exception as e:
349
+ print(f"[audio_to_text] soundfile failed: {e}")
350
+ return None, None
351
+
352
+
353
+ # ─────────────────────────────────────────────────────────────────────────────
354
+ # HF API PATH — production / HF Spaces
355
+ # ─────────────────────────────────────────────────────────────────────────────
356
+ def _transcribe_via_hf_api(audio_file) -> str:
357
+ """
358
+ Production path — HuggingFace Inference API (whisper-large-v3 on HF GPU).
359
+ Set AUDIO_BACKEND=hf_api and HF_TOKEN=hf_xxx in HF Space Secrets.
360
+
361
+ Why large-v3 via API instead of loading locally:
362
+ - large-v3 = 3GB — too large to load on free HF Spaces
363
+ - HF API runs it on GPU — faster than local CPU anyway (~15-30s vs 3min)
364
+ - Free tier: 1000 requests/day — enough for a civic portal
365
+
366
+ large-v3 auto-detect is accurate enough for EN/TE/HI — no need for
367
+ the 3-attempt language loop used in local path.
368
+ """
369
+ import requests
370
+
371
+ try:
372
+ audio_bytes = audio_file.read()
373
+ if not audio_bytes:
374
+ return ""
375
+
376
+ print(f"[audio_to_text] HF API: sending {len(audio_bytes)} bytes to whisper-large-v3...")
377
+
378
+ # First attempt: auto-detect language (large-v3 is accurate enough)
379
+ res = requests.post(
380
+ "https://api-inference.huggingface.co/models/openai/whisper-large-v3",
381
+ headers = {"Authorization": f"Bearer {_HF_TOKEN}"},
382
+ data = audio_bytes,
383
+ timeout = 120, # HF free tier can queue up to 60s before processing
384
+ )
385
+
386
+ # Handle model loading (HF cold start)
387
+ if res.status_code == 503:
388
+ import time
389
+ print("[audio_to_text] HF API: model loading — waiting 20s...")
390
+ time.sleep(20)
391
+ res = requests.post(
392
+ "https://api-inference.huggingface.co/models/openai/whisper-large-v3",
393
+ headers = {"Authorization": f"Bearer {_HF_TOKEN}"},
394
+ data = audio_bytes,
395
+ timeout = 120,
396
+ )
397
+
398
+ if res.ok:
399
+ data = res.json()
400
+ # HF API returns {"text": "..."} or [{"generated_text": "..."}]
401
+ if isinstance(data, dict):
402
+ text = data.get("text", "").strip()
403
+ elif isinstance(data, list) and data:
404
+ text = data[0].get("generated_text", "").strip()
405
+ else:
406
+ text = ""
407
+
408
+ if _is_valid_transcription(text):
409
+ print(f"[audio_to_text] HF API OK: {len(text)} chars: {text[:100]}")
410
+ return text
411
+ else:
412
+ print(f"[audio_to_text] HF API hallucination discarded: {text[:60]!r}")
413
+ return ""
414
+ else:
415
+ print(f"[audio_to_text] HF API error {res.status_code}: {res.text[:300]}")
416
+ return ""
417
+
418
+ except requests.exceptions.Timeout:
419
+ print("[audio_to_text] HF API timeout — model may be overloaded")
420
+ return ""
421
+ except Exception as e:
422
+ print(f"[audio_to_text] HF API exception: {e}")
423
+ return ""
424
+
425
+
426
+ # ─────────────────────────────────────────────────────────────────────────────
427
+ # HELPERS
428
+ # ─────────────────────────────────────────────────────────────────────────────
429
+ def _get_suffix(audio_file) -> str:
430
+ """Determine file extension from FileStorage. Defaults to .webm."""
431
+ filename = getattr(audio_file, "filename", "") or ""
432
+ mime = getattr(audio_file, "mimetype", "") or ""
433
+
434
+ _MIME_TO_EXT = {
435
+ "audio/wav": ".wav", "audio/x-wav": ".wav", "audio/wave": ".wav",
436
+ "audio/mpeg": ".mp3", "audio/mp3": ".mp3",
437
+ "audio/ogg": ".ogg",
438
+ "audio/flac": ".flac", "audio/x-flac": ".flac",
439
+ "audio/mp4": ".m4a", "audio/x-m4a": ".m4a",
440
+ "audio/webm": ".webm", "video/webm": ".webm",
441
+ }
442
+
443
+ if "." in filename:
444
+ return "." + filename.rsplit(".", 1)[-1].lower()
445
+
446
+ # Default to .webm — Chrome/Edge MediaRecorder always sends webm
447
+ return _MIME_TO_EXT.get(mime.lower(), ".webm")
448
+
449
+
450
+ def _resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
451
+ """Resample audio array from orig_sr to target_sr."""
452
+ try:
453
+ from scipy.signal import resample_poly
454
+ from math import gcd
455
+ g = gcd(orig_sr, target_sr)
456
+ return resample_poly(audio, target_sr // g, orig_sr // g).astype(np.float32)
457
+ except ImportError:
458
+ target_length = int(len(audio) * target_sr / orig_sr)
459
+ return np.interp(
460
+ np.linspace(0, len(audio) - 1, target_length),
461
+ np.arange(len(audio)),
462
+ audio,
463
+ ).astype(np.float32)
multi_modal/image_to_text.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # multi_modal/image_to_text.py
3
+ #
4
+ # Converts an uploaded image into grievance text for BERT.
5
+ #
6
+ # Labels: Electricity | Garbage | Pollution | Public Transport |
7
+ # Roads | Sanitation | Stray Animals | Water
8
+ #
9
+ # OUTPUT RULE:
10
+ # Only raw observed text from OCR + BLIP caption.
11
+ # No predefined phrases, no templates, no appended context.
12
+ # BERT classifier must receive unbiased descriptive text.
13
+ #
14
+ # PIPELINE:
15
+ # Step 1 — Preprocess (sharpen, contrast, resize)
16
+ # Step 2 — EasyOCR (visible text in EN/HI/TE)
17
+ # Step 3 — BLIP-base, 5 civic prompts, best-of-5 by keyword score
18
+ # Step 4 — Clean fusion: OCR + caption, no added words
19
+ #
20
+ # FOR RENDER:
21
+ # Set IMAGE_BACKEND=hf_api + HF_TOKEN=hf_xxx in .env
22
+ # =========================================================
23
+
24
+ import io
25
+ import os
26
+ import re
27
+ import torch
28
+ import numpy as np
29
+ from PIL import Image, ImageFilter, ImageEnhance
30
+
31
+ # ── Environment ────────────────────────────────────────────────────────────────
32
+ _BACKEND = os.environ.get("IMAGE_BACKEND", "local")
33
+ _HF_TOKEN = os.environ.get("HF_TOKEN", "")
34
+ _DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
35
+
36
+ # ── Lazy model handles ─────────────────────────────────────────────────────────
37
+ _ocr_reader = None
38
+ _blip_processor = None
39
+ _blip_model = None
40
+
41
+ # ── Five civic prompts for BLIP ────────────────────────────────────────────────
42
+ # Steers BLIP toward civic observation language.
43
+ # Best-scoring caption across all 5 is selected.
44
+ _CIVIC_PROMPTS = [
45
+ "a civic grievance showing",
46
+ "a public infrastructure problem showing",
47
+ "a sanitation or garbage problem showing",
48
+ "a water or drainage problem showing",
49
+ "a road or footpath damage showing",
50
+ ]
51
+
52
+ # ── Civic keyword set — for scoring captions only, never appended to output ───
53
+ _CIVIC_KEYWORDS = {
54
+ "garbage", "waste", "trash", "litter", "dumped", "overflowing", "filth",
55
+ "sewage", "drain", "clog", "smell", "foul", "unhygienic", "sanitation",
56
+ "pothole", "road", "damaged", "broken", "crack", "footpath", "pavement",
57
+ "accident", "vehicle", "commuter", "traffic",
58
+ "water", "flood", "waterlog", "overflow", "leak", "pipe", "supply",
59
+ "stagnant", "puddle", "inundated",
60
+ "electricity", "wire", "pole", "streetlight", "cable", "spark", "fallen",
61
+ "pollution", "smoke", "dust", "emission", "contamination",
62
+ "animal", "stray", "dog", "cattle", "menace",
63
+ "transport", "bus", "auto", "road", "signal",
64
+ "hazard", "risk", "danger", "health", "resident", "street", "public",
65
+ "municipal", "colony", "area", "locality", "civic", "problem",
66
+ "issue", "blocked", "accumulated", "piled", "scattered",
67
+ }
68
+
69
+
70
+ # ─────────────────────────────────────────────────────────────────────────────
71
+ # PUBLIC API
72
+ # ─────────────────────────────────────────────────────────────────────────────
73
+ def extract_text_from_image(image_bytes: bytes) -> str:
74
+ """
75
+ Convert raw image bytes into clean descriptive grievance text.
76
+
77
+ Output is purely what the image contains — no predefined phrases.
78
+ BERT classifier receives unbiased input.
79
+
80
+ Parameters
81
+ ----------
82
+ image_bytes : bytes
83
+ Raw bytes from Flask request.files["image"].read()
84
+
85
+ Returns
86
+ -------
87
+ str
88
+ Clean observed description e.g.:
89
+ "garbage dumped on the road near residential area"
90
+ "pothole on the main road"
91
+ "water supply pipeline broken leaking on street"
92
+ "stray dogs near garbage pile"
93
+ Returns "" on total failure (never raises).
94
+ """
95
+ if not image_bytes:
96
+ return ""
97
+
98
+ if _BACKEND == "hf_api" and _HF_TOKEN:
99
+ return _extract_via_hf_api(image_bytes)
100
+
101
+ return _extract_local(image_bytes)
102
+
103
+
104
+ # ─────────────────────────────────────────────────────────────────────────────
105
+ # STEP 1 — PREPROCESSING
106
+ # ─────────────────────────────────────────────────────────────────────────────
107
+ def _preprocess_image(image_bytes: bytes) -> Image.Image:
108
+ """
109
+ Prepare image for best BLIP + OCR accuracy.
110
+ - RGB conversion
111
+ - Resize: long edge capped at 1024px
112
+ - UnsharpMask: recovers blurry phone shots
113
+ - Contrast +20%: helps BLIP detect civic features
114
+ """
115
+ img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
116
+
117
+ w, h = img.size
118
+ if max(w, h) > 1024:
119
+ scale = 1024 / max(w, h)
120
+ img = img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
121
+
122
+ img = img.filter(ImageFilter.UnsharpMask(radius=1.5, percent=120, threshold=3))
123
+ img = ImageEnhance.Contrast(img).enhance(1.2)
124
+ return img
125
+
126
+
127
+ # ─────────────────────────────────────────────────────────────────────────────
128
+ # STEP 2 — OCR
129
+ # ─────────────────────────────────────────────────────────────────────────────
130
+ def _load_ocr():
131
+ global _ocr_reader
132
+ if _ocr_reader is not None:
133
+ return None if _ocr_reader == "unavailable" else _ocr_reader
134
+ try:
135
+ import easyocr
136
+ print("🔄 Loading EasyOCR (EN + HI + TE)…")
137
+ _ocr_reader = easyocr.Reader(
138
+ ["en", "hi", "te"],
139
+ gpu=torch.cuda.is_available(),
140
+ verbose=False,
141
+ )
142
+ print("✅ EasyOCR loaded.")
143
+ except ImportError:
144
+ print("⚠️ easyocr not installed — run: pip install easyocr")
145
+ _ocr_reader = "unavailable"
146
+ except Exception as e:
147
+ print(f"⚠️ EasyOCR load error: {e}")
148
+ _ocr_reader = "unavailable"
149
+ return None if _ocr_reader == "unavailable" else _ocr_reader
150
+
151
+
152
+ def _run_ocr(img: Image.Image) -> str:
153
+ """Extract visible text from image. Returns '' if nothing meaningful."""
154
+ try:
155
+ reader = _load_ocr()
156
+ if reader is None:
157
+ return ""
158
+ img_np = np.array(img)
159
+ results = reader.readtext(img_np, detail=0, paragraph=True)
160
+ text = " ".join(results).strip()
161
+ return text if len(text) >= 6 else ""
162
+ except Exception as e:
163
+ print(f"[image_to_text] OCR error: {e}")
164
+ return ""
165
+
166
+
167
+ # ─────────────────────────────────────────────────────────────────────────────
168
+ # STEP 3 — BLIP MULTI-PROMPT CAPTIONING
169
+ # ─────────────────────────────────────────────────────────────────────────────
170
+ def _load_blip():
171
+ global _blip_processor, _blip_model
172
+ if _blip_model is not None:
173
+ return (None, None) if _blip_model == "unavailable" else (_blip_processor, _blip_model)
174
+ try:
175
+ from transformers import BlipProcessor, BlipForConditionalGeneration
176
+ print("🔄 Loading BLIP-base captioning model (~450 MB)…")
177
+ _blip_processor = BlipProcessor.from_pretrained(
178
+ "Salesforce/blip-image-captioning-base"
179
+ )
180
+ _blip_model = BlipForConditionalGeneration.from_pretrained(
181
+ "Salesforce/blip-image-captioning-base"
182
+ ).to(_DEVICE)
183
+ _blip_model.eval()
184
+ print("✅ BLIP-base loaded.")
185
+ except Exception as e:
186
+ print(f"⚠️ BLIP load error: {e}")
187
+ _blip_model = "unavailable"
188
+ return (None, None) if _blip_model == "unavailable" else (_blip_processor, _blip_model)
189
+
190
+
191
+ def _score_caption(caption: str) -> int:
192
+ """Count civic keywords in caption. Used for selection only — never added to output."""
193
+ words = set(re.findall(r'\b\w+\b', caption.lower()))
194
+ return len(words & _CIVIC_KEYWORDS)
195
+
196
+
197
+ def _run_blip_multi_prompt(img: Image.Image) -> str:
198
+ """
199
+ Run BLIP with 5 civic prompts.
200
+ Returns the caption with the highest civic keyword score.
201
+ Raw caption only — no extra words added.
202
+ """
203
+ processor, model = _load_blip()
204
+ if model is None:
205
+ return ""
206
+
207
+ best_caption = ""
208
+ best_score = -1
209
+
210
+ for prompt in _CIVIC_PROMPTS:
211
+ try:
212
+ inputs = processor(
213
+ img,
214
+ text = prompt,
215
+ return_tensors = "pt",
216
+ ).to(_DEVICE)
217
+
218
+ with torch.no_grad():
219
+ output = model.generate(
220
+ **inputs,
221
+ max_new_tokens = 60,
222
+ num_beams = 5,
223
+ early_stopping = True,
224
+ no_repeat_ngram_size = 3,
225
+ )
226
+
227
+ caption = processor.decode(output[0], skip_special_tokens=True).strip()
228
+
229
+ # Skip if model just echoed the prompt with no new content
230
+ if len(caption) <= len(prompt) + 5:
231
+ continue
232
+
233
+ score = _score_caption(caption)
234
+ if score > best_score:
235
+ best_score = score
236
+ best_caption = caption
237
+
238
+ except Exception as e:
239
+ print(f"[image_to_text] Prompt failed: {e}")
240
+ continue
241
+
242
+ # Unconditional fallback
243
+ if not best_caption:
244
+ try:
245
+ inputs = processor(img, return_tensors="pt").to(_DEVICE)
246
+ with torch.no_grad():
247
+ output = model.generate(
248
+ **inputs,
249
+ max_new_tokens = 60,
250
+ num_beams = 5,
251
+ no_repeat_ngram_size = 3,
252
+ )
253
+ best_caption = processor.decode(output[0], skip_special_tokens=True).strip()
254
+ except Exception as e:
255
+ print(f"[image_to_text] Unconditional fallback failed: {e}")
256
+
257
+ return best_caption
258
+
259
+
260
+ # ─────────────────────────────────────────────────────────────────────────────
261
+ # STEP 4 — CLEAN FUSION (no predefined phrases added)
262
+ # ─────────────────────────────────────────────────────────────────────────────
263
+ def _is_redundant(text_a: str, text_b: str) -> bool:
264
+ """True if text_a is >60% word-overlap with text_b (already covered)."""
265
+ words_a = set(text_a.lower().split())
266
+ words_b = set(text_b.lower().split())
267
+ if not words_a:
268
+ return True
269
+ return len(words_a & words_b) / len(words_a) > 0.6
270
+
271
+
272
+ def _fuse(ocr_text: str, caption: str) -> str:
273
+ """
274
+ Combine OCR + caption into a single clean string.
275
+
276
+ Rules (no predefined text added at any point):
277
+ OCR > 20 chars → OCR is primary (it's the actual complaint text)
278
+ Caption appended only if it adds new information
279
+ OCR short/none → Caption is the output, as-is from BLIP
280
+ Both empty → return ""
281
+ """
282
+ ocr = ocr_text.strip()
283
+ caption = caption.strip()
284
+
285
+ if len(ocr) > 20:
286
+ # OCR has the real complaint text — use it directly
287
+ if caption and not _is_redundant(caption, ocr):
288
+ return f"{ocr}. {caption}"
289
+ return ocr
290
+
291
+ if caption:
292
+ # Pure photo — BLIP caption is the output, nothing added
293
+ if ocr and not _is_redundant(ocr, caption):
294
+ return f"{caption}. {ocr}"
295
+ return caption
296
+
297
+ return ocr or caption
298
+
299
+
300
+ # ─────────────────────────────────────────────────────────────────────────────
301
+ # LOCAL PIPELINE
302
+ # ─────────────────────────────────────────────────────────────────────────────
303
+ def _extract_local(image_bytes: bytes) -> str:
304
+ try:
305
+ img = _preprocess_image(image_bytes)
306
+ ocr_text = _run_ocr(img)
307
+ caption = _run_blip_multi_prompt(img)
308
+
309
+ if ocr_text:
310
+ print(f"[image_to_text] OCR: {ocr_text[:100]}")
311
+ if caption:
312
+ print(f"[image_to_text] Caption (score={_score_caption(caption)}): {caption[:100]}")
313
+
314
+ result = _fuse(ocr_text, caption)
315
+ print(f"[image_to_text] ✅ Output: {result[:160]}")
316
+ return result
317
+
318
+ except Exception as e:
319
+ print(f"[image_to_text] ❌ Pipeline failed: {e}")
320
+ return ""
321
+
322
+
323
+ # ─────────────────────────────────────────────────────────────────────────────
324
+ # HF API PATH — Render / production
325
+ # ─────────────────────────────────────────────────────────────────────────────
326
+ def _extract_via_hf_api(image_bytes: bytes) -> str:
327
+ """
328
+ Production path — HuggingFace Inference API.
329
+ Raw caption returned as-is. No predefined text added.
330
+ """
331
+ import requests
332
+ try:
333
+ res = requests.post(
334
+ "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large",
335
+ headers = {"Authorization": f"Bearer {_HF_TOKEN}"},
336
+ data = image_bytes,
337
+ timeout = 30,
338
+ )
339
+ if res.ok:
340
+ data = res.json()
341
+ caption = data[0].get("generated_text", "").strip() if isinstance(data, list) else ""
342
+ print(f"[image_to_text] ✅ HF API output: {caption[:160]}")
343
+ return caption
344
+ except Exception as e:
345
+ print(f"[image_to_text] HF API error: {e}")
346
+ return ""
requirements.txt ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── Core ML / DL ──────────────────────────────────────────────
2
+ torch
3
+ transformers==5.2.0
4
+ tokenizers==0.22.2
5
+ accelerate>=1.1.0
6
+ safetensors>=0.4.3
7
+ huggingface-hub>=1.3.0
8
+
9
+ # ── Audio ──────────────────────────────────────────────────────
10
+ pydub
11
+ soundfile
12
+ scipy
13
+
14
+ # ── Image ──────────────────────────────────────────────────────
15
+ Pillow
16
+ easyocr
17
+ opencv-python-headless
18
+
19
+ # ── NLP / Text ─────────────────────────────────────────────────
20
+ sentencepiece
21
+ tiktoken
22
+ protobuf>=5.28.0
23
+ regex
24
+ nltk
25
+ indic-nlp-library
26
+ stopwordsiso
27
+
28
+ # ── Explainability ─────────────────────────────────────────────
29
+ captum
30
+ shap>=0.44
31
+
32
+ # ── Forecasting ────────────────────────────────────────────────
33
+ prophet
34
+
35
+ # ── Data / ML ──────────────────────────────────────────────────
36
+ pandas
37
+ numpy
38
+ scikit-learn
39
+ matplotlib
40
+ seaborn
41
+
42
+ # ── Backend ────────────────────────────────────────────────────
43
+ # Flask only — MongoDB + Cloudinary are handled by Express/Node
44
+ flask
45
+ flask-cors
46
+ gunicorn
47
+ werkzeug
48
+ python-dotenv
49
+ requests==2.32.3
50
+ python-dotenv==1.0.1
sentiment_analysis/artifacts/indic_urgency_model/config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AlbertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0,
6
+ "bos_token_id": 2,
7
+ "classifier_dropout_prob": 0.1,
8
+ "down_scale_factor": 1,
9
+ "dtype": "float32",
10
+ "embedding_size": 128,
11
+ "eos_token_id": 3,
12
+ "gap_size": 0,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0,
15
+ "hidden_size": 768,
16
+ "id2label": {
17
+ "0": "LABEL_0",
18
+ "1": "LABEL_1",
19
+ "2": "LABEL_2",
20
+ "3": "LABEL_3"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "inner_group_num": 1,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "LABEL_0": 0,
27
+ "LABEL_1": 1,
28
+ "LABEL_2": 2,
29
+ "LABEL_3": 3
30
+ },
31
+ "layer_norm_eps": 1e-12,
32
+ "max_position_embeddings": 512,
33
+ "model_type": "albert",
34
+ "net_structure_type": 0,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_groups": 1,
37
+ "num_hidden_layers": 12,
38
+ "num_memory_blocks": 0,
39
+ "pad_token_id": 0,
40
+ "problem_type": "single_label_classification",
41
+ "tie_word_embeddings": true,
42
+ "transformers_version": "5.1.0",
43
+ "type_vocab_size": 2,
44
+ "use_cache": false,
45
+ "vocab_size": 200000
46
+ }
sentiment_analysis/artifacts/indic_urgency_model/label_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:693a9eaa2ea5336e580da8eb27a85d30bb0e2100184c6a491f5d60f5df14abf7
3
+ size 275
sentiment_analysis/artifacts/indic_urgency_model/label_map.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d80d3aacf4a3ee5cabfa43c871d944fce68e1e4602d2154417d1c4ca3899edf7
3
+ size 194
sentiment_analysis/artifacts/indic_urgency_model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5eb8a6dd044987f78c003f910440efb162c76d3b780ff2c0026c19158fac2df
3
+ size 14969267
sentiment_analysis/artifacts/indic_urgency_model/tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "[CLS]",
5
+ "cls_token": "[CLS]",
6
+ "do_lower_case": true,
7
+ "eos_token": "[SEP]",
8
+ "extra_special_tokens": [
9
+ "<pad>",
10
+ "[CLS]",
11
+ "[SEP]",
12
+ "[MASK]"
13
+ ],
14
+ "is_local": false,
15
+ "keep_accents": false,
16
+ "mask_token": "[MASK]",
17
+ "model_max_length": 1000000000000000019884624838656,
18
+ "pad_token": "<pad>",
19
+ "sep_token": "[SEP]",
20
+ "tokenizer_class": "AlbertTokenizer",
21
+ "trim_offsets": true,
22
+ "unk_id": 1,
23
+ "unk_token": "<unk>"
24
+ }
sentiment_analysis/artifacts/urgency_bert_model/config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "gradient_checkpointing": false,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 768,
15
+ "id2label": {
16
+ "0": "LABEL_0",
17
+ "1": "LABEL_1",
18
+ "2": "LABEL_2",
19
+ "3": "LABEL_3"
20
+ },
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 3072,
23
+ "is_decoder": false,
24
+ "label2id": {
25
+ "LABEL_0": 0,
26
+ "LABEL_1": 1,
27
+ "LABEL_2": 2,
28
+ "LABEL_3": 3
29
+ },
30
+ "layer_norm_eps": 1e-12,
31
+ "max_position_embeddings": 512,
32
+ "model_type": "bert",
33
+ "num_attention_heads": 12,
34
+ "num_hidden_layers": 12,
35
+ "pad_token_id": 0,
36
+ "position_embedding_type": "absolute",
37
+ "problem_type": "single_label_classification",
38
+ "tie_word_embeddings": true,
39
+ "transformers_version": "5.1.0",
40
+ "type_vocab_size": 2,
41
+ "use_cache": false,
42
+ "vocab_size": 30522
43
+ }
sentiment_analysis/artifacts/urgency_bert_model/label_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:693a9eaa2ea5336e580da8eb27a85d30bb0e2100184c6a491f5d60f5df14abf7
3
+ size 275
sentiment_analysis/artifacts/urgency_bert_model/label_map.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d80d3aacf4a3ee5cabfa43c871d944fce68e1e4602d2154417d1c4ca3899edf7
3
+ size 194
sentiment_analysis/artifacts/urgency_bert_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
sentiment_analysis/artifacts/urgency_bert_model/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 512,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
sentiment_analysis/bert_model.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # BERT URGENCY MODEL TRAINING
3
+ # File: bert_model.py
4
+ # Purpose: Train urgency prediction (Low, Medium, High, Critical)
5
+ # =========================================================
6
+
7
+ import os
8
+ import re
9
+ import pickle
10
+ import pandas as pd
11
+ import numpy as np
12
+ import torch
13
+
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.preprocessing import LabelEncoder
16
+ from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef
17
+
18
+ from transformers import (
19
+ BertTokenizer,
20
+ BertForSequenceClassification,
21
+ Trainer,
22
+ TrainingArguments
23
+ )
24
+
25
+ from torch.utils.data import Dataset
26
+
27
+
28
+ # =========================================================
29
+ # PATH CONFIGURATION
30
+ # =========================================================
31
+
32
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
33
+
34
+ DATA_PATH = os.path.join(BASE_DIR, "urgency_train.csv")
35
+
36
+ ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
37
+
38
+ MODEL_DIR = os.path.join(ARTIFACT_DIR, "urgency_bert_model")
39
+
40
+ os.makedirs(MODEL_DIR, exist_ok=True)
41
+
42
+
43
+ # =========================================================
44
+ # PARAMETERS
45
+ # =========================================================
46
+
47
+ MAX_LENGTH = 128
48
+ EPOCHS = 4
49
+ BATCH_SIZE = 16
50
+ LEARNING_RATE = 2e-5
51
+
52
+
53
+ # =========================================================
54
+ # LOAD DATASET
55
+ # =========================================================
56
+
57
+ print(f"\nLoading dataset from: {DATA_PATH}")
58
+
59
+ df = pd.read_csv(DATA_PATH)
60
+
61
+ df = df[["text", "urgency"]]
62
+
63
+ df.dropna(inplace=True)
64
+
65
+ df.drop_duplicates(inplace=True)
66
+
67
+
68
+ # =========================================================
69
+ # CLEAN TEXT
70
+ # =========================================================
71
+
72
+ def clean_text(text):
73
+
74
+ text = str(text)
75
+
76
+ text = re.sub(r"<.*?>", " ", text)
77
+
78
+ text = re.sub(r"\s+", " ", text).strip()
79
+
80
+ return text
81
+
82
+
83
+ df["text"] = df["text"].apply(clean_text)
84
+
85
+
86
+ # =========================================================
87
+ # LABEL ENCODING
88
+ # =========================================================
89
+
90
+ label_encoder = LabelEncoder()
91
+
92
+ df["label_id"] = label_encoder.fit_transform(df["urgency"])
93
+
94
+ label_map = dict(zip(
95
+ label_encoder.classes_,
96
+ label_encoder.transform(label_encoder.classes_)
97
+ ))
98
+
99
+
100
+ # SAVE LABEL ARTIFACTS
101
+
102
+ with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "wb") as f:
103
+ pickle.dump(label_encoder, f)
104
+
105
+ with open(os.path.join(MODEL_DIR, "label_map.pkl"), "wb") as f:
106
+ pickle.dump(label_map, f)
107
+
108
+
109
+ NUM_LABELS = len(label_map)
110
+
111
+ print("Classes:", label_map)
112
+
113
+
114
+ # =========================================================
115
+ # SPLIT DATA
116
+ # =========================================================
117
+
118
+ train_df, temp_df = train_test_split(
119
+
120
+ df,
121
+ test_size=0.30,
122
+ stratify=df["label_id"],
123
+ random_state=42
124
+ )
125
+
126
+ val_df, test_df = train_test_split(
127
+
128
+ temp_df,
129
+ test_size=0.50,
130
+ stratify=temp_df["label_id"],
131
+ random_state=42
132
+ )
133
+
134
+
135
+ # =========================================================
136
+ # TOKENIZER
137
+ # =========================================================
138
+
139
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
140
+
141
+ tokenizer.save_pretrained(MODEL_DIR)
142
+
143
+
144
+ # =========================================================
145
+ # DATASET CLASS
146
+ # =========================================================
147
+
148
+ class UrgencyDataset(Dataset):
149
+
150
+ def __init__(self, texts, labels):
151
+
152
+ self.encodings = tokenizer(
153
+ list(texts),
154
+ truncation=True,
155
+ padding=True,
156
+ max_length=MAX_LENGTH
157
+ )
158
+
159
+ self.labels = list(labels)
160
+
161
+ def __getitem__(self, idx):
162
+
163
+ item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
164
+
165
+ item["labels"] = torch.tensor(self.labels[idx])
166
+
167
+ return item
168
+
169
+ def __len__(self):
170
+
171
+ return len(self.labels)
172
+
173
+
174
+ train_dataset = UrgencyDataset(train_df["text"], train_df["label_id"])
175
+
176
+ val_dataset = UrgencyDataset(val_df["text"], val_df["label_id"])
177
+
178
+ test_dataset = UrgencyDataset(test_df["text"], test_df["label_id"])
179
+
180
+
181
+ # =========================================================
182
+ # LOAD MODEL
183
+ # =========================================================
184
+
185
+ model = BertForSequenceClassification.from_pretrained(
186
+
187
+ "bert-base-uncased",
188
+
189
+ num_labels=NUM_LABELS
190
+ )
191
+
192
+
193
+ # =========================================================
194
+ # METRICS
195
+ # =========================================================
196
+
197
+ def compute_metrics(eval_pred):
198
+
199
+ logits, labels = eval_pred
200
+
201
+ preds = np.argmax(logits, axis=1)
202
+
203
+ return {
204
+
205
+ "accuracy": accuracy_score(labels, preds),
206
+
207
+ "balanced_accuracy": balanced_accuracy_score(labels, preds),
208
+
209
+ "f1_weighted": f1_score(labels, preds, average="weighted"),
210
+
211
+ "mcc": matthews_corrcoef(labels, preds)
212
+ }
213
+
214
+
215
+ # =========================================================
216
+ # TRAINING CONFIG
217
+ # =========================================================
218
+
219
+ training_args = TrainingArguments(
220
+ output_dir=os.path.join(ARTIFACT_DIR, "results"),
221
+ learning_rate=LEARNING_RATE,
222
+ per_device_train_batch_size=BATCH_SIZE,
223
+ per_device_eval_batch_size=BATCH_SIZE,
224
+ num_train_epochs=EPOCHS,
225
+ weight_decay=0.01,
226
+ logging_steps=100,
227
+ save_strategy="no",
228
+ report_to="none"
229
+ )
230
+
231
+ trainer = Trainer(
232
+ model=model,
233
+ args=training_args,
234
+ train_dataset=train_dataset,
235
+ eval_dataset=val_dataset,
236
+ compute_metrics=compute_metrics
237
+ )
238
+ print("\nTraining urgency BERT model...")
239
+
240
+ trainer.train()
241
+
242
+
243
+ # =========================================================
244
+ # FINAL TEST EVALUATION
245
+ # =========================================================
246
+
247
+ predictions = trainer.predict(test_dataset)
248
+
249
+ y_true = predictions.label_ids
250
+
251
+ y_pred = np.argmax(predictions.predictions, axis=1)
252
+
253
+ print("\nFINAL TEST RESULTS")
254
+
255
+ print("Accuracy:", accuracy_score(y_true, y_pred))
256
+
257
+ print("F1:", f1_score(y_true, y_pred, average="weighted"))
258
+
259
+
260
+ # =========================================================
261
+ # SAVE MODEL
262
+ # =========================================================
263
+
264
+ trainer.save_model(MODEL_DIR)
265
+
266
+ print("\nUrgency BERT model saved successfully.")
267
+
268
+
sentiment_analysis/bert_predict.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # BERT URGENCY PREDICTION — ENGLISH
3
+ # =========================================================
4
+
5
+ import os
6
+ import torch
7
+ import pickle
8
+ from transformers import BertTokenizer, BertForSequenceClassification
9
+
10
+ # ── Load artifacts ────────────────────────────────────────
11
+ BASE_DIR = os.path.dirname(__file__)
12
+ MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "urgency_bert_model")
13
+
14
+ tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
15
+ model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
16
+
17
+ label_encoder = pickle.load(
18
+ open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb")
19
+ )
20
+
21
+ model.eval()
22
+
23
+ MAX_LENGTH = 128
24
+
25
+
26
+ # ── Predict ───────────────────────────────────────────────
27
+ def predict_urgency(
28
+ text: str,
29
+ input_ids=None, # O3: pre-tokenised tensor from main.py
30
+ attention_mask=None, # O3: pre-tokenised tensor from main.py
31
+ ) -> dict:
32
+ """
33
+ Predict urgency level for English grievance text.
34
+
35
+ Args:
36
+ text : Raw input string.
37
+ input_ids : Optional pre-tokenised tensor (1, seq_len).
38
+ attention_mask : Required when input_ids is provided.
39
+
40
+ Returns dict with keys: urgency, confidence, class_index.
41
+ """
42
+ # O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
43
+ if input_ids is None:
44
+ enc = tokenizer(
45
+ text,
46
+ return_tensors="pt",
47
+ truncation=True,
48
+ padding=False,
49
+ max_length=MAX_LENGTH,
50
+ )
51
+ input_ids = enc["input_ids"]
52
+ attention_mask = enc["attention_mask"]
53
+
54
+ with torch.no_grad():
55
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
56
+
57
+ probs = torch.softmax(outputs.logits, dim=1)
58
+ conf, pred = torch.max(probs, dim=1)
59
+ confidence = conf.item()
60
+ predicted_index = pred.item()
61
+
62
+ urgency = label_encoder.inverse_transform([predicted_index])[0]
63
+
64
+ return {
65
+ "urgency": urgency,
66
+ "confidence": round(confidence, 4),
67
+ "class_index": predicted_index,
68
+ }
69
+
70
+
71
+ def get_model_and_tokenizer():
72
+ return model, tokenizer
73
+
74
+
75
+ # ── Standalone test ───────────────────────────────────────
76
+ if __name__ == "__main__":
77
+ print("\nBERT Urgency Prediction Test")
78
+ while True:
79
+ text = input("\nEnter grievance (or 'exit'): ")
80
+ if text.lower() == "exit":
81
+ break
82
+ print(predict_urgency(text))
sentiment_analysis/indic_bert_model.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # INDICBERT URGENCY MODEL TRAINING
3
+ # File: indic_model.py
4
+ # Supports: Hindi + Telugu urgency prediction
5
+ # Labels: Low, Medium, High, Critical
6
+ # =========================================================
7
+
8
+ import os
9
+ import re
10
+ import pickle
11
+ import pandas as pd
12
+ import numpy as np
13
+ import torch
14
+
15
+ from sklearn.model_selection import train_test_split
16
+ from sklearn.preprocessing import LabelEncoder
17
+ from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, matthews_corrcoef
18
+
19
+ from transformers import (
20
+ AutoTokenizer,
21
+ AutoModelForSequenceClassification,
22
+ Trainer,
23
+ TrainingArguments
24
+ )
25
+
26
+ from torch.utils.data import Dataset
27
+
28
+
29
+ # =========================================================
30
+ # PATH CONFIG
31
+ # =========================================================
32
+
33
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
34
+
35
+ DATA_PATH = os.path.join(BASE_DIR, "urgency_indic.csv")
36
+
37
+ ARTIFACT_DIR = os.path.join(BASE_DIR, "artifacts")
38
+
39
+ MODEL_DIR = os.path.join(ARTIFACT_DIR, "indic_urgency_model")
40
+
41
+ os.makedirs(MODEL_DIR, exist_ok=True)
42
+
43
+
44
+ # =========================================================
45
+ # PARAMETERS
46
+ # =========================================================
47
+
48
+ MODEL_NAME = "ai4bharat/indic-bert"
49
+
50
+ MAX_LENGTH = 128
51
+ EPOCHS = 4
52
+ BATCH_SIZE = 16
53
+ LEARNING_RATE = 2e-5
54
+
55
+
56
+ # =========================================================
57
+ # LOAD DATASET
58
+ # =========================================================
59
+
60
+ print(f"\nLoading Indic urgency dataset from: {DATA_PATH}")
61
+
62
+ df = pd.read_csv(DATA_PATH)
63
+
64
+ df = df[["text", "urgency"]]
65
+
66
+ df.dropna(inplace=True)
67
+
68
+ df.drop_duplicates(inplace=True)
69
+
70
+
71
+ # =========================================================
72
+ # CLEAN TEXT
73
+ # =========================================================
74
+
75
+ def clean_text(text):
76
+
77
+ text = str(text)
78
+
79
+ text = re.sub(r"<.*?>", " ", text)
80
+
81
+ text = re.sub(r"\s+", " ", text).strip()
82
+
83
+ return text
84
+
85
+
86
+ df["text"] = df["text"].apply(clean_text)
87
+
88
+
89
+ # =========================================================
90
+ # LABEL ENCODING
91
+ # =========================================================
92
+
93
+ label_encoder = LabelEncoder()
94
+
95
+ df["label_id"] = label_encoder.fit_transform(df["urgency"])
96
+
97
+ label_map = dict(zip(
98
+ label_encoder.classes_,
99
+ label_encoder.transform(label_encoder.classes_)
100
+ ))
101
+
102
+
103
+ # SAVE LABEL ARTIFACTS
104
+
105
+ with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "wb") as f:
106
+ pickle.dump(label_encoder, f)
107
+
108
+ with open(os.path.join(MODEL_DIR, "label_map.pkl"), "wb") as f:
109
+ pickle.dump(label_map, f)
110
+
111
+
112
+ NUM_LABELS = len(label_map)
113
+
114
+ print("Classes:", label_map)
115
+
116
+
117
+ # =========================================================
118
+ # TRAIN / VAL / TEST SPLIT
119
+ # =========================================================
120
+
121
+ train_df, temp_df = train_test_split(
122
+
123
+ df,
124
+ test_size=0.30,
125
+ stratify=df["label_id"],
126
+ random_state=42
127
+ )
128
+
129
+ val_df, test_df = train_test_split(
130
+
131
+ temp_df,
132
+ test_size=0.50,
133
+ stratify=temp_df["label_id"],
134
+ random_state=42
135
+ )
136
+
137
+
138
+ # =========================================================
139
+ # TOKENIZER
140
+ # =========================================================
141
+
142
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
143
+
144
+ tokenizer.save_pretrained(MODEL_DIR)
145
+
146
+
147
+ # =========================================================
148
+ # DATASET CLASS
149
+ # =========================================================
150
+
151
+ class IndicUrgencyDataset(Dataset):
152
+
153
+ def __init__(self, texts, labels):
154
+
155
+ self.encodings = tokenizer(
156
+
157
+ list(texts),
158
+
159
+ truncation=True,
160
+
161
+ padding=True,
162
+
163
+ max_length=MAX_LENGTH
164
+ )
165
+
166
+ self.labels = list(labels)
167
+
168
+ def __getitem__(self, idx):
169
+
170
+ item = {
171
+ key: torch.tensor(val[idx])
172
+ for key, val in self.encodings.items()
173
+ }
174
+
175
+ item["labels"] = torch.tensor(self.labels[idx])
176
+
177
+ return item
178
+
179
+ def __len__(self):
180
+
181
+ return len(self.labels)
182
+
183
+
184
+ train_dataset = IndicUrgencyDataset(train_df["text"], train_df["label_id"])
185
+
186
+ val_dataset = IndicUrgencyDataset(val_df["text"], val_df["label_id"])
187
+
188
+ test_dataset = IndicUrgencyDataset(test_df["text"], test_df["label_id"])
189
+
190
+
191
+ # =========================================================
192
+ # MODEL
193
+ # =========================================================
194
+
195
+ model = AutoModelForSequenceClassification.from_pretrained(
196
+
197
+ MODEL_NAME,
198
+
199
+ num_labels=NUM_LABELS
200
+ )
201
+
202
+
203
+ # =========================================================
204
+ # METRICS
205
+ # =========================================================
206
+
207
+ def compute_metrics(eval_pred):
208
+
209
+ logits, labels = eval_pred
210
+
211
+ preds = np.argmax(logits, axis=1)
212
+
213
+ return {
214
+
215
+ "accuracy": accuracy_score(labels, preds),
216
+
217
+ "balanced_accuracy": balanced_accuracy_score(labels, preds),
218
+
219
+ "f1_weighted": f1_score(labels, preds, average="weighted"),
220
+
221
+ "mcc": matthews_corrcoef(labels, preds)
222
+ }
223
+
224
+
225
+ # =========================================================
226
+ # TRAINING CONFIG
227
+ # =========================================================
228
+ training_args = TrainingArguments(
229
+ output_dir=f"{ARTIFACT_DIR}/indic_results",
230
+ learning_rate=LEARNING_RATE,
231
+ per_device_train_batch_size=BATCH_SIZE,
232
+ per_device_eval_batch_size=BATCH_SIZE,
233
+ num_train_epochs=EPOCHS,
234
+ weight_decay=0.01,
235
+ logging_steps=100,
236
+ save_strategy="no",
237
+ report_to="none"
238
+ )
239
+
240
+
241
+ trainer = Trainer(
242
+ model=model,
243
+ args=training_args,
244
+ train_dataset=train_dataset,
245
+ eval_dataset=val_dataset,
246
+ compute_metrics=compute_metrics
247
+ )
248
+
249
+ print("\nTraining IndicBERT urgency model...")
250
+
251
+ trainer.train()
252
+
253
+
254
+ # =========================================================
255
+ # SAVE MODEL
256
+ # =========================================================
257
+
258
+ trainer.save_model(MODEL_DIR)
259
+
260
+ print("\nIndicBERT urgency model saved successfully.")
sentiment_analysis/indic_bert_predict.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # INDICBERT URGENCY PREDICTION — HINDI + TELUGU
3
+ # =========================================================
4
+
5
+ import os
6
+ import re
7
+ import torch
8
+ import pickle
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+
11
+ # ── Load artifacts ────────────────────────────────────────
12
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
+ MODEL_DIR = os.path.join(BASE_DIR, "artifacts", "indic_urgency_model")
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
16
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
17
+ model.eval()
18
+
19
+ with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "rb") as f:
20
+ label_encoder = pickle.load(f)
21
+
22
+ MAX_LENGTH = 128
23
+
24
+
25
+ # ── Text cleaning ─────────────────────────────────────────
26
+ def clean_text(text: str) -> str:
27
+ text = str(text)
28
+ text = re.sub(r"<.*?>", " ", text)
29
+ text = re.sub(r"\s+", " ", text).strip()
30
+ return text
31
+
32
+
33
+ # ── Predict ───────────────────────────────────────────────
34
+ def predict(
35
+ text: str,
36
+ input_ids=None, # O3: pre-tokenised tensor from main.py
37
+ attention_mask=None, # O3: pre-tokenised tensor from main.py
38
+ ) -> dict:
39
+ """
40
+ Predict urgency level for Hindi / Telugu grievance text.
41
+
42
+ Args:
43
+ text : Raw input string.
44
+ input_ids : Optional pre-tokenised tensor (1, seq_len).
45
+ attention_mask : Required when input_ids is provided.
46
+
47
+ Returns dict with keys: urgency, confidence, class_index.
48
+ """
49
+ # O3: use pre-tokenised tensors if supplied; otherwise tokenise now.
50
+ if input_ids is None:
51
+ cleaned = clean_text(text)
52
+ enc = tokenizer(
53
+ cleaned,
54
+ return_tensors="pt",
55
+ truncation=True,
56
+ padding=False,
57
+ max_length=MAX_LENGTH,
58
+ )
59
+ input_ids = enc["input_ids"]
60
+ attention_mask = enc["attention_mask"]
61
+
62
+ with torch.no_grad():
63
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
64
+
65
+ probs = torch.softmax(outputs.logits, dim=1)
66
+ conf, pred = torch.max(probs, dim=1)
67
+ confidence = conf.item()
68
+ predicted_index = pred.item()
69
+
70
+ urgency = label_encoder.inverse_transform([predicted_index])[0]
71
+
72
+ return {
73
+ "urgency": urgency,
74
+ "confidence": round(confidence, 4),
75
+ "class_index": predicted_index,
76
+ }
77
+
78
+
79
+ def get_model_and_tokenizer():
80
+ return model, tokenizer
81
+
82
+
83
+ # ── Standalone test ───────────────────────────────────────
84
+ if __name__ == "__main__":
85
+ while True:
86
+ text = input("\nEnter Hindi/Telugu grievance (or 'exit'): ")
87
+ if text.lower() == "exit":
88
+ break
89
+ print(predict(text))