sgAtdbd commited on
Commit
8ad9255
·
1 Parent(s): 4a2bdba

Initial deployment of HateShield backend

Browse files
.dockerignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ venv/
5
+ data/*.csv
6
+ *.log
7
+ .env
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ venv/
7
+ env/
8
+ .env
9
+ .venv
10
+ *.log
11
+ .DS_Store
12
+ *.csv
13
+ .pytest_cache/
14
+ .coverage
15
+ htmlcov/
16
+ dist/
17
+ build/
18
+ *.egg-info/
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Create Dockerfile
2
+ FROM python:3.10-slim
3
+
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements first for better caching
12
+ COPY requirements.txt .
13
+
14
+ # Install Python dependencies
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy application files
18
+ COPY . .
19
+
20
+ # Create cache directories
21
+ RUN mkdir -p /tmp/transformers_cache /tmp/huggingface
22
+
23
+ # Set environment variables
24
+ ENV TRANSFORMERS_CACHE=/tmp/transformers_cache
25
+ ENV HF_HOME=/tmp/huggingface
26
+
27
+ # Expose port 7860 (Hugging Face Spaces default)
28
+ EXPOSE 7860
29
+
30
+ # Run the application
31
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"] | Out-File -FilePath Dockerfile -Encoding utf8
README.md CHANGED
@@ -1,10 +1,33 @@
 
 
1
  ---
2
- title: Hateshield Bn
3
- emoji: 👁
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Create README.md
2
+ @"
3
  ---
4
+ title: HateShield Backend
5
+ emoji: 🛡️
6
+ colorFrom: blue
7
+ colorTo: purple
8
  sdk: docker
9
  pinned: false
10
  ---
11
 
12
+ # HateShield Backend API
13
+
14
+ Bilingual hate speech detection system using ensemble ML models.
15
+
16
+ ## Features
17
+ - English & Bengali hate speech detection
18
+ - Document analysis (PDF, DOCX, TXT)
19
+ - URL content scraping
20
+ - Real-time confidence scoring
21
+
22
+ ## API Endpoints
23
+ - \`POST /analyze/text\` - Analyze text input
24
+ - \`POST /analyze/url\` - Analyze URL content
25
+ - \`POST /analyze/document\` - Analyze uploaded documents
26
+ - \`GET /health\` - Health check
27
+
28
+ ## Tech Stack
29
+ - FastAPI
30
+ - Transformers (Hugging Face)
31
+ - scikit-learn
32
+ - PyTorch
33
+ "@ | Out-File -FilePath README.md -Encoding utf8
__init__.py ADDED
File without changes
api/__init__.py ADDED
File without changes
api/routes.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
3
+ os.environ['HF_HOME'] = '/tmp/huggingface'
4
+
5
+ from main import app
6
+
7
+ if __name__ == "__main__":
8
+ import uvicorn
9
+ uvicorn.run(app, host="0.0.0.0", port=7860) # HF Spaces uses port 7860
main.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, UploadFile, File
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel, HttpUrl
4
+ from typing import Optional
5
+ import uvicorn
6
+
7
+ from services.analyzer import analyze_content
8
+ from services.text_extractor import extract_from_url, extract_from_document
9
+
10
+ app = FastAPI(
11
+ title="HateShield-BN API",
12
+ description="Bilingual Hate Speech Detection System",
13
+ version="1.0.0"
14
+ )
15
+
16
+ # CORS
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=["http://localhost:5173", "http://localhost:3000"],
20
+ allow_credentials=True,
21
+ allow_methods=["*"],
22
+ allow_headers=["*"],
23
+ )
24
+
25
+ # Request models
26
+ class TextRequest(BaseModel):
27
+ text: str
28
+
29
+ class URLRequest(BaseModel):
30
+ url: HttpUrl
31
+
32
+ # Routes
33
+ @app.get("/")
34
+ async def root():
35
+ return {
36
+ "message": "HateShield-BN API is running!",
37
+ "version": "1.0.0",
38
+ "endpoints": {
39
+ "text": "/api/analyze/text",
40
+ "url": "/api/analyze/url",
41
+ "document": "/api/analyze/document"
42
+ }
43
+ }
44
+
45
+ @app.post("/api/analyze/text")
46
+ async def analyze_text(request: TextRequest):
47
+ """Analyze text for hate speech"""
48
+ try:
49
+ if not request.text or len(request.text.strip()) == 0:
50
+ raise HTTPException(status_code=400, detail="Text cannot be empty")
51
+
52
+ result = await analyze_content(request.text)
53
+ return result
54
+ except Exception as e:
55
+ print(f"Error analyzing text: {e}")
56
+ raise HTTPException(status_code=500, detail=str(e))
57
+
58
+ @app.post("/api/analyze/url")
59
+ async def analyze_url(request: URLRequest):
60
+ """Analyze content from URL"""
61
+ try:
62
+ # Note: extract_from_url is now synchronous
63
+ text = extract_from_url(str(request.url))
64
+
65
+ if not text:
66
+ raise HTTPException(status_code=400, detail="Could not extract text from URL")
67
+
68
+ result = await analyze_content(text)
69
+ return result
70
+ except HTTPException:
71
+ raise
72
+ except Exception as e:
73
+ print(f"Error analyzing URL: {e}")
74
+ raise HTTPException(status_code=500, detail=str(e))
75
+
76
+ @app.post("/api/analyze/document")
77
+ async def analyze_document(file: UploadFile = File(...)):
78
+ """Analyze uploaded document"""
79
+ try:
80
+ # Check file type
81
+ allowed_types = [".pdf", ".docx", ".txt"]
82
+ file_ext = f".{file.filename.split('.')[-1].lower()}"
83
+
84
+ if file_ext not in allowed_types:
85
+ raise HTTPException(
86
+ status_code=400,
87
+ detail=f"File type {file_ext} not supported. Allowed: {', '.join(allowed_types)}"
88
+ )
89
+
90
+ # Read file content
91
+ content = await file.read()
92
+
93
+ # Note: extract_from_document is now synchronous
94
+ text = extract_from_document(content, file_ext)
95
+
96
+ if not text:
97
+ raise HTTPException(status_code=400, detail="Could not extract text from document")
98
+
99
+ result = await analyze_content(text)
100
+ return result
101
+ except HTTPException:
102
+ raise
103
+ except Exception as e:
104
+ print(f"Error analyzing document: {e}")
105
+ raise HTTPException(status_code=500, detail=str(e))
106
+
107
+ @app.get("/health")
108
+ async def health_check():
109
+ return {"status": "healthy"}
110
+
111
+ if __name__ == "__main__":
112
+ uvicorn.run(
113
+ "main:app",
114
+ host="0.0.0.0",
115
+ port=8000,
116
+ reload=True
117
+ )
models/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .hate_speech_classifier import HateSpeechClassifier
2
+
3
+ __all__ = ['HateSpeechClassifier']
models/hate_speech_classifier.py ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Optional
2
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
3
+ import joblib
4
+ import os
5
+ import re
6
+ import torch
7
+ from deep_translator import GoogleTranslator
8
+
9
+ class HateSpeechClassifier:
10
+ def __init__(self):
11
+ base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
12
+ models_dir = os.path.join(base_dir, "models", "model_weights", "custom_models")
13
+
14
+ # Initialize translator
15
+ self.translator = GoogleTranslator(source='bn', target='en')
16
+
17
+ # Use multiple pretrained models for better accuracy
18
+ self.pretrained_models = {
19
+ "primary": {
20
+ "name": "facebook/roberta-hate-speech-dynabench-r4-target",
21
+ "pipeline": None,
22
+ "weight": 0.6
23
+ },
24
+ "secondary": {
25
+ "name": "cardiffnlp/twitter-roberta-base-hate-latest",
26
+ "pipeline": None,
27
+ "weight": 0.4
28
+ }
29
+ }
30
+
31
+ # English custom model paths
32
+ self.english_model_path = os.path.join(models_dir, "english_model.pkl")
33
+ self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
34
+ self.english_model = None
35
+ self.english_vectorizer = None
36
+ self.english_model_loaded = False
37
+
38
+ # Bengali custom model paths
39
+ self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
40
+ self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
41
+ self.bengali_model = None
42
+ self.bengali_vectorizer = None
43
+ self.bengali_model_loaded = False
44
+
45
+ # Load models
46
+ self._load_custom_models()
47
+
48
+ # Enhanced hate keywords
49
+ self.hate_keywords = {
50
+ "english": [
51
+ "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
52
+ "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
53
+ "terrorist", "racist", "sexist", "discrimination", "discriminate",
54
+ "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
55
+ "chamar", "bhangi", "sc/st", "reservation quota",
56
+ "no right to live", "don't deserve", "shouldn't exist", "subhuman",
57
+ "inferior", "worthless", "scum", "vermin", "parasite",
58
+ "should be killed", "must die", "deserve to die", "need to be eliminated",
59
+ "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
60
+ "nigger", "chink", "paki", "kike", "faggot", "tranny"
61
+ ],
62
+ "bengali": [
63
+ "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
64
+ "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
65
+ "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
66
+ "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
67
+ ]
68
+ }
69
+
70
+ self.hate_patterns = {
71
+ "english": [
72
+ r"no right to (live|exist|be here|survive)",
73
+ r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
74
+ r"don'?t deserve (to live|life|existence|to exist)",
75
+ r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
76
+ r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
77
+ r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
78
+ r"(send|throw|kick|drive) (them|back) (out|away|home)",
79
+ r"(all|these) .{0,30} (should die|must be killed|need to go)",
80
+ r"(death to|kill all|eliminate all) .{0,30}",
81
+ r"(inferior|subhuman|less than human|not human)",
82
+ ],
83
+ "bengali": [
84
+ r"বাঁচার অধিকার নেই",
85
+ r"মরে যাওয়া উচিত",
86
+ r"নিশ্চিহ্ন করা উচিত"
87
+ ]
88
+ }
89
+
90
+ self.offensive_keywords = {
91
+ "english": [
92
+ "damn", "hell", "crap", "suck", "dumb", "loser", "trash",
93
+ "stupid", "idiot", "moron", "pathetic", "bad", "ugly",
94
+ "disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
95
+ ],
96
+ "bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
97
+ }
98
+
99
+ def _translate_to_english(self, text: str) -> Optional[str]:
100
+ """Translate Bengali to English using deep-translator"""
101
+ try:
102
+ print(f"🔄 Translating Bengali text to English...")
103
+
104
+ # deep-translator has a 5000 character limit per request
105
+ max_chars = 4500
106
+ if len(text) > max_chars:
107
+ text_to_translate = text[:max_chars]
108
+ print(f"⚠️ Text truncated to {max_chars} characters for translation")
109
+ else:
110
+ text_to_translate = text
111
+
112
+ # Translate using Google Translate
113
+ translated_text = self.translator.translate(text_to_translate)
114
+
115
+ print(f"✓ Translation successful")
116
+ print(f" Original (Bengali): {text_to_translate[:100]}...")
117
+ print(f" Translated (English): {translated_text[:100]}...")
118
+
119
+ return translated_text
120
+ except Exception as e:
121
+ print(f"❌ Translation failed: {e}")
122
+ # Try splitting into smaller chunks if it fails
123
+ try:
124
+ print("🔄 Retrying with smaller chunks...")
125
+ words = text.split()
126
+ chunks = []
127
+ current_chunk = []
128
+ current_length = 0
129
+
130
+ for word in words:
131
+ if current_length + len(word) > 1000: # Smaller chunks
132
+ if current_chunk:
133
+ chunks.append(' '.join(current_chunk))
134
+ current_chunk = [word]
135
+ current_length = len(word)
136
+ else:
137
+ current_chunk.append(word)
138
+ current_length += len(word) + 1
139
+
140
+ if current_chunk:
141
+ chunks.append(' '.join(current_chunk))
142
+
143
+ translated_chunks = []
144
+ for chunk in chunks[:5]: # Translate max 5 chunks
145
+ translated_chunk = self.translator.translate(chunk)
146
+ translated_chunks.append(translated_chunk)
147
+
148
+ translated_text = ' '.join(translated_chunks)
149
+ print(f"✓ Translation successful with chunking")
150
+ return translated_text
151
+ except Exception as e2:
152
+ print(f"❌ Translation with chunking also failed: {e2}")
153
+ return None
154
+
155
+ def _load_custom_models(self):
156
+ """Load language-specific custom models"""
157
+ try:
158
+ if os.path.exists(self.english_model_path) and os.path.exists(self.english_vectorizer_path):
159
+ print("Loading English custom model...")
160
+ self.english_model = joblib.load(self.english_model_path)
161
+ self.english_vectorizer = joblib.load(self.english_vectorizer_path)
162
+ self.english_model_loaded = True
163
+ print("✓ English custom model loaded")
164
+ else:
165
+ print("❌ English custom model not found")
166
+ self.english_model_loaded = False
167
+ except Exception as e:
168
+ print(f"❌ Error loading English model: {e}")
169
+ self.english_model_loaded = False
170
+
171
+ try:
172
+ if os.path.exists(self.bengali_model_path) and os.path.exists(self.bengali_vectorizer_path):
173
+ print("Loading Bengali custom model...")
174
+ self.bengali_model = joblib.load(self.bengali_model_path)
175
+ self.bengali_vectorizer = joblib.load(self.bengali_vectorizer_path)
176
+ self.bengali_model_loaded = True
177
+ print("✓ Bengali custom model loaded")
178
+ else:
179
+ print("❌ Bengali custom model not found")
180
+ self.bengali_model_loaded = False
181
+ except Exception as e:
182
+ print(f"❌ Error loading Bengali model: {e}")
183
+ self.bengali_model_loaded = False
184
+
185
+ def _load_pretrained_model(self, model_key: str):
186
+ """Lazy load pretrained model"""
187
+ model_info = self.pretrained_models.get(model_key)
188
+ if not model_info:
189
+ return
190
+
191
+ if model_info["pipeline"] is None:
192
+ try:
193
+ print(f"Loading {model_key} pretrained model: {model_info['name']}...")
194
+ model_info["pipeline"] = pipeline(
195
+ "text-classification",
196
+ model=model_info["name"],
197
+ device=-1,
198
+ top_k=None,
199
+ truncation=True,
200
+ max_length=512
201
+ )
202
+ print(f"✓ {model_key} pretrained model loaded")
203
+ except Exception as e:
204
+ print(f"❌ Error loading {model_key} pretrained model: {e}")
205
+ model_info["pipeline"] = None
206
+
207
+ async def classify_with_custom_model(self, text: str, language: str) -> Dict:
208
+ """Classify using language-specific custom model"""
209
+ if language == "english":
210
+ if not self.english_model_loaded:
211
+ return None
212
+ model = self.english_model
213
+ vectorizer = self.english_vectorizer
214
+ elif language == "bengali":
215
+ if not self.bengali_model_loaded:
216
+ return None
217
+ model = self.bengali_model
218
+ vectorizer = self.bengali_vectorizer
219
+ else:
220
+ return None
221
+
222
+ try:
223
+ X = vectorizer.transform([text])
224
+ prediction = model.predict(X)[0]
225
+
226
+ if hasattr(model, 'predict_proba'):
227
+ probabilities = model.predict_proba(X)[0]
228
+ confidence = float(max(probabilities))
229
+ else:
230
+ confidence = 0.75
231
+
232
+ if language == "english":
233
+ if prediction == 0:
234
+ category = "neutral"
235
+ else:
236
+ category = "hate_speech"
237
+ else:
238
+ if prediction == 0:
239
+ category = "neutral"
240
+ elif prediction == 1:
241
+ category = "offensive"
242
+ else:
243
+ category = "hate_speech"
244
+
245
+ return {
246
+ "category": category,
247
+ "confidence": confidence,
248
+ "method": f"custom_model_{language}",
249
+ "raw_prediction": int(prediction)
250
+ }
251
+ except Exception as e:
252
+ print(f"❌ Custom model classification failed: {e}")
253
+ return None
254
+
255
+ async def classify_with_pretrained_model(self, text: str, language: str = "english") -> Dict:
256
+ """Classify using ensemble of pretrained models with translation support"""
257
+
258
+ # Translate Bengali text to English
259
+ translated_text = None
260
+ if language == "bengali":
261
+ translated_text = self._translate_to_english(text)
262
+ if not translated_text:
263
+ print("❌ Translation failed, skipping pretrained models")
264
+ return None
265
+ text_to_analyze = translated_text
266
+ else:
267
+ text_to_analyze = text
268
+
269
+ results = []
270
+
271
+ # For long texts, analyze first 400 words
272
+ words = text_to_analyze.split()
273
+ if len(words) > 400:
274
+ truncated_text = ' '.join(words[:400])
275
+ print(f"⚠️ Text too long ({len(words)} words), analyzing first 400 words")
276
+ else:
277
+ truncated_text = text_to_analyze
278
+
279
+ # Try primary model
280
+ self._load_pretrained_model("primary")
281
+ primary = self.pretrained_models["primary"]
282
+
283
+ if primary["pipeline"] is not None:
284
+ try:
285
+ result = primary["pipeline"](truncated_text)[0]
286
+
287
+ if isinstance(result, list):
288
+ result = result[0]
289
+
290
+ label = result['label'].lower()
291
+ confidence = float(result['score'])
292
+
293
+ if 'hate' in label and 'not' not in label:
294
+ category = "hate_speech"
295
+ elif 'not' in label or 'non' in label:
296
+ category = "neutral"
297
+ else:
298
+ category = "offensive"
299
+
300
+ results.append({
301
+ "category": category,
302
+ "confidence": confidence,
303
+ "weight": primary["weight"],
304
+ "model": "primary",
305
+ "raw_label": result['label']
306
+ })
307
+
308
+ print(f"[Primary Model] {result['label']} -> {category} ({confidence:.2%})")
309
+ except Exception as e:
310
+ print(f"❌ Primary model failed: {e}")
311
+
312
+ # Try secondary model
313
+ self._load_pretrained_model("secondary")
314
+ secondary = self.pretrained_models["secondary"]
315
+
316
+ if secondary["pipeline"] is not None:
317
+ try:
318
+ result = secondary["pipeline"](truncated_text)[0]
319
+
320
+ if isinstance(result, list):
321
+ result = result[0]
322
+
323
+ label = result['label'].lower()
324
+ confidence = float(result['score'])
325
+
326
+ if 'hate' in label:
327
+ category = "hate_speech"
328
+ elif 'offensive' in label:
329
+ category = "offensive"
330
+ else:
331
+ category = "neutral"
332
+
333
+ results.append({
334
+ "category": category,
335
+ "confidence": confidence,
336
+ "weight": secondary["weight"],
337
+ "model": "secondary",
338
+ "raw_label": result['label']
339
+ })
340
+
341
+ print(f"[Secondary Model] {result['label']} -> {category} ({confidence:.2%})")
342
+ except Exception as e:
343
+ print(f"❌ Secondary model failed: {e}")
344
+
345
+ if not results:
346
+ return None
347
+
348
+ # Ensemble voting
349
+ category_scores = {}
350
+ for result in results:
351
+ cat = result["category"]
352
+ score = result["confidence"] * result["weight"]
353
+ category_scores[cat] = category_scores.get(cat, 0) + score
354
+
355
+ final_category = max(category_scores, key=category_scores.get)
356
+ total_weight = sum(r["weight"] for r in results)
357
+ final_confidence = category_scores[final_category] / total_weight
358
+
359
+ raw_labels = [r["raw_label"] for r in results]
360
+
361
+ return {
362
+ "category": final_category,
363
+ "confidence": final_confidence,
364
+ "method": "pretrained_ensemble",
365
+ "raw_labels": raw_labels,
366
+ "models_used": [r["model"] for r in results],
367
+ "translated": language == "bengali",
368
+ "translated_text": translated_text[:200] + "..." if translated_text and len(translated_text) > 200 else translated_text
369
+ }
370
+
371
+ def classify_with_keywords(self, text: str, language: str) -> Dict:
372
+ """Classify using keyword and pattern matching"""
373
+ text_lower = text.lower()
374
+
375
+ hate_count = sum(1 for keyword in self.hate_keywords.get(language, [])
376
+ if keyword.lower() in text_lower)
377
+ offensive_count = sum(1 for keyword in self.offensive_keywords.get(language, [])
378
+ if keyword.lower() in text_lower)
379
+
380
+ pattern_matches = []
381
+ matched_patterns = []
382
+ for pattern in self.hate_patterns.get(language, []):
383
+ match = re.search(pattern, text_lower, re.IGNORECASE)
384
+ if match:
385
+ pattern_matches.append(pattern)
386
+ matched_patterns.append(match.group(0))
387
+
388
+ if pattern_matches or hate_count > 0:
389
+ category = "hate_speech"
390
+ base_confidence = 0.90 if pattern_matches else 0.7
391
+ confidence = min(base_confidence + (hate_count * 0.03), 0.98)
392
+ elif offensive_count > 0:
393
+ category = "offensive"
394
+ confidence = min(0.6 + (offensive_count * 0.08), 0.88)
395
+ else:
396
+ category = "neutral"
397
+ confidence = 0.7
398
+
399
+ detected_keywords = []
400
+ for keyword in self.hate_keywords.get(language, []):
401
+ if keyword.lower() in text_lower:
402
+ detected_keywords.append(keyword)
403
+ for keyword in self.offensive_keywords.get(language, []):
404
+ if keyword.lower() in text_lower:
405
+ detected_keywords.append(keyword)
406
+
407
+ return {
408
+ "category": category,
409
+ "confidence": confidence,
410
+ "method": "keyword_matching",
411
+ "detected_keywords": detected_keywords,
412
+ "hate_count": hate_count,
413
+ "offensive_count": offensive_count,
414
+ "pattern_matches": len(pattern_matches),
415
+ "matched_patterns": matched_patterns[:3]
416
+ }
models/language_detector.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langdetect import detect, DetectorFactory, LangDetectException
2
+ import re
3
+
4
+ # Set seed for consistent results
5
+ DetectorFactory.seed = 0
6
+
7
+ def detect_language(text: str) -> str:
8
+ """
9
+ Detect if text is English, Bengali, Mixed, or Unknown
10
+ Uses multiple detection strategies for accuracy
11
+ """
12
+
13
+ if not text or len(text.strip()) < 3:
14
+ return "unknown"
15
+
16
+ # Strategy 1: Check for Bengali Unicode characters
17
+ bengali_pattern = r'[\u0980-\u09FF]'
18
+ has_bengali = bool(re.search(bengali_pattern, text))
19
+
20
+ # Strategy 2: Check for English characters
21
+ english_pattern = r'[a-zA-Z]'
22
+ has_english = bool(re.search(english_pattern, text))
23
+
24
+ # If both present, it's mixed
25
+ if has_bengali and has_english:
26
+ bengali_chars = len(re.findall(bengali_pattern, text))
27
+ english_chars = len(re.findall(english_pattern, text))
28
+
29
+ # If one language dominates heavily (>80%), classify as that language
30
+ total_chars = bengali_chars + english_chars
31
+ if bengali_chars / total_chars > 0.8:
32
+ return "bengali"
33
+ elif english_chars / total_chars > 0.8:
34
+ return "english"
35
+ else:
36
+ return "mixed"
37
+
38
+ # If only Bengali
39
+ if has_bengali:
40
+ return "bengali"
41
+
42
+ # If only English
43
+ if has_english:
44
+ try:
45
+ # Use langdetect for confirmation
46
+ detected = detect(text)
47
+ if detected == 'en':
48
+ return "english"
49
+ elif detected == 'bn':
50
+ return "bengali"
51
+ else:
52
+ # If langdetect finds another language but we have English chars
53
+ return "english"
54
+ except LangDetectException:
55
+ return "english"
56
+
57
+ # Fallback to langdetect
58
+ try:
59
+ detected = detect(text)
60
+ if detected == 'en':
61
+ return "english"
62
+ elif detected == 'bn':
63
+ return "bengali"
64
+ else:
65
+ return "unknown"
66
+ except LangDetectException:
67
+ return "unknown"
68
+
69
+ def get_language_script_info(text: str) -> dict:
70
+ """
71
+ Get detailed information about the scripts used in text
72
+ Useful for debugging and fine-tuning
73
+ """
74
+ bengali_chars = len(re.findall(r'[\u0980-\u09FF]', text))
75
+ english_chars = len(re.findall(r'[a-zA-Z]', text))
76
+ digits = len(re.findall(r'\d', text))
77
+ other_chars = len(text) - bengali_chars - english_chars - digits
78
+
79
+ return {
80
+ "bengali_characters": bengali_chars,
81
+ "english_characters": english_chars,
82
+ "digits": digits,
83
+ "other_characters": other_chars,
84
+ "total_length": len(text)
85
+ }
models/model_weights/custom_models/bengali_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d332e9f2678d28c8d70a8ce7d003d3219a164168a4881ac962832235fd75f485
3
+ size 40879
models/model_weights/custom_models/bengali_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97332e9985a028a664f245948d6fdd4c6f4f604ef91b98b8865bef925971ba92
3
+ size 200620
models/model_weights/custom_models/english_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9a5a7ee8483b34cac119f50c01bb3806e3e6d5f5e8dff842ca4b599cfd32e14
3
+ size 40747
models/model_weights/custom_models/english_vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2732dbcd00696ba2022190a179a993a9d7a869bca51c266ffa368bc52dc26d06
3
+ size 186651
models/model_weights/custom_models/metadata.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "training_date": "2025-11-10 12:05:38",
3
+ "models": {
4
+ "english": {
5
+ "best_model": "svm",
6
+ "f1_score": 0.824268566911743,
7
+ "num_classes": 2,
8
+ "samples": 726119,
9
+ "comparison": {
10
+ "logistic": {
11
+ "accuracy": 0.8236104225196937,
12
+ "f1_score": 0.8236057473045872,
13
+ "training_time": 5.804867267608643
14
+ },
15
+ "svm": {
16
+ "accuracy": 0.8242714702803944,
17
+ "f1_score": 0.824268566911743,
18
+ "training_time": 22.070060968399048
19
+ }
20
+ }
21
+ },
22
+ "bengali": {
23
+ "best_model": "logistic",
24
+ "f1_score": 0.8723120553261358,
25
+ "num_classes": 2,
26
+ "samples": 30000,
27
+ "comparison": {
28
+ "logistic": {
29
+ "accuracy": 0.872,
30
+ "f1_score": 0.8723120553261358,
31
+ "training_time": 1.3237473964691162
32
+ },
33
+ "svm": {
34
+ "accuracy": 0.8625,
35
+ "f1_score": 0.862875926779109,
36
+ "training_time": 0.345095157623291
37
+ }
38
+ }
39
+ }
40
+ },
41
+ "separate_models": true,
42
+ "algorithms_tested": [
43
+ "logistic",
44
+ "svm",
45
+ "random_forest"
46
+ ]
47
+ }
models/model_weights/custom_models/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33f55b33ac7ffa8fa0d1025978c589da482f0538cf6756cc8874adb115a556a5
3
+ size 120779
models/model_weights/custom_models/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24ba3e80100ca6511ec5a64f10233136f3a4a83d92cb39bf7e8e9eb5c4cbd942
3
+ size 186321
models/train_model.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training script for HateShield-BN Custom Model
3
+ Trains SEPARATE models for English and Bengali datasets
4
+ Compares multiple algorithms and saves the best one
5
+ """
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.linear_model import LogisticRegression
12
+ from sklearn.ensemble import RandomForestClassifier
13
+ from sklearn.svm import LinearSVC
14
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
15
+ import joblib
16
+ import os
17
+ from typing import Tuple, Dict
18
+ import warnings
19
+ from tqdm import tqdm
20
+ import time
21
+ import json
22
+
23
+ warnings.filterwarnings('ignore')
24
+
25
+ # Configuration
26
+ ENGLISH_DATASET_PATH = "data/english_hate_speech.csv"
27
+ BENGALI_DATASET_PATH = "data/bengali_hate_speech.csv"
28
+ MODEL_OUTPUT_PATH = "models/model_weights/custom_models"
29
+ RANDOM_STATE = 42
30
+
31
+ def load_english_dataset() -> pd.DataFrame:
32
+ """Load and preprocess English dataset"""
33
+ print("📄 Loading English dataset...")
34
+
35
+ try:
36
+ df = pd.read_csv(ENGLISH_DATASET_PATH)
37
+ print(f" ✓ Loaded: {len(df):,} samples")
38
+
39
+ # Standardize column names
40
+ if 'content' in df.columns:
41
+ df = df.rename(columns={'content': 'text'})
42
+ elif 'Content' in df.columns:
43
+ df = df.rename(columns={'Content': 'text'})
44
+
45
+ # Ensure label column
46
+ if 'Label' in df.columns:
47
+ df['label'] = df['Label'].astype(int)
48
+ elif 'label' in df.columns:
49
+ df['label'] = df['label'].astype(int)
50
+ else:
51
+ raise ValueError("English dataset must have 'Label' or 'label' column")
52
+
53
+ # Keep only text and label
54
+ df = df[['text', 'label']].copy()
55
+
56
+ # Clean data
57
+ df = df.dropna(subset=['text', 'label'])
58
+ df = df[df['text'].str.strip().str.len() > 0]
59
+
60
+ # Ensure binary labels (0, 1)
61
+ unique_labels = df['label'].unique()
62
+ print(f" 📊 Unique labels: {sorted(unique_labels)}")
63
+
64
+ if set(unique_labels) == {0, 1}:
65
+ print(" ✓ Binary classification: 0=Non-Hate, 1=Hate")
66
+ else:
67
+ print(f" ⚠️ Warning: Expected binary labels, found: {unique_labels}")
68
+ # Convert to binary if needed
69
+ df['label'] = (df['label'] > 0).astype(int)
70
+
71
+ print(f" ✓ After preprocessing: {len(df):,} samples")
72
+
73
+ return df
74
+
75
+ except FileNotFoundError:
76
+ print(f" ❌ Error: File not found at {ENGLISH_DATASET_PATH}")
77
+ return pd.DataFrame(columns=['text', 'label'])
78
+ except Exception as e:
79
+ print(f" ❌ Error loading English dataset: {e}")
80
+ return pd.DataFrame(columns=['text', 'label'])
81
+
82
+ def load_bengali_dataset() -> pd.DataFrame:
83
+ """Load and preprocess Bengali dataset"""
84
+ print("\n📄 Loading Bengali dataset...")
85
+
86
+ try:
87
+ df = pd.read_csv(BENGALI_DATASET_PATH)
88
+ print(f" ✓ Loaded: {len(df):,} samples")
89
+
90
+ # Standardize column names
91
+ if 'sentence' in df.columns:
92
+ df = df.rename(columns={'sentence': 'text'})
93
+ elif 'sentences' in df.columns:
94
+ df = df.rename(columns={'sentences': 'text'})
95
+
96
+ # Convert hate/category to standard labels
97
+ if 'hate' in df.columns:
98
+ if 'category' in df.columns:
99
+ category_map = {
100
+ 'non-hate': 0,
101
+ 'offensive': 1,
102
+ 'hate': 2,
103
+ }
104
+ df['label'] = df['category'].map(category_map)
105
+ # Fill missing with hate column
106
+ df.loc[df['label'].isna() & (df['hate'] == 1), 'label'] = 2
107
+ df.loc[df['label'].isna() & (df['hate'] == 0), 'label'] = 0
108
+ else:
109
+ # If only 'hate' column, map: 0=non-hate, 1=hate (as offensive), 2=hate
110
+ df['label'] = df['hate'].apply(lambda x: 2 if x == 1 else 0)
111
+
112
+ df['label'] = df['label'].astype(int)
113
+ df = df[['text', 'label']].copy()
114
+
115
+ # Clean data
116
+ df = df.dropna(subset=['text', 'label'])
117
+ df = df[df['text'].str.strip().str.len() > 0]
118
+
119
+ # Ensure multi-class labels (0, 1, 2)
120
+ unique_labels = df['label'].unique()
121
+ print(f" 📊 Unique labels: {sorted(unique_labels)}")
122
+
123
+ if set(unique_labels) == {0, 1, 2}:
124
+ print(" ✓ Multi-class: 0=Neutral, 1=Offensive, 2=Hate Speech")
125
+ elif set(unique_labels) == {0, 1}:
126
+ print(" ⚠️ Warning: Only binary labels found, expected 3 classes")
127
+ else:
128
+ print(f" ⚠️ Warning: Unexpected labels: {unique_labels}")
129
+
130
+ print(f" ✓ After preprocessing: {len(df):,} samples")
131
+
132
+ return df
133
+
134
+ except FileNotFoundError:
135
+ print(f" ❌ Error: File not found at {BENGALI_DATASET_PATH}")
136
+ return pd.DataFrame(columns=['text', 'label'])
137
+ except Exception as e:
138
+ print(f" ❌ Error loading Bengali dataset: {e}")
139
+ return pd.DataFrame(columns=['text', 'label'])
140
+
141
+ def analyze_distribution(df: pd.DataFrame, name: str):
142
+ """Print dataset statistics"""
143
+ if len(df) == 0:
144
+ print(f"\n{'='*50}")
145
+ print(f"❌ {name} Dataset: EMPTY")
146
+ print('='*50)
147
+ return
148
+
149
+ print(f"\n{'='*50}")
150
+ print(f"📊 {name} Dataset Distribution")
151
+ print('='*50)
152
+
153
+ unique_labels = sorted(df['label'].unique())
154
+ print(f"Unique labels: {unique_labels}")
155
+ print(f"Total samples: {len(df):,}\n")
156
+
157
+ # Dynamic label names
158
+ if set(unique_labels) == {0, 1}:
159
+ label_names = {0: 'Non-Hate/Neutral', 1: 'Hate/Offensive'}
160
+ elif set(unique_labels) == {0, 1, 2}:
161
+ label_names = {0: 'Neutral', 1: 'Offensive', 2: 'Hate Speech'}
162
+ else:
163
+ label_names = {label: f'Class {label}' for label in unique_labels}
164
+
165
+ # Show distribution
166
+ for label in unique_labels:
167
+ count = len(df[df['label'] == label])
168
+ percentage = count / len(df) * 100
169
+ label_name = label_names.get(label, f'Unknown({label})')
170
+ print(f" {label} - {label_name:20s}: {count:6,} ({percentage:5.1f}%)")
171
+
172
+ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
173
+ """Train a single model and return results"""
174
+ print(f"\n 🔧 Training {model_type.upper()}...")
175
+
176
+ # Choose model
177
+ if model_type == 'logistic':
178
+ model = LogisticRegression(
179
+ max_iter=1000,
180
+ random_state=RANDOM_STATE,
181
+ class_weight='balanced',
182
+ n_jobs=-1
183
+ )
184
+ elif model_type == 'svm':
185
+ model = LinearSVC(
186
+ random_state=RANDOM_STATE,
187
+ class_weight='balanced',
188
+ max_iter=2000
189
+ )
190
+ elif model_type == 'random_forest':
191
+ model = RandomForestClassifier(
192
+ n_estimators=100,
193
+ random_state=RANDOM_STATE,
194
+ class_weight='balanced',
195
+ n_jobs=-1
196
+ )
197
+ else:
198
+ raise ValueError(f"Unknown model type: {model_type}")
199
+
200
+ # Train
201
+ start_time = time.time()
202
+
203
+ model.fit(X_train, y_train)
204
+ y_pred = model.predict(X_test)
205
+
206
+ training_time = time.time() - start_time
207
+
208
+ # Evaluate
209
+ accuracy = accuracy_score(y_test, y_pred)
210
+ f1 = f1_score(y_test, y_pred, average='weighted')
211
+
212
+ print(f" ✓ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
213
+ print(f" ✓ F1-Score: {f1:.4f}")
214
+ print(f" ✓ Time: {training_time:.2f}s")
215
+
216
+ return {
217
+ 'model': model,
218
+ 'accuracy': accuracy,
219
+ 'f1_score': f1,
220
+ 'training_time': training_time,
221
+ 'predictions': y_pred
222
+ }
223
+
224
+ def train_and_compare_models(X_train, X_test, y_train, y_test, language: str) -> Tuple:
225
+ """Train multiple models and return the best one"""
226
+ print(f"\n🤖 Training Multiple Models for {language.upper()}...")
227
+ print("=" * 60)
228
+
229
+ models_to_train = ['logistic', 'svm']
230
+ results = {}
231
+
232
+ # Train all models
233
+ for model_type in models_to_train:
234
+ try:
235
+ result = train_single_model(X_train, X_test, y_train, y_test, model_type, language)
236
+ results[model_type] = result
237
+ except Exception as e:
238
+ print(f" ❌ Error training {model_type}: {e}")
239
+ continue
240
+
241
+ if not results:
242
+ print("❌ No models trained successfully!")
243
+ return None, None, {}
244
+
245
+ # Compare models
246
+ print(f"\n{'='*60}")
247
+ print(f"📊 Model Comparison for {language.upper()}")
248
+ print('='*60)
249
+ print(f"{'Model':<20} {'Accuracy':<12} {'F1-Score':<12} {'Time (s)':<10}")
250
+ print('-'*60)
251
+
252
+ best_model_name = None
253
+ best_score = 0
254
+
255
+ for model_name, result in results.items():
256
+ accuracy = result['accuracy']
257
+ f1 = result['f1_score']
258
+ time_taken = result['training_time']
259
+
260
+ # Use F1-score as primary metric (better for imbalanced datasets)
261
+ score = f1
262
+
263
+ print(f"{model_name:<20} {accuracy:<12.4f} {f1:<12.4f} {time_taken:<10.2f}")
264
+
265
+ if score > best_score:
266
+ best_score = score
267
+ best_model_name = model_name
268
+
269
+ print('='*60)
270
+ print(f"🏆 Best Model: {best_model_name.upper()} (F1-Score: {best_score:.4f})")
271
+ print('='*60)
272
+
273
+ # Get best model
274
+ best_result = results[best_model_name]
275
+ best_model = best_result['model']
276
+
277
+ # Detailed report for best model
278
+ print(f"\n📈 Detailed Report for {best_model_name.upper()}:")
279
+
280
+ unique_labels = sorted(np.unique(y_test))
281
+
282
+ if set(unique_labels) == {0, 1}:
283
+ target_names = ['Non-Hate', 'Hate']
284
+ elif set(unique_labels) == {0, 1, 2}:
285
+ target_names = ['Neutral', 'Offensive', 'Hate Speech']
286
+ else:
287
+ target_names = [f'Class {i}' for i in unique_labels]
288
+
289
+ print(classification_report(y_test, best_result['predictions'],
290
+ target_names=target_names,
291
+ zero_division=0))
292
+
293
+ print("🔢 Confusion Matrix:")
294
+ print(confusion_matrix(y_test, best_result['predictions']))
295
+
296
+ # Return comparison data
297
+ comparison = {
298
+ model_name: {
299
+ 'accuracy': result['accuracy'],
300
+ 'f1_score': result['f1_score'],
301
+ 'training_time': result['training_time']
302
+ }
303
+ for model_name, result in results.items()
304
+ }
305
+
306
+ return best_model, best_model_name, comparison
307
+
308
+ def train_language_specific_model(df: pd.DataFrame, language: str):
309
+ """Train model for specific language with comparison"""
310
+ print(f"\n{'='*60}")
311
+ print(f"🎓 Training {language.upper()} Model")
312
+ print('='*60)
313
+
314
+ if len(df) == 0:
315
+ print(f"❌ No data for {language}!")
316
+ return None, None, None, None, {}
317
+
318
+ # Analyze distribution
319
+ analyze_distribution(df, language.capitalize())
320
+
321
+ # Split data
322
+ print(f"\n✂️ Splitting data (80/20 train/test)...")
323
+ X = df['text']
324
+ y = df['label'].astype(int)
325
+
326
+ X_train, X_test, y_train, y_test = train_test_split(
327
+ X, y,
328
+ test_size=0.2,
329
+ random_state=RANDOM_STATE,
330
+ stratify=y
331
+ )
332
+
333
+ print(f" ✓ Train size: {len(X_train):,}")
334
+ print(f" ✓ Test size: {len(X_test):,}")
335
+
336
+ # Create TF-IDF vectorizer
337
+ print(f"\n🔤 Creating TF-IDF vectorizer...")
338
+ vectorizer = TfidfVectorizer(
339
+ max_features=5000,
340
+ ngram_range=(1, 2),
341
+ min_df=2,
342
+ max_df=0.8,
343
+ strip_accents='unicode',
344
+ analyzer='word',
345
+ token_pattern=r'\w{1,}',
346
+ sublinear_tf=True
347
+ )
348
+
349
+ print(" ⏳ Vectorizing text...")
350
+ X_train_vec = vectorizer.fit_transform(X_train)
351
+ X_test_vec = vectorizer.transform(X_test)
352
+
353
+ print(f" ✓ Feature dimension: {X_train_vec.shape[1]:,}")
354
+
355
+ # Train and compare models
356
+ best_model, best_model_name, comparison = train_and_compare_models(
357
+ X_train_vec, X_test_vec, y_train, y_test, language
358
+ )
359
+
360
+ if best_model is None:
361
+ return None, None, None, None, {}
362
+
363
+ # Get final accuracy
364
+ y_pred = best_model.predict(X_test_vec)
365
+ final_accuracy = accuracy_score(y_test, y_pred)
366
+ final_f1 = f1_score(y_test, y_pred, average='weighted')
367
+
368
+ return best_model, vectorizer, best_model_name, final_f1, comparison
369
+
370
+ def main():
371
+ """Main training pipeline"""
372
+ print("\n" + "=" * 70)
373
+ print("🛡️ HateShield-BN Model Training (Language-Specific with Comparison)")
374
+ print("=" * 70 + "\n")
375
+
376
+ # Load datasets separately
377
+ df_english = load_english_dataset()
378
+ df_bengali = load_bengali_dataset()
379
+
380
+ if len(df_english) == 0 and len(df_bengali) == 0:
381
+ print("\n❌ Error: No data found!")
382
+ return
383
+
384
+ os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
385
+
386
+ results = {}
387
+
388
+ # Train English model
389
+ if len(df_english) > 0:
390
+ print("\n" + "🇬🇧 " * 35)
391
+ english_model, english_vectorizer, english_best_name, english_f1, english_comparison = train_language_specific_model(
392
+ df_english, 'english'
393
+ )
394
+
395
+ if english_model is not None:
396
+ # Save English model
397
+ print(f"\n💾 Saving English model ({english_best_name})...")
398
+ english_model_path = os.path.join(MODEL_OUTPUT_PATH, "english_model.pkl")
399
+ english_vec_path = os.path.join(MODEL_OUTPUT_PATH, "english_vectorizer.pkl")
400
+
401
+ joblib.dump(english_model, english_model_path)
402
+ joblib.dump(english_vectorizer, english_vec_path)
403
+
404
+ print(f" ✓ Model saved to: {english_model_path}")
405
+ print(f" ✓ Vectorizer saved to: {english_vec_path}")
406
+
407
+ results['english'] = {
408
+ 'best_model': english_best_name,
409
+ 'f1_score': english_f1,
410
+ 'num_classes': len(df_english['label'].unique()),
411
+ 'samples': len(df_english),
412
+ 'comparison': english_comparison
413
+ }
414
+
415
+ # Train Bengali model
416
+ if len(df_bengali) > 0:
417
+ print("\n" + "🇧🇩 " * 35)
418
+ bengali_model, bengali_vectorizer, bengali_best_name, bengali_f1, bengali_comparison = train_language_specific_model(
419
+ df_bengali, 'bengali'
420
+ )
421
+
422
+ if bengali_model is not None:
423
+ # Save Bengali model
424
+ print(f"\n💾 Saving Bengali model ({bengali_best_name})...")
425
+ bengali_model_path = os.path.join(MODEL_OUTPUT_PATH, "bengali_model.pkl")
426
+ bengali_vec_path = os.path.join(MODEL_OUTPUT_PATH, "bengali_vectorizer.pkl")
427
+
428
+ joblib.dump(bengali_model, bengali_model_path)
429
+ joblib.dump(bengali_vectorizer, bengali_vec_path)
430
+
431
+ print(f" ✓ Model saved to: {bengali_model_path}")
432
+ print(f" ✓ Vectorizer saved to: {bengali_vec_path}")
433
+
434
+ results['bengali'] = {
435
+ 'best_model': bengali_best_name,
436
+ 'f1_score': bengali_f1,
437
+ 'num_classes': len(df_bengali['label'].unique()),
438
+ 'samples': len(df_bengali),
439
+ 'comparison': bengali_comparison
440
+ }
441
+
442
+ # Save metadata
443
+ print(f"\n💾 Saving metadata...")
444
+ metadata = {
445
+ 'training_date': time.strftime('%Y-%m-%d %H:%M:%S'),
446
+ 'models': results,
447
+ 'separate_models': True,
448
+ 'algorithms_tested': ['logistic', 'svm', 'random_forest']
449
+ }
450
+
451
+ with open(os.path.join(MODEL_OUTPUT_PATH, "metadata.json"), 'w') as f:
452
+ json.dump(metadata, f, indent=2)
453
+
454
+ # Final Summary
455
+ print("\n" + "=" * 70)
456
+ print("✅ Training Complete!")
457
+ print("=" * 70)
458
+
459
+ if 'english' in results:
460
+ print(f"\n🇬🇧 English Model:")
461
+ print(f" Best Algorithm: {results['english']['best_model'].upper()}")
462
+ print(f" F1-Score: {results['english']['f1_score']:.4f}")
463
+ print(f" Classes: {results['english']['num_classes']}")
464
+ print(f" Samples: {results['english']['samples']:,}")
465
+ print(f"\n Model Comparison:")
466
+ for model_name, scores in results['english']['comparison'].items():
467
+ print(f" {model_name:<15}: Acc={scores['accuracy']:.4f}, F1={scores['f1_score']:.4f}")
468
+
469
+ if 'bengali' in results:
470
+ print(f"\n🇧🇩 Bengali Model:")
471
+ print(f" Best Algorithm: {results['bengali']['best_model'].upper()}")
472
+ print(f" F1-Score: {results['bengali']['f1_score']:.4f}")
473
+ print(f" Classes: {results['bengali']['num_classes']}")
474
+ print(f" Samples: {results['bengali']['samples']:,}")
475
+ print(f"\n Model Comparison:")
476
+ for model_name, scores in results['bengali']['comparison'].items():
477
+ print(f" {model_name:<15}: Acc={scores['accuracy']:.4f}, F1={scores['f1_score']:.4f}")
478
+
479
+ print("\n" + "=" * 70 + "\n")
480
+
481
+ if __name__ == "__main__":
482
+ main()
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ python-multipart==0.0.6
4
+ pydantic==2.5.0
5
+
6
+ # Web scraping
7
+ requests==2.31.0
8
+ beautifulsoup4==4.12.2
9
+
10
+ # Document processing
11
+ PyPDF2==3.0.1
12
+ python-docx==1.1.0
13
+
14
+ # ML (optimized versions)
15
+ numpy<2.0.0
16
+ pandas<3.0.0
17
+ scikit-learn>=1.3.0,<2.0.0
18
+ transformers>=4.35.0,<5.0.0
19
+ torch>=2.0.0,<3.0.0
20
+ langdetect==1.0.9
21
+ deep-translator==1.11.4
22
+ joblib>=1.5.0
services/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .analyzer import analyze_content
2
+
3
+ __all__ = ['analyze_content']
services/analyzer.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+ import re
3
+ from models.hate_speech_classifier import HateSpeechClassifier
4
+ from models.language_detector import detect_language
5
+
6
+ # Initialize classifier globally
7
+ classifier = HateSpeechClassifier()
8
+
9
+ def highlight_keywords(text: str, keywords: List[str]) -> List[str]:
10
+ """Extract phrases containing keywords"""
11
+ highlighted = []
12
+ text_lower = text.lower()
13
+
14
+ for keyword in keywords:
15
+ if keyword.lower() in text_lower:
16
+ sentences = re.split(r'[।.!?]+', text)
17
+ for sentence in sentences:
18
+ if keyword.lower() in sentence.lower():
19
+ highlighted.append(sentence.strip())
20
+ break
21
+
22
+ return highlighted[:5]
23
+
24
+ async def analyze_content(text: str) -> Dict:
25
+ """
26
+ Main analysis function that combines all models
27
+ """
28
+ # Detect language
29
+ language = detect_language(text)
30
+
31
+ # Get results from all three methods
32
+ custom_result = await classifier.classify_with_custom_model(text, language)
33
+
34
+ # ✅ Pass language to pretrained model for translation support
35
+ pretrained_result = await classifier.classify_with_pretrained_model(text, language)
36
+
37
+ keyword_result = classifier.classify_with_keywords(text, language)
38
+
39
+ # Enhanced ensemble decision with adaptive weights
40
+ results = []
41
+
42
+ has_patterns = keyword_result.get("pattern_matches", 0) > 0
43
+ has_hate_keywords = keyword_result.get("hate_count", 0) > 0
44
+
45
+ if has_patterns or has_hate_keywords:
46
+ custom_weight = 0.5
47
+ pretrained_weight = 0.2
48
+ keyword_weight = 0.3
49
+ else:
50
+ custom_weight = 0.4
51
+ pretrained_weight = 0.4
52
+ keyword_weight = 0.2
53
+
54
+ if custom_result:
55
+ results.append({
56
+ "category": custom_result["category"],
57
+ "confidence": custom_result["confidence"],
58
+ "weight": custom_weight
59
+ })
60
+
61
+ if pretrained_result:
62
+ results.append({
63
+ "category": pretrained_result["category"],
64
+ "confidence": pretrained_result["confidence"],
65
+ "weight": pretrained_weight
66
+ })
67
+
68
+ if keyword_result:
69
+ results.append({
70
+ "category": keyword_result["category"],
71
+ "confidence": keyword_result["confidence"],
72
+ "weight": keyword_weight
73
+ })
74
+
75
+ # Weighted voting
76
+ category_scores = {}
77
+ for result in results:
78
+ cat = result["category"]
79
+ score = result["confidence"] * result["weight"]
80
+ category_scores[cat] = category_scores.get(cat, 0) + score
81
+
82
+ if category_scores:
83
+ sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
84
+ final_category = sorted_categories[0][0]
85
+ final_confidence = category_scores[final_category] / sum(r["weight"] for r in results)
86
+
87
+ if len(sorted_categories) > 1:
88
+ top_cat, top_score = sorted_categories[0]
89
+ second_cat, second_score = sorted_categories[1]
90
+
91
+ if (second_cat == "hate_speech" and
92
+ top_cat != "hate_speech" and
93
+ (top_score - second_score) < 0.15 and
94
+ has_patterns):
95
+ final_category = "hate_speech"
96
+ final_confidence = second_score / sum(r["weight"] for r in results)
97
+ else:
98
+ final_category = "neutral"
99
+ final_confidence = 0.5
100
+
101
+ # Generate reasoning
102
+ reasons = []
103
+ if has_patterns:
104
+ reasons.append(f"Detected hate speech patterns in text structure")
105
+ if custom_result and custom_result["category"] == "hate_speech":
106
+ reasons.append(f"Custom model detected {custom_result['category']} with {custom_result['confidence']:.2%} confidence")
107
+ if pretrained_result:
108
+ if pretrained_result.get("translated"):
109
+ reasons.append(f"Pretrained model analyzed translated text and identified {pretrained_result['category']}")
110
+ elif pretrained_result["category"] != "neutral":
111
+ reasons.append(f"Pretrained model identified {pretrained_result['category']} patterns")
112
+ if keyword_result and keyword_result.get("detected_keywords"):
113
+ reasons.append(f"Found {len(keyword_result['detected_keywords'])} hate/offensive keywords")
114
+
115
+ if not reasons:
116
+ reasons = ["Classification based on content analysis"]
117
+
118
+ all_keywords = keyword_result.get("detected_keywords", [])
119
+ highlighted_phrases = highlight_keywords(text, all_keywords) if all_keywords else []
120
+
121
+ return {
122
+ "ensemble": {
123
+ "category": final_category,
124
+ "confidence": float(final_confidence),
125
+ "reasons": reasons,
126
+ "weights_used": {
127
+ "custom_model": custom_weight,
128
+ "pretrained_model": pretrained_weight,
129
+ "keyword_analysis": keyword_weight
130
+ }
131
+ },
132
+ "custom_model": {
133
+ "available": custom_result is not None,
134
+ "category": custom_result["category"] if custom_result else None,
135
+ "confidence": custom_result["confidence"] if custom_result else None,
136
+ "method": custom_result.get("method") if custom_result else None,
137
+ "raw_prediction": custom_result.get("raw_prediction") if custom_result else None
138
+ },
139
+ "pretrained_model": {
140
+ "available": pretrained_result is not None,
141
+ "category": pretrained_result["category"] if pretrained_result else None,
142
+ "confidence": pretrained_result["confidence"] if pretrained_result else None,
143
+ "method": pretrained_result.get("method") if pretrained_result else None,
144
+ "raw_labels": pretrained_result.get("raw_labels") if pretrained_result else None,
145
+ "translated": pretrained_result.get("translated", False) if pretrained_result else False,
146
+ "translated_text": pretrained_result.get("translated_text") if pretrained_result else None
147
+ },
148
+ "keyword_analysis": {
149
+ "available": True,
150
+ "category": keyword_result["category"],
151
+ "confidence": keyword_result["confidence"],
152
+ "method": keyword_result["method"],
153
+ "detected_keywords": keyword_result.get("detected_keywords", []),
154
+ "hate_count": keyword_result.get("hate_count", 0),
155
+ "offensive_count": keyword_result.get("offensive_count", 0),
156
+ "pattern_matches": keyword_result.get("pattern_matches", 0)
157
+ },
158
+ "highlighted_phrases": highlighted_phrases,
159
+ "detected_language": language,
160
+ "original_text": text[:200] + "..." if len(text) > 200 else text
161
+ }
services/text_extractor.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from typing import Optional
4
+ import PyPDF2
5
+ from docx import Document
6
+ import io
7
+
8
+ def extract_from_url(url: str) -> str:
9
+ """Extract text content from URL (synchronous)"""
10
+ try:
11
+ headers = {
12
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
13
+ }
14
+ response = requests.get(url, headers=headers, timeout=10)
15
+ response.raise_for_status()
16
+
17
+ soup = BeautifulSoup(response.content, 'html.parser')
18
+
19
+ # Remove script and style elements
20
+ for script in soup(["script", "style", "nav", "footer", "header"]):
21
+ script.decompose()
22
+
23
+ # Get text
24
+ text = soup.get_text(separator=' ', strip=True)
25
+
26
+ # Clean up whitespace
27
+ lines = (line.strip() for line in text.splitlines())
28
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
29
+ text = ' '.join(chunk for chunk in chunks if chunk)
30
+
31
+ return text
32
+ except Exception as e:
33
+ print(f"Error extracting from URL: {e}")
34
+ raise Exception(f"Failed to extract text from URL: {str(e)}")
35
+
36
+ def extract_from_document(content: bytes, file_extension: str) -> str:
37
+ """Extract text from document (synchronous)"""
38
+ try:
39
+ if file_extension == ".pdf":
40
+ return _extract_from_pdf(content)
41
+ elif file_extension == ".docx":
42
+ return _extract_from_docx(content)
43
+ elif file_extension == ".txt":
44
+ return content.decode('utf-8')
45
+ else:
46
+ raise ValueError(f"Unsupported file type: {file_extension}")
47
+ except Exception as e:
48
+ print(f"Error extracting from document: {e}")
49
+ raise Exception(f"Failed to extract text from document: {str(e)}")
50
+
51
+ def _extract_from_pdf(content: bytes) -> str:
52
+ """Extract text from PDF"""
53
+ try:
54
+ pdf_file = io.BytesIO(content)
55
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
56
+
57
+ text = ""
58
+ for page in pdf_reader.pages:
59
+ text += page.extract_text() + "\n"
60
+
61
+ return text.strip()
62
+ except Exception as e:
63
+ raise Exception(f"Error reading PDF: {str(e)}")
64
+
65
+ def _extract_from_docx(content: bytes) -> str:
66
+ """Extract text from DOCX"""
67
+ try:
68
+ doc_file = io.BytesIO(content)
69
+ doc = Document(doc_file)
70
+
71
+ text = ""
72
+ for paragraph in doc.paragraphs:
73
+ text += paragraph.text + "\n"
74
+
75
+ return text.strip()
76
+ except Exception as e:
77
+ raise Exception(f"Error reading DOCX: {str(e)}")
utils/__init__.py ADDED
File without changes
utils/helpers.py ADDED
File without changes