Spaces:

sgAtdbd
/

Hateshield-bn

Sleeping

App Files Files Community

sgAtdbd commited on Nov 10, 2025

Commit

8ad9255

1 Parent(s): 4a2bdba

Initial deployment of HateShield backend

Browse files

Files changed (26) hide show

.dockerignore +7 -0
.gitignore +18 -0
Dockerfile +31 -0
README.md +28 -5
__init__.py +0 -0
api/__init__.py +0 -0
api/routes.py +0 -0
app.py +9 -0
main.py +117 -0
models/__init__.py +3 -0
models/hate_speech_classifier.py +416 -0
models/language_detector.py +85 -0
models/model_weights/custom_models/bengali_model.pkl +3 -0
models/model_weights/custom_models/bengali_vectorizer.pkl +3 -0
models/model_weights/custom_models/english_model.pkl +3 -0
models/model_weights/custom_models/english_vectorizer.pkl +3 -0
models/model_weights/custom_models/metadata.json +47 -0
models/model_weights/custom_models/model.pkl +3 -0
models/model_weights/custom_models/vectorizer.pkl +3 -0
models/train_model.py +482 -0
requirements.txt +22 -0
services/__init__.py +3 -0
services/analyzer.py +161 -0
services/text_extractor.py +77 -0
utils/__init__.py +0 -0
utils/helpers.py +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__/
+*.pyc
+*.pyo
+venv/
+data/*.csv
+*.log
+.env

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+venv/
+env/
+.env
+.venv
+*.log
+.DS_Store
+*.csv
+.pytest_cache/
+.coverage
+htmlcov/
+dist/
+build/
+*.egg-info/

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# Create Dockerfile
+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application files
+COPY . .
+# Create cache directories
+RUN mkdir -p /tmp/transformers_cache /tmp/huggingface
+# Set environment variables
+ENV TRANSFORMERS_CACHE=/tmp/transformers_cache
+ENV HF_HOME=/tmp/huggingface
+# Expose port 7860 (Hugging Face Spaces default)
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"] | Out-File -FilePath Dockerfile -Encoding utf8

README.md CHANGED Viewed

@@ -1,10 +1,33 @@
 ---
-title: Hateshield Bn
-emoji: 👁
-colorFrom: green
-colorTo: indigo
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Create README.md
+@"
 ---
+title: HateShield Backend
+emoji: 🛡️
+colorFrom: blue
+colorTo: purple
 sdk: docker
 pinned: false
 ---
+# HateShield Backend API
+Bilingual hate speech detection system using ensemble ML models.
+## Features
+- English & Bengali hate speech detection
+- Document analysis (PDF, DOCX, TXT)
+- URL content scraping
+- Real-time confidence scoring
+## API Endpoints
+- \`POST /analyze/text\` - Analyze text input
+- \`POST /analyze/url\` - Analyze URL content
+- \`POST /analyze/document\` - Analyze uploaded documents
+- \`GET /health\` - Health check
+## Tech Stack
+- FastAPI
+- Transformers (Hugging Face)
+- scikit-learn
+- PyTorch
+"@ | Out-File -FilePath README.md -Encoding utf8

__init__.py ADDED Viewed

File without changes

api/__init__.py ADDED Viewed

File without changes

api/routes.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
+os.environ['HF_HOME'] = '/tmp/huggingface'
+from main import app
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)  # HF Spaces uses port 7860

main.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from fastapi import FastAPI, HTTPException, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, HttpUrl
+from typing import Optional
+import uvicorn
+from services.analyzer import analyze_content
+from services.text_extractor import extract_from_url, extract_from_document
+app = FastAPI(
+    title="HateShield-BN API",
+    description="Bilingual Hate Speech Detection System",
+    version="1.0.0"
+)
+# CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:5173", "http://localhost:3000"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Request models
+class TextRequest(BaseModel):
+    text: str
+class URLRequest(BaseModel):
+    url: HttpUrl
+# Routes
+@app.get("/")
+async def root():
+    return {
+        "message": "HateShield-BN API is running!",
+        "version": "1.0.0",
+        "endpoints": {
+            "text": "/api/analyze/text",
+            "url": "/api/analyze/url",
+            "document": "/api/analyze/document"
+        }
+    }
+@app.post("/api/analyze/text")
+async def analyze_text(request: TextRequest):
+    """Analyze text for hate speech"""
+    try:
+        if not request.text or len(request.text.strip()) == 0:
+            raise HTTPException(status_code=400, detail="Text cannot be empty")
+        result = await analyze_content(request.text)
+        return result
+    except Exception as e:
+        print(f"Error analyzing text: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/analyze/url")
+async def analyze_url(request: URLRequest):
+    """Analyze content from URL"""
+    try:
+        # Note: extract_from_url is now synchronous
+        text = extract_from_url(str(request.url))
+        if not text:
+            raise HTTPException(status_code=400, detail="Could not extract text from URL")
+        result = await analyze_content(text)
+        return result
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Error analyzing URL: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/analyze/document")
+async def analyze_document(file: UploadFile = File(...)):
+    """Analyze uploaded document"""
+    try:
+        # Check file type
+        allowed_types = [".pdf", ".docx", ".txt"]
+        file_ext = f".{file.filename.split('.')[-1].lower()}"
+        if file_ext not in allowed_types:
+            raise HTTPException(
+                status_code=400,
+                detail=f"File type {file_ext} not supported. Allowed: {', '.join(allowed_types)}"
+            )
+        # Read file content
+        content = await file.read()
+        # Note: extract_from_document is now synchronous
+        text = extract_from_document(content, file_ext)
+        if not text:
+            raise HTTPException(status_code=400, detail="Could not extract text from document")
+        result = await analyze_content(text)
+        return result
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"Error analyzing document: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+if __name__ == "__main__":
+    uvicorn.run(
+        "main:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True
+    )

models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .hate_speech_classifier import HateSpeechClassifier
2	+
3	+ __all__ = ['HateSpeechClassifier']

models/hate_speech_classifier.py ADDED Viewed

	@@ -0,0 +1,416 @@

+from typing import Dict, Optional
+from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
+import joblib
+import os
+import re
+import torch
+from deep_translator import GoogleTranslator
+class HateSpeechClassifier:
+    def __init__(self):
+        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        models_dir = os.path.join(base_dir, "models", "model_weights", "custom_models")
+        # Initialize translator
+        self.translator = GoogleTranslator(source='bn', target='en')
+        # Use multiple pretrained models for better accuracy
+        self.pretrained_models = {
+            "primary": {
+                "name": "facebook/roberta-hate-speech-dynabench-r4-target",
+                "pipeline": None,
+                "weight": 0.6
+            },
+            "secondary": {
+                "name": "cardiffnlp/twitter-roberta-base-hate-latest",
+                "pipeline": None,
+                "weight": 0.4
+            }
+        }
+        # English custom model paths
+        self.english_model_path = os.path.join(models_dir, "english_model.pkl")
+        self.english_vectorizer_path = os.path.join(models_dir, "english_vectorizer.pkl")
+        self.english_model = None
+        self.english_vectorizer = None
+        self.english_model_loaded = False
+        # Bengali custom model paths
+        self.bengali_model_path = os.path.join(models_dir, "bengali_model.pkl")
+        self.bengali_vectorizer_path = os.path.join(models_dir, "bengali_vectorizer.pkl")
+        self.bengali_model = None
+        self.bengali_vectorizer = None
+        self.bengali_model_loaded = False
+        # Load models
+        self._load_custom_models()
+        # Enhanced hate keywords
+        self.hate_keywords = {
+            "english": [
+                "hate", "kill", "death", "violence", "murder", "attack", "destroy", "eliminate",
+                "die", "dead", "shoot", "stab", "burn", "hang", "lynch",
+                "terrorist", "racist", "sexist", "discrimination", "discriminate",
+                "scheduled caste", "scheduled tribe", "dalit", "lower caste", "untouchable",
+                "chamar", "bhangi", "sc/st", "reservation quota",
+                "no right to live", "don't deserve", "shouldn't exist", "subhuman",
+                "inferior", "worthless", "scum", "vermin", "parasite",
+                "should be killed", "must die", "deserve to die", "need to be eliminated",
+                "jihadi", "kafir", "infidel", "terrorist religion", "religious extremist",
+                "nigger", "chink", "paki", "kike", "faggot", "tranny"
+            ],
+            "bengali": [
+                "শালা", "হালা", "মাগি", "কুত্তা", "হারামি", "চোদ", "বাল",
+                "ঘৃণা", "মারো", "মৃত্যু", "সন্ত্রাসী", "বোকা", "মূর্খ",
+                "বিদ্বেষ", "ভয়ঙ্কর", "জঘন্য", "হত্যা", "আক্রমণ",
+                "দলিত", "নিম্নবর্ণ", "অস্পৃশ্য"
+            ]
+        }
+        self.hate_patterns = {
+            "english": [
+                r"no right to (live|exist|be here|survive)",
+                r"(should|must|need to|ought to) (die|be killed|be eliminated|perish)",
+                r"don'?t deserve (to live|life|existence|to exist)",
+                r"(get rid of|eliminate|exterminate|wipe out) (them|these|those|the)",
+                r"(scheduled caste|dalit|lower caste|sc/st).{0,50}(no right|shouldn't|don't deserve)",
+                r"(religious|ethnic|caste|racial) (cleansing|purification|genocide)",
+                r"(send|throw|kick|drive) (them|back) (out|away|home)",
+                r"(all|these) .{0,30} (should die|must be killed|need to go)",
+                r"(death to|kill all|eliminate all) .{0,30}",
+                r"(inferior|subhuman|less than human|not human)",
+            ],
+            "bengali": [
+                r"বাঁচার অধিকার নেই",
+                r"মরে যাওয়া উচিত",
+                r"নিশ্চিহ্ন করা উচিত"
+            ]
+        }
+        self.offensive_keywords = {
+            "english": [
+                "damn", "hell", "crap", "suck", "dumb", "loser", "trash",
+                "stupid", "idiot", "moron", "pathetic", "bad", "ugly",
+                "disgusting", "nasty", "filthy", "asshole", "bitch", "bastard"
+            ],
+            "bengali": ["বাজে", "খারাপ", "নোংরা", "বেকুব"]
+        }
+    def _translate_to_english(self, text: str) -> Optional[str]:
+        """Translate Bengali to English using deep-translator"""
+        try:
+            print(f"🔄 Translating Bengali text to English...")
+            # deep-translator has a 5000 character limit per request
+            max_chars = 4500
+            if len(text) > max_chars:
+                text_to_translate = text[:max_chars]
+                print(f"⚠️  Text truncated to {max_chars} characters for translation")
+            else:
+                text_to_translate = text
+            # Translate using Google Translate
+            translated_text = self.translator.translate(text_to_translate)
+            print(f"✓ Translation successful")
+            print(f"   Original (Bengali): {text_to_translate[:100]}...")
+            print(f"   Translated (English): {translated_text[:100]}...")
+            return translated_text
+        except Exception as e:
+            print(f"❌ Translation failed: {e}")
+            # Try splitting into smaller chunks if it fails
+            try:
+                print("🔄 Retrying with smaller chunks...")
+                words = text.split()
+                chunks = []
+                current_chunk = []
+                current_length = 0
+                for word in words:
+                    if current_length + len(word) > 1000:  # Smaller chunks
+                        if current_chunk:
+                            chunks.append(' '.join(current_chunk))
+                        current_chunk = [word]
+                        current_length = len(word)
+                    else:
+                        current_chunk.append(word)
+                        current_length += len(word) + 1
+                if current_chunk:
+                    chunks.append(' '.join(current_chunk))
+                translated_chunks = []
+                for chunk in chunks[:5]:  # Translate max 5 chunks
+                    translated_chunk = self.translator.translate(chunk)
+                    translated_chunks.append(translated_chunk)
+                translated_text = ' '.join(translated_chunks)
+                print(f"✓ Translation successful with chunking")
+                return translated_text
+            except Exception as e2:
+                print(f"❌ Translation with chunking also failed: {e2}")
+                return None
+    def _load_custom_models(self):
+        """Load language-specific custom models"""
+        try:
+            if os.path.exists(self.english_model_path) and os.path.exists(self.english_vectorizer_path):
+                print("Loading English custom model...")
+                self.english_model = joblib.load(self.english_model_path)
+                self.english_vectorizer = joblib.load(self.english_vectorizer_path)
+                self.english_model_loaded = True
+                print("✓ English custom model loaded")
+            else:
+                print("❌ English custom model not found")
+                self.english_model_loaded = False
+        except Exception as e:
+            print(f"❌ Error loading English model: {e}")
+            self.english_model_loaded = False
+        try:
+            if os.path.exists(self.bengali_model_path) and os.path.exists(self.bengali_vectorizer_path):
+                print("Loading Bengali custom model...")
+                self.bengali_model = joblib.load(self.bengali_model_path)
+                self.bengali_vectorizer = joblib.load(self.bengali_vectorizer_path)
+                self.bengali_model_loaded = True
+                print("✓ Bengali custom model loaded")
+            else:
+                print("❌ Bengali custom model not found")
+                self.bengali_model_loaded = False
+        except Exception as e:
+            print(f"❌ Error loading Bengali model: {e}")
+            self.bengali_model_loaded = False
+    def _load_pretrained_model(self, model_key: str):
+        """Lazy load pretrained model"""
+        model_info = self.pretrained_models.get(model_key)
+        if not model_info:
+            return
+        if model_info["pipeline"] is None:
+            try:
+                print(f"Loading {model_key} pretrained model: {model_info['name']}...")
+                model_info["pipeline"] = pipeline(
+                    "text-classification",
+                    model=model_info["name"],
+                    device=-1,
+                    top_k=None,
+                    truncation=True,
+                    max_length=512
+                )
+                print(f"✓ {model_key} pretrained model loaded")
+            except Exception as e:
+                print(f"❌ Error loading {model_key} pretrained model: {e}")
+                model_info["pipeline"] = None
+    async def classify_with_custom_model(self, text: str, language: str) -> Dict:
+        """Classify using language-specific custom model"""
+        if language == "english":
+            if not self.english_model_loaded:
+                return None
+            model = self.english_model
+            vectorizer = self.english_vectorizer
+        elif language == "bengali":
+            if not self.bengali_model_loaded:
+                return None
+            model = self.bengali_model
+            vectorizer = self.bengali_vectorizer
+        else:
+            return None
+        try:
+            X = vectorizer.transform([text])
+            prediction = model.predict(X)[0]
+            if hasattr(model, 'predict_proba'):
+                probabilities = model.predict_proba(X)[0]
+                confidence = float(max(probabilities))
+            else:
+                confidence = 0.75
+            if language == "english":
+                if prediction == 0:
+                    category = "neutral"
+                else:
+                    category = "hate_speech"
+            else:
+                if prediction == 0:
+                    category = "neutral"
+                elif prediction == 1:
+                    category = "offensive"
+                else:
+                    category = "hate_speech"
+            return {
+                "category": category,
+                "confidence": confidence,
+                "method": f"custom_model_{language}",
+                "raw_prediction": int(prediction)
+            }
+        except Exception as e:
+            print(f"❌ Custom model classification failed: {e}")
+            return None
+    async def classify_with_pretrained_model(self, text: str, language: str = "english") -> Dict:
+        """Classify using ensemble of pretrained models with translation support"""
+        # Translate Bengali text to English
+        translated_text = None
+        if language == "bengali":
+            translated_text = self._translate_to_english(text)
+            if not translated_text:
+                print("❌ Translation failed, skipping pretrained models")
+                return None
+            text_to_analyze = translated_text
+        else:
+            text_to_analyze = text
+        results = []
+        # For long texts, analyze first 400 words
+        words = text_to_analyze.split()
+        if len(words) > 400:
+            truncated_text = ' '.join(words[:400])
+            print(f"⚠️  Text too long ({len(words)} words), analyzing first 400 words")
+        else:
+            truncated_text = text_to_analyze
+        # Try primary model
+        self._load_pretrained_model("primary")
+        primary = self.pretrained_models["primary"]
+        if primary["pipeline"] is not None:
+            try:
+                result = primary["pipeline"](truncated_text)[0]
+                if isinstance(result, list):
+                    result = result[0]
+                label = result['label'].lower()
+                confidence = float(result['score'])
+                if 'hate' in label and 'not' not in label:
+                    category = "hate_speech"
+                elif 'not' in label or 'non' in label:
+                    category = "neutral"
+                else:
+                    category = "offensive"
+                results.append({
+                    "category": category,
+                    "confidence": confidence,
+                    "weight": primary["weight"],
+                    "model": "primary",
+                    "raw_label": result['label']
+                })
+                print(f"[Primary Model] {result['label']} -> {category} ({confidence:.2%})")
+            except Exception as e:
+                print(f"❌ Primary model failed: {e}")
+        # Try secondary model
+        self._load_pretrained_model("secondary")
+        secondary = self.pretrained_models["secondary"]
+        if secondary["pipeline"] is not None:
+            try:
+                result = secondary["pipeline"](truncated_text)[0]
+                if isinstance(result, list):
+                    result = result[0]
+                label = result['label'].lower()
+                confidence = float(result['score'])
+                if 'hate' in label:
+                    category = "hate_speech"
+                elif 'offensive' in label:
+                    category = "offensive"
+                else:
+                    category = "neutral"
+                results.append({
+                    "category": category,
+                    "confidence": confidence,
+                    "weight": secondary["weight"],
+                    "model": "secondary",
+                    "raw_label": result['label']
+                })
+                print(f"[Secondary Model] {result['label']} -> {category} ({confidence:.2%})")
+            except Exception as e:
+                print(f"❌ Secondary model failed: {e}")
+        if not results:
+            return None
+        # Ensemble voting
+        category_scores = {}
+        for result in results:
+            cat = result["category"]
+            score = result["confidence"] * result["weight"]
+            category_scores[cat] = category_scores.get(cat, 0) + score
+        final_category = max(category_scores, key=category_scores.get)
+        total_weight = sum(r["weight"] for r in results)
+        final_confidence = category_scores[final_category] / total_weight
+        raw_labels = [r["raw_label"] for r in results]
+        return {
+            "category": final_category,
+            "confidence": final_confidence,
+            "method": "pretrained_ensemble",
+            "raw_labels": raw_labels,
+            "models_used": [r["model"] for r in results],
+            "translated": language == "bengali",
+            "translated_text": translated_text[:200] + "..." if translated_text and len(translated_text) > 200 else translated_text
+        }
+    def classify_with_keywords(self, text: str, language: str) -> Dict:
+        """Classify using keyword and pattern matching"""
+        text_lower = text.lower()
+        hate_count = sum(1 for keyword in self.hate_keywords.get(language, [])
+                        if keyword.lower() in text_lower)
+        offensive_count = sum(1 for keyword in self.offensive_keywords.get(language, [])
+                             if keyword.lower() in text_lower)
+        pattern_matches = []
+        matched_patterns = []
+        for pattern in self.hate_patterns.get(language, []):
+            match = re.search(pattern, text_lower, re.IGNORECASE)
+            if match:
+                pattern_matches.append(pattern)
+                matched_patterns.append(match.group(0))
+        if pattern_matches or hate_count > 0:
+            category = "hate_speech"
+            base_confidence = 0.90 if pattern_matches else 0.7
+            confidence = min(base_confidence + (hate_count * 0.03), 0.98)
+        elif offensive_count > 0:
+            category = "offensive"
+            confidence = min(0.6 + (offensive_count * 0.08), 0.88)
+        else:
+            category = "neutral"
+            confidence = 0.7
+        detected_keywords = []
+        for keyword in self.hate_keywords.get(language, []):
+            if keyword.lower() in text_lower:
+                detected_keywords.append(keyword)
+        for keyword in self.offensive_keywords.get(language, []):
+            if keyword.lower() in text_lower:
+                detected_keywords.append(keyword)
+        return {
+            "category": category,
+            "confidence": confidence,
+            "method": "keyword_matching",
+            "detected_keywords": detected_keywords,
+            "hate_count": hate_count,
+            "offensive_count": offensive_count,
+            "pattern_matches": len(pattern_matches),
+            "matched_patterns": matched_patterns[:3]
+        }

models/language_detector.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from langdetect import detect, DetectorFactory, LangDetectException
+import re
+# Set seed for consistent results
+DetectorFactory.seed = 0
+def detect_language(text: str) -> str:
+    """
+    Detect if text is English, Bengali, Mixed, or Unknown
+    Uses multiple detection strategies for accuracy
+    """
+    if not text or len(text.strip()) < 3:
+        return "unknown"
+    # Strategy 1: Check for Bengali Unicode characters
+    bengali_pattern = r'[\u0980-\u09FF]'
+    has_bengali = bool(re.search(bengali_pattern, text))
+    # Strategy 2: Check for English characters
+    english_pattern = r'[a-zA-Z]'
+    has_english = bool(re.search(english_pattern, text))
+    # If both present, it's mixed
+    if has_bengali and has_english:
+        bengali_chars = len(re.findall(bengali_pattern, text))
+        english_chars = len(re.findall(english_pattern, text))
+        # If one language dominates heavily (>80%), classify as that language
+        total_chars = bengali_chars + english_chars
+        if bengali_chars / total_chars > 0.8:
+            return "bengali"
+        elif english_chars / total_chars > 0.8:
+            return "english"
+        else:
+            return "mixed"
+    # If only Bengali
+    if has_bengali:
+        return "bengali"
+    # If only English
+    if has_english:
+        try:
+            # Use langdetect for confirmation
+            detected = detect(text)
+            if detected == 'en':
+                return "english"
+            elif detected == 'bn':
+                return "bengali"
+            else:
+                # If langdetect finds another language but we have English chars
+                return "english"
+        except LangDetectException:
+            return "english"
+    # Fallback to langdetect
+    try:
+        detected = detect(text)
+        if detected == 'en':
+            return "english"
+        elif detected == 'bn':
+            return "bengali"
+        else:
+            return "unknown"
+    except LangDetectException:
+        return "unknown"
+def get_language_script_info(text: str) -> dict:
+    """
+    Get detailed information about the scripts used in text
+    Useful for debugging and fine-tuning
+    """
+    bengali_chars = len(re.findall(r'[\u0980-\u09FF]', text))
+    english_chars = len(re.findall(r'[a-zA-Z]', text))
+    digits = len(re.findall(r'\d', text))
+    other_chars = len(text) - bengali_chars - english_chars - digits
+    return {
+        "bengali_characters": bengali_chars,
+        "english_characters": english_chars,
+        "digits": digits,
+        "other_characters": other_chars,
+        "total_length": len(text)
+    }

models/model_weights/custom_models/bengali_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d332e9f2678d28c8d70a8ce7d003d3219a164168a4881ac962832235fd75f485
+size 40879

models/model_weights/custom_models/bengali_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97332e9985a028a664f245948d6fdd4c6f4f604ef91b98b8865bef925971ba92
+size 200620

models/model_weights/custom_models/english_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9a5a7ee8483b34cac119f50c01bb3806e3e6d5f5e8dff842ca4b599cfd32e14
+size 40747

models/model_weights/custom_models/english_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2732dbcd00696ba2022190a179a993a9d7a869bca51c266ffa368bc52dc26d06
+size 186651

models/model_weights/custom_models/metadata.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "training_date": "2025-11-10 12:05:38",
+  "models": {
+    "english": {
+      "best_model": "svm",
+      "f1_score": 0.824268566911743,
+      "num_classes": 2,
+      "samples": 726119,
+      "comparison": {
+        "logistic": {
+          "accuracy": 0.8236104225196937,
+          "f1_score": 0.8236057473045872,
+          "training_time": 5.804867267608643
+        },
+        "svm": {
+          "accuracy": 0.8242714702803944,
+          "f1_score": 0.824268566911743,
+          "training_time": 22.070060968399048
+        }
+      }
+    },
+    "bengali": {
+      "best_model": "logistic",
+      "f1_score": 0.8723120553261358,
+      "num_classes": 2,
+      "samples": 30000,
+      "comparison": {
+        "logistic": {
+          "accuracy": 0.872,
+          "f1_score": 0.8723120553261358,
+          "training_time": 1.3237473964691162
+        },
+        "svm": {
+          "accuracy": 0.8625,
+          "f1_score": 0.862875926779109,
+          "training_time": 0.345095157623291
+        }
+      }
+    }
+  },
+  "separate_models": true,
+  "algorithms_tested": [
+    "logistic",
+    "svm",
+    "random_forest"
+  ]
+}

models/model_weights/custom_models/model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33f55b33ac7ffa8fa0d1025978c589da482f0538cf6756cc8874adb115a556a5
+size 120779

models/model_weights/custom_models/vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24ba3e80100ca6511ec5a64f10233136f3a4a83d92cb39bf7e8e9eb5c4cbd942
+size 186321

models/train_model.py ADDED Viewed

	@@ -0,0 +1,482 @@

+"""
+Training script for HateShield-BN Custom Model
+Trains SEPARATE models for English and Bengali datasets
+Compares multiple algorithms and saves the best one
+"""
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import LinearSVC
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
+import joblib
+import os
+from typing import Tuple, Dict
+import warnings
+from tqdm import tqdm
+import time
+import json
+warnings.filterwarnings('ignore')
+# Configuration
+ENGLISH_DATASET_PATH = "data/english_hate_speech.csv"
+BENGALI_DATASET_PATH = "data/bengali_hate_speech.csv"
+MODEL_OUTPUT_PATH = "models/model_weights/custom_models"
+RANDOM_STATE = 42
+def load_english_dataset() -> pd.DataFrame:
+    """Load and preprocess English dataset"""
+    print("📄 Loading English dataset...")
+    try:
+        df = pd.read_csv(ENGLISH_DATASET_PATH)
+        print(f"  ✓ Loaded: {len(df):,} samples")
+        # Standardize column names
+        if 'content' in df.columns:
+            df = df.rename(columns={'content': 'text'})
+        elif 'Content' in df.columns:
+            df = df.rename(columns={'Content': 'text'})
+        # Ensure label column
+        if 'Label' in df.columns:
+            df['label'] = df['Label'].astype(int)
+        elif 'label' in df.columns:
+            df['label'] = df['label'].astype(int)
+        else:
+            raise ValueError("English dataset must have 'Label' or 'label' column")
+        # Keep only text and label
+        df = df[['text', 'label']].copy()
+        # Clean data
+        df = df.dropna(subset=['text', 'label'])
+        df = df[df['text'].str.strip().str.len() > 0]
+        # Ensure binary labels (0, 1)
+        unique_labels = df['label'].unique()
+        print(f"  📊 Unique labels: {sorted(unique_labels)}")
+        if set(unique_labels) == {0, 1}:
+            print("  ✓ Binary classification: 0=Non-Hate, 1=Hate")
+        else:
+            print(f"  ⚠️  Warning: Expected binary labels, found: {unique_labels}")
+            # Convert to binary if needed
+            df['label'] = (df['label'] > 0).astype(int)
+        print(f"  ✓ After preprocessing: {len(df):,} samples")
+        return df
+    except FileNotFoundError:
+        print(f"  ❌ Error: File not found at {ENGLISH_DATASET_PATH}")
+        return pd.DataFrame(columns=['text', 'label'])
+    except Exception as e:
+        print(f"  ❌ Error loading English dataset: {e}")
+        return pd.DataFrame(columns=['text', 'label'])
+def load_bengali_dataset() -> pd.DataFrame:
+    """Load and preprocess Bengali dataset"""
+    print("\n📄 Loading Bengali dataset...")
+    try:
+        df = pd.read_csv(BENGALI_DATASET_PATH)
+        print(f"  ✓ Loaded: {len(df):,} samples")
+        # Standardize column names
+        if 'sentence' in df.columns:
+            df = df.rename(columns={'sentence': 'text'})
+        elif 'sentences' in df.columns:
+            df = df.rename(columns={'sentences': 'text'})
+        # Convert hate/category to standard labels
+        if 'hate' in df.columns:
+            if 'category' in df.columns:
+                category_map = {
+                    'non-hate': 0,
+                    'offensive': 1,
+                    'hate': 2,
+                }
+                df['label'] = df['category'].map(category_map)
+                # Fill missing with hate column
+                df.loc[df['label'].isna() & (df['hate'] == 1), 'label'] = 2
+                df.loc[df['label'].isna() & (df['hate'] == 0), 'label'] = 0
+            else:
+                # If only 'hate' column, map: 0=non-hate, 1=hate (as offensive), 2=hate
+                df['label'] = df['hate'].apply(lambda x: 2 if x == 1 else 0)
+        df['label'] = df['label'].astype(int)
+        df = df[['text', 'label']].copy()
+        # Clean data
+        df = df.dropna(subset=['text', 'label'])
+        df = df[df['text'].str.strip().str.len() > 0]
+        # Ensure multi-class labels (0, 1, 2)
+        unique_labels = df['label'].unique()
+        print(f"  📊 Unique labels: {sorted(unique_labels)}")
+        if set(unique_labels) == {0, 1, 2}:
+            print("  ✓ Multi-class: 0=Neutral, 1=Offensive, 2=Hate Speech")
+        elif set(unique_labels) == {0, 1}:
+            print("  ⚠️  Warning: Only binary labels found, expected 3 classes")
+        else:
+            print(f"  ⚠️  Warning: Unexpected labels: {unique_labels}")
+        print(f"  ✓ After preprocessing: {len(df):,} samples")
+        return df
+    except FileNotFoundError:
+        print(f"  ❌ Error: File not found at {BENGALI_DATASET_PATH}")
+        return pd.DataFrame(columns=['text', 'label'])
+    except Exception as e:
+        print(f"  ❌ Error loading Bengali dataset: {e}")
+        return pd.DataFrame(columns=['text', 'label'])
+def analyze_distribution(df: pd.DataFrame, name: str):
+    """Print dataset statistics"""
+    if len(df) == 0:
+        print(f"\n{'='*50}")
+        print(f"❌ {name} Dataset: EMPTY")
+        print('='*50)
+        return
+    print(f"\n{'='*50}")
+    print(f"📊 {name} Dataset Distribution")
+    print('='*50)
+    unique_labels = sorted(df['label'].unique())
+    print(f"Unique labels: {unique_labels}")
+    print(f"Total samples: {len(df):,}\n")
+    # Dynamic label names
+    if set(unique_labels) == {0, 1}:
+        label_names = {0: 'Non-Hate/Neutral', 1: 'Hate/Offensive'}
+    elif set(unique_labels) == {0, 1, 2}:
+        label_names = {0: 'Neutral', 1: 'Offensive', 2: 'Hate Speech'}
+    else:
+        label_names = {label: f'Class {label}' for label in unique_labels}
+    # Show distribution
+    for label in unique_labels:
+        count = len(df[df['label'] == label])
+        percentage = count / len(df) * 100
+        label_name = label_names.get(label, f'Unknown({label})')
+        print(f"  {label} - {label_name:20s}: {count:6,} ({percentage:5.1f}%)")
+def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
+    """Train a single model and return results"""
+    print(f"\n  🔧 Training {model_type.upper()}...")
+    # Choose model
+    if model_type == 'logistic':
+        model = LogisticRegression(
+            max_iter=1000,
+            random_state=RANDOM_STATE,
+            class_weight='balanced',
+            n_jobs=-1
+        )
+    elif model_type == 'svm':
+        model = LinearSVC(
+            random_state=RANDOM_STATE,
+            class_weight='balanced',
+            max_iter=2000
+        )
+    elif model_type == 'random_forest':
+        model = RandomForestClassifier(
+            n_estimators=100,
+            random_state=RANDOM_STATE,
+            class_weight='balanced',
+            n_jobs=-1
+        )
+    else:
+        raise ValueError(f"Unknown model type: {model_type}")
+    # Train
+    start_time = time.time()
+    model.fit(X_train, y_train)
+    y_pred = model.predict(X_test)
+    training_time = time.time() - start_time
+    # Evaluate
+    accuracy = accuracy_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred, average='weighted')
+    print(f"     ✓ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
+    print(f"     ✓ F1-Score: {f1:.4f}")
+    print(f"     ✓ Time: {training_time:.2f}s")
+    return {
+        'model': model,
+        'accuracy': accuracy,
+        'f1_score': f1,
+        'training_time': training_time,
+        'predictions': y_pred
+    }
+def train_and_compare_models(X_train, X_test, y_train, y_test, language: str) -> Tuple:
+    """Train multiple models and return the best one"""
+    print(f"\n🤖 Training Multiple Models for {language.upper()}...")
+    print("=" * 60)
+    models_to_train = ['logistic', 'svm']
+    results = {}
+    # Train all models
+    for model_type in models_to_train:
+        try:
+            result = train_single_model(X_train, X_test, y_train, y_test, model_type, language)
+            results[model_type] = result
+        except Exception as e:
+            print(f"     ❌ Error training {model_type}: {e}")
+            continue
+    if not results:
+        print("❌ No models trained successfully!")
+        return None, None, {}
+    # Compare models
+    print(f"\n{'='*60}")
+    print(f"📊 Model Comparison for {language.upper()}")
+    print('='*60)
+    print(f"{'Model':<20} {'Accuracy':<12} {'F1-Score':<12} {'Time (s)':<10}")
+    print('-'*60)
+    best_model_name = None
+    best_score = 0
+    for model_name, result in results.items():
+        accuracy = result['accuracy']
+        f1 = result['f1_score']
+        time_taken = result['training_time']
+        # Use F1-score as primary metric (better for imbalanced datasets)
+        score = f1
+        print(f"{model_name:<20} {accuracy:<12.4f} {f1:<12.4f} {time_taken:<10.2f}")
+        if score > best_score:
+            best_score = score
+            best_model_name = model_name
+    print('='*60)
+    print(f"🏆 Best Model: {best_model_name.upper()} (F1-Score: {best_score:.4f})")
+    print('='*60)
+    # Get best model
+    best_result = results[best_model_name]
+    best_model = best_result['model']
+    # Detailed report for best model
+    print(f"\n📈 Detailed Report for {best_model_name.upper()}:")
+    unique_labels = sorted(np.unique(y_test))
+    if set(unique_labels) == {0, 1}:
+        target_names = ['Non-Hate', 'Hate']
+    elif set(unique_labels) == {0, 1, 2}:
+        target_names = ['Neutral', 'Offensive', 'Hate Speech']
+    else:
+        target_names = [f'Class {i}' for i in unique_labels]
+    print(classification_report(y_test, best_result['predictions'],
+                                target_names=target_names,
+                                zero_division=0))
+    print("🔢 Confusion Matrix:")
+    print(confusion_matrix(y_test, best_result['predictions']))
+    # Return comparison data
+    comparison = {
+        model_name: {
+            'accuracy': result['accuracy'],
+            'f1_score': result['f1_score'],
+            'training_time': result['training_time']
+        }
+        for model_name, result in results.items()
+    }
+    return best_model, best_model_name, comparison
+def train_language_specific_model(df: pd.DataFrame, language: str):
+    """Train model for specific language with comparison"""
+    print(f"\n{'='*60}")
+    print(f"🎓 Training {language.upper()} Model")
+    print('='*60)
+    if len(df) == 0:
+        print(f"❌ No data for {language}!")
+        return None, None, None, None, {}
+    # Analyze distribution
+    analyze_distribution(df, language.capitalize())
+    # Split data
+    print(f"\n✂️  Splitting data (80/20 train/test)...")
+    X = df['text']
+    y = df['label'].astype(int)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y,
+        test_size=0.2,
+        random_state=RANDOM_STATE,
+        stratify=y
+    )
+    print(f"  ✓ Train size: {len(X_train):,}")
+    print(f"  ✓ Test size: {len(X_test):,}")
+    # Create TF-IDF vectorizer
+    print(f"\n🔤 Creating TF-IDF vectorizer...")
+    vectorizer = TfidfVectorizer(
+        max_features=5000,
+        ngram_range=(1, 2),
+        min_df=2,
+        max_df=0.8,
+        strip_accents='unicode',
+        analyzer='word',
+        token_pattern=r'\w{1,}',
+        sublinear_tf=True
+    )
+    print("  ⏳ Vectorizing text...")
+    X_train_vec = vectorizer.fit_transform(X_train)
+    X_test_vec = vectorizer.transform(X_test)
+    print(f"  ✓ Feature dimension: {X_train_vec.shape[1]:,}")
+    # Train and compare models
+    best_model, best_model_name, comparison = train_and_compare_models(
+        X_train_vec, X_test_vec, y_train, y_test, language
+    )
+    if best_model is None:
+        return None, None, None, None, {}
+    # Get final accuracy
+    y_pred = best_model.predict(X_test_vec)
+    final_accuracy = accuracy_score(y_test, y_pred)
+    final_f1 = f1_score(y_test, y_pred, average='weighted')
+    return best_model, vectorizer, best_model_name, final_f1, comparison
+def main():
+    """Main training pipeline"""
+    print("\n" + "=" * 70)
+    print("🛡️  HateShield-BN Model Training (Language-Specific with Comparison)")
+    print("=" * 70 + "\n")
+    # Load datasets separately
+    df_english = load_english_dataset()
+    df_bengali = load_bengali_dataset()
+    if len(df_english) == 0 and len(df_bengali) == 0:
+        print("\n❌ Error: No data found!")
+        return
+    os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
+    results = {}
+    # Train English model
+    if len(df_english) > 0:
+        print("\n" + "🇬🇧 " * 35)
+        english_model, english_vectorizer, english_best_name, english_f1, english_comparison = train_language_specific_model(
+            df_english, 'english'
+        )
+        if english_model is not None:
+            # Save English model
+            print(f"\n💾 Saving English model ({english_best_name})...")
+            english_model_path = os.path.join(MODEL_OUTPUT_PATH, "english_model.pkl")
+            english_vec_path = os.path.join(MODEL_OUTPUT_PATH, "english_vectorizer.pkl")
+            joblib.dump(english_model, english_model_path)
+            joblib.dump(english_vectorizer, english_vec_path)
+            print(f"  ✓ Model saved to: {english_model_path}")
+            print(f"  ✓ Vectorizer saved to: {english_vec_path}")
+            results['english'] = {
+                'best_model': english_best_name,
+                'f1_score': english_f1,
+                'num_classes': len(df_english['label'].unique()),
+                'samples': len(df_english),
+                'comparison': english_comparison
+            }
+    # Train Bengali model
+    if len(df_bengali) > 0:
+        print("\n" + "🇧🇩 " * 35)
+        bengali_model, bengali_vectorizer, bengali_best_name, bengali_f1, bengali_comparison = train_language_specific_model(
+            df_bengali, 'bengali'
+        )
+        if bengali_model is not None:
+            # Save Bengali model
+            print(f"\n💾 Saving Bengali model ({bengali_best_name})...")
+            bengali_model_path = os.path.join(MODEL_OUTPUT_PATH, "bengali_model.pkl")
+            bengali_vec_path = os.path.join(MODEL_OUTPUT_PATH, "bengali_vectorizer.pkl")
+            joblib.dump(bengali_model, bengali_model_path)
+            joblib.dump(bengali_vectorizer, bengali_vec_path)
+            print(f"  ✓ Model saved to: {bengali_model_path}")
+            print(f"  ✓ Vectorizer saved to: {bengali_vec_path}")
+            results['bengali'] = {
+                'best_model': bengali_best_name,
+                'f1_score': bengali_f1,
+                'num_classes': len(df_bengali['label'].unique()),
+                'samples': len(df_bengali),
+                'comparison': bengali_comparison
+            }
+    # Save metadata
+    print(f"\n💾 Saving metadata...")
+    metadata = {
+        'training_date': time.strftime('%Y-%m-%d %H:%M:%S'),
+        'models': results,
+        'separate_models': True,
+        'algorithms_tested': ['logistic', 'svm', 'random_forest']
+    }
+    with open(os.path.join(MODEL_OUTPUT_PATH, "metadata.json"), 'w') as f:
+        json.dump(metadata, f, indent=2)
+    # Final Summary
+    print("\n" + "=" * 70)
+    print("✅ Training Complete!")
+    print("=" * 70)
+    if 'english' in results:
+        print(f"\n🇬🇧 English Model:")
+        print(f"   Best Algorithm: {results['english']['best_model'].upper()}")
+        print(f"   F1-Score: {results['english']['f1_score']:.4f}")
+        print(f"   Classes: {results['english']['num_classes']}")
+        print(f"   Samples: {results['english']['samples']:,}")
+        print(f"\n   Model Comparison:")
+        for model_name, scores in results['english']['comparison'].items():
+            print(f"     {model_name:<15}: Acc={scores['accuracy']:.4f}, F1={scores['f1_score']:.4f}")
+    if 'bengali' in results:
+        print(f"\n🇧🇩 Bengali Model:")
+        print(f"   Best Algorithm: {results['bengali']['best_model'].upper()}")
+        print(f"   F1-Score: {results['bengali']['f1_score']:.4f}")
+        print(f"   Classes: {results['bengali']['num_classes']}")
+        print(f"   Samples: {results['bengali']['samples']:,}")
+        print(f"\n   Model Comparison:")
+        for model_name, scores in results['bengali']['comparison'].items():
+            print(f"     {model_name:<15}: Acc={scores['accuracy']:.4f}, F1={scores['f1_score']:.4f}")
+    print("\n" + "=" * 70 + "\n")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+python-multipart==0.0.6
+pydantic==2.5.0
+# Web scraping
+requests==2.31.0
+beautifulsoup4==4.12.2
+# Document processing
+PyPDF2==3.0.1
+python-docx==1.1.0
+# ML (optimized versions)
+numpy<2.0.0
+pandas<3.0.0
+scikit-learn>=1.3.0,<2.0.0
+transformers>=4.35.0,<5.0.0
+torch>=2.0.0,<3.0.0
+langdetect==1.0.9
+deep-translator==1.11.4
+joblib>=1.5.0

services/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .analyzer import analyze_content
2	+
3	+ __all__ = ['analyze_content']

services/analyzer.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from typing import Dict, List
+import re
+from models.hate_speech_classifier import HateSpeechClassifier
+from models.language_detector import detect_language
+# Initialize classifier globally
+classifier = HateSpeechClassifier()
+def highlight_keywords(text: str, keywords: List[str]) -> List[str]:
+    """Extract phrases containing keywords"""
+    highlighted = []
+    text_lower = text.lower()
+    for keyword in keywords:
+        if keyword.lower() in text_lower:
+            sentences = re.split(r'[।.!?]+', text)
+            for sentence in sentences:
+                if keyword.lower() in sentence.lower():
+                    highlighted.append(sentence.strip())
+                    break
+    return highlighted[:5]
+async def analyze_content(text: str) -> Dict:
+    """
+    Main analysis function that combines all models
+    """
+    # Detect language
+    language = detect_language(text)
+    # Get results from all three methods
+    custom_result = await classifier.classify_with_custom_model(text, language)
+    # ✅ Pass language to pretrained model for translation support
+    pretrained_result = await classifier.classify_with_pretrained_model(text, language)
+    keyword_result = classifier.classify_with_keywords(text, language)
+    # Enhanced ensemble decision with adaptive weights
+    results = []
+    has_patterns = keyword_result.get("pattern_matches", 0) > 0
+    has_hate_keywords = keyword_result.get("hate_count", 0) > 0
+    if has_patterns or has_hate_keywords:
+        custom_weight = 0.5
+        pretrained_weight = 0.2
+        keyword_weight = 0.3
+    else:
+        custom_weight = 0.4
+        pretrained_weight = 0.4
+        keyword_weight = 0.2
+    if custom_result:
+        results.append({
+            "category": custom_result["category"],
+            "confidence": custom_result["confidence"],
+            "weight": custom_weight
+        })
+    if pretrained_result:
+        results.append({
+            "category": pretrained_result["category"],
+            "confidence": pretrained_result["confidence"],
+            "weight": pretrained_weight
+        })
+    if keyword_result:
+        results.append({
+            "category": keyword_result["category"],
+            "confidence": keyword_result["confidence"],
+            "weight": keyword_weight
+        })
+    # Weighted voting
+    category_scores = {}
+    for result in results:
+        cat = result["category"]
+        score = result["confidence"] * result["weight"]
+        category_scores[cat] = category_scores.get(cat, 0) + score
+    if category_scores:
+        sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
+        final_category = sorted_categories[0][0]
+        final_confidence = category_scores[final_category] / sum(r["weight"] for r in results)
+        if len(sorted_categories) > 1:
+            top_cat, top_score = sorted_categories[0]
+            second_cat, second_score = sorted_categories[1]
+            if (second_cat == "hate_speech" and
+                top_cat != "hate_speech" and
+                (top_score - second_score) < 0.15 and
+                has_patterns):
+                final_category = "hate_speech"
+                final_confidence = second_score / sum(r["weight"] for r in results)
+    else:
+        final_category = "neutral"
+        final_confidence = 0.5
+    # Generate reasoning
+    reasons = []
+    if has_patterns:
+        reasons.append(f"Detected hate speech patterns in text structure")
+    if custom_result and custom_result["category"] == "hate_speech":
+        reasons.append(f"Custom model detected {custom_result['category']} with {custom_result['confidence']:.2%} confidence")
+    if pretrained_result:
+        if pretrained_result.get("translated"):
+            reasons.append(f"Pretrained model analyzed translated text and identified {pretrained_result['category']}")
+        elif pretrained_result["category"] != "neutral":
+            reasons.append(f"Pretrained model identified {pretrained_result['category']} patterns")
+    if keyword_result and keyword_result.get("detected_keywords"):
+        reasons.append(f"Found {len(keyword_result['detected_keywords'])} hate/offensive keywords")
+    if not reasons:
+        reasons = ["Classification based on content analysis"]
+    all_keywords = keyword_result.get("detected_keywords", [])
+    highlighted_phrases = highlight_keywords(text, all_keywords) if all_keywords else []
+    return {
+        "ensemble": {
+            "category": final_category,
+            "confidence": float(final_confidence),
+            "reasons": reasons,
+            "weights_used": {
+                "custom_model": custom_weight,
+                "pretrained_model": pretrained_weight,
+                "keyword_analysis": keyword_weight
+            }
+        },
+        "custom_model": {
+            "available": custom_result is not None,
+            "category": custom_result["category"] if custom_result else None,
+            "confidence": custom_result["confidence"] if custom_result else None,
+            "method": custom_result.get("method") if custom_result else None,
+            "raw_prediction": custom_result.get("raw_prediction") if custom_result else None
+        },
+        "pretrained_model": {
+            "available": pretrained_result is not None,
+            "category": pretrained_result["category"] if pretrained_result else None,
+            "confidence": pretrained_result["confidence"] if pretrained_result else None,
+            "method": pretrained_result.get("method") if pretrained_result else None,
+            "raw_labels": pretrained_result.get("raw_labels") if pretrained_result else None,
+            "translated": pretrained_result.get("translated", False) if pretrained_result else False,
+            "translated_text": pretrained_result.get("translated_text") if pretrained_result else None
+        },
+        "keyword_analysis": {
+            "available": True,
+            "category": keyword_result["category"],
+            "confidence": keyword_result["confidence"],
+            "method": keyword_result["method"],
+            "detected_keywords": keyword_result.get("detected_keywords", []),
+            "hate_count": keyword_result.get("hate_count", 0),
+            "offensive_count": keyword_result.get("offensive_count", 0),
+            "pattern_matches": keyword_result.get("pattern_matches", 0)
+        },
+        "highlighted_phrases": highlighted_phrases,
+        "detected_language": language,
+        "original_text": text[:200] + "..." if len(text) > 200 else text
+    }

services/text_extractor.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import requests
+from bs4 import BeautifulSoup
+from typing import Optional
+import PyPDF2
+from docx import Document
+import io
+def extract_from_url(url: str) -> str:
+    """Extract text content from URL (synchronous)"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style", "nav", "footer", "header"]):
+            script.decompose()
+        # Get text
+        text = soup.get_text(separator=' ', strip=True)
+        # Clean up whitespace
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        return text
+    except Exception as e:
+        print(f"Error extracting from URL: {e}")
+        raise Exception(f"Failed to extract text from URL: {str(e)}")
+def extract_from_document(content: bytes, file_extension: str) -> str:
+    """Extract text from document (synchronous)"""
+    try:
+        if file_extension == ".pdf":
+            return _extract_from_pdf(content)
+        elif file_extension == ".docx":
+            return _extract_from_docx(content)
+        elif file_extension == ".txt":
+            return content.decode('utf-8')
+        else:
+            raise ValueError(f"Unsupported file type: {file_extension}")
+    except Exception as e:
+        print(f"Error extracting from document: {e}")
+        raise Exception(f"Failed to extract text from document: {str(e)}")
+def _extract_from_pdf(content: bytes) -> str:
+    """Extract text from PDF"""
+    try:
+        pdf_file = io.BytesIO(content)
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text() + "\n"
+        return text.strip()
+    except Exception as e:
+        raise Exception(f"Error reading PDF: {str(e)}")
+def _extract_from_docx(content: bytes) -> str:
+    """Extract text from DOCX"""
+    try:
+        doc_file = io.BytesIO(content)
+        doc = Document(doc_file)
+        text = ""
+        for paragraph in doc.paragraphs:
+            text += paragraph.text + "\n"
+        return text.strip()
+    except Exception as e:
+        raise Exception(f"Error reading DOCX: {str(e)}")

utils/__init__.py ADDED Viewed

File without changes

utils/helpers.py ADDED Viewed

File without changes