azzar commited on
Commit
ee913c5
·
verified ·
1 Parent(s): 9392708

Upload 9 files

Browse files
models/.gitkeep ADDED
File without changes
models/content_moderation_api.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Moderation API
3
+ Generated on: 2025-10-03 07:26:24
4
+ """
5
+
6
+ import joblib
7
+ import json
8
+ import re
9
+ from typing import Dict, List, Tuple
10
+
11
+ class ContentModeratorAPI:
12
+ def __init__(self, model_path: str, config_path: str):
13
+ # Load model
14
+ self.model = joblib.load(model_path)
15
+
16
+ # Load configuration
17
+ with open(config_path, 'r') as f:
18
+ config = json.load(f)
19
+
20
+ self.blocked_words = config['blocked_words']
21
+ self.sensitivity_threshold = config['sensitivity_threshold']
22
+ self.model_type = config['embedding_type']
23
+
24
+ # Compile blocked word patterns
25
+ self.patterns = [re.compile(r'' + re.escape(word) + r'', re.IGNORECASE)
26
+ for word in self.blocked_words]
27
+
28
+ def detect_blocked_words(self, text: str) -> List[str]:
29
+ """Detect blocked words in text"""
30
+ found_words = []
31
+ for word, pattern in zip(self.blocked_words, self.patterns):
32
+ if pattern.search(text):
33
+ found_words.append(word)
34
+ return found_words
35
+
36
+ def censor_text(self, text: str, replacement: str = "***") -> str:
37
+ """Censor inappropriate content"""
38
+ censored = text
39
+ for word, pattern in zip(self.blocked_words, self.patterns):
40
+ censored = pattern.sub(replacement, censored)
41
+ return censored
42
+
43
+ def moderate(self, text: str) -> Dict:
44
+ """Moderate a single text"""
45
+ # Rule-based detection
46
+ blocked_words = self.detect_blocked_words(text)
47
+ rule_inappropriate = len(blocked_words) > 0
48
+
49
+ # ML-based detection
50
+ ml_confidence = 0.0
51
+ ml_inappropriate = False
52
+
53
+ if self.model_type == 'tfidf':
54
+ # This would need the vectorizer loaded separately
55
+ pass
56
+ elif self.model_type == 'sentence_transformer':
57
+ # This would need the embedder loaded separately
58
+ pass
59
+
60
+ # Combine predictions
61
+ is_inappropriate = rule_inappropriate or ml_inappropriate
62
+ censored_text = self.censor_text(text) if is_inappropriate else text
63
+
64
+ return {"text": text, "is_inappropriate": is_inappropriate,
65
+ "blocked_words": blocked_words, "censored_text": censored_text,
66
+ "confidence": ml_confidence}
67
+
68
+ def moderate_batch(self, texts: List[str]) -> List[Dict]:
69
+ """Moderate a batch of texts"""
70
+ return [self.moderate(text) for text in texts]
71
+
72
+ # Usage example:
73
+ # moderator = ContentModeratorAPI('models/model.pkl', 'models/config.json')
74
+ # result = moderator.moderate("Test text")
75
+ # print(result)
models/content_moderation_api_cuda.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Moderation API
3
+ Generated on: 2025-10-03 04:10:32
4
+ """
5
+
6
+ import joblib
7
+ import json
8
+ import re
9
+ from typing import Dict, List, Tuple
10
+
11
+ class ContentModeratorAPI:
12
+ def __init__(self, model_path: str, config_path: str):
13
+ # Load model
14
+ self.model = joblib.load(model_path)
15
+
16
+ # Load configuration
17
+ with open(config_path, 'r') as f:
18
+ config = json.load(f)
19
+
20
+ self.blocked_words = config['blocked_words']
21
+ self.sensitivity_threshold = config['sensitivity_threshold']
22
+ self.model_type = config['embedding_type']
23
+
24
+ # Compile blocked word patterns
25
+ self.patterns = [re.compile(r'' + re.escape(word) + r'', re.IGNORECASE)
26
+ for word in self.blocked_words]
27
+
28
+ def detect_blocked_words(self, text: str) -> List[str]:
29
+ """Detect blocked words in text"""
30
+ found_words = []
31
+ for word, pattern in zip(self.blocked_words, self.patterns):
32
+ if pattern.search(text):
33
+ found_words.append(word)
34
+ return found_words
35
+
36
+ def censor_text(self, text: str, replacement: str = "***") -> str:
37
+ """Censor inappropriate content"""
38
+ censored = text
39
+ for word, pattern in zip(self.blocked_words, self.patterns):
40
+ censored = pattern.sub(replacement, censored)
41
+ return censored
42
+
43
+ def moderate(self, text: str) -> Dict:
44
+ """Moderate a single text"""
45
+ # Rule-based detection
46
+ blocked_words = self.detect_blocked_words(text)
47
+ rule_inappropriate = len(blocked_words) > 0
48
+
49
+ # ML-based detection
50
+ ml_confidence = 0.0
51
+ ml_inappropriate = False
52
+
53
+ if self.model_type == 'tfidf':
54
+ # This would need the vectorizer loaded separately
55
+ pass
56
+ elif self.model_type == 'sentence_transformer':
57
+ # This would need the embedder loaded separately
58
+ pass
59
+
60
+ # Combine predictions
61
+ is_inappropriate = rule_inappropriate or ml_inappropriate
62
+ censored_text = self.censor_text(text) if is_inappropriate else text
63
+
64
+ return {"text": text, "is_inappropriate": is_inappropriate,
65
+ "blocked_words": blocked_words, "censored_text": censored_text,
66
+ "confidence": ml_confidence}
67
+
68
+ def moderate_batch(self, texts: List[str]) -> List[Dict]:
69
+ """Moderate a batch of texts"""
70
+ return [self.moderate(text) for text in texts]
71
+
72
+ # Usage example:
73
+ # moderator = ContentModeratorAPI('models/model.pkl', 'models/config.json')
74
+ # result = moderator.moderate("Test text")
75
+ # print(result)
models/content_moderation_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "20251003_072624",
3
+ "models": {
4
+ "sentence_transformer": {
5
+ "model": "models/content_moderation_sentence_transformer_20251003_072624.pkl",
6
+ "embedder": "models/content_moderation_sentence_transformer_embedder_20251003_072624.pkl",
7
+ "type": "sentence_transformer"
8
+ }
9
+ },
10
+ "blocked_words": [
11
+ "hate",
12
+ "kill",
13
+ "murder",
14
+ "attack",
15
+ "threaten",
16
+ "bully",
17
+ "harass",
18
+ "abuse",
19
+ "violent",
20
+ "weapon",
21
+ "bomb",
22
+ "gun",
23
+ "knife",
24
+ "fight",
25
+ "war",
26
+ "death",
27
+ "sex",
28
+ "porn",
29
+ "nude",
30
+ "adult",
31
+ "explicit",
32
+ "xxx",
33
+ "nsfw",
34
+ "erotic",
35
+ "strip",
36
+ "escort",
37
+ "hooker",
38
+ "prostitute",
39
+ "condom",
40
+ "vibrator",
41
+ "damn",
42
+ "hell",
43
+ "bastard",
44
+ "bitch",
45
+ "ass",
46
+ "fuck",
47
+ "shit",
48
+ "cunt",
49
+ "piss",
50
+ "dick",
51
+ "cock",
52
+ "pussy",
53
+ "tits",
54
+ "boobs",
55
+ "slut",
56
+ "whore",
57
+ "racist",
58
+ "sexist",
59
+ "homophobic",
60
+ "transphobic",
61
+ "xenophobic"
62
+ ],
63
+ "sensitivity_threshold": 0.7,
64
+ "embedding_type": "sentence_transformers",
65
+ "model_name": "all-mpnet-base-v2"
66
+ }
models/content_moderation_config_cuda.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "20251003_041031",
3
+ "models": {
4
+ "sentence_transformer": {
5
+ "model": "models/content_moderation_sentence_transformer_20251003_041031.pkl",
6
+ "embedder": "models/content_moderation_sentence_transformer_embedder_20251003_041031.pkl",
7
+ "type": "sentence_transformer"
8
+ }
9
+ },
10
+ "blocked_words": [
11
+ "hate",
12
+ "kill",
13
+ "murder",
14
+ "attack",
15
+ "threaten",
16
+ "bully",
17
+ "harass",
18
+ "abuse",
19
+ "violent",
20
+ "weapon",
21
+ "bomb",
22
+ "gun",
23
+ "knife",
24
+ "fight",
25
+ "war",
26
+ "death",
27
+ "sex",
28
+ "porn",
29
+ "nude",
30
+ "adult",
31
+ "explicit",
32
+ "xxx",
33
+ "nsfw",
34
+ "erotic",
35
+ "strip",
36
+ "escort",
37
+ "hooker",
38
+ "prostitute",
39
+ "condom",
40
+ "vibrator",
41
+ "damn",
42
+ "hell",
43
+ "bastard",
44
+ "bitch",
45
+ "ass",
46
+ "fuck",
47
+ "shit",
48
+ "cunt",
49
+ "piss",
50
+ "dick",
51
+ "cock",
52
+ "pussy",
53
+ "tits",
54
+ "boobs",
55
+ "slut",
56
+ "whore",
57
+ "racist",
58
+ "sexist",
59
+ "homophobic",
60
+ "transphobic",
61
+ "xenophobic"
62
+ ],
63
+ "sensitivity_threshold": 0.7,
64
+ "embedding_type": "sentence_transformers",
65
+ "model_name": "all-mpnet-base-v2"
66
+ }
models/content_moderation_sentence_transformer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9747086d0f1d6e0c6262d6388db23b6b89ada16858d7e8bc06ce3a028990c8ff
3
+ size 7007
models/content_moderation_sentence_transformer_cuda.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a024f3b5cdaafc6d0f34de926f7819474a4bef503a0df03875f98f4d82fe76c3
3
+ size 7007
models/content_moderation_sentence_transformer_embedder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dea50d157098740447cb1d4a1e6bc2acc4367f1969bedaa6c3fba2ae404ab98
3
+ size 438525899
models/content_moderation_sentence_transformer_embedder_cuda.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a52e0ef7d9ae61ac0bcd6369a0e4ca50ad263043e30bc55df37b1498a91ad66b
3
+ size 438526502