File size: 2,657 Bytes
292d884
 
 
 
 
 
 
 
 
434c6fc
dd1f349
434c6fc
 
 
 
292d884
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434c6fc
292d884
 
 
 
 
 
434c6fc
292d884
 
 
434c6fc
292d884
 
434c6fc
292d884
 
434c6fc
292d884
 
 
 
 
 
dd1f349
fe3474b
 
 
dd1f349
 
 
 
 
 
 
423a505
33940f8
bc5fd0a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
import joblib
import requests
import os
from lime.lime_text import LimeTextExplainer

# Constants
CLASSES = ["Non-Hate Speech", "Hate Speech"]
STOPWORDS = {
    "แƒ™แƒ˜", "แƒแƒ แƒ", "แƒ“แƒ", "แƒ แƒแƒ›", "แƒ แƒแƒ“แƒ’แƒแƒœ", "แƒ˜แƒก", "แƒ”แƒก", "แƒ แƒ", "แƒ›แƒแƒก", "แƒ›แƒ˜แƒกแƒ˜", 
    "แƒจแƒ”แƒœแƒ˜", "แƒฉแƒ”แƒ›แƒ˜", "แƒ แƒแƒ“", "แƒ แƒแƒขแƒแƒ›" "แƒ›แƒ”แƒ แƒ”", "แƒแƒœ", "แƒแƒฃ", "แƒแƒ›แƒ˜แƒก", "แƒ˜แƒ›แƒ˜แƒก", 
    "แƒ แƒแƒ›แƒช", "แƒ”แƒ”", "แƒ”แƒ”แƒ”", "แƒฎแƒแƒ ", "แƒ•แƒแƒ ", "แƒ แƒแƒ’แƒแƒ แƒช", "แƒ แƒแƒช", "แƒ แƒแƒ“แƒ”แƒกแƒแƒช", 
    "แƒกแƒแƒ“แƒแƒช", "แƒ—แƒฃ", "แƒ แƒ", "แƒ แƒแƒ›แƒ”แƒšแƒ˜", "แƒ แƒแƒ›แƒšแƒ˜แƒช", "แƒ แƒแƒ“แƒ˜แƒก", "แƒ แƒแƒฆแƒ", "แƒ›แƒแƒ’แƒ แƒแƒ›", 
    "แƒแƒ ", "แƒแƒฅ", "แƒ˜แƒฅ", "แƒจแƒ”แƒ›แƒ“แƒ”แƒ’", "แƒกแƒแƒ“", "แƒ›แƒ”", "แƒจแƒ”แƒœ", "แƒ—แƒฅแƒ•แƒ”แƒœ", "แƒ›แƒ˜แƒ”แƒ ", 
    "แƒ•แƒ˜แƒœ", "แƒ แƒแƒ’แƒแƒ ", "แƒ—แƒฃแƒœแƒ“แƒแƒช", "แƒ แƒแƒ—แƒ", "แƒ˜แƒกแƒ˜แƒœแƒ˜", "แƒ•แƒ˜แƒœแƒช", "แƒ แƒแƒขแƒ",
}

MODEL_URL = "https://raw.githubusercontent.com/RevazRevazashvili/geo-hate-speech-analysis/main/models/tfidf_logreg_classifier.pkl"
MODEL_PATH = "tfidf_logreg_classifier.pkl"

# Download model if not exists
if not os.path.exists(MODEL_PATH):
    r = requests.get(MODEL_URL)
    with open(MODEL_PATH, "wb") as f:
        f.write(r.content)

model = joblib.load(MODEL_PATH)

def is_undecided(prob):
    return 0.35 < prob < 0.7

def get_hate_words(text):
    explainer = LimeTextExplainer(class_names=CLASSES)
    predict_fn = lambda x: model.predict_proba(x)
    
    try:
        explanation = explainer.explain_instance(text, predict_fn, num_features=10)
        influential_words = explanation.as_list()
        filtered = [(word, score) for word, score in influential_words if word not in STOPWORDS]
    except:
        filtered = []
    
    pred = int(model.predict([text])[0])
    prob = model.predict_proba([text])[0][-1]
    pred_class = CLASSES[pred]
    
    if is_undecided(prob):
        return []
    
    if pred_class == "Hate Speech":
        return [word for word, score in filtered if score > 0]
    
    return []

def api_predict(text):
    words = get_hate_words(text)
    return {"hate_words": words}

with gr.Blocks() as demo:
    input_text = gr.Textbox(label="Enter Georgian text")
    output_json = gr.JSON(label="Detected Hate Words")
    submit_btn = gr.Button("Predict")
    
    submit_btn.click(
        fn=api_predict, 
        inputs=input_text, 
        outputs=output_json,
        api_name="predict"  # This is the key addition
    )

if __name__ == "__main__":
    demo.launch(share=True, server_name="0.0.0.0", server_port=7860, ssr_mode=False)