Abrhaley commited on
Commit
9027a7a
·
verified ·
1 Parent(s): 4f654c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -19
app.py CHANGED
@@ -1,28 +1,66 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
 
 
 
 
 
3
 
4
- # Load your Tigrinya model (replace with your actual model)
5
- # For now, using a multilingual model as placeholder
6
- classifier = pipeline(
7
- "text-classification",
8
- model="Davlan/bert-base-multilingual-cased-finetuned-amharic", # Replace with your model
9
- tokenizer="Davlan/bert-base-multilingual-cased-finetuned-amharic"
10
- )
11
 
12
- def detect_harmful(text):
13
- if not text.strip():
14
- return "Please enter Tigrinya text"
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- results = classifier(text)
17
- return {
18
- "Harmful" if res["label"] == "LABEL_1" else "Safe": res["score"]
19
- for res in results
 
 
20
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
22
  gr.Interface(
23
- fn=detect_harmful,
24
- inputs=gr.Textbox(label="Enter Tigrinya Text", placeholder="ጽሑፍ ኣብዚ..."),
25
- outputs=gr.Label(label="Detection Results"),
 
26
  title="ጉዳት ዘለዎ ጽሑፍ ኣሳታሚ ትግርኛ (Tigrinya Harmful Content Detector)",
27
- description="ብትግርኛ ዝተጻሕፈ ጉዳት ዘለዎ ጽሑፍ ይለለጥ።"
 
 
 
 
28
  ).launch()
 
1
  import gradio as gr
2
+ import joblib
3
+ import pandas as pd
4
+ import numpy as np
5
+ import json
6
+ import re
7
+ from nltk.stem import SnowballStemmer
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
 
10
+ # Load model and metadata
11
+ model = joblib.load('model.joblib')
12
+ with open('feature_names.json', 'r') as f:
13
+ feature_names = json.load(f)['feature_names']
 
 
 
14
 
15
+ # Tigrinya text preprocessing
16
+ def preprocess_tigrinya(text):
17
+ # Basic cleaning
18
+ text = re.sub(r'[^\w\s]', '', text)
19
+ text = text.lower()
20
+
21
+ # Simple stemmer (use NLTK's Arabic stemmer as closest match)
22
+ stemmer = SnowballStemmer("arabic")
23
+ words = text.split()
24
+ stemmed = [stemmer.stem(word) for word in words]
25
+ return " ".join(stemmed)
26
+
27
+ # Feature extraction
28
+ def extract_features(text):
29
+ processed = preprocess_tigrinya(text)
30
 
31
+ # Create feature vector (customize based on your original features)
32
+ features = {
33
+ "word_count": len(processed.split()),
34
+ "unique_words": len(set(processed.split())),
35
+ "char_count": len(processed),
36
+ "contains_hate_keyword": int(any(kw in processed for kw in ["ዘሕደረ", "ጸረ"]))
37
  }
38
+
39
+ # Create DataFrame with all expected features
40
+ df = pd.DataFrame(columns=feature_names)
41
+ df = df.append(features, ignore_index=True).fillna(0)
42
+ return df
43
+
44
+ # Prediction function
45
+ def predict(text):
46
+ if not text.strip():
47
+ return "ጽሑፍ ኣእትዉ! (Please enter text)"
48
+
49
+ features_df = extract_features(text)
50
+ proba = model.predict_proba(features_df)[0]
51
+ return {"ጉዳት ዘለዎ (Harmful)": float(proba[1]),
52
+ "ሰላማዊ (Safe)": float(proba[0])}
53
 
54
+ # Gradio interface
55
  gr.Interface(
56
+ fn=predict,
57
+ inputs=gr.Textbox(label="ትግርኛ ጽሑፍ ኣእትዉ (Enter Tigrinya Text)",
58
+ placeholder="እዚ ጽሑፍ ጉዳት ዘለዎ ይመስል..."),
59
+ outputs=gr.Label(label="ውጽኢት (Prediction)"),
60
  title="ጉዳት ዘለዎ ጽሑፍ ኣሳታሚ ትግርኛ (Tigrinya Harmful Content Detector)",
61
+ description="ብትግርኛ ዝተጻሕፈ ጉዳት ዘለዎ ጽሑፍ ይለለጥ።",
62
+ examples=[
63
+ ["እዚ ጽሑፍ ጥሩ እዩ"], # Safe example
64
+ ["ኣፍታዊ ጥልመት ኣለካ!"] # Harmful example
65
+ ]
66
  ).launch()