import gradio as gr import joblib import pandas as pd import numpy as np import json import re from nltk.stem import SnowballStemmer from sklearn.feature_extraction.text import TfidfVectorizer # Load model and metadata model = joblib.load('model.joblib') with open('feature_names.json', 'r') as f: feature_names = json.load(f)['feature_names'] # Tigrinya text preprocessing def preprocess_tigrinya(text): # Basic cleaning text = re.sub(r'[^\w\s]', '', text) text = text.lower() # Simple stemmer (use NLTK's Arabic stemmer as closest match) stemmer = SnowballStemmer("arabic") words = text.split() stemmed = [stemmer.stem(word) for word in words] return " ".join(stemmed) # Feature extraction def extract_features(text): processed = preprocess_tigrinya(text) # Create feature vector (customize based on your original features) features = { "word_count": len(processed.split()), "unique_words": len(set(processed.split())), "char_count": len(processed), "contains_hate_keyword": int(any(kw in processed for kw in ["ዘሕደረ", "ጸረ"])) } # Create DataFrame with all expected features df = pd.DataFrame(columns=feature_names) df = df.append(features, ignore_index=True).fillna(0) return df # Prediction function def predict(text): if not text.strip(): return "ጽሑፍ ኣእትዉ! (Please enter text)" features_df = extract_features(text) proba = model.predict_proba(features_df)[0] return {"ጉዳት ዘለዎ (Harmful)": float(proba[1]), "ሰላማዊ (Safe)": float(proba[0])} # Gradio interface gr.Interface( fn=predict, inputs=gr.Textbox(label="ትግርኛ ጽሑፍ ኣእትዉ (Enter Tigrinya Text)", placeholder="እዚ ጽሑፍ ጉዳት ዘለዎ ይመስል..."), outputs=gr.Label(label="ውጽኢት (Prediction)"), title="ጉዳት ዘለዎ ጽሑፍ ኣሳታሚ ትግርኛ (Tigrinya Harmful Content Detector)", description="ብትግርኛ ዝተጻሕፈ ጉዳት ዘለዎ ጽሑፍ ይለለጥ።", examples=[ ["እዚ ጽሑፍ ጥሩ እዩ"], # Safe example ["ኣፍታዊ ጥልመት ኣለካ!"] # Harmful example ] ).launch()