Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,89 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
repo_id = "Alshargi/arabic-msa-dialects-segmentation"
|
| 3 |
-
gr.Interface.load(f"huggingface/{repo_id}"
|
|
|
|
|
|
| 1 |
+
import joblib
|
| 2 |
import gradio as gr
|
| 3 |
+
from nltk import word_tokenize
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
# Load the scikit-learn model
|
| 7 |
+
clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
|
| 8 |
+
|
| 9 |
+
# Function to define features for each word in the sentence
|
| 10 |
+
def features(sentence, index):
|
| 11 |
+
return {
|
| 12 |
+
'word': sentence[index],
|
| 13 |
+
'is_first': index == 0,
|
| 14 |
+
'is_last': index == len(sentence) - 1,
|
| 15 |
+
'lword': len(sentence[index]),
|
| 16 |
+
|
| 17 |
+
'prefix-1': sentence[index][:1],
|
| 18 |
+
'prefix-2': sentence[index][:2],
|
| 19 |
+
'prefix-3': sentence[index][:3],
|
| 20 |
+
'prefix-4': sentence[index][:4],
|
| 21 |
+
'prefix-5': sentence[index][:5],
|
| 22 |
+
|
| 23 |
+
'suffix-1': sentence[index][-1],
|
| 24 |
+
'suffix-2': sentence[index][-2:],
|
| 25 |
+
'suffix-3': sentence[index][-3:],
|
| 26 |
+
'suffix-4': sentence[index][-4:],
|
| 27 |
+
'suffix-5': sentence[index][-5:],
|
| 28 |
+
|
| 29 |
+
'prev_word_4': prvwords_4(sentence, index),
|
| 30 |
+
'prev_word_3': prvwords_3(sentence, index),
|
| 31 |
+
'prev_word_2': prvwords_2(sentence, index),
|
| 32 |
+
'prev_word_1': prvwords_1(sentence, index),
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
'next_word_1': nextwords_1(sentence, index),
|
| 36 |
+
'next_word_2': nextwords_2(sentence, index),
|
| 37 |
+
'next_word_3': nextwords_3(sentence, index),
|
| 38 |
+
'next_word_4': nextwords_4(sentence, index),
|
| 39 |
+
|
| 40 |
+
'is_numeric': sentence[index].isdigit(),
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Function to rebuild the word based on the segmentation results
|
| 44 |
+
def rebuildxx(ww, xres):
|
| 45 |
+
numprfx = xres.count('p')
|
| 46 |
+
numsufx = xres.count('f')
|
| 47 |
+
resfinal = ''
|
| 48 |
+
if numprfx != 0 and numsufx != 0 :
|
| 49 |
+
resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] , ww[-numsufx:] )
|
| 50 |
+
if numprfx == 0 and numsufx == 0 :
|
| 51 |
+
#resfinal = "{}+{}+{}".format("", ww , "" )
|
| 52 |
+
resfinal = "{}".format(ww )
|
| 53 |
+
|
| 54 |
+
if numprfx == 0 and numsufx != 0 :
|
| 55 |
+
#resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
|
| 56 |
+
resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
|
| 57 |
+
|
| 58 |
+
if numprfx != 0 and numsufx == 0 :
|
| 59 |
+
#resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:], "")
|
| 60 |
+
resfinal = "{}+{}".format(ww[:numprfx] , ww[numprfx:])
|
| 61 |
+
|
| 62 |
+
return resfinal
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# Function to preprocess text
|
| 67 |
+
def prepare_text(text):
|
| 68 |
+
# Define regular expression pattern to match symbols and punctuation from any language
|
| 69 |
+
symbol_pattern = r'([^\w\s\d])' # Capture non-word, non-space, non-digit characters
|
| 70 |
+
prepared_text = re.sub(symbol_pattern, r' \1 ', text)
|
| 71 |
+
prepared_text = re.sub(r'\s+', ' ', prepared_text)
|
| 72 |
+
|
| 73 |
+
return prepared_text.strip() # Remove leading and trailing spaces
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Prediction function
|
| 77 |
+
def predict(text):
|
| 78 |
+
preprocessed_text = prepare_text(text)
|
| 79 |
+
tokenized_text = word_tokenize(preprocessed_text)
|
| 80 |
+
result = clf.predict([features(tokenized_text, index) for index in range(len(tokenized_text))])
|
| 81 |
+
segmented_text = ""
|
| 82 |
+
for word, segmentation in zip(tokenized_text, result):
|
| 83 |
+
segmented_text += rebuildxx(word, segmentation) + " "
|
| 84 |
+
return segmented_text
|
| 85 |
+
|
| 86 |
+
# Interface
|
| 87 |
repo_id = "Alshargi/arabic-msa-dialects-segmentation"
|
| 88 |
+
iface = gr.Interface.load(f"huggingface/{repo_id}", inputs=gr.inputs.Textbox(lines=5, label="Input Text"), outputs="text")
|
| 89 |
+
iface.launch()
|