Delete app.py
Browse files
app.py
DELETED
|
@@ -1,89 +0,0 @@
|
|
| 1 |
-
import joblib
|
| 2 |
-
import gradio as gr
|
| 3 |
-
from nltk import word_tokenize
|
| 4 |
-
import re
|
| 5 |
-
|
| 6 |
-
# Load the scikit-learn model
|
| 7 |
-
clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
|
| 8 |
-
|
| 9 |
-
# Function to define features for each word in the sentence
|
| 10 |
-
def features(sentence, index):
|
| 11 |
-
return {
|
| 12 |
-
'word': sentence[index],
|
| 13 |
-
'is_first': index == 0,
|
| 14 |
-
'is_last': index == len(sentence) - 1,
|
| 15 |
-
'lword': len(sentence[index]),
|
| 16 |
-
|
| 17 |
-
'prefix-1': sentence[index][:1],
|
| 18 |
-
'prefix-2': sentence[index][:2],
|
| 19 |
-
'prefix-3': sentence[index][:3],
|
| 20 |
-
'prefix-4': sentence[index][:4],
|
| 21 |
-
'prefix-5': sentence[index][:5],
|
| 22 |
-
|
| 23 |
-
'suffix-1': sentence[index][-1],
|
| 24 |
-
'suffix-2': sentence[index][-2:],
|
| 25 |
-
'suffix-3': sentence[index][-3:],
|
| 26 |
-
'suffix-4': sentence[index][-4:],
|
| 27 |
-
'suffix-5': sentence[index][-5:],
|
| 28 |
-
|
| 29 |
-
'prev_word_4': prvwords_4(sentence, index),
|
| 30 |
-
'prev_word_3': prvwords_3(sentence, index),
|
| 31 |
-
'prev_word_2': prvwords_2(sentence, index),
|
| 32 |
-
'prev_word_1': prvwords_1(sentence, index),
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
'next_word_1': nextwords_1(sentence, index),
|
| 36 |
-
'next_word_2': nextwords_2(sentence, index),
|
| 37 |
-
'next_word_3': nextwords_3(sentence, index),
|
| 38 |
-
'next_word_4': nextwords_4(sentence, index),
|
| 39 |
-
|
| 40 |
-
'is_numeric': sentence[index].isdigit(),
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
# Function to rebuild the word based on the segmentation results
|
| 44 |
-
def rebuildxx(ww, xres):
|
| 45 |
-
numprfx = xres.count('p')
|
| 46 |
-
numsufx = xres.count('f')
|
| 47 |
-
resfinal = ''
|
| 48 |
-
if numprfx != 0 and numsufx != 0 :
|
| 49 |
-
resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] , ww[-numsufx:] )
|
| 50 |
-
if numprfx == 0 and numsufx == 0 :
|
| 51 |
-
#resfinal = "{}+{}+{}".format("", ww , "" )
|
| 52 |
-
resfinal = "{}".format(ww )
|
| 53 |
-
|
| 54 |
-
if numprfx == 0 and numsufx != 0 :
|
| 55 |
-
#resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
|
| 56 |
-
resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
|
| 57 |
-
|
| 58 |
-
if numprfx != 0 and numsufx == 0 :
|
| 59 |
-
#resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:], "")
|
| 60 |
-
resfinal = "{}+{}".format(ww[:numprfx] , ww[numprfx:])
|
| 61 |
-
|
| 62 |
-
return resfinal
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
# Function to preprocess text
|
| 67 |
-
def prepare_text(text):
|
| 68 |
-
# Define regular expression pattern to match symbols and punctuation from any language
|
| 69 |
-
symbol_pattern = r'([^\w\s\d])' # Capture non-word, non-space, non-digit characters
|
| 70 |
-
prepared_text = re.sub(symbol_pattern, r' \1 ', text)
|
| 71 |
-
prepared_text = re.sub(r'\s+', ' ', prepared_text)
|
| 72 |
-
|
| 73 |
-
return prepared_text.strip() # Remove leading and trailing spaces
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# Prediction function
|
| 77 |
-
def predict(text):
|
| 78 |
-
preprocessed_text = prepare_text(text)
|
| 79 |
-
tokenized_text = word_tokenize(preprocessed_text)
|
| 80 |
-
result = clf.predict([features(tokenized_text, index) for index in range(len(tokenized_text))])
|
| 81 |
-
segmented_text = ""
|
| 82 |
-
for word, segmentation in zip(tokenized_text, result):
|
| 83 |
-
segmented_text += rebuildxx(word, segmentation) + " "
|
| 84 |
-
return segmented_text
|
| 85 |
-
|
| 86 |
-
# Interface
|
| 87 |
-
repo_id = "Alshargi/arabic-msa-dialects-segmentation"
|
| 88 |
-
iface = gr.Interface.load(f"huggingface/{repo_id}", inputs=gr.inputs.Textbox(lines=5, label="Input Text"), outputs="text")
|
| 89 |
-
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|