Alshargi
/

arabic-msa-dialects-segmentation

+import joblib
 import gradio as gr
+from nltk import word_tokenize
+import re
+# Load the scikit-learn model
+clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
+# Function to define features for each word in the sentence
+def features(sentence, index):
+    return {
+        'word': sentence[index],
+        'is_first': index == 0,
+        'is_last': index == len(sentence) - 1,
+        'lword': len(sentence[index]),
+        'prefix-1': sentence[index][:1],
+        'prefix-2': sentence[index][:2],
+        'prefix-3': sentence[index][:3],
+        'prefix-4': sentence[index][:4],
+        'prefix-5': sentence[index][:5],
+        'suffix-1': sentence[index][-1],
+        'suffix-2': sentence[index][-2:],
+        'suffix-3': sentence[index][-3:],
+        'suffix-4': sentence[index][-4:],
+        'suffix-5': sentence[index][-5:],
+        'prev_word_4': prvwords_4(sentence, index),
+        'prev_word_3': prvwords_3(sentence, index),
+        'prev_word_2': prvwords_2(sentence, index),
+        'prev_word_1': prvwords_1(sentence, index),
+        'next_word_1':  nextwords_1(sentence, index),
+        'next_word_2':  nextwords_2(sentence, index),
+        'next_word_3':  nextwords_3(sentence, index),
+        'next_word_4':  nextwords_4(sentence, index),
+        'is_numeric': sentence[index].isdigit(),
+            }
+# Function to rebuild the word based on the segmentation results
+def rebuildxx(ww, xres):
+    numprfx = xres.count('p')
+    numsufx = xres.count('f')
+    resfinal = ''
+    if numprfx != 0 and  numsufx != 0 :
+        resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] ,  ww[-numsufx:] )
+    if numprfx == 0 and  numsufx == 0 :
+        #resfinal = "{}+{}+{}".format("", ww , ""  )
+        resfinal = "{}".format(ww )
+    if numprfx == 0 and  numsufx != 0 :
+        #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
+        resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
+    if numprfx != 0 and  numsufx == 0 :
+        #resfinal = "{}+{}+{}".format(ww[:numprfx] ,  ww[numprfx:],  "")
+        resfinal = "{}+{}".format(ww[:numprfx] ,  ww[numprfx:])
+    return resfinal
+# Function to preprocess text
+def prepare_text(text):
+    # Define regular expression pattern to match symbols and punctuation from any language
+    symbol_pattern = r'([^\w\s\d])'  # Capture non-word, non-space, non-digit characters
+    prepared_text = re.sub(symbol_pattern, r' \1 ', text)
+    prepared_text = re.sub(r'\s+', ' ', prepared_text)
+    return prepared_text.strip()  # Remove leading and trailing spaces
+# Prediction function
+def predict(text):
+    preprocessed_text = prepare_text(text)
+    tokenized_text = word_tokenize(preprocessed_text)
+    result = clf.predict([features(tokenized_text, index) for index in range(len(tokenized_text))])
+    segmented_text = ""
+    for word, segmentation in zip(tokenized_text, result):
+        segmented_text += rebuildxx(word, segmentation) + " "
+    return segmented_text
+# Interface
 repo_id = "Alshargi/arabic-msa-dialects-segmentation"
+iface = gr.Interface.load(f"huggingface/{repo_id}", inputs=gr.inputs.Textbox(lines=5, label="Input Text"), outputs="text")
+iface.launch()