Alshargi
/

arabic-msa-dialects-segmentation

@@ -1,89 +0,0 @@
-import joblib
-import gradio as gr
-from nltk import word_tokenize
-import re
-# Load the scikit-learn model
-clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
-# Function to define features for each word in the sentence
-def features(sentence, index):
-    return {
-        'word': sentence[index],
-        'is_first': index == 0,
-        'is_last': index == len(sentence) - 1,
-        'lword': len(sentence[index]),
-        'prefix-1': sentence[index][:1],
-        'prefix-2': sentence[index][:2],
-        'prefix-3': sentence[index][:3],
-        'prefix-4': sentence[index][:4],
-        'prefix-5': sentence[index][:5],
-        'suffix-1': sentence[index][-1],
-        'suffix-2': sentence[index][-2:],
-        'suffix-3': sentence[index][-3:],
-        'suffix-4': sentence[index][-4:],
-        'suffix-5': sentence[index][-5:],
-        'prev_word_4': prvwords_4(sentence, index),
-        'prev_word_3': prvwords_3(sentence, index),
-        'prev_word_2': prvwords_2(sentence, index),
-        'prev_word_1': prvwords_1(sentence, index),
-        'next_word_1':  nextwords_1(sentence, index),
-        'next_word_2':  nextwords_2(sentence, index),
-        'next_word_3':  nextwords_3(sentence, index),
-        'next_word_4':  nextwords_4(sentence, index),
-        'is_numeric': sentence[index].isdigit(),
-            }
-# Function to rebuild the word based on the segmentation results
-def rebuildxx(ww, xres):
-    numprfx = xres.count('p')
-    numsufx = xres.count('f')
-    resfinal = ''
-    if numprfx != 0 and  numsufx != 0 :
-        resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] ,  ww[-numsufx:] )
-    if numprfx == 0 and  numsufx == 0 :
-        #resfinal = "{}+{}+{}".format("", ww , ""  )
-        resfinal = "{}".format(ww )
-    if numprfx == 0 and  numsufx != 0 :
-        #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
-        resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
-    if numprfx != 0 and  numsufx == 0 :
-        #resfinal = "{}+{}+{}".format(ww[:numprfx] ,  ww[numprfx:],  "")
-        resfinal = "{}+{}".format(ww[:numprfx] ,  ww[numprfx:])
-    return resfinal
-# Function to preprocess text
-def prepare_text(text):
-    # Define regular expression pattern to match symbols and punctuation from any language
-    symbol_pattern = r'([^\w\s\d])'  # Capture non-word, non-space, non-digit characters
-    prepared_text = re.sub(symbol_pattern, r' \1 ', text)
-    prepared_text = re.sub(r'\s+', ' ', prepared_text)
-    return prepared_text.strip()  # Remove leading and trailing spaces
-# Prediction function
-def predict(text):
-    preprocessed_text = prepare_text(text)
-    tokenized_text = word_tokenize(preprocessed_text)
-    result = clf.predict([features(tokenized_text, index) for index in range(len(tokenized_text))])
-    segmented_text = ""
-    for word, segmentation in zip(tokenized_text, result):
-        segmented_text += rebuildxx(word, segmentation) + " "
-    return segmented_text
-# Interface
-repo_id = "Alshargi/arabic-msa-dialects-segmentation"
-iface = gr.Interface.load(f"huggingface/{repo_id}", inputs=gr.inputs.Textbox(lines=5, label="Input Text"), outputs="text")
-iface.launch()