Alshargi commited on
Commit
99c01ad
ยท
verified ยท
1 Parent(s): fceb57e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -1
app.py CHANGED
@@ -1,3 +1,89 @@
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  repo_id = "Alshargi/arabic-msa-dialects-segmentation"
3
- gr.Interface.load(f"huggingface/{repo_id}").launch()
 
 
1
+ import joblib
2
  import gradio as gr
3
+ from nltk import word_tokenize
4
+ import re
5
+
6
+ # Load the scikit-learn model
7
+ clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
8
+
9
+ # Function to define features for each word in the sentence
10
+ def features(sentence, index):
11
+ return {
12
+ 'word': sentence[index],
13
+ 'is_first': index == 0,
14
+ 'is_last': index == len(sentence) - 1,
15
+ 'lword': len(sentence[index]),
16
+
17
+ 'prefix-1': sentence[index][:1],
18
+ 'prefix-2': sentence[index][:2],
19
+ 'prefix-3': sentence[index][:3],
20
+ 'prefix-4': sentence[index][:4],
21
+ 'prefix-5': sentence[index][:5],
22
+
23
+ 'suffix-1': sentence[index][-1],
24
+ 'suffix-2': sentence[index][-2:],
25
+ 'suffix-3': sentence[index][-3:],
26
+ 'suffix-4': sentence[index][-4:],
27
+ 'suffix-5': sentence[index][-5:],
28
+
29
+ 'prev_word_4': prvwords_4(sentence, index),
30
+ 'prev_word_3': prvwords_3(sentence, index),
31
+ 'prev_word_2': prvwords_2(sentence, index),
32
+ 'prev_word_1': prvwords_1(sentence, index),
33
+
34
+
35
+ 'next_word_1': nextwords_1(sentence, index),
36
+ 'next_word_2': nextwords_2(sentence, index),
37
+ 'next_word_3': nextwords_3(sentence, index),
38
+ 'next_word_4': nextwords_4(sentence, index),
39
+
40
+ 'is_numeric': sentence[index].isdigit(),
41
+ }
42
+
43
+ # Function to rebuild the word based on the segmentation results
44
+ def rebuildxx(ww, xres):
45
+ numprfx = xres.count('p')
46
+ numsufx = xres.count('f')
47
+ resfinal = ''
48
+ if numprfx != 0 and numsufx != 0 :
49
+ resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] , ww[-numsufx:] )
50
+ if numprfx == 0 and numsufx == 0 :
51
+ #resfinal = "{}+{}+{}".format("", ww , "" )
52
+ resfinal = "{}".format(ww )
53
+
54
+ if numprfx == 0 and numsufx != 0 :
55
+ #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
56
+ resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
57
+
58
+ if numprfx != 0 and numsufx == 0 :
59
+ #resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:], "")
60
+ resfinal = "{}+{}".format(ww[:numprfx] , ww[numprfx:])
61
+
62
+ return resfinal
63
+
64
+
65
+
66
+ # Function to preprocess text
67
+ def prepare_text(text):
68
+ # Define regular expression pattern to match symbols and punctuation from any language
69
+ symbol_pattern = r'([^\w\s\d])' # Capture non-word, non-space, non-digit characters
70
+ prepared_text = re.sub(symbol_pattern, r' \1 ', text)
71
+ prepared_text = re.sub(r'\s+', ' ', prepared_text)
72
+
73
+ return prepared_text.strip() # Remove leading and trailing spaces
74
+
75
+
76
+ # Prediction function
77
+ def predict(text):
78
+ preprocessed_text = prepare_text(text)
79
+ tokenized_text = word_tokenize(preprocessed_text)
80
+ result = clf.predict([features(tokenized_text, index) for index in range(len(tokenized_text))])
81
+ segmented_text = ""
82
+ for word, segmentation in zip(tokenized_text, result):
83
+ segmented_text += rebuildxx(word, segmentation) + " "
84
+ return segmented_text
85
+
86
+ # Interface
87
  repo_id = "Alshargi/arabic-msa-dialects-segmentation"
88
+ iface = gr.Interface.load(f"huggingface/{repo_id}", inputs=gr.inputs.Textbox(lines=5, label="Input Text"), outputs="text")
89
+ iface.launch()