Alshargi commited on
Commit
ee28f70
ยท
verified ยท
1 Parent(s): 16bf7ac

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -89
app.py DELETED
@@ -1,89 +0,0 @@
1
- import joblib
2
- import gradio as gr
3
- from nltk import word_tokenize
4
- import re
5
-
6
- # Load the scikit-learn model
7
- clf = joblib.load('arabic-msa-dialects-segmentation-v1.pkl')
8
-
9
- # Function to define features for each word in the sentence
10
- def features(sentence, index):
11
- return {
12
- 'word': sentence[index],
13
- 'is_first': index == 0,
14
- 'is_last': index == len(sentence) - 1,
15
- 'lword': len(sentence[index]),
16
-
17
- 'prefix-1': sentence[index][:1],
18
- 'prefix-2': sentence[index][:2],
19
- 'prefix-3': sentence[index][:3],
20
- 'prefix-4': sentence[index][:4],
21
- 'prefix-5': sentence[index][:5],
22
-
23
- 'suffix-1': sentence[index][-1],
24
- 'suffix-2': sentence[index][-2:],
25
- 'suffix-3': sentence[index][-3:],
26
- 'suffix-4': sentence[index][-4:],
27
- 'suffix-5': sentence[index][-5:],
28
-
29
- 'prev_word_4': prvwords_4(sentence, index),
30
- 'prev_word_3': prvwords_3(sentence, index),
31
- 'prev_word_2': prvwords_2(sentence, index),
32
- 'prev_word_1': prvwords_1(sentence, index),
33
-
34
-
35
- 'next_word_1': nextwords_1(sentence, index),
36
- 'next_word_2': nextwords_2(sentence, index),
37
- 'next_word_3': nextwords_3(sentence, index),
38
- 'next_word_4': nextwords_4(sentence, index),
39
-
40
- 'is_numeric': sentence[index].isdigit(),
41
- }
42
-
43
- # Function to rebuild the word based on the segmentation results
44
- def rebuildxx(ww, xres):
45
- numprfx = xres.count('p')
46
- numsufx = xres.count('f')
47
- resfinal = ''
48
- if numprfx != 0 and numsufx != 0 :
49
- resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:-numsufx] , ww[-numsufx:] )
50
- if numprfx == 0 and numsufx == 0 :
51
- #resfinal = "{}+{}+{}".format("", ww , "" )
52
- resfinal = "{}".format(ww )
53
-
54
- if numprfx == 0 and numsufx != 0 :
55
- #resfinal = "{}+{}+{}".format("" , ww[:-numsufx], ww[-numsufx:] )
56
- resfinal = "{}+{}".format(ww[:-numsufx], ww[-numsufx:] )
57
-
58
- if numprfx != 0 and numsufx == 0 :
59
- #resfinal = "{}+{}+{}".format(ww[:numprfx] , ww[numprfx:], "")
60
- resfinal = "{}+{}".format(ww[:numprfx] , ww[numprfx:])
61
-
62
- return resfinal
63
-
64
-
65
-
66
- # Function to preprocess text
67
- def prepare_text(text):
68
- # Define regular expression pattern to match symbols and punctuation from any language
69
- symbol_pattern = r'([^\w\s\d])' # Capture non-word, non-space, non-digit characters
70
- prepared_text = re.sub(symbol_pattern, r' \1 ', text)
71
- prepared_text = re.sub(r'\s+', ' ', prepared_text)
72
-
73
- return prepared_text.strip() # Remove leading and trailing spaces
74
-
75
-
76
- # Prediction function
77
- def predict(text):
78
- preprocessed_text = prepare_text(text)
79
- tokenized_text = word_tokenize(preprocessed_text)
80
- result = clf.predict([features(tokenized_text, index) for index in range(len(tokenized_text))])
81
- segmented_text = ""
82
- for word, segmentation in zip(tokenized_text, result):
83
- segmented_text += rebuildxx(word, segmentation) + " "
84
- return segmented_text
85
-
86
- # Interface
87
- repo_id = "Alshargi/arabic-msa-dialects-segmentation"
88
- iface = gr.Interface.load(f"huggingface/{repo_id}", inputs=gr.inputs.Textbox(lines=5, label="Input Text"), outputs="text")
89
- iface.launch()