ShynBui commited on
Commit
b5a336b
·
verified ·
1 Parent(s): a5ded10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -4
app.py CHANGED
@@ -1,7 +1,44 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
+ import re
4
+ import os
5
+ import py_vncorenlp
6
 
7
+ def preprocess_text(text):
8
+ # Loại bỏ các ký tự đặc biệt và dấu câu
9
+ text = re.sub(r'[^\w\s]', '', text)
10
+
11
+ # Loại bỏ URL
12
+ text = re.sub(r'http\S+', '', text)
13
+
14
+ # Loại bỏ đường dẫn file
15
+ text = re.sub(r'\/\w+', '', text)
16
+
17
+ return text
18
+
19
+ def remove_escape_sequences(text):
20
+ escape_sequences = ['\n', '\t', '\r', '\\']
21
+ for sequence in escape_sequences:
22
+ text = text.replace(sequence, '')
23
+ return text
24
+
25
+ def remove_html_tags(text):
26
+ clean_text = re.sub(r'<[^>]*>', '', text)
27
+ return clean_text
28
+
29
+ def vi_word_segment(text):
30
+ rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=os.getcwd())
31
+ output = rdrsegmenter.word_segment(text)
32
+ return ' '.join(output)
33
+
34
+ def process_text(text):
35
+ text = text[:256]
36
+ text = preprocess_text(text)
37
+ text = remove_escape_sequences(text)
38
+ text = remove_escape_sequences(text)
39
+ text = vi_word_segment(text)
40
+ return text
41
+
42
+ if __name__ == '__main__':
43
+ iface = gr.Interface(fn=process_text, inputs="text", outputs="text")
44
+ iface.launch()