Ralfouzan commited on
Commit
5792bf1
ยท
1 Parent(s): 037ac5a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from GoogleNews import GoogleNews
3
+
4
+ import pandas as pd
5
+ import numpy as np
6
+ import spacy
7
+ import gensim
8
+ import string
9
+ import re
10
+
11
+ import sklearn
12
+ from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ nlp = spacy.load("spacy.aravec.model")
16
+ #---------------------------------------------------------------------------------------------------------------
17
+ #---------------------------------------------- Side bar ------------------------------------------------------
18
+ #---------------------------------------------------------------------------------------------------------------
19
+ st.sidebar.markdown('ู…ูˆุงู‚ุน ุงุฎุจุงุฑูŠู‡ ู…ุนุชู…ุฏู‡ ')
20
+ st.sidebar.markdown("[ุงู„ุนุฑุจูŠุฉ](https://www.alarabiya.net/)")
21
+ st.sidebar.markdown("[ุงู„ุฌุฒูŠุฑุฉ ู†ุช](https://www.aljazeera.net/news/)")
22
+ st.sidebar.markdown("[ูˆูƒุงู„ุฉ ุงู„ุงู†ุจุงุก ุงู„ูƒูˆูŠุชูŠุฉ](https://www.kuna.net.kw/Default.aspx?language=ar)")
23
+ #---------------------------------------------------------------------------------------------------------------
24
+
25
+
26
+ st.write("""
27
+ Arabic headline news detection
28
+ """)
29
+
30
+ tx = st.text_input (''' ุงู„ุฑุฌุงุก ุงุฏุฎุงู„ ุงู„ุนู†ูˆุงู† ุงู„ู…ุฑุงุฏ ุงู„ุชุงูƒุฏ ู…ู† ุตุญุชู‡ ''')
31
+
32
+ #---------------------------------------------------------------------------------------------------------------
33
+ #----------------------------------------Pre-proccessing functions----------------------------------------------
34
+ #---------------------------------------------------------------------------------------------------------------
35
+ def clean_str(text):
36
+ search = ["ุฃ","ุฅ","ุข","ุฉ","_","-","/",".","ุŒ"," ูˆ "," ูŠุง ",'"',"ู€","'","ู‰","\\",'\n', '\t','"','?','ุŸ','!']
37
+ replace = ["ุง","ุง","ุง","ู‡"," "," ","","",""," ูˆ"," ูŠุง","","","","ูŠ","",' ', ' ',' ',' ? ',' ุŸ ',' ! ']
38
+
39
+ #remove tashkeel
40
+ p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
41
+ text = re.sub(p_tashkeel,"", text)
42
+
43
+ #remove longation
44
+ p_longation = re.compile(r'(.)\1+')
45
+ subst = r"\1\1"
46
+ text = re.sub(p_longation, subst, text)
47
+
48
+ text = text.replace('ูˆูˆ', 'ูˆ')
49
+ text = text.replace('ูŠูŠ', 'ูŠ')
50
+ text = text.replace('ุงุง', 'ุง')
51
+
52
+ for i in range(0, len(search)):
53
+ text = text.replace(search[i], replace[i])
54
+
55
+ #trim
56
+ text = text.strip()
57
+
58
+ return text
59
+
60
+
61
+ def split_hashtag_to_words(tag):
62
+ tag = tag.replace('#','')
63
+ tags = tag.split('_')
64
+ if len(tags) > 1 :
65
+
66
+ return tags
67
+ pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])")
68
+ return pattern.findall(tag)
69
+
70
+ def clean_hashtag(text):
71
+ words = text.split()
72
+ text = list()
73
+ for word in words:
74
+ if is_hashtag(word):
75
+ text.extend(extract_hashtag(word))
76
+ else:
77
+ text.append(word)
78
+ return " ".join(text)
79
+
80
+ def is_hashtag(word):
81
+ if word.startswith("#"):
82
+ return True
83
+ else:
84
+ return False
85
+
86
+ def extract_hashtag(text):
87
+
88
+ hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")])
89
+ word_list = []
90
+ for word in hash_list :
91
+ word_list.extend(split_hashtag_to_words(word))
92
+ return word_list
93
+
94
+ # Define the preprocessing Class
95
+ class Preprocessor:
96
+ def __init__(self, tokenizer, **cfg):
97
+ self.tokenizer = tokenizer
98
+
99
+ def __call__(self, text):
100
+ preprocessed = clean_str(text)
101
+ return self.tokenizer(preprocessed)
102
+
103
+ #---------------------------------------------------------------------------------------------------------------
104
+ #----------------------------------------- END OF PRE-PROCESSING------------------------------------------------
105
+ #---------------------------------------------------------------------------------------------------------------
106
+ # Apply the `Preprocessor` Class
107
+
108
+ nlp.tokenizer = Preprocessor(nlp.tokenizer)
109
+
110
+ if len(tx) != 0:
111
+ googlenews = GoogleNews(lang='ar')
112
+ googlenews.clear()
113
+
114
+ f =0
115
+ Prediction =''
116
+ top_similar_ind =''
117
+ top_similar_news =''
118
+ medium =''
119
+ top_similar_ind2 =''
120
+ tp_desc =''
121
+
122
+ st.markdown(f"Searching for: { tx }")
123
+ st.markdown(f"ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€ู€")
124
+
125
+
126
+ tx = clean_hashtag(tx)
127
+ tx = clean_str(tx)
128
+
129
+
130
+ googlenews.search(tx)
131
+ result = googlenews.page_at(1)
132
+ googlenews.clear()
133
+
134
+ if len(result) == 0:
135
+ Prediction ='ุงู„ุฎุจุฑ ุฒุงุฆู'
136
+ top_similar_news ='ู„ุง ูŠูˆุฌุฏ ุงุฎุจุงุฑ ู…ู…ุงุซู„ู‡'
137
+ medium ='ู„ุง ูŠูˆุฌุฏ ู…ุตุฏุฑ'
138
+ tp_desc ='ู„ุง ูŠูˆุฌุฏ ูˆุตู'
139
+
140
+ else:
141
+ result_text = {"Text":[]}
142
+
143
+ #google search
144
+ for i in range(len(result)):
145
+ title =result[i]['title']
146
+ result_text['Text'].append(title)
147
+
148
+
149
+ result_text2 = {"Text":[]}
150
+ #google search
151
+ for i in range(len(result)):
152
+ desc =result[i]['desc']
153
+ result_text2['Text'].append(desc)
154
+
155
+ result_text = pd.DataFrame(result_text)
156
+ result_text2 = pd.DataFrame(result_text2)
157
+
158
+ data = pd.DataFrame()
159
+ data['Text2'] = result_text['Text'].copy()
160
+
161
+ data['Text2'] = data['Text2'].apply(lambda x: nlp(x).similarity(nlp(tx)))
162
+ sg300top = data['Text2'].max(axis = 0)
163
+
164
+ top_similar_ind = np.argmax(data['Text2'])
165
+ top_similar_news = result[top_similar_ind]['title']
166
+ descr = result[top_similar_ind]['desc']
167
+ medium = result[top_similar_ind]['media']
168
+ date = result[top_similar_ind]['date']
169
+ link = result[top_similar_ind]['link']
170
+
171
+ data['Text3'] = result_text2['Text'].copy()
172
+ data['Text3'] = data['Text3'].apply(lambda x: nlp(x).similarity(nlp(tx)))
173
+ sg300top2 = data['Text3'].max(axis = 0)
174
+ top_similar_ind2 = np.argmax(data['Text3'])
175
+ tp_desc = result[top_similar_ind2]['desc']
176
+
177
+ if sg300top >= .85 or sg300top2 >= .85 :
178
+ Prediction ='ุงู„ุฎุจุฑ ุตุญูŠุญ'
179
+ else:
180
+ Prediction =' ุงู„ุฎุจุฑ ุฒุงุฆู'
181
+
182
+
183
+
184
+ st.markdown(f"System Prediction : { Prediction }")
185
+ st.markdown(f"ุงู„ุฎุจุฑ ุงู„ู…ู…ุงุซู„: { top_similar_news }")
186
+ st.markdown(f"")
187
+ st.markdown(f"ุชุงุฑูŠุฎ ุงู„ุฎุจุฑ: { date }")
188
+ st.markdown(f"")
189
+ st.markdown(f"ุงู„ุชูุตูŠู„: { descr }")
190
+ st.markdown(f"")
191
+ st.markdown(f"ุงู„ู…ุตุฏุฑ: { medium }")
192
+ st.markdown(f"")
193
+ st.markdown(f"ุฑุงุจุท ุงู„ุฎุจุฑ: { link }")
194
+
195
+
196
+
197
+
198
+
199
+ #st.markdown(f"Searching for: { tx }")