Update app.py
Browse files
app.py
CHANGED
|
@@ -11,81 +11,80 @@ default_value = "بيعت الأسلحة في السوق"
|
|
| 11 |
# sent is the variable holding the user's input
|
| 12 |
sent = st.text_area("مدخل", default_value, height=20)
|
| 13 |
|
| 14 |
-
st.checkbox('استعمال الرسم البياني', value=False)
|
| 15 |
-
|
| 16 |
-
tmt = {}
|
| 17 |
-
VocMap = './voc.csv'
|
| 18 |
-
ibra_gr = './BM25.csv'
|
| 19 |
-
|
| 20 |
-
df3 = pd.read_csv(VocMap, delimiter='\t')
|
| 21 |
-
df_g = pd.read_csv(ibra_gr, delimiter='\t')
|
| 22 |
-
df_g.set_index(['ID1','ID2'], inplace=True)
|
| 23 |
-
|
| 24 |
-
df_in = pd.read_csv(ibra_gr, delimiter='\t')
|
| 25 |
-
df_in.set_index(['ID1'], inplace=True)
|
| 26 |
-
|
| 27 |
-
def Query2id(voc, query):
|
| 28 |
-
return [voc.index[voc['word'] == word].values[0] for word in query.split()]
|
| 29 |
-
|
| 30 |
-
id_list = Query2id(df3, sent)
|
| 31 |
-
|
| 32 |
-
def setQueriesVoc(df, id_list):
|
| 33 |
-
res = []
|
| 34 |
-
for e in id_list:
|
| 35 |
-
res.extend(list(df.loc[e]['ID2'].values))
|
| 36 |
-
return list(set(res))
|
| 37 |
-
|
| 38 |
-
L = setQueriesVoc(df_in, id_list)
|
| 39 |
-
|
| 40 |
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
|
| 41 |
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
|
| 42 |
|
| 43 |
#@st.cache
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
|
| 58 |
-
dict_next_words = next_word(text_st, pipe)
|
| 59 |
-
df = pd.DataFrame.from_dict(dict_next_words)
|
| 60 |
-
df.reset_index(drop=True, inplace=True)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
st.dataframe(df)
|
| 90 |
-
st.dataframe(res_df)
|
| 91 |
#st.table(df)
|
|
|
|
| 11 |
# sent is the variable holding the user's input
|
| 12 |
sent = st.text_area("مدخل", default_value, height=20)
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
|
| 15 |
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
|
| 16 |
|
| 17 |
#@st.cache
|
| 18 |
+
if (st.button('بحث', disabled=False)):
|
| 19 |
+
def next_word(text, pipe):
|
| 20 |
+
res_dict= {
|
| 21 |
+
'Word':[],
|
| 22 |
+
'Score':[],
|
| 23 |
+
}
|
| 24 |
+
for e in pipe(text):
|
| 25 |
+
if all(c not in list(string.punctuation) for c in e['token_str']):
|
| 26 |
+
res_dict['Word'].append(e['token_str'])
|
| 27 |
+
res_dict['Score'].append(e['score'])
|
| 28 |
+
return res_dict
|
| 29 |
+
|
| 30 |
+
text_st = sent+ ' <mask>'
|
| 31 |
+
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
|
| 32 |
+
dict_next_words = next_word(text_st, pipe)
|
| 33 |
+
df = pd.DataFrame.from_dict(dict_next_words)
|
| 34 |
+
df.reset_index(drop=True, inplace=True)
|
| 35 |
+
st.dataframe(df)
|
| 36 |
+
|
| 37 |
+
if (st.button('استعمال الرسم البياني', disabled=False)):
|
| 38 |
+
tmt = {}
|
| 39 |
+
VocMap = './voc.csv'
|
| 40 |
+
ScoreMap = './BM25.csv'
|
| 41 |
+
|
| 42 |
+
df3 = pd.read_csv(VocMap, delimiter='\t')
|
| 43 |
+
df_g = pd.read_csv(ScoreMap, delimiter='\t')
|
| 44 |
+
df_g.set_index(['ID1','ID2'], inplace=True)
|
| 45 |
+
|
| 46 |
+
df_in = pd.read_csv(ScoreMap, delimiter='\t')
|
| 47 |
+
df_in.set_index(['ID1'], inplace=True)
|
| 48 |
+
|
| 49 |
+
def Query2id(voc, query):
|
| 50 |
+
return [voc.index[voc['word'] == word].values[0] for word in query.split()]
|
| 51 |
+
|
| 52 |
+
id_list = Query2id(df3, sent)
|
| 53 |
+
|
| 54 |
+
def setQueriesVoc(df, id_list):
|
| 55 |
+
res = []
|
| 56 |
+
for e in id_list:
|
| 57 |
+
res.extend(list(df.loc[e]['ID2'].values))
|
| 58 |
+
return list(set(res))
|
| 59 |
|
| 60 |
+
L = setQueriesVoc(df_in, id_list)
|
| 61 |
|
| 62 |
+
for nc in L:
|
| 63 |
+
score = 0.0
|
| 64 |
+
temp = []
|
| 65 |
+
for ni in id_list:
|
| 66 |
+
try:
|
| 67 |
+
score = score + df_g.loc[(ni, nc),'score']
|
| 68 |
+
except KeyError:
|
| 69 |
+
continue
|
| 70 |
+
key = df3.loc[nc].values[0]
|
| 71 |
+
tmt[key] = score
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
exp_terms = []
|
| 75 |
+
t_li = tmt.values()
|
| 76 |
+
tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
|
| 77 |
+
i = 0
|
| 78 |
+
dict_res = {'word':[], 'score':[]}
|
| 79 |
+
for key, value in tmexp:
|
| 80 |
+
new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001
|
| 81 |
+
dict_res['score'].append(str(new_score)[:6])
|
| 82 |
+
dict_res['word'].append(key)
|
| 83 |
+
i+=1
|
| 84 |
+
if (i==10):
|
| 85 |
+
break
|
| 86 |
+
res_df = pd.DataFrame.from_dict(dict_res)
|
| 87 |
+
res_df.index += 1
|
| 88 |
+
st.dataframe(res_df)
|
| 89 |
|
|
|
|
|
|
|
| 90 |
#st.table(df)
|