Spaces:
Sleeping
Sleeping
added cross encoder
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ from datasets import Features
|
|
| 7 |
from datasets import Value
|
| 8 |
from datasets import Dataset
|
| 9 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 10 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
import os
|
| 12 |
import gradio as gr
|
|
@@ -39,14 +40,20 @@ df = joined_df.copy()
|
|
| 39 |
|
| 40 |
|
| 41 |
model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token)
|
|
|
|
| 42 |
arr = np.array(df['embed'].to_list())
|
| 43 |
|
| 44 |
def find_most_similar_matn(text, n):
|
| 45 |
-
|
|
|
|
| 46 |
cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr)
|
| 47 |
indices = np.argsort(cos_sim)[0][-n:]
|
| 48 |
matns = df.iloc[indices]
|
| 49 |
matns['Similarity'] = cos_sim[0][indices]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']]
|
| 51 |
|
| 52 |
with gr.Blocks() as demo:
|
|
|
|
| 7 |
from datasets import Value
|
| 8 |
from datasets import Dataset
|
| 9 |
from sentence_transformers import SentenceTransformer
|
| 10 |
+
from sentence_transformers.cross_encoder import CrossEncoder
|
| 11 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 12 |
import os
|
| 13 |
import gradio as gr
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
model = SentenceTransformer('FDSRashid/QulBERT', token=Secret_token)
|
| 43 |
+
model_CE = CrossEncoder('FDSRashid/QulBERT-CE-2.0', token=Secret_token)
|
| 44 |
arr = np.array(df['embed'].to_list())
|
| 45 |
|
| 46 |
def find_most_similar_matn(text, n):
|
| 47 |
+
prep_text = araby.strip_diacritics(text)
|
| 48 |
+
embed_text = model.encode(prep_text)
|
| 49 |
cos_sim = cosine_similarity(embed_text.reshape(1, -1), arr)
|
| 50 |
indices = np.argsort(cos_sim)[0][-n:]
|
| 51 |
matns = df.iloc[indices]
|
| 52 |
matns['Similarity'] = cos_sim[0][indices]
|
| 53 |
+
matns_prep = [araby.strip_diacritics(text) for text in matns['matn']]
|
| 54 |
+
to_compare = [(i, prep_text) for i in matns_prep]
|
| 55 |
+
is_taraf = model_CE.predict(to_compare)
|
| 56 |
+
matns = matns[is_taraf> .5]
|
| 57 |
return matns[['Book_Name', 'matn', 'taraf_ID', 'Book_ID', 'Hadith Number', 'Author', 'Similarity']]
|
| 58 |
|
| 59 |
with gr.Blocks() as demo:
|