Spaces:
Runtime error
Runtime error
Konrad Wojtasik
commited on
Commit
·
f1acfeb
1
Parent(s):
ffcea41
Add encoded corpus
Browse files- .gitattributes +9 -0
- app.py +26 -8
- distiluse-base-multilingual-cased-v1-fiqa-pl-corpus +3 -0
- distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus +3 -0
- distiluse-base-multilingual-cased-v1-scifact-pl-corpus +3 -0
- mcontriever-fiqa-pl-corpus +3 -0
- mcontriever-nfcorpus-pl-corpus +3 -0
- mcontriever-scifacts-pl-corpus +3 -0
- multilingual-e5-base-fiqa-pl-corpus +3 -0
- multilingual-e5-base-nfcorpus-pl-corpus +3 -0
- multilingual-e5-base-scifact-pl-corpus +3 -0
.gitattributes
CHANGED
|
@@ -32,3 +32,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
distiluse-base-multilingual-cased-v1-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
distiluse-base-multilingual-cased-v1-scifact-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
mcontriever-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
mcontriever-scifacts-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
mcontriever-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
multilingual-e5-base-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
multilingual-e5-base-scifact-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
multilingual-e5-base-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -32,20 +32,35 @@ def load_data(dataset_type):
|
|
| 32 |
return queries, corpus
|
| 33 |
|
| 34 |
@st.cache_data()
|
| 35 |
-
def bi_encode(
|
| 36 |
|
| 37 |
global bi_encoder
|
| 38 |
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
|
| 39 |
-
bi_encoder = SentenceTransformer(
|
| 40 |
|
| 41 |
-
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
st.success(f"Embeddings computed. Shape: {corpus_embeddings.shape}")
|
|
@@ -150,7 +165,10 @@ def search_func(query, bi_encoder_type, top_k=top_k):
|
|
| 150 |
|
| 151 |
##### Sematic Search #####
|
| 152 |
# Encode the query using the bi-encoder and find potentially relevant passages
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
| 154 |
question_embedding = question_embedding.cpu()
|
| 155 |
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
|
| 156 |
hits = hits[0] # Get the hits for the first query
|
|
|
|
| 32 |
return queries, corpus
|
| 33 |
|
| 34 |
@st.cache_data()
|
| 35 |
+
def bi_encode(bi_encoder_name,passages, dataset_name='scifact-pl'):
|
| 36 |
|
| 37 |
global bi_encoder
|
| 38 |
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
|
| 39 |
+
bi_encoder = SentenceTransformer(bi_encoder_name,use_auth_token=auth_token)
|
| 40 |
|
| 41 |
+
# Thos code would be used if we would embed the passages, but here to make it fast we will load already embedded tensors:
|
| 42 |
+
# with st.spinner('Encoding passages into a vector space...'):
|
| 43 |
|
| 44 |
+
# if bi_encoder_name == 'intfloat/multilingual-e5-base':
|
| 45 |
|
| 46 |
+
# corpus_embeddings = bi_encoder.encode(['passage: ' + sentence for sentence in passages], convert_to_tensor=True)
|
| 47 |
|
| 48 |
+
# else:
|
| 49 |
+
# corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True)
|
| 50 |
+
|
| 51 |
+
with st.spinner('Loading encoded passages...'):
|
| 52 |
+
|
| 53 |
+
if bi_encoder_name == "sentence-transformers/distiluse-base-multilingual-cased-v1":
|
| 54 |
+
name = 'distiluse-base-multilingual-cased-v1'
|
| 55 |
+
|
| 56 |
+
elif bi_encoder_name == 'intfloat/multilingual-e5-base':
|
| 57 |
+
name = 'multilingual-e5-base'
|
| 58 |
+
|
| 59 |
+
elif bi_encoder_name == 'nthakur/mcontriever-base-msmarco':
|
| 60 |
+
name = 'mcontriever'
|
| 61 |
+
|
| 62 |
+
corpus_embeddings_name = "-".join([name, dataset_name, "corpus"])
|
| 63 |
+
corpus_embeddings = torch.load(corpus_embeddings_name)
|
| 64 |
|
| 65 |
|
| 66 |
st.success(f"Embeddings computed. Shape: {corpus_embeddings.shape}")
|
|
|
|
| 165 |
|
| 166 |
##### Sematic Search #####
|
| 167 |
# Encode the query using the bi-encoder and find potentially relevant passages
|
| 168 |
+
if bi_encoder_type == 'intfloat/multilingual-e5-base':
|
| 169 |
+
question_embedding = bi_encoder.encode("query: " + query, convert_to_tensor=True)
|
| 170 |
+
else:
|
| 171 |
+
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
| 172 |
question_embedding = question_embedding.cpu()
|
| 173 |
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
|
| 174 |
hits = hits[0] # Get the hits for the first query
|
distiluse-base-multilingual-cased-v1-fiqa-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2312ba328c80b17088d5ae7d704c0362ed086fe58ac7899230047ddc530e8f3f
|
| 3 |
+
size 118043631
|
distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:855b2b61bb109bd2c4964af10b924e15a70b0f79e08380dfb9c39697e189c513
|
| 3 |
+
size 7441403
|
distiluse-base-multilingual-cased-v1-scifact-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e09e44d2269375c67add8ce85666882551fb547fee01e3259ecdf88842c3301
|
| 3 |
+
size 10615800
|
mcontriever-fiqa-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b7e6fe50d931782dd6c24019ebc1db17e5543d247fb4a212c3ef4faf2007b62
|
| 3 |
+
size 177064804
|
mcontriever-nfcorpus-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8440623d38b4bd95c56cecbd21d9ca86f479393228a2934f3d0cc3c5edc9cad6
|
| 3 |
+
size 11161456
|
mcontriever-scifacts-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:291dd709b7bbe015f1176c9fe37fe8e7a64d55673cac2729a304c7b26a40addd
|
| 3 |
+
size 15923056
|
multilingual-e5-base-fiqa-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f8183a87b7c515c54aeddd770e1531cc85fcf99b4f9e3715c0716d020689a8d
|
| 3 |
+
size 177064831
|
multilingual-e5-base-nfcorpus-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db3b211fc13de3a311addae59b061049e55a2794639127740c64b0680af8615b
|
| 3 |
+
size 11161547
|
multilingual-e5-base-scifact-pl-corpus
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4166d06d8037cc70990c3f1ac894aaae0b6187bf69e8f6a6e25d9236edf891ea
|
| 3 |
+
size 15923144
|