QueryExpansionForEtsy

Sleeping

App Files Files Community

Jackie2235 commited on Jun 30, 2023

Commit

7192529

1 Parent(s): 99af713

Upload app.py

Browse files

Files changed (1) hide show

app.py +68 -61

app.py CHANGED Viewed

@@ -1,16 +1,10 @@
 import streamlit as st
-from streamlit_tags import st_tags, st_tags_sidebar
-from keytotext import pipeline
 from PIL import Image
 import json
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
-import gzip
-import os
-import torch
 import pickle
-import random
-import numpy as np
 import pandas as pd
 ############
@@ -41,7 +35,7 @@ option1 = st.sidebar.selectbox(
      ('multi-qa-MiniLM-L6-cos-v1','null','null'))
 option2 = st.sidebar.selectbox(
-     'Which corss-encoder model would you like to be selected?',
      ('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))
 st.sidebar.success("Load Successfully!")
@@ -50,22 +44,28 @@ st.sidebar.success("Load Successfully!")
 #    print("Warning: No GPU found. Please add GPU to your notebook")
 #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
-bi_encoder = SentenceTransformer(option1,device='cpu')
 bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
 top_k = 32                          #Number of passages we want to retrieve with the bi-encoder
-#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
-cross_encoder = CrossEncoder(option2, device='cpu')
 passages = []
 # load pre-train embeedings files
 embedding_cache_path = 'etsy-embeddings-cpu.pkl'
-print("Load pre-computed embeddings from disc")
-with open(embedding_cache_path, "rb") as fIn:
-  cache_data = pickle.load(fIn)
-  passages = cache_data['sentences']
-  corpus_embeddings = cache_data['embeddings']
 from rank_bm25 import BM25Okapi
 from sklearn.feature_extraction import _stop_words
@@ -76,18 +76,24 @@ import re
 import yake
-language = "en"
-max_ngram_size = 3
-deduplication_threshold = 0.9
-deduplication_algo = 'seqm'
-windowSize = 3
-numOfKeywords = 3
-custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
 # load query GMS information
-with open('query_gms.json', 'r') as file:
-    query_gms_dict = json.load(file)
 # We lower case our text and remove stop-words from indexing
 def bm25_tokenizer(text):
     tokenized_doc = []
@@ -98,10 +104,14 @@ def bm25_tokenizer(text):
             tokenized_doc.append(token)
     return tokenized_doc
-tokenized_corpus = []
-for passage in tqdm(passages):
-    tokenized_corpus.append(bm25_tokenizer(passage))
 bm25 = BM25Okapi(tokenized_corpus)
 def word_len(s):
@@ -126,13 +136,13 @@ def clean_string(input_string):
         output_string.append(string_strip)
     return output_string
-def add_gms_score_for_candidates(candidates, query_gms_dict):
-    for query_candidate in candidates:
-        value = candidates[query_candidate]
-        value['gms'] = query_gms_dict.get(query_candidate, 0)
-        candidates[query_candidate] = value
-    return candidates
 def generate_query_expansion_candidates(query):
     print("Input query:", query)
     expanded_query_set = {}
@@ -143,8 +153,8 @@ def generate_query_expansion_candidates(query):
     top_n_indices = np.argpartition(bm25_scores, -5)[-5:]
     bm25_hits = [{'corpus_id': idx, 'bm25_score': bm25_scores[idx]} for idx in top_n_indices]
     # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
     ##### Sematic Search #####
     # Encode the query using the bi-encoder and find potentially relevant passages
     query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
@@ -157,7 +167,7 @@ def generate_query_expansion_candidates(query):
     cross_scores = cross_encoder.predict(cross_inp)
     for idx in range(len(cross_scores)):
         encoder_hits[idx]['cross_score'] = cross_scores[idx]
     candidates = {}
     for hit in bm25_hits:
         corpus_id = hit['corpus_id']
@@ -170,25 +180,23 @@ def generate_query_expansion_candidates(query):
         else:
             bm25_score = candidates[corpus_id]['bm25_score']
             candidates[corpus_id].update({'bm25_score': bm25_score, 'bi_score': hit['score'], 'cross_score': hit['cross_score']})
     final_candidates = {}
     for key, value in candidates.items():
         input_string = passages[key].replace("\n", "")
         string_set = set(clean_string(input_string))
         for item in string_set:
-            final_candidates[item] = value
     # remove the query itself from candidates
-    if query in final_candidates:
         del final_candidates[query]
     # add gms column
-    for query_candidate in final_candidates:
-        value = final_candidates[query_candidate]
-        value['gms'] = query_gms_dict.get(query_candidate, 0)
-        final_candidates[query_candidate] = value
     # Total Results
-    st.write("E-Commerce Query Expansion Candidates: \n")
-    return final_candidates
 def re_rank_candidates(query, candidates, method):
     if method == 'bm25':
@@ -236,22 +244,21 @@ def re_rank_candidates(query, candidates, method):
 # st.write("## Raw Candidates:")
-if st.button('Generated Expansion'):
-    st.write("E-Commerce Query Expansion Candidates: \n")
     col1, col2 = st.columns(2)
     candidates = generate_query_expansion_candidates(query = user_query)
     with col1:
-        st.subheader('Query Candidates')
-        df = re_rank_candidates(user_query, candidates, method='cross_encoder')
-        result = list(df['query'][:maxtags_sidebar])
-        st.write(result)
     with col2:
-        st.subheader('Sorted Query Candidates')
-        df2 = re_rank_candidates(user_query, candidates, method='gms')
-        result_rank=list(df2[['query', 'gms']][:maxtags_sidebar])
-        st.write(result_rank)
     ## convert into dataframe
     # data_dicts = [{'query': key, **values} for key, values in candidates.items()]

 import streamlit as st
 from PIL import Image
 import json
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
 import pickle
 import pandas as pd
 ############
      ('multi-qa-MiniLM-L6-cos-v1','null','null'))
 option2 = st.sidebar.selectbox(
+     'Which cross-encoder model would you like to be selected?',
      ('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))
 st.sidebar.success("Load Successfully!")
 #    print("Warning: No GPU found. Please add GPU to your notebook")
 #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
+@st.cache_resource
+def load_encoders(sentence_enc, cross_enc):
+    return SentenceTransformer(sentence_enc,device='cpu'), CrossEncoder(cross_enc,device='cpu')
+bi_encoder, cross_encoder = load_encoders(option1,option2)
 bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
 top_k = 32                          #Number of passages we want to retrieve with the bi-encoder
 passages = []
 # load pre-train embeedings files
+@st.cache_resource
+def load_pickle(path):
+    with open(path, "rb") as fIn:
+        cache_data = pickle.load(fIn)
+        passages = cache_data['sentences']
+        corpus_embeddings = cache_data['embeddings']
+    print("Load pre-computed embeddings from disc")
+    return passages,corpus_embeddings
 embedding_cache_path = 'etsy-embeddings-cpu.pkl'
+passages,corpus_embeddings = load_pickle(embedding_cache_path)
 from rank_bm25 import BM25Okapi
 from sklearn.feature_extraction import _stop_words
 import yake
+@st.cache_resource
+def load_model():
+    language = "en"
+    max_ngram_size = 3
+    deduplication_threshold = 0.9
+    deduplication_algo = 'seqm'
+    windowSize = 3
+    numOfKeywords = 3
+    return yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
+custom_kw_extractor = load_model()
 # load query GMS information
+@st.cache_resource
+def load_json(path):
+    with open(path, 'r') as file:
+        query_gms_dict = json.load(file)
+    return query_gms_dict
+query_gms_dict = load_json('query_gms.json')
 # We lower case our text and remove stop-words from indexing
 def bm25_tokenizer(text):
     tokenized_doc = []
             tokenized_doc.append(token)
     return tokenized_doc
+@st.cache_resource
+def get_tokenized_corpus(passages,_tokenizer):
+    tokenized_corpus = []
+    for passage in passages:
+        tokenized_corpus.append(_tokenizer(passage))
+    return tokenized_corpus
+tokenized_corpus = get_tokenized_corpus(passages,bm25_tokenizer)
 bm25 = BM25Okapi(tokenized_corpus)
 def word_len(s):
         output_string.append(string_strip)
     return output_string
+# def add_gms_score_for_candidates(candidates, query_gms_dict):
+#     for query_candidate in candidates:
+#         value = candidates[query_candidate]
+#         value['gms'] = query_gms_dict.get(query_candidate, 0)
+#         candidates[query_candidate] = value
+#     return candidates
 def generate_query_expansion_candidates(query):
     print("Input query:", query)
     expanded_query_set = {}
     top_n_indices = np.argpartition(bm25_scores, -5)[-5:]
     bm25_hits = [{'corpus_id': idx, 'bm25_score': bm25_scores[idx]} for idx in top_n_indices]
     # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
     ##### Sematic Search #####
     # Encode the query using the bi-encoder and find potentially relevant passages
     query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
     cross_scores = cross_encoder.predict(cross_inp)
     for idx in range(len(cross_scores)):
         encoder_hits[idx]['cross_score'] = cross_scores[idx]
     candidates = {}
     for hit in bm25_hits:
         corpus_id = hit['corpus_id']
         else:
             bm25_score = candidates[corpus_id]['bm25_score']
             candidates[corpus_id].update({'bm25_score': bm25_score, 'bi_score': hit['score'], 'cross_score': hit['cross_score']})
     final_candidates = {}
     for key, value in candidates.items():
         input_string = passages[key].replace("\n", "")
         string_set = set(clean_string(input_string))
         for item in string_set:
+            final_candidates[item.replace("\n", " ")] = value
     # remove the query itself from candidates
+    if query in final_candidates:
         del final_candidates[query]
+    # print(final_candidates)
     # add gms column
+    df = pd.DataFrame(final_candidates).T
+    df['gms'] = [query_gms_dict.get(i,0) for i in df.index]
     # Total Results
+    return df.to_dict('index')
 def re_rank_candidates(query, candidates, method):
     if method == 'bm25':
 # st.write("## Raw Candidates:")
+if st.button('Generated Expansion'):
     col1, col2 = st.columns(2)
     candidates = generate_query_expansion_candidates(query = user_query)
     with col1:
+        st.subheader('Original Ranking')
+        ranking_cross = re_rank_candidates(user_query, candidates, method='cross_encoder')
+        ranking_cross.index = ranking_cross.index+1
+        st.table(ranking_cross['query'][:maxtags_sidebar])
     with col2:
+        st.subheader('GMS-sorted Ranking')
+        ranking_gms = re_rank_candidates(user_query, candidates, method='gms')
+        ranking_gms.index = ranking_gms.index + 1
+        st.table(ranking_gms[['query', 'gms']][:maxtags_sidebar])
     ## convert into dataframe
     # data_dicts = [{'query': key, **values} for key, values in candidates.items()]