File size: 8,277 Bytes
051dae2
 
 
 
75095b6
051dae2
 
 
 
 
 
 
 
 
af0ed7b
051dae2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731e892
 
 
 
 
1fb6d8c
 
731e892
 
 
 
051dae2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7693508
9eb9f4e
7693508
 
051dae2
 
 
 
 
 
 
3c44dc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbcebf2
 
 
 
 
 
 
7693508
3c44dc3
051dae2
3c44dc3
051dae2
 
 
 
65cd7ee
3c44dc3
 
051dae2
3c44dc3
 
051dae2
 
3c44dc3
 
 
 
 
342a876
 
3c44dc3
 
 
 
 
 
 
 
 
 
7693508
 
 
 
 
 
342a876
 
3c44dc3
051dae2
3c44dc3
342a876
3c44dc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e25dc5a
 
 
 
 
 
 
 
db1f600
9111e7b
cc8fc07
14a64d7
cc8fc07
 
9111e7b
db1f600
9b2e9c2
db1f600
 
 
 
9b2e9c2
fbcebf2
 
4c16383
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import streamlit as st
from streamlit_tags import st_tags, st_tags_sidebar
from keytotext import pipeline
from PIL import Image
from tabulate import tabulate

import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import pickle
import random
import numpy as np
import pandas as pd

############
## Main page
############

st.write("# Demonstration for Etsy Query Expansion(Etsy-QE)")

st.markdown("***Idea is to build a model which will take query as inputs and generate expansion information as outputs.***")
image = Image.open('etsy-shop-LLC.png')
st.image(image)

st.sidebar.write("# Top-N Selection")
maxtags_sidebar = st.sidebar.slider('Number of query allowed?', 1, 20, 1, key='ehikwegrjifbwreuk')
#user_query = st_tags(
#    label='# Enter Query:',
#    text='Press enter to add more',
#    value=['Mother'],
#    suggestions=['gift', 'nike', 'wool'],
#    maxtags=maxtags_sidebar,
#    key="aljnf")

user_query = st.text_input("Enter a query for the generated text: e.g., gift, home decoration ...")

# Add selectbox in streamlit
option1 = st.sidebar.selectbox(
     'Which transformers model would you like to be selected?',
     ('multi-qa-MiniLM-L6-cos-v1','null','null'))

option2 = st.sidebar.selectbox(
     'Which corss-encoder model would you like to be selected?',
     ('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))

st.sidebar.success("Load Successfully!")

#if not torch.cuda.is_available():
#    print("Warning: No GPU found. Please add GPU to your notebook")

#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer(option1,device='cpu')
bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder(option2, device='cpu')

passages = []

# load pre-train embeedings files
print("Load pre-computed embeddings from disc")
# embedding_cache_path = 'embeddings.pt'
# corpus_embeddings = torch.load(embedding_cache_path)
# with open('sentences.json', 'r') as file:
#     passages = json.load(file)

embedding_cache_path = 'etsy-embeddings-cpu.pkl'
# embedding_cache_path = 'etsy-embeddings-cpu-3parts-0530.pkl'
with open(embedding_cache_path, "rb") as fIn:
  cache_data = pickle.load(fIn)
  passages = cache_data['sentences']
  corpus_embeddings = cache_data['embeddings']

from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import re

import yake

language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
deduplication_algo = 'seqm'
windowSize = 3
numOfKeywords = 3

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
# load query GMS information
with open('query_gms_mock_2M.json', 'r') as file:
    query_gms_dict = json.load(file)
    

def word_len(s):
    return len([i for i in s.split(' ') if i])


# This function will search all wikipedia articles for passages that
# answer the query
DEFAULT_SCORE = -100.0
def clean_string(input_string):
    string_sub1 = re.sub("([^\u0030-\u0039\u0041-\u007a])", ' ', input_string)
    string_sub2 = re.sub("\x20\x20", "\n", string_sub1)
    string_strip = string_sub2.strip().lower()
    output_string = []
    if len(string_strip) > 20:
        keywords = custom_kw_extractor.extract_keywords(string_strip)
        for tokens in keywords:
            string_clean = tokens[0]
            if word_len(string_clean) > 1:
                output_string.append(string_clean)
    else:
        output_string.append(string_strip)
    return output_string

# add gms column
def add_gms_score_for_candidates(candidates):
    candidates_final = {}
    for key, value in candidates.items():
        gms_value = query_gms_dict.get(key, 0)
        candidates_final[key] = {'gms': gms_value, 'bi_score': value['bi_score'], 'cross_score': value['cross_score']}
    return candidates_final
    
def generate_query_expansion_candidates(query):
    print("Input query:", query)
    expanded_query_set = {}

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    # query_embedding = query_embedding.cuda()
    # Get the hits for the first query
    encoder_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

    # For all retrieved passages, add the cross_encoder scores
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in encoder_hits]
    cross_scores = cross_encoder.predict(cross_inp)
    for idx in range(len(cross_scores)):
        encoder_hits[idx]['cross_score'] = cross_scores[idx]
    
    candidates = {}
    for hit in encoder_hits:
        corpus_id = hit['corpus_id']
        candidates[corpus_id] = {'bi_score': hit['score'], 'cross_score': hit['cross_score']}

    
    final_candidates = {}
    for key, value in candidates.items():
        input_string = passages[key].replace("\n", "")
        string_set = set(clean_string(input_string))
        for item in string_set:
            final_candidates[item] = value
    # remove the query itself from candidates
    if query in final_candidates: 
        del final_candidates[query]

    # add gms column
    for query_candidate in final_candidates:
        value = final_candidates[query_candidate]
        value['gms'] = query_gms_dict.get(query_candidate, 0)
        final_candidates[query_candidate] = value
    # Total Results
    # st.write("E-Commerce Query Expansion Candidates: \n")
    return final_candidates

def re_rank_candidates(query, candidates, method):
    if method == 'bi_encoder':
        # Filter and sort by bi_score
        filtered_sorted_result = sorted(
            [(k, v) for k, v in candidates.items() if v['bi_score'] > DEFAULT_SCORE],
            key=lambda x: x[1]['bi_score'],
            reverse=True
        )
    elif method == 'cross_encoder':
        # Filter and sort by cross_score
        filtered_sorted_result = sorted(
            [(k, v) for k, v in candidates.items() if v['cross_score'] > DEFAULT_SCORE],
            key=lambda x: x[1]['cross_score'],
            reverse=True
        )
    elif method == 'gms':
        filtered_sorted_by_encoder = sorted(
            [(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
            key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
            reverse=True
        )
        # first sort by cross_score + bi_score
        filtered_sorted_result = sorted(filtered_sorted_by_encoder, key=lambda x: x[1]['gms'], reverse=True
        )
    else:
        # use default method cross_score + bi_score
        # Filter and sort by cross_score + bi_score
        filtered_sorted_result = sorted(
            [(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
            key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
            reverse=True
        )
    return filtered_sorted_result


if st.button('Generated Expansion'): 
    st.write("E-Commerce Query Expansion Candidates: \n")
    col1, col2 = st.columns(2)
    candidates = generate_query_expansion_candidates(query = user_query)
    with col1:
        st.subheader('Raw Candidates:')
        candidates_rerank = re_rank_candidates(user_query, candidates, method='cross_encoder')[:maxtags_sidebar]
        result = [item[0] for item in candidates_rerank]
        st.write(result)
    with col2:
        st.subheader('Rerank By GMS:')
        candidates_gms = add_gms_score_for_candidates(candidates)
        candidates_rerank = re_rank_candidates(user_query, candidates_gms, method='gms')[:maxtags_sidebar]
        data_dicts = [{'query': item[0], 'GMS Value': item[1]['gms']} for item in candidates_rerank]
        df = pd.DataFrame.from_dict(data_dicts)
        st.write(df)