Spaces:
Runtime error
Runtime error
File size: 8,277 Bytes
051dae2 75095b6 051dae2 af0ed7b 051dae2 731e892 1fb6d8c 731e892 051dae2 7693508 9eb9f4e 7693508 051dae2 3c44dc3 fbcebf2 7693508 3c44dc3 051dae2 3c44dc3 051dae2 65cd7ee 3c44dc3 051dae2 3c44dc3 051dae2 3c44dc3 342a876 3c44dc3 7693508 342a876 3c44dc3 051dae2 3c44dc3 342a876 3c44dc3 e25dc5a db1f600 9111e7b cc8fc07 14a64d7 cc8fc07 9111e7b db1f600 9b2e9c2 db1f600 9b2e9c2 fbcebf2 4c16383 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
import streamlit as st
from streamlit_tags import st_tags, st_tags_sidebar
from keytotext import pipeline
from PIL import Image
from tabulate import tabulate
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import pickle
import random
import numpy as np
import pandas as pd
############
## Main page
############
st.write("# Demonstration for Etsy Query Expansion(Etsy-QE)")
st.markdown("***Idea is to build a model which will take query as inputs and generate expansion information as outputs.***")
image = Image.open('etsy-shop-LLC.png')
st.image(image)
st.sidebar.write("# Top-N Selection")
maxtags_sidebar = st.sidebar.slider('Number of query allowed?', 1, 20, 1, key='ehikwegrjifbwreuk')
#user_query = st_tags(
# label='# Enter Query:',
# text='Press enter to add more',
# value=['Mother'],
# suggestions=['gift', 'nike', 'wool'],
# maxtags=maxtags_sidebar,
# key="aljnf")
user_query = st.text_input("Enter a query for the generated text: e.g., gift, home decoration ...")
# Add selectbox in streamlit
option1 = st.sidebar.selectbox(
'Which transformers model would you like to be selected?',
('multi-qa-MiniLM-L6-cos-v1','null','null'))
option2 = st.sidebar.selectbox(
'Which corss-encoder model would you like to be selected?',
('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))
st.sidebar.success("Load Successfully!")
#if not torch.cuda.is_available():
# print("Warning: No GPU found. Please add GPU to your notebook")
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
bi_encoder = SentenceTransformer(option1,device='cpu')
bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens
top_k = 32 #Number of passages we want to retrieve with the bi-encoder
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder(option2, device='cpu')
passages = []
# load pre-train embeedings files
print("Load pre-computed embeddings from disc")
# embedding_cache_path = 'embeddings.pt'
# corpus_embeddings = torch.load(embedding_cache_path)
# with open('sentences.json', 'r') as file:
# passages = json.load(file)
embedding_cache_path = 'etsy-embeddings-cpu.pkl'
# embedding_cache_path = 'etsy-embeddings-cpu-3parts-0530.pkl'
with open(embedding_cache_path, "rb") as fIn:
cache_data = pickle.load(fIn)
passages = cache_data['sentences']
corpus_embeddings = cache_data['embeddings']
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import re
import yake
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
deduplication_algo = 'seqm'
windowSize = 3
numOfKeywords = 3
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
# load query GMS information
with open('query_gms_mock_2M.json', 'r') as file:
query_gms_dict = json.load(file)
def word_len(s):
return len([i for i in s.split(' ') if i])
# This function will search all wikipedia articles for passages that
# answer the query
DEFAULT_SCORE = -100.0
def clean_string(input_string):
string_sub1 = re.sub("([^\u0030-\u0039\u0041-\u007a])", ' ', input_string)
string_sub2 = re.sub("\x20\x20", "\n", string_sub1)
string_strip = string_sub2.strip().lower()
output_string = []
if len(string_strip) > 20:
keywords = custom_kw_extractor.extract_keywords(string_strip)
for tokens in keywords:
string_clean = tokens[0]
if word_len(string_clean) > 1:
output_string.append(string_clean)
else:
output_string.append(string_strip)
return output_string
# add gms column
def add_gms_score_for_candidates(candidates):
candidates_final = {}
for key, value in candidates.items():
gms_value = query_gms_dict.get(key, 0)
candidates_final[key] = {'gms': gms_value, 'bi_score': value['bi_score'], 'cross_score': value['cross_score']}
return candidates_final
def generate_query_expansion_candidates(query):
print("Input query:", query)
expanded_query_set = {}
##### Sematic Search #####
# Encode the query using the bi-encoder and find potentially relevant passages
query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
# query_embedding = query_embedding.cuda()
# Get the hits for the first query
encoder_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
# For all retrieved passages, add the cross_encoder scores
cross_inp = [[query, passages[hit['corpus_id']]] for hit in encoder_hits]
cross_scores = cross_encoder.predict(cross_inp)
for idx in range(len(cross_scores)):
encoder_hits[idx]['cross_score'] = cross_scores[idx]
candidates = {}
for hit in encoder_hits:
corpus_id = hit['corpus_id']
candidates[corpus_id] = {'bi_score': hit['score'], 'cross_score': hit['cross_score']}
final_candidates = {}
for key, value in candidates.items():
input_string = passages[key].replace("\n", "")
string_set = set(clean_string(input_string))
for item in string_set:
final_candidates[item] = value
# remove the query itself from candidates
if query in final_candidates:
del final_candidates[query]
# add gms column
for query_candidate in final_candidates:
value = final_candidates[query_candidate]
value['gms'] = query_gms_dict.get(query_candidate, 0)
final_candidates[query_candidate] = value
# Total Results
# st.write("E-Commerce Query Expansion Candidates: \n")
return final_candidates
def re_rank_candidates(query, candidates, method):
if method == 'bi_encoder':
# Filter and sort by bi_score
filtered_sorted_result = sorted(
[(k, v) for k, v in candidates.items() if v['bi_score'] > DEFAULT_SCORE],
key=lambda x: x[1]['bi_score'],
reverse=True
)
elif method == 'cross_encoder':
# Filter and sort by cross_score
filtered_sorted_result = sorted(
[(k, v) for k, v in candidates.items() if v['cross_score'] > DEFAULT_SCORE],
key=lambda x: x[1]['cross_score'],
reverse=True
)
elif method == 'gms':
filtered_sorted_by_encoder = sorted(
[(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
reverse=True
)
# first sort by cross_score + bi_score
filtered_sorted_result = sorted(filtered_sorted_by_encoder, key=lambda x: x[1]['gms'], reverse=True
)
else:
# use default method cross_score + bi_score
# Filter and sort by cross_score + bi_score
filtered_sorted_result = sorted(
[(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
reverse=True
)
return filtered_sorted_result
if st.button('Generated Expansion'):
st.write("E-Commerce Query Expansion Candidates: \n")
col1, col2 = st.columns(2)
candidates = generate_query_expansion_candidates(query = user_query)
with col1:
st.subheader('Raw Candidates:')
candidates_rerank = re_rank_candidates(user_query, candidates, method='cross_encoder')[:maxtags_sidebar]
result = [item[0] for item in candidates_rerank]
st.write(result)
with col2:
st.subheader('Rerank By GMS:')
candidates_gms = add_gms_score_for_candidates(candidates)
candidates_rerank = re_rank_candidates(user_query, candidates_gms, method='gms')[:maxtags_sidebar]
data_dicts = [{'query': item[0], 'GMS Value': item[1]['gms']} for item in candidates_rerank]
df = pd.DataFrame.from_dict(data_dicts)
st.write(df) |