QueryExpansionForEtsy

Build error

App Files Files Community

another version of app.py adding scores

by yinlinfu - opened Jun 21, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+255

-0

Files changed (1) hide show

appv2.py +255 -0

appv2.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import streamlit as st
+from streamlit_tags import st_tags, st_tags_sidebar
+from keytotext import pipeline
+from PIL import Image
+import json
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import gzip
+import os
+import torch
+import pickle
+import random
+import numpy as np
+import pandas as pd
+############
+## Main page
+############
+st.write("# Demonstration for Etsy Query Expansion（Etsy-QE）")
+st.markdown("***Idea is to build a model which will take query as inputs and generate expansion information as outputs.***")
+image = Image.open('etsy-shop-LLC.png')
+st.image(image)
+st.sidebar.write("# Top-N Selection")
+maxtags_sidebar = st.sidebar.slider('Number of query allowed?', 1, 20, 1, key='ehikwegrjifbwreuk')
+#user_query = st_tags(
+#    label='# Enter Query:',
+#    text='Press enter to add more',
+#    value=['Mother'],
+#    suggestions=['gift', 'nike', 'wool'],
+#    maxtags=maxtags_sidebar,
+#    key="aljnf")
+user_query = st.text_input("Enter a query for the generated text: e.g., gift, home decoration ...")
+# Add selectbox in streamlit
+option1 = st.sidebar.selectbox(
+     'Which transformers model would you like to be selected?',
+     ('multi-qa-MiniLM-L6-cos-v1','null','null'))
+option2 = st.sidebar.selectbox(
+     'Which corss-encoder model would you like to be selected?',
+     ('cross-encoder/ms-marco-MiniLM-L-6-v2','null','null'))
+st.sidebar.success("Load Successfully!")
+#if not torch.cuda.is_available():
+#    print("Warning: No GPU found. Please add GPU to your notebook")
+#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
+bi_encoder = SentenceTransformer(option1,device='cpu')
+bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens
+top_k = 32                          #Number of passages we want to retrieve with the bi-encoder
+#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
+cross_encoder = CrossEncoder(option2, device='cpu')
+passages = []
+# load pre-train embeedings files
+embedding_cache_path = 'etsy-embeddings-cpu.pkl'
+print("Load pre-computed embeddings from disc")
+with open(embedding_cache_path, "rb") as fIn:
+  cache_data = pickle.load(fIn)
+  passages = cache_data['sentences']
+  corpus_embeddings = cache_data['embeddings']
+from rank_bm25 import BM25Okapi
+from sklearn.feature_extraction import _stop_words
+import string
+from tqdm.autonotebook import tqdm
+import numpy as np
+import re
+import yake
+language = "en"
+max_ngram_size = 3
+deduplication_threshold = 0.9
+deduplication_algo = 'seqm'
+windowSize = 3
+numOfKeywords = 3
+custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
+# load query GMS information
+with open('query_gms.json', 'r') as file:
+    query_gms_dict = json.load(file)
+# We lower case our text and remove stop-words from indexing
+def bm25_tokenizer(text):
+    tokenized_doc = []
+    for token in text.lower().split():
+        token = token.strip(string.punctuation)
+        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
+            tokenized_doc.append(token)
+    return tokenized_doc
+tokenized_corpus = []
+for passage in tqdm(passages):
+    tokenized_corpus.append(bm25_tokenizer(passage))
+bm25 = BM25Okapi(tokenized_corpus)
+def word_len(s):
+    return len([i for i in s.split(' ') if i])
+# This function will search all wikipedia articles for passages that
+# answer the query
+DEFAULT_SCORE = -100.0
+def clean_string(input_string):
+    string_sub1 = re.sub("([^\u0030-\u0039\u0041-\u007a])", ' ', input_string)
+    string_sub2 = re.sub("\x20\x20", "\n", string_sub1)
+    string_strip = string_sub2.strip().lower()
+    output_string = []
+    if len(string_strip) > 20:
+        keywords = custom_kw_extractor.extract_keywords(string_strip)
+        for tokens in keywords:
+            string_clean = tokens[0]
+            if word_len(string_clean) > 1:
+                output_string.append(string_clean)
+    else:
+        output_string.append(string_strip)
+    return output_string
+def add_gms_score_for_candidates(candidates, query_gms_dict):
+    for query_candidate in candidates:
+        value = candidates[query_candidate]
+        value['gms'] = query_gms_dict.get(query_candidate, 0)
+        candidates[query_candidate] = value
+    return candidates
+def generate_query_expansion_candidates(query):
+    print("Input query:", query)
+    expanded_query_set = {}
+    ##### BM25 search (lexical search) #####
+    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
+    # finds the indices of the top n scores
+    top_n_indices = np.argpartition(bm25_scores, -5)[-5:]
+    bm25_hits = [{'corpus_id': idx, 'bm25_score': bm25_scores[idx]} for idx in top_n_indices]
+    # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
+    ##### Sematic Search #####
+    # Encode the query using the bi-encoder and find potentially relevant passages
+    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    # query_embedding = query_embedding.cuda()
+    # Get the hits for the first query
+    encoder_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]
+    # For all retrieved passages, add the cross_encoder scores
+    cross_inp = [[query, passages[hit['corpus_id']]] for hit in encoder_hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+    for idx in range(len(cross_scores)):
+        encoder_hits[idx]['cross_score'] = cross_scores[idx]
+    candidates = {}
+    for hit in bm25_hits:
+        corpus_id = hit['corpus_id']
+        if  corpus_id not in candidates:
+            candidates[corpus_id] = {'bm25_score': hit['bm25_score'], 'bi_score': DEFAULT_SCORE, 'cross_score': DEFAULT_SCORE}
+    for hit in encoder_hits:
+        corpus_id = hit['corpus_id']
+        if corpus_id not in candidates:
+            candidates[corpus_id] = {'bm25_score': DEFAULT_SCORE, 'bi_score': hit['score'], 'cross_score': hit['cross_score']}
+        else:
+            bm25_score = candidates[corpus_id]['bm25_score']
+            candidates[corpus_id].update({'bm25_score': bm25_score, 'bi_score': hit['score'], 'cross_score': hit['cross_score']})
+    final_candidates = {}
+    for key, value in candidates.items():
+        input_string = passages[key].replace("\n", "")
+        string_set = set(clean_string(input_string))
+        for item in string_set:
+            final_candidates[item] = value
+    # remove the query itself from candidates
+    if query in final_candidates:
+        del final_candidates[query]
+    # add gms column
+    for query_candidate in final_candidates:
+        value = final_candidates[query_candidate]
+        value['gms'] = query_gms_dict.get(query_candidate, 0)
+        final_candidates[query_candidate] = value
+    # Total Results
+    st.write("E-Commerce Query Expansion Candidates: \n")
+    return final_candidates
+def re_rank_candidates(query, candidates, method):
+    if method == 'bm25':
+        # Filter and sort by bm25_score
+        filtered_sorted_result = sorted(
+            [(k, v) for k, v in candidates.items() if v['bm25_score'] > DEFAULT_SCORE],
+            key=lambda x: x[1]['bm25_score'],
+            reverse=True
+        )
+    elif method == 'bi_encoder':
+        # Filter and sort by bi_score
+        filtered_sorted_result = sorted(
+            [(k, v) for k, v in candidates.items() if v['bi_score'] > DEFAULT_SCORE],
+            key=lambda x: x[1]['bi_score'],
+            reverse=True
+        )
+    elif method == 'cross_encoder':
+        # Filter and sort by cross_score
+        filtered_sorted_result = sorted(
+            [(k, v) for k, v in candidates.items() if v['cross_score'] > DEFAULT_SCORE],
+            key=lambda x: x[1]['cross_score'],
+            reverse=True
+        )
+    elif method == 'gms':
+        filtered_sorted_by_encoder = sorted(
+            [(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
+            key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
+            reverse=True
+        )
+        # first sort by cross_score + bi_score
+        filtered_sorted_result = sorted(filtered_sorted_by_encoder, key=lambda x: x[1]['gms'], reverse=True
+        )
+    else:
+        # use default method cross_score + bi_score
+        # Filter and sort by cross_score + bi_score
+        filtered_sorted_result = sorted(
+            [(k, v) for k, v in candidates.items() if (v['cross_score'] > DEFAULT_SCORE) & (v['bi_score'] > DEFAULT_SCORE)],
+            key=lambda x: x[1]['cross_score'] + x[1]['bi_score'],
+            reverse=True
+        )
+    data_dicts = [{'query': item[0], **item[1]} for item in filtered_sorted_result]
+    # Convert the list of dictionaries into a DataFrame
+    df = pd.DataFrame(data_dicts)
+    return df
+# st.write("## Raw Candidates:")
+if st.button('Generated Expansion'):
+    candidates = generate_query_expansion_candidates(query = user_query)
+    df = re_rank_candidates(user_query, candidates, method='cross_encoder')
+    result = list(df['query'][:maxtags_sidebar])
+    st.write(result)
+    ## convert into dataframe
+    # data_dicts = [{'query': key, **values} for key, values in candidates.items()]
+    # df = pd.DataFrame(data_dicts)
+    # st.write(list(candidates.keys())[0:maxtags_sidebar])
+    # st.write(df)
+    # st.dataframe(df)
+    # st.success(raw_candidates)
+if st.button('Rerank By GMS'):
+    candidates = generate_query_expansion_candidates(query = user_query)
+    df = re_rank_candidates(user_query, candidates, method='gms')
+    st.dataframe(df[['query', 'gms']][:maxtags_sidebar])