Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from tfidf_matcher.ngrams import ngrams | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.neighbors import NearestNeighbors | |
| import gradio as gr | |
| def matcher(original=[], lookup=[], outname='Original', ngram_length=3, cutoff=0.8): | |
| k_matches=1 | |
| # Enforce listtype, set to lower | |
| original = list(original.split(",")) | |
| lookup = list(lookup.split(",")) | |
| # print(original) | |
| # print(lookup) | |
| original_lower = [x.lower() for x in original] | |
| lookup_lower = [x.lower() for x in lookup] | |
| # Set ngram length for TfidfVectorizer callable | |
| def ngrams_user(string, n=ngram_length): | |
| return ngrams(string, n) | |
| # Generate Sparse TFIDF matrix from Lookup corpus | |
| vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams_user) | |
| tf_idf_lookup = vectorizer.fit_transform(lookup_lower) | |
| # Fit KNN model to sparse TFIDF matrix generated from Lookup | |
| nbrs = NearestNeighbors(n_neighbors=k_matches, n_jobs=-1, metric="cosine").fit(tf_idf_lookup) | |
| # Use nbrs model to obtain nearest matches in lookup dataset. Vectorize first. | |
| tf_idf_original = vectorizer.transform(original_lower) | |
| distances, lookup_indices = nbrs.kneighbors(tf_idf_original) | |
| # Extract top Match Score (which is just the distance to the nearest neighbour), | |
| # Original match item, and Lookup matches. | |
| original_name_list = [] | |
| confidence_list = [] | |
| index_list = [] | |
| lookup_list = [] | |
| print(len(lookup_indices)) | |
| # i is 0:len(original), j is list of lists of matches | |
| for i, lookup_index in enumerate(lookup_indices): | |
| original_name = original[i] | |
| # lookup names in lookup list | |
| lookups = [lookup[index] for index in lookup_index] | |
| # transform distances to confidences and store | |
| confidence = [1 - round(dist, 2) for dist in distances[i]] | |
| original_name_list.append(original_name) | |
| # store index | |
| index_list.append(lookup_index) | |
| confidence_list.append(confidence) | |
| lookup_list.append(lookups) | |
| # Convert to df | |
| df_orig_name = pd.DataFrame(original_name_list, columns=[outname]) | |
| df_lookups = pd.DataFrame( | |
| lookup_list, columns=["Match"] | |
| ) | |
| df_confidence = pd.DataFrame( | |
| confidence_list, | |
| columns=["Match Confidence"], | |
| ) | |
| # bind columns | |
| matches = pd.concat([df_orig_name, df_lookups, df_confidence], axis=1) | |
| # reorder columns | can be skipped | |
| lookup_cols = list(matches.columns.values) | |
| lookup_cols_reordered = [lookup_cols[0]] | |
| for i in range(1, k_matches + 1): | |
| lookup_cols_reordered.append(lookup_cols[i]) | |
| lookup_cols_reordered.append(lookup_cols[i + k_matches]) | |
| # lookup_cols_reordered.append(lookup_cols[i + 2 * k_matches]) | |
| matches = matches[lookup_cols_reordered] | |
| matches = matches.loc[matches["Match Confidence"] > cutoff] | |
| matches.sort_values(by=["Match Confidence"], ascending=False, inplace=True) | |
| matches.drop(columns={"Match Confidence"}, inplace=True) | |
| return matches | |
| def combine(a, b): | |
| return a + " " + b | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| txt = gr.Textbox(label="Input a list of names", value='Courtney Walsh,Curtly Ambrose,Malcolm Marshall,Brian Lara,Viv Richards,Obama',lines=2) | |
| txt_2 = gr.Textbox(label="Input some names to match", value="Walsh, Ambrose, Marshall, Lara",lines=2) | |
| # with gr.Row(): | |
| with gr.Column(): | |
| outty = gr.Dataframe( | |
| headers=["Original", "Match"], | |
| datatype=["str", "str"], | |
| label="Matched", | |
| ) | |
| btn = gr.Button(value="Submit") | |
| btn.click(matcher, inputs=[txt, txt_2], outputs=[outty]) | |
| # if __name__ == "__main__": | |
| demo.launch() |