Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1 +1,110 @@
|
|
| 1 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from tfidf_matcher.ngrams import ngrams
|
| 3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
+
from sklearn.neighbors import NearestNeighbors
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
def matcher(original=[], lookup=[], outname='Original', ngram_length=3, cutoff=0.8):
|
| 8 |
+
k_matches=1
|
| 9 |
+
|
| 10 |
+
# Enforce listtype, set to lower
|
| 11 |
+
original = list(original.split(","))
|
| 12 |
+
lookup = list(lookup.split(","))
|
| 13 |
+
|
| 14 |
+
# print(original)
|
| 15 |
+
# print(lookup)
|
| 16 |
+
|
| 17 |
+
original_lower = [x.lower() for x in original]
|
| 18 |
+
lookup_lower = [x.lower() for x in lookup]
|
| 19 |
+
|
| 20 |
+
# Set ngram length for TfidfVectorizer callable
|
| 21 |
+
def ngrams_user(string, n=ngram_length):
|
| 22 |
+
return ngrams(string, n)
|
| 23 |
+
|
| 24 |
+
# Generate Sparse TFIDF matrix from Lookup corpus
|
| 25 |
+
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams_user)
|
| 26 |
+
tf_idf_lookup = vectorizer.fit_transform(lookup_lower)
|
| 27 |
+
|
| 28 |
+
# Fit KNN model to sparse TFIDF matrix generated from Lookup
|
| 29 |
+
nbrs = NearestNeighbors(n_neighbors=k_matches, n_jobs=-1, metric="cosine").fit(tf_idf_lookup)
|
| 30 |
+
|
| 31 |
+
# Use nbrs model to obtain nearest matches in lookup dataset. Vectorize first.
|
| 32 |
+
tf_idf_original = vectorizer.transform(original_lower)
|
| 33 |
+
distances, lookup_indices = nbrs.kneighbors(tf_idf_original)
|
| 34 |
+
|
| 35 |
+
# Extract top Match Score (which is just the distance to the nearest neighbour),
|
| 36 |
+
# Original match item, and Lookup matches.
|
| 37 |
+
original_name_list = []
|
| 38 |
+
confidence_list = []
|
| 39 |
+
index_list = []
|
| 40 |
+
lookup_list = []
|
| 41 |
+
print(len(lookup_indices))
|
| 42 |
+
# i is 0:len(original), j is list of lists of matches
|
| 43 |
+
for i, lookup_index in enumerate(lookup_indices):
|
| 44 |
+
original_name = original[i]
|
| 45 |
+
# lookup names in lookup list
|
| 46 |
+
lookups = [lookup[index] for index in lookup_index]
|
| 47 |
+
# transform distances to confidences and store
|
| 48 |
+
confidence = [1 - round(dist, 2) for dist in distances[i]]
|
| 49 |
+
original_name_list.append(original_name)
|
| 50 |
+
# store index
|
| 51 |
+
index_list.append(lookup_index)
|
| 52 |
+
confidence_list.append(confidence)
|
| 53 |
+
lookup_list.append(lookups)
|
| 54 |
+
|
| 55 |
+
# Convert to df
|
| 56 |
+
df_orig_name = pd.DataFrame(original_name_list, columns=[outname])
|
| 57 |
+
|
| 58 |
+
df_lookups = pd.DataFrame(
|
| 59 |
+
lookup_list, columns=["Match"]
|
| 60 |
+
)
|
| 61 |
+
df_confidence = pd.DataFrame(
|
| 62 |
+
confidence_list,
|
| 63 |
+
columns=["Match Confidence"],
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# bind columns
|
| 67 |
+
matches = pd.concat([df_orig_name, df_lookups, df_confidence], axis=1)
|
| 68 |
+
|
| 69 |
+
# reorder columns | can be skipped
|
| 70 |
+
lookup_cols = list(matches.columns.values)
|
| 71 |
+
lookup_cols_reordered = [lookup_cols[0]]
|
| 72 |
+
for i in range(1, k_matches + 1):
|
| 73 |
+
lookup_cols_reordered.append(lookup_cols[i])
|
| 74 |
+
lookup_cols_reordered.append(lookup_cols[i + k_matches])
|
| 75 |
+
# lookup_cols_reordered.append(lookup_cols[i + 2 * k_matches])
|
| 76 |
+
matches = matches[lookup_cols_reordered]
|
| 77 |
+
|
| 78 |
+
matches = matches.loc[matches["Match Confidence"] > cutoff]
|
| 79 |
+
matches.sort_values(by=["Match Confidence"], ascending=False, inplace=True)
|
| 80 |
+
|
| 81 |
+
return matches
|
| 82 |
+
|
| 83 |
+
def combine(a, b):
|
| 84 |
+
return a + " " + b
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
with gr.Blocks() as demo:
|
| 88 |
+
|
| 89 |
+
with gr.Row():
|
| 90 |
+
with gr.Column():
|
| 91 |
+
txt = gr.Textbox(label="Input a list of names", value='Courtney Walsh,Curtly Ambrose,Malcolm Marshall,Brian Lara,Viv Richards,Obama',lines=2)
|
| 92 |
+
txt_2 = gr.Textbox(label="Input some names to match", value="Walsh, Ambrose, Marshall, Lara",lines=2)
|
| 93 |
+
|
| 94 |
+
# with gr.Row():
|
| 95 |
+
with gr.Column():
|
| 96 |
+
|
| 97 |
+
outty = gr.Dataframe(
|
| 98 |
+
headers=["Original", "Match", "Confidence"],
|
| 99 |
+
datatype=["str", "str", "number"],
|
| 100 |
+
label="Matched",
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
btn = gr.Button(value="Submit")
|
| 105 |
+
btn.click(matcher, inputs=[txt, txt_2], outputs=[outty])
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
demo.launch()
|