Spaces:
Running
Running
Upload 2 files
Browse files- app.py +50 -7
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -9,6 +9,9 @@ import re
|
|
| 9 |
import random
|
| 10 |
import compress_fasttext
|
| 11 |
from collections import OrderedDict
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
faq_content="""
|
|
@@ -52,6 +55,34 @@ You can read more about TF-IDF on its [Wikipedia page](https://en.wikipedia.org/
|
|
| 52 |
"""
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# Load the model and data once at startup
|
| 56 |
with h5py.File('complete_artist_data.hdf5', 'r') as f:
|
| 57 |
# Deserialize the vectorizer
|
|
@@ -99,11 +130,16 @@ def find_similar_tags(test_tags):
|
|
| 99 |
# Find similar tags and prepare data for dataframe.
|
| 100 |
results_data = []
|
| 101 |
for tag in test_tags:
|
| 102 |
-
|
|
|
|
| 103 |
result, seen = [], set()
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
else:
|
| 108 |
for item in similar_words:
|
| 109 |
similar_word, similarity = item
|
|
@@ -127,13 +163,20 @@ def find_similar_tags(test_tags):
|
|
| 127 |
results_data.append(["", word, sim])
|
| 128 |
results_data.append(["", "", ""]) # Adds a blank line after each group of tags
|
| 129 |
|
|
|
|
|
|
|
| 130 |
|
| 131 |
return results_data # Return list of lists for Dataframe
|
| 132 |
|
| 133 |
def find_similar_artists(new_tags_string, top_n):
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
X_new_image = vectorizer.transform([','.join(new_image_tags)])
|
| 139 |
similarities = cosine_similarity(X_new_image, X_artist)[0]
|
|
|
|
| 9 |
import random
|
| 10 |
import compress_fasttext
|
| 11 |
from collections import OrderedDict
|
| 12 |
+
from lark import Lark
|
| 13 |
+
from lark import Token
|
| 14 |
+
|
| 15 |
|
| 16 |
|
| 17 |
faq_content="""
|
|
|
|
| 55 |
"""
|
| 56 |
|
| 57 |
|
| 58 |
+
grammar=r"""
|
| 59 |
+
!start: (prompt | /[][():]/+)*
|
| 60 |
+
prompt: (emphasized | plain | comma | WHITESPACE)*
|
| 61 |
+
!emphasized: "(" prompt ")"
|
| 62 |
+
| "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
|
| 63 |
+
comma: ","
|
| 64 |
+
WHITESPACE: /\s+/
|
| 65 |
+
plain: /([^,\\\[\]():|]|\\.)+/
|
| 66 |
+
%import common.SIGNED_NUMBER -> NUMBER
|
| 67 |
+
"""
|
| 68 |
+
# Initialize the parser
|
| 69 |
+
parser = Lark(grammar, start='start')
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# Function to extract tags
|
| 73 |
+
def extract_tags(tree):
|
| 74 |
+
tags = []
|
| 75 |
+
def _traverse(node):
|
| 76 |
+
if isinstance(node, Token) and node.type == '__ANON_1':
|
| 77 |
+
tags.append(node.value.strip())
|
| 78 |
+
elif not isinstance(node, Token):
|
| 79 |
+
for child in node.children:
|
| 80 |
+
_traverse(child)
|
| 81 |
+
|
| 82 |
+
_traverse(tree)
|
| 83 |
+
return tags
|
| 84 |
+
|
| 85 |
+
|
| 86 |
# Load the model and data once at startup
|
| 87 |
with h5py.File('complete_artist_data.hdf5', 'r') as f:
|
| 88 |
# Deserialize the vectorizer
|
|
|
|
| 130 |
# Find similar tags and prepare data for dataframe.
|
| 131 |
results_data = []
|
| 132 |
for tag in test_tags:
|
| 133 |
+
modified_tag_for_search = tag.replace(' ','_')
|
| 134 |
+
similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search)
|
| 135 |
result, seen = [], set()
|
| 136 |
+
|
| 137 |
+
if modified_tag_for_search in find_similar_tags.tag2aliases:
|
| 138 |
+
if tag in find_similar_tags.tag2aliases and "_" in tag: #Implicitly tell the user that they should get rid of the underscore
|
| 139 |
+
result.append(modified_tag_for_search.replace('_',' '), 1)
|
| 140 |
+
seen.add(tag)
|
| 141 |
+
else: #The user correctly did not put underscores in their tag
|
| 142 |
+
continue
|
| 143 |
else:
|
| 144 |
for item in similar_words:
|
| 145 |
similar_word, similarity = item
|
|
|
|
| 163 |
results_data.append(["", word, sim])
|
| 164 |
results_data.append(["", "", ""]) # Adds a blank line after each group of tags
|
| 165 |
|
| 166 |
+
if not results_data:
|
| 167 |
+
results_data.append(["No Unknown Tags Found", "", ""])
|
| 168 |
|
| 169 |
return results_data # Return list of lists for Dataframe
|
| 170 |
|
| 171 |
def find_similar_artists(new_tags_string, top_n):
|
| 172 |
+
# Parse the prompt
|
| 173 |
+
parsed = parser.parse(new_tags_string)
|
| 174 |
+
# Extract tags from the parsed tree
|
| 175 |
+
new_image_tags = extract_tags(parsed)
|
| 176 |
+
new_image_tags = [tag.replace('_', ' ').strip() for tag in new_image_tags]
|
| 177 |
+
|
| 178 |
+
###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))
|
| 179 |
+
unseen_tags_data = find_similar_tags(new_image_tags)
|
| 180 |
|
| 181 |
X_new_image = vectorizer.transform([','.join(new_image_tags)])
|
| 182 |
similarities = cosine_similarity(X_new_image, X_artist)[0]
|
requirements.txt
CHANGED
|
@@ -4,3 +4,4 @@ scikit-learn==1.2.2
|
|
| 4 |
h5py==3.8.0
|
| 5 |
joblib==1.2.0
|
| 6 |
compress-fasttext
|
|
|
|
|
|
| 4 |
h5py==3.8.0
|
| 5 |
joblib==1.2.0
|
| 6 |
compress-fasttext
|
| 7 |
+
lark-parser
|