Spaces:

FoodDesert
/

Prompt_Squirrel

Running

App Files Files Community

FoodDesert commited on Apr 22, 2024

Commit

c442eda

verified ·

1 Parent(s): 273251e

Upload 2 files

Browse files

Files changed (2) hide show

ConvertSampleImagesToJpeg.ipynb +147 -0
app.py +16 -100

ConvertSampleImagesToJpeg.ipynb ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4aa04654",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "098e115f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import os\n",
+    "import json\n",
+    "from PIL import Image\n",
+    "from sd_parsers import ParserManager\n",
+    "\n",
+    "# Directory with PNG images\n",
+    "image_directory = 'E:/image/holder/Tagset_Completer/sampleimages/02landscape'\n",
+    "\n",
+    "# Initialize the ParserManager\n",
+    "parser_manager = ParserManager()\n",
+    "\n",
+    "# Dictionary for artist names to corresponding JPG file names\n",
+    "artist_to_file_map = {}\n",
+    "\n",
+    "# Iterate through PNG files in the directory\n",
+    "for png_file in glob.glob(os.path.join(image_directory, '*.png')):\n",
+    "    with Image.open(png_file) as img:\n",
+    "        # Extract metadata using ParserManager\n",
+    "        prompt_info = parser_manager.parse(img)\n",
+    "        if prompt_info and prompt_info.prompts:\n",
+    "            first_prompt_text = list(prompt_info.prompts)[0].value.split(',')[0].strip()\n",
+    "            if first_prompt_text.startswith(\"by \"):\n",
+    "                first_prompt_text = first_prompt_text[3:]  # Remove \"by \" prefix\n",
+    "            artist_to_file_map[first_prompt_text] = os.path.basename(png_file).replace('.png', '.jpg')\n",
+    "        else:\n",
+    "            artist_to_file_map[\"\"] = os.path.basename(png_file).replace('.png', '.jpg')\n",
+    "\n",
+    "# Save the mapping to a JSON file in the same directory\n",
+    "json_path = os.path.join(image_directory, 'artist_to_file_map.json')\n",
+    "with open(json_path, 'w') as json_file:\n",
+    "    json.dump(artist_to_file_map, json_file, indent=4)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ac5cba7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Iterate through PNG files in the directory\n",
+    "for png_file in glob.glob(os.path.join(image_directory, '*.png')):\n",
+    "    # Open the image\n",
+    "    with Image.open(png_file) as img:\n",
+    "        # Convert the image to RGB mode in case it's RGBA or P mode\n",
+    "        img = img.convert('RGB')\n",
+    "        # Define the output filename replacing .png with .jpg\n",
+    "        jpg_file = png_file.rsplit('.', 1)[0] + '.jpg'\n",
+    "        # Save the image in JPG format\n",
+    "        img.save(jpg_file, 'JPEG')\n",
+    "        # Optionally, remove the original PNG file\n",
+    "        os.remove(png_file)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32bfb9cc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3648a9fc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09f74cbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d2e18c17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "354fda37",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac4e5911",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py CHANGED Viewed

@@ -185,39 +185,10 @@ def load_model_components(file_path):
         model_components['row_to_tag'] = {idx: tag for tag, idx in model_components['tag_to_row_index'].items()}
     return model_components
 # Load all components at the start
 tf_idf_components = load_model_components('tf_idf_files_420.joblib')
-# Load the model and data once at startup
-with h5py.File('complete_artist_data.hdf5', 'r') as f:
-    # Deserialize the vectorizer
-    vectorizer_bytes = f['vectorizer'][()].tobytes()
-    # Use io.BytesIO to convert bytes back to a file-like object for joblib to load
-    vectorizer_buffer = BytesIO(vectorizer_bytes)
-    vectorizer = load(vectorizer_buffer)
-    # Load X_artist
-    X_artist = f['X_artist'][:]
-    # Load artist names and decode to strings
-    artist_names = [name.decode() for name in f['artist_names'][:]]
-with h5py.File('conditional_tag_probabilities_matrix.h5', 'r') as f:
-    # Reconstruct the sparse co-occurrence matrix
-    conditional_co_occurrence_matrix = csr_matrix(
-        (f['co_occurrence_data'][:], f['co_occurrence_indices'][:], f['co_occurrence_indptr'][:]),
-        shape=f['co_occurrence_shape'][:]
-    )
-    # Reconstruct the vocabulary
-    conditional_words = f['vocabulary_words'][:]
-    conditional_indices = f['vocabulary_indices'][:]
-    conditional_vocabulary = {key.decode('utf-8'): value for key, value in zip(conditional_words, conditional_indices)}
-    # Load the document count
-    conditional_doc_count = f['doc_count'][()]
-    conditional_smoothing = 100. / conditional_doc_count
 nsfw_tags = set()  # Initialize an empty set to store words meeting the threshold
 # Open and read the CSV file
@@ -349,50 +320,6 @@ def build_tag_id_wiki_dict(filename='wiki_pages-2023-08-08.csv'):
     return tag_data
-#Imagine we are adding smoothing_value to the number of times word_j occurs in each document for smoothing.
-#Note the intention is that sum_i(P(word_i|word_j)) =(approx) # of words in a document rather than 1.
-def conditional_probability(word_i, word_j, co_occurrence_matrix, vocabulary, doc_count, smoothing_value=0.01):
-    word_i_index = vocabulary.get(word_i)
-    word_j_index = vocabulary.get(word_j)
-    if word_i_index is not None and word_j_index is not None:
-        # Directly access the sparse matrix elements
-        word_j_count = co_occurrence_matrix[word_j_index, word_j_index]
-        smoothed_word_j_count =  word_j_count + (smoothing_value * doc_count)
-        word_i_count = co_occurrence_matrix[word_i_index, word_i_index]
-        co_occurrence_count = co_occurrence_matrix[word_i_index, word_j_index]
-        smoothed_co_occurrence_count = co_occurrence_count + (smoothing_value * word_i_count)
-        # Calculate the conditional probability with smoothing
-        conditional_prob = smoothed_co_occurrence_count / smoothed_word_j_count
-        return conditional_prob
-    elif word_i_index is None:
-        return 0
-    else:
-        return None
-def geometric_mean_given_words(target_word, context_words, co_occurrence_matrix, vocabulary, doc_count, smoothing_value=0.01):
-    probabilities = []
-    # Collect the conditional probabilities of the target word given each context word, ignoring None values
-    for context_word in context_words:
-        prob = conditional_probability(target_word, context_word, co_occurrence_matrix, vocabulary, doc_count, smoothing_value)
-        if prob is not None:
-            probabilities.append(prob)
-    # Compute the geometric mean of the probabilities, avoiding division by zero
-    if probabilities:  # Check if the list is not empty
-        geometric_mean = np.prod(probabilities) ** (1.0 / len(probabilities))
-    else:
-        geometric_mean = 0.5  # Or assign some default value if all probabilities are None
-    return geometric_mean
 def create_html_tables_for_tags(subtable_heading, word_similarity_tuples, tag2count, tag2idwiki):
     # Wrap the tag part in a <span> with styles for bold and larger font
@@ -511,7 +438,7 @@ def create_html_placeholder(title="", content="", placeholder_height=400, placeh
     return html_placeholder
-def find_similar_tags(test_tags, similarity_weight, allow_nsfw_tags):
     #Initialize stuff
     if not hasattr(find_similar_tags, "fasttext_small_model"):
         find_similar_tags.fasttext_small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load('e621FastTextModel010Replacement_small.bin')
@@ -584,10 +511,8 @@ def find_similar_tags(test_tags, similarity_weight, allow_nsfw_tags):
         #Adjust score based on context
         for i in range(len(result)):
             word, score = result[i]  # Unpack the tuple
-            geometric_mean = geometric_mean_given_words(word.replace(' ','_'), [context_tag for context_tag in transformed_tags if context_tag != word and context_tag != modified_tag], conditional_co_occurrence_matrix, conditional_vocabulary, conditional_doc_count, smoothing_value=conditional_smoothing)
-            adjusted_score = (similarity_weight * geometric_mean) + ((1-similarity_weight)*score)  # Apply the adjustment function
-            result[i] = (word, adjusted_score)  # Update the tuple with the adjusted score
-            #print(word, score, geometric_mean, adjusted_score)
         result = sorted(result, key=lambda x: x[1], reverse=True)[:10]
         html_content += create_html_tables_for_tags(modified_tag, result, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
@@ -650,32 +575,30 @@ def augment_bad_entities_with_regex(text):
     return bad_entities
-def find_similar_artists(original_tags_string, top_n, similarity_weight, allow_nsfw_tags):
     try:
         new_tags_string = original_tags_string.lower()
         new_tags_string, removed_tags = remove_special_tags(new_tags_string)
         # Parse the prompt
         parsed = parser.parse(new_tags_string)
         # Extract tags from the parsed tree
         new_image_tags = extract_tags(parsed)
         tag_data = build_tag_offsets_dicts(new_image_tags)
-        ###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))   #We may want this line again later.  These are the tags that were not used to calculate the artists list.
-        unseen_tags_data, bad_entities = find_similar_tags(tag_data, similarity_weight, allow_nsfw_tags)
         #Bad tags stuff
         bad_entities.extend(augment_bad_entities_with_regex(new_tags_string))
         bad_entities.sort(key=lambda x: x['start'])
         bad_tags_illustrated_string = {"text":new_tags_string, "entities":bad_entities}
-        #Suggested tags stuff
-        suggested_tags_html_content = "<div class=\"scrollable-content\" style='display: inline-block; margin: 20px; text-align: center;'>"
-        suggested_tags_html_content += "<h1>Suggested Tags</h1>"  # Heading for the table
-        suggested_tags = get_tfidf_reduced_similar_tags([item["tf_idf_matrix_tag"] for item in tag_data] + removed_tags, allow_nsfw_tags)
         # Create a set of tags that should be filtered out
         filter_tags = {entry["original_tag"].strip() for entry in tag_data}
@@ -690,13 +613,6 @@ def find_similar_artists(original_tags_string, top_n, similarity_weight, allow_n
         suggested_tags_html_content += create_html_tables_for_tags("Suggested Tag", topnsuggestions, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
         #Artist stuff
-        #artist_matrix_tags = [tag_info['artist_matrix_tag'] for tag_info in tag_data if tag_info['node_type'] == "tag"]
-        #X_new_image = vectorizer.transform([','.join(artist_matrix_tags + removed_tags)])
-        #similarities = cosine_similarity(X_new_image, X_artist)[0]
-        #
-        #top_artist_indices = np.argsort(similarities)[-(top_n + 1):][::-1]
-        #top_artists = [(artist_names[i], similarities[i]) for i in top_artist_indices if artist_names[i].lower() != "by conditional dnp"][:top_n]
         excluded_artists = ["by conditional dnp", "by unknown artist"]
         top_artists = [(key, value) for key, value in  suggested_artist_tags_filtered.items() if key.lower() not in excluded_artists][:top_n]
         top_artists_str = create_top_artists_table(top_artists)
@@ -737,7 +653,7 @@ with gr.Blocks(css=css) as app:
         with gr.Column(scale=3):
             with gr.Group():
                 with gr.Row():
-                    similarity_weight = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="Similarity weight")
                     allow_nsfw = gr.Checkbox(label="Allow NSFW Tags", value=False)
                 with gr.Row():
                     with gr.Column(scale=2):
@@ -759,7 +675,7 @@ with gr.Blocks(css=css) as app:
     submit_button.click(
         find_similar_artists,
-        inputs=[image_tags, num_artists, similarity_weight, allow_nsfw],
         outputs=[unseen_tags, bad_tags_illustrated_string, suggested_tags, top_artists, dynamic_prompts] + galleries
     )

         model_components['row_to_tag'] = {idx: tag for tag, idx in model_components['tag_to_row_index'].items()}
     return model_components
 # Load all components at the start
 tf_idf_components = load_model_components('tf_idf_files_420.joblib')
 nsfw_tags = set()  # Initialize an empty set to store words meeting the threshold
 # Open and read the CSV file
     return tag_data
 def create_html_tables_for_tags(subtable_heading, word_similarity_tuples, tag2count, tag2idwiki):
     # Wrap the tag part in a <span> with styles for bold and larger font
     return html_placeholder
+def find_similar_tags(test_tags, tag_to_context_similarity, context_similarity_weight, allow_nsfw_tags):
     #Initialize stuff
     if not hasattr(find_similar_tags, "fasttext_small_model"):
         find_similar_tags.fasttext_small_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load('e621FastTextModel010Replacement_small.bin')
         #Adjust score based on context
         for i in range(len(result)):
             word, score = result[i]  # Unpack the tuple
+            context_score = tag_to_context_similarity.get(word,0)
+            result[i] = (word, .5 * ((context_similarity_weight * context_score) + ((1 - context_similarity_weight) * score)))
         result = sorted(result, key=lambda x: x[1], reverse=True)[:10]
         html_content += create_html_tables_for_tags(modified_tag, result, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
     return bad_entities
+def find_similar_artists(original_tags_string, top_n, context_similarity_weight, allow_nsfw_tags):
     try:
         new_tags_string = original_tags_string.lower()
         new_tags_string, removed_tags = remove_special_tags(new_tags_string)
         # Parse the prompt
         parsed = parser.parse(new_tags_string)
         # Extract tags from the parsed tree
         new_image_tags = extract_tags(parsed)
         tag_data = build_tag_offsets_dicts(new_image_tags)
+        #Suggested tags stuff
+        suggested_tags_html_content = "<div class=\"scrollable-content\" style='display: inline-block; margin: 20px; text-align: center;'>"
+        suggested_tags_html_content += "<h1>Suggested Tags</h1>"  # Heading for the table
+        suggested_tags = get_tfidf_reduced_similar_tags([item["tf_idf_matrix_tag"] for item in tag_data] + removed_tags, allow_nsfw_tags)
+        unseen_tags_data, bad_entities = find_similar_tags(tag_data, suggested_tags, context_similarity_weight, allow_nsfw_tags)
         #Bad tags stuff
         bad_entities.extend(augment_bad_entities_with_regex(new_tags_string))
         bad_entities.sort(key=lambda x: x['start'])
         bad_tags_illustrated_string = {"text":new_tags_string, "entities":bad_entities}
         # Create a set of tags that should be filtered out
         filter_tags = {entry["original_tag"].strip() for entry in tag_data}
         suggested_tags_html_content += create_html_tables_for_tags("Suggested Tag", topnsuggestions, find_similar_tags.tag2count, find_similar_tags.tag2idwiki)
         #Artist stuff
         excluded_artists = ["by conditional dnp", "by unknown artist"]
         top_artists = [(key, value) for key, value in  suggested_artist_tags_filtered.items() if key.lower() not in excluded_artists][:top_n]
         top_artists_str = create_top_artists_table(top_artists)
         with gr.Column(scale=3):
             with gr.Group():
                 with gr.Row():
+                    context_similarity_weight = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="Context Similarity Weight")
                     allow_nsfw = gr.Checkbox(label="Allow NSFW Tags", value=False)
                 with gr.Row():
                     with gr.Column(scale=2):
     submit_button.click(
         find_similar_artists,
+        inputs=[image_tags, num_artists, context_similarity_weight, allow_nsfw],
         outputs=[unseen_tags, bad_tags_illustrated_string, suggested_tags, top_artists, dynamic_prompts] + galleries
     )