jnalv's picture
Fix: Properly check numpy array length instead of truthiness to avoid ambiguity error
8d18a79
#!/usr/bin/env python3
"""
Semantic Explorer - Hugging Face Spaces Version
Multi-model support with lazy loading for HF Spaces deployment
"""
import gradio as gr
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import numpy as np
from pathlib import Path
# Project paths
PROJECT_ROOT = Path(__file__).parent
CHROMA_DIR = PROJECT_ROOT / "chromadb"
# Model registry with dimensions
AVAILABLE_MODELS = {
"all-MiniLM-L6-v2": {
"name": "all-MiniLM-L6-v2",
"display": "all-MiniLM-L6-v2: 384 dimensions",
"hf_id": "sentence-transformers/all-MiniLM-L6-v2",
"dimensions": 384,
"trust_remote_code": False
},
"bge-large-en-v1.5": {
"name": "bge-large-en-v1.5",
"display": "BGE Large EN v1.5: 1024 dimensions",
"hf_id": "BAAI/bge-large-en-v1.5",
"dimensions": 1024,
"trust_remote_code": False
}
}
# Global variables for caching models and collections
loaded_models = {}
loaded_collections = {}
def get_collection_name(model_key):
"""Generate collection name based on model"""
return f"words_{model_key.replace('-', '_').replace('.', '_')}"
def load_model_and_collection(model_key):
"""Lazy load model and collection, cache them"""
global loaded_models, loaded_collections
# Return cached if already loaded
if model_key in loaded_models and model_key in loaded_collections:
print(f"Using cached model: {AVAILABLE_MODELS[model_key]['display']}")
return loaded_models[model_key], loaded_collections[model_key]
model_info = AVAILABLE_MODELS[model_key]
print(f"Loading model: {model_info['display']}")
# Load embedding model directly from Hugging Face Hub
trust_remote_code = model_info.get("trust_remote_code", False)
print(f"Loading from Hugging Face: {model_info['hf_id']}")
model = SentenceTransformer(
model_info["hf_id"],
trust_remote_code=trust_remote_code
)
# Load ChromaDB collection
collection_name = get_collection_name(model_key)
client = chromadb.PersistentClient(
path=str(CHROMA_DIR),
settings=Settings(anonymized_telemetry=False)
)
try:
collection = client.get_collection(collection_name)
count = collection.count()
print(f"Loaded collection '{collection_name}' with {count} words")
except Exception as e:
error_msg = f"Could not load collection '{collection_name}'.\nPlease ensure the ChromaDB database is uploaded correctly."
print(f"Error: {e}")
print(error_msg)
return None, None
# Cache them
loaded_models[model_key] = model
loaded_collections[model_key] = collection
return model, collection
def compute_similarity(vec1, vec2):
"""Compute cosine similarity between two vectors"""
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def find_most_and_least_similar(model_key, reference_word, n=20):
"""
Find both the n most similar AND n least similar words to the reference word
Queries a sample of words to find the range of similarities
Returns:
String with both most and least similar formatted
"""
model, collection = load_model_and_collection(model_key)
if model is None or collection is None:
return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly."
if not reference_word.strip():
return "Please enter a reference word."
try:
# Get embedding for reference word
ref_embedding = model.encode([reference_word], convert_to_numpy=True)[0]
# Strategy: Use query() for most similar (should work)
# For least similar, we'll query with negated embedding and compute actual similarities
# Query for most similar words using query_texts (more reliable than query_embeddings)
# This should work since Comparison Tool uses similar approach
most_results = collection.query(
query_texts=[reference_word],
n_results=n + 10,
include=["documents", "embeddings"]
)
# Process most similar results
most_similar = []
if most_results and most_results.get('documents') and len(most_results['documents']) > 0:
docs = most_results['documents'][0]
embs_list = most_results.get('embeddings', [[]])
embs = embs_list[0] if embs_list and len(embs_list) > 0 else []
# Compute actual cosine similarities
if len(embs) > 0 and len(embs) == len(docs):
for word, emb in zip(docs, embs):
if word.lower() != reference_word.lower():
word_embedding = np.array(emb)
similarity = compute_similarity(ref_embedding, word_embedding)
most_similar.append((word, similarity))
if len(most_similar) >= n:
break
else:
# Fallback: use distances
distances_list = most_results.get('distances', [[]])
distances = distances_list[0] if distances_list and len(distances_list) > 0 else []
for i, word in enumerate(docs):
if word.lower() != reference_word.lower():
if i < len(distances):
distance = distances[i]
similarity = max(0.0, 1.0 - distance)
else:
similarity = 0.0
most_similar.append((word, similarity))
if len(most_similar) >= n:
break
if len(most_similar) == 0:
return f"Error: Query returned no results. Collection count: {collection.count()}"
# For least similar, query with a semantically opposite word
# Use words that are likely to be dissimilar (abstract concepts, opposites, etc.)
opposite_words = ["nothing", "abstract", "void", "nonexistent", "opposite"]
least_similar = []
# Query with opposite words to find dissimilar words
for opp_word in opposite_words:
try:
opp_results = collection.query(
query_texts=[opp_word],
n_results=n + 5,
include=["documents", "embeddings"]
)
if opp_results and opp_results.get('documents') and len(opp_results['documents']) > 0:
docs = opp_results['documents'][0]
embs_list = opp_results.get('embeddings', [[]])
embs = embs_list[0] if embs_list and len(embs_list) > 0 else []
if len(embs) > 0 and len(embs) == len(docs):
for word, emb in zip(docs, embs):
if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]:
word_embedding = np.array(emb)
similarity = compute_similarity(ref_embedding, word_embedding)
least_similar.append((word, similarity))
if len(least_similar) >= n * 2: # Get extra to sort
break
if len(least_similar) >= n * 2:
break
except Exception as e:
print(f"Error querying with '{opp_word}': {e}")
continue
# If we still don't have enough, try querying with negated embedding using query_texts
if len(least_similar) < n:
# Encode a negated concept
neg_word_embedding = model.encode(["not " + reference_word], convert_to_numpy=True)[0]
try:
neg_results = collection.query(
query_embeddings=[neg_word_embedding.tolist()],
n_results=n + 10,
include=["documents", "embeddings"]
)
if neg_results and neg_results.get('documents') and len(neg_results['documents']) > 0:
docs = neg_results['documents'][0]
embs_list = neg_results.get('embeddings', [[]])
embs = embs_list[0] if embs_list and len(embs_list) > 0 else []
if len(embs) > 0 and len(embs) == len(docs):
for word, emb in zip(docs, embs):
if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]:
word_embedding = np.array(emb)
similarity = compute_similarity(ref_embedding, word_embedding)
least_similar.append((word, similarity))
if len(least_similar) >= n * 2:
break
except Exception as e:
print(f"Error with negated query: {e}")
# Sort least similar (ascending - lowest similarity first)
least_similar.sort(key=lambda x: x[1])
least_similar = least_similar[:n] # Take top n least similar
# Format output
model_display = AVAILABLE_MODELS[model_key]['display']
output = [
f"**Using: {model_display}**\n",
"**MOST SIMILAR:**"
]
output.extend([f"{word}: {sim:.4f}" for word, sim in most_similar])
output.append("\n**LEAST SIMILAR:**")
output.extend([f"{word}: {sim:.4f}" for word, sim in least_similar])
return "\n".join(output)
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error in find_most_and_least_similar: {error_details}")
print(f"Collection count: {collection.count() if collection else 'N/A'}")
return f"Error: {str(e)}\n\nPlease check the logs for details."
def compare_words(model_key, reference_word, comparison_words):
"""
Compare reference word to a list of comparison words
"""
model, collection = load_model_and_collection(model_key)
if model is None or collection is None:
return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly."
if not reference_word.strip():
return "Please enter a reference word."
if not comparison_words.strip():
return "Please enter at least one comparison word."
# Parse comparison words (one per line, max 10)
comp_list = [w.strip() for w in comparison_words.split('\n') if w.strip()]
if len(comp_list) > 10:
return "Maximum 10 comparison words allowed. Please reduce your list."
if not comp_list:
return "Please enter at least one comparison word."
# Get embeddings
all_words = [reference_word] + comp_list
embeddings = model.encode(all_words, convert_to_numpy=True)
ref_embedding = embeddings[0]
comp_embeddings = embeddings[1:]
# Compute similarities
results = []
for word, embedding in zip(comp_list, comp_embeddings):
sim = compute_similarity(ref_embedding, embedding)
results.append((word, sim))
# Sort by similarity (descending)
results.sort(key=lambda x: -x[1])
# Format output
model_display = AVAILABLE_MODELS[model_key]['display']
output = [f"**Using: {model_display}**\n"]
output.extend([f"{word}: {sim:.4f}" for word, sim in results])
return "\n".join(output)
def create_interface():
"""Create the Gradio interface with model dropdown and 3 tabs"""
# Get model choices for dropdown
model_choices = [info["display"] for info in AVAILABLE_MODELS.values()]
model_keys = list(AVAILABLE_MODELS.keys())
with gr.Blocks(title="Semantic Explorer") as app:
gr.Markdown("# 🔍 Semantic Explorer")
gr.Markdown("Explore semantic similarity between words using embedding vectors")
# Model selector at the top
model_selector = gr.Radio(
choices=model_choices,
value=model_choices[0],
label="Select Embedding Model",
info="Choose which embedding model to use for similarity calculations"
)
with gr.Tabs():
# Tab 1: Comparison Tool (FIRST)
with gr.Tab("Comparison Tool"):
gr.Markdown("### Compare a reference word to specific comparison words")
gr.Markdown("*Enter up to 10 comparison words, one per line.*")
with gr.Row():
with gr.Column():
ct_reference = gr.Textbox(
label="Reference Word",
placeholder="Enter a word...",
lines=1
)
ct_comparisons = gr.Textbox(
label="Comparison Words (one per line, max 10)",
placeholder="word1\nword2\nword3\n...",
lines=10
)
ct_button = gr.Button("Compare Words", variant="primary")
with gr.Column():
ct_output = gr.Textbox(
label="Results",
lines=15,
placeholder="Results will appear here..."
)
ct_button.click(
fn=lambda selector, ref, comp: compare_words(
model_keys[model_choices.index(selector)], ref, comp
),
inputs=[model_selector, ct_reference, ct_comparisons],
outputs=ct_output
)
# Tab 2: Most & Least Similar (COMBINED)
with gr.Tab("Most & Least Similar"):
gr.Markdown("### Find both the most AND least semantically similar words")
gr.Markdown("*Shows 20 results for each category*")
with gr.Row():
with gr.Column():
ml_reference = gr.Textbox(
label="Reference Word",
placeholder="Enter a word...",
lines=1
)
ml_button = gr.Button("Find Similar & Dissimilar Words", variant="primary")
with gr.Column():
ml_output = gr.Textbox(
label="Results",
lines=25,
placeholder="Results will appear here..."
)
ml_button.click(
fn=lambda selector, ref: find_most_and_least_similar(
model_keys[model_choices.index(selector)], ref, 20
),
inputs=[model_selector, ml_reference],
outputs=ml_output
)
gr.Markdown("---")
gr.Markdown("*Select different embedding models to compare their semantic representations*")
return app
# Create the interface
demo = create_interface()
# Launch when run directly
if __name__ == "__main__":
print("=" * 70)
print("SEMANTIC EXPLORER - HUGGING FACE SPACES")
print("=" * 70)
print("\nAvailable models:")
for key, info in AVAILABLE_MODELS.items():
print(f" - {info['display']}")
print("\nNote: Models and collections will be loaded on-demand.")
print("=" * 70)
demo.launch()