#!/usr/bin/env python3 """ Semantic Explorer - Hugging Face Spaces Version Multi-model support with lazy loading for HF Spaces deployment """ import gradio as gr import chromadb from chromadb.config import Settings from sentence_transformers import SentenceTransformer import numpy as np from pathlib import Path # Project paths PROJECT_ROOT = Path(__file__).parent CHROMA_DIR = PROJECT_ROOT / "chromadb" # Model registry with dimensions AVAILABLE_MODELS = { "all-MiniLM-L6-v2": { "name": "all-MiniLM-L6-v2", "display": "all-MiniLM-L6-v2: 384 dimensions", "hf_id": "sentence-transformers/all-MiniLM-L6-v2", "dimensions": 384, "trust_remote_code": False }, "bge-large-en-v1.5": { "name": "bge-large-en-v1.5", "display": "BGE Large EN v1.5: 1024 dimensions", "hf_id": "BAAI/bge-large-en-v1.5", "dimensions": 1024, "trust_remote_code": False } } # Global variables for caching models and collections loaded_models = {} loaded_collections = {} def get_collection_name(model_key): """Generate collection name based on model""" return f"words_{model_key.replace('-', '_').replace('.', '_')}" def load_model_and_collection(model_key): """Lazy load model and collection, cache them""" global loaded_models, loaded_collections # Return cached if already loaded if model_key in loaded_models and model_key in loaded_collections: print(f"Using cached model: {AVAILABLE_MODELS[model_key]['display']}") return loaded_models[model_key], loaded_collections[model_key] model_info = AVAILABLE_MODELS[model_key] print(f"Loading model: {model_info['display']}") # Load embedding model directly from Hugging Face Hub trust_remote_code = model_info.get("trust_remote_code", False) print(f"Loading from Hugging Face: {model_info['hf_id']}") model = SentenceTransformer( model_info["hf_id"], trust_remote_code=trust_remote_code ) # Load ChromaDB collection collection_name = get_collection_name(model_key) client = chromadb.PersistentClient( path=str(CHROMA_DIR), settings=Settings(anonymized_telemetry=False) ) try: collection = client.get_collection(collection_name) count = collection.count() print(f"Loaded collection '{collection_name}' with {count} words") except Exception as e: error_msg = f"Could not load collection '{collection_name}'.\nPlease ensure the ChromaDB database is uploaded correctly." print(f"Error: {e}") print(error_msg) return None, None # Cache them loaded_models[model_key] = model loaded_collections[model_key] = collection return model, collection def compute_similarity(vec1, vec2): """Compute cosine similarity between two vectors""" return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) def find_most_and_least_similar(model_key, reference_word, n=20): """ Find both the n most similar AND n least similar words to the reference word Queries a sample of words to find the range of similarities Returns: String with both most and least similar formatted """ model, collection = load_model_and_collection(model_key) if model is None or collection is None: return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly." if not reference_word.strip(): return "Please enter a reference word." try: # Get embedding for reference word ref_embedding = model.encode([reference_word], convert_to_numpy=True)[0] # Strategy: Use query() for most similar (should work) # For least similar, we'll query with negated embedding and compute actual similarities # Query for most similar words using query_texts (more reliable than query_embeddings) # This should work since Comparison Tool uses similar approach most_results = collection.query( query_texts=[reference_word], n_results=n + 10, include=["documents", "embeddings"] ) # Process most similar results most_similar = [] if most_results and most_results.get('documents') and len(most_results['documents']) > 0: docs = most_results['documents'][0] embs_list = most_results.get('embeddings', [[]]) embs = embs_list[0] if embs_list and len(embs_list) > 0 else [] # Compute actual cosine similarities if len(embs) > 0 and len(embs) == len(docs): for word, emb in zip(docs, embs): if word.lower() != reference_word.lower(): word_embedding = np.array(emb) similarity = compute_similarity(ref_embedding, word_embedding) most_similar.append((word, similarity)) if len(most_similar) >= n: break else: # Fallback: use distances distances_list = most_results.get('distances', [[]]) distances = distances_list[0] if distances_list and len(distances_list) > 0 else [] for i, word in enumerate(docs): if word.lower() != reference_word.lower(): if i < len(distances): distance = distances[i] similarity = max(0.0, 1.0 - distance) else: similarity = 0.0 most_similar.append((word, similarity)) if len(most_similar) >= n: break if len(most_similar) == 0: return f"Error: Query returned no results. Collection count: {collection.count()}" # For least similar, query with a semantically opposite word # Use words that are likely to be dissimilar (abstract concepts, opposites, etc.) opposite_words = ["nothing", "abstract", "void", "nonexistent", "opposite"] least_similar = [] # Query with opposite words to find dissimilar words for opp_word in opposite_words: try: opp_results = collection.query( query_texts=[opp_word], n_results=n + 5, include=["documents", "embeddings"] ) if opp_results and opp_results.get('documents') and len(opp_results['documents']) > 0: docs = opp_results['documents'][0] embs_list = opp_results.get('embeddings', [[]]) embs = embs_list[0] if embs_list and len(embs_list) > 0 else [] if len(embs) > 0 and len(embs) == len(docs): for word, emb in zip(docs, embs): if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]: word_embedding = np.array(emb) similarity = compute_similarity(ref_embedding, word_embedding) least_similar.append((word, similarity)) if len(least_similar) >= n * 2: # Get extra to sort break if len(least_similar) >= n * 2: break except Exception as e: print(f"Error querying with '{opp_word}': {e}") continue # If we still don't have enough, try querying with negated embedding using query_texts if len(least_similar) < n: # Encode a negated concept neg_word_embedding = model.encode(["not " + reference_word], convert_to_numpy=True)[0] try: neg_results = collection.query( query_embeddings=[neg_word_embedding.tolist()], n_results=n + 10, include=["documents", "embeddings"] ) if neg_results and neg_results.get('documents') and len(neg_results['documents']) > 0: docs = neg_results['documents'][0] embs_list = neg_results.get('embeddings', [[]]) embs = embs_list[0] if embs_list and len(embs_list) > 0 else [] if len(embs) > 0 and len(embs) == len(docs): for word, emb in zip(docs, embs): if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]: word_embedding = np.array(emb) similarity = compute_similarity(ref_embedding, word_embedding) least_similar.append((word, similarity)) if len(least_similar) >= n * 2: break except Exception as e: print(f"Error with negated query: {e}") # Sort least similar (ascending - lowest similarity first) least_similar.sort(key=lambda x: x[1]) least_similar = least_similar[:n] # Take top n least similar # Format output model_display = AVAILABLE_MODELS[model_key]['display'] output = [ f"**Using: {model_display}**\n", "**MOST SIMILAR:**" ] output.extend([f"{word}: {sim:.4f}" for word, sim in most_similar]) output.append("\n**LEAST SIMILAR:**") output.extend([f"{word}: {sim:.4f}" for word, sim in least_similar]) return "\n".join(output) except Exception as e: import traceback error_details = traceback.format_exc() print(f"Error in find_most_and_least_similar: {error_details}") print(f"Collection count: {collection.count() if collection else 'N/A'}") return f"Error: {str(e)}\n\nPlease check the logs for details." def compare_words(model_key, reference_word, comparison_words): """ Compare reference word to a list of comparison words """ model, collection = load_model_and_collection(model_key) if model is None or collection is None: return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly." if not reference_word.strip(): return "Please enter a reference word." if not comparison_words.strip(): return "Please enter at least one comparison word." # Parse comparison words (one per line, max 10) comp_list = [w.strip() for w in comparison_words.split('\n') if w.strip()] if len(comp_list) > 10: return "Maximum 10 comparison words allowed. Please reduce your list." if not comp_list: return "Please enter at least one comparison word." # Get embeddings all_words = [reference_word] + comp_list embeddings = model.encode(all_words, convert_to_numpy=True) ref_embedding = embeddings[0] comp_embeddings = embeddings[1:] # Compute similarities results = [] for word, embedding in zip(comp_list, comp_embeddings): sim = compute_similarity(ref_embedding, embedding) results.append((word, sim)) # Sort by similarity (descending) results.sort(key=lambda x: -x[1]) # Format output model_display = AVAILABLE_MODELS[model_key]['display'] output = [f"**Using: {model_display}**\n"] output.extend([f"{word}: {sim:.4f}" for word, sim in results]) return "\n".join(output) def create_interface(): """Create the Gradio interface with model dropdown and 3 tabs""" # Get model choices for dropdown model_choices = [info["display"] for info in AVAILABLE_MODELS.values()] model_keys = list(AVAILABLE_MODELS.keys()) with gr.Blocks(title="Semantic Explorer") as app: gr.Markdown("# 🔍 Semantic Explorer") gr.Markdown("Explore semantic similarity between words using embedding vectors") # Model selector at the top model_selector = gr.Radio( choices=model_choices, value=model_choices[0], label="Select Embedding Model", info="Choose which embedding model to use for similarity calculations" ) with gr.Tabs(): # Tab 1: Comparison Tool (FIRST) with gr.Tab("Comparison Tool"): gr.Markdown("### Compare a reference word to specific comparison words") gr.Markdown("*Enter up to 10 comparison words, one per line.*") with gr.Row(): with gr.Column(): ct_reference = gr.Textbox( label="Reference Word", placeholder="Enter a word...", lines=1 ) ct_comparisons = gr.Textbox( label="Comparison Words (one per line, max 10)", placeholder="word1\nword2\nword3\n...", lines=10 ) ct_button = gr.Button("Compare Words", variant="primary") with gr.Column(): ct_output = gr.Textbox( label="Results", lines=15, placeholder="Results will appear here..." ) ct_button.click( fn=lambda selector, ref, comp: compare_words( model_keys[model_choices.index(selector)], ref, comp ), inputs=[model_selector, ct_reference, ct_comparisons], outputs=ct_output ) # Tab 2: Most & Least Similar (COMBINED) with gr.Tab("Most & Least Similar"): gr.Markdown("### Find both the most AND least semantically similar words") gr.Markdown("*Shows 20 results for each category*") with gr.Row(): with gr.Column(): ml_reference = gr.Textbox( label="Reference Word", placeholder="Enter a word...", lines=1 ) ml_button = gr.Button("Find Similar & Dissimilar Words", variant="primary") with gr.Column(): ml_output = gr.Textbox( label="Results", lines=25, placeholder="Results will appear here..." ) ml_button.click( fn=lambda selector, ref: find_most_and_least_similar( model_keys[model_choices.index(selector)], ref, 20 ), inputs=[model_selector, ml_reference], outputs=ml_output ) gr.Markdown("---") gr.Markdown("*Select different embedding models to compare their semantic representations*") return app # Create the interface demo = create_interface() # Launch when run directly if __name__ == "__main__": print("=" * 70) print("SEMANTIC EXPLORER - HUGGING FACE SPACES") print("=" * 70) print("\nAvailable models:") for key, info in AVAILABLE_MODELS.items(): print(f" - {info['display']}") print("\nNote: Models and collections will be loaded on-demand.") print("=" * 70) demo.launch()