Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Semantic Explorer - Hugging Face Spaces Version | |
| Multi-model support with lazy loading for HF Spaces deployment | |
| """ | |
| import gradio as gr | |
| import chromadb | |
| from chromadb.config import Settings | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| from pathlib import Path | |
| # Project paths | |
| PROJECT_ROOT = Path(__file__).parent | |
| CHROMA_DIR = PROJECT_ROOT / "chromadb" | |
| # Model registry with dimensions | |
| AVAILABLE_MODELS = { | |
| "all-MiniLM-L6-v2": { | |
| "name": "all-MiniLM-L6-v2", | |
| "display": "all-MiniLM-L6-v2: 384 dimensions", | |
| "hf_id": "sentence-transformers/all-MiniLM-L6-v2", | |
| "dimensions": 384, | |
| "trust_remote_code": False | |
| }, | |
| "bge-large-en-v1.5": { | |
| "name": "bge-large-en-v1.5", | |
| "display": "BGE Large EN v1.5: 1024 dimensions", | |
| "hf_id": "BAAI/bge-large-en-v1.5", | |
| "dimensions": 1024, | |
| "trust_remote_code": False | |
| } | |
| } | |
| # Global variables for caching models and collections | |
| loaded_models = {} | |
| loaded_collections = {} | |
| def get_collection_name(model_key): | |
| """Generate collection name based on model""" | |
| return f"words_{model_key.replace('-', '_').replace('.', '_')}" | |
| def load_model_and_collection(model_key): | |
| """Lazy load model and collection, cache them""" | |
| global loaded_models, loaded_collections | |
| # Return cached if already loaded | |
| if model_key in loaded_models and model_key in loaded_collections: | |
| print(f"Using cached model: {AVAILABLE_MODELS[model_key]['display']}") | |
| return loaded_models[model_key], loaded_collections[model_key] | |
| model_info = AVAILABLE_MODELS[model_key] | |
| print(f"Loading model: {model_info['display']}") | |
| # Load embedding model directly from Hugging Face Hub | |
| trust_remote_code = model_info.get("trust_remote_code", False) | |
| print(f"Loading from Hugging Face: {model_info['hf_id']}") | |
| model = SentenceTransformer( | |
| model_info["hf_id"], | |
| trust_remote_code=trust_remote_code | |
| ) | |
| # Load ChromaDB collection | |
| collection_name = get_collection_name(model_key) | |
| client = chromadb.PersistentClient( | |
| path=str(CHROMA_DIR), | |
| settings=Settings(anonymized_telemetry=False) | |
| ) | |
| try: | |
| collection = client.get_collection(collection_name) | |
| count = collection.count() | |
| print(f"Loaded collection '{collection_name}' with {count} words") | |
| except Exception as e: | |
| error_msg = f"Could not load collection '{collection_name}'.\nPlease ensure the ChromaDB database is uploaded correctly." | |
| print(f"Error: {e}") | |
| print(error_msg) | |
| return None, None | |
| # Cache them | |
| loaded_models[model_key] = model | |
| loaded_collections[model_key] = collection | |
| return model, collection | |
| def compute_similarity(vec1, vec2): | |
| """Compute cosine similarity between two vectors""" | |
| return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) | |
| def find_most_and_least_similar(model_key, reference_word, n=20): | |
| """ | |
| Find both the n most similar AND n least similar words to the reference word | |
| Queries a sample of words to find the range of similarities | |
| Returns: | |
| String with both most and least similar formatted | |
| """ | |
| model, collection = load_model_and_collection(model_key) | |
| if model is None or collection is None: | |
| return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly." | |
| if not reference_word.strip(): | |
| return "Please enter a reference word." | |
| try: | |
| # Get embedding for reference word | |
| ref_embedding = model.encode([reference_word], convert_to_numpy=True)[0] | |
| # Strategy: Use query() for most similar (should work) | |
| # For least similar, we'll query with negated embedding and compute actual similarities | |
| # Query for most similar words using query_texts (more reliable than query_embeddings) | |
| # This should work since Comparison Tool uses similar approach | |
| most_results = collection.query( | |
| query_texts=[reference_word], | |
| n_results=n + 10, | |
| include=["documents", "embeddings"] | |
| ) | |
| # Process most similar results | |
| most_similar = [] | |
| if most_results and most_results.get('documents') and len(most_results['documents']) > 0: | |
| docs = most_results['documents'][0] | |
| embs_list = most_results.get('embeddings', [[]]) | |
| embs = embs_list[0] if embs_list and len(embs_list) > 0 else [] | |
| # Compute actual cosine similarities | |
| if len(embs) > 0 and len(embs) == len(docs): | |
| for word, emb in zip(docs, embs): | |
| if word.lower() != reference_word.lower(): | |
| word_embedding = np.array(emb) | |
| similarity = compute_similarity(ref_embedding, word_embedding) | |
| most_similar.append((word, similarity)) | |
| if len(most_similar) >= n: | |
| break | |
| else: | |
| # Fallback: use distances | |
| distances_list = most_results.get('distances', [[]]) | |
| distances = distances_list[0] if distances_list and len(distances_list) > 0 else [] | |
| for i, word in enumerate(docs): | |
| if word.lower() != reference_word.lower(): | |
| if i < len(distances): | |
| distance = distances[i] | |
| similarity = max(0.0, 1.0 - distance) | |
| else: | |
| similarity = 0.0 | |
| most_similar.append((word, similarity)) | |
| if len(most_similar) >= n: | |
| break | |
| if len(most_similar) == 0: | |
| return f"Error: Query returned no results. Collection count: {collection.count()}" | |
| # For least similar, query with a semantically opposite word | |
| # Use words that are likely to be dissimilar (abstract concepts, opposites, etc.) | |
| opposite_words = ["nothing", "abstract", "void", "nonexistent", "opposite"] | |
| least_similar = [] | |
| # Query with opposite words to find dissimilar words | |
| for opp_word in opposite_words: | |
| try: | |
| opp_results = collection.query( | |
| query_texts=[opp_word], | |
| n_results=n + 5, | |
| include=["documents", "embeddings"] | |
| ) | |
| if opp_results and opp_results.get('documents') and len(opp_results['documents']) > 0: | |
| docs = opp_results['documents'][0] | |
| embs_list = opp_results.get('embeddings', [[]]) | |
| embs = embs_list[0] if embs_list and len(embs_list) > 0 else [] | |
| if len(embs) > 0 and len(embs) == len(docs): | |
| for word, emb in zip(docs, embs): | |
| if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]: | |
| word_embedding = np.array(emb) | |
| similarity = compute_similarity(ref_embedding, word_embedding) | |
| least_similar.append((word, similarity)) | |
| if len(least_similar) >= n * 2: # Get extra to sort | |
| break | |
| if len(least_similar) >= n * 2: | |
| break | |
| except Exception as e: | |
| print(f"Error querying with '{opp_word}': {e}") | |
| continue | |
| # If we still don't have enough, try querying with negated embedding using query_texts | |
| if len(least_similar) < n: | |
| # Encode a negated concept | |
| neg_word_embedding = model.encode(["not " + reference_word], convert_to_numpy=True)[0] | |
| try: | |
| neg_results = collection.query( | |
| query_embeddings=[neg_word_embedding.tolist()], | |
| n_results=n + 10, | |
| include=["documents", "embeddings"] | |
| ) | |
| if neg_results and neg_results.get('documents') and len(neg_results['documents']) > 0: | |
| docs = neg_results['documents'][0] | |
| embs_list = neg_results.get('embeddings', [[]]) | |
| embs = embs_list[0] if embs_list and len(embs_list) > 0 else [] | |
| if len(embs) > 0 and len(embs) == len(docs): | |
| for word, emb in zip(docs, embs): | |
| if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]: | |
| word_embedding = np.array(emb) | |
| similarity = compute_similarity(ref_embedding, word_embedding) | |
| least_similar.append((word, similarity)) | |
| if len(least_similar) >= n * 2: | |
| break | |
| except Exception as e: | |
| print(f"Error with negated query: {e}") | |
| # Sort least similar (ascending - lowest similarity first) | |
| least_similar.sort(key=lambda x: x[1]) | |
| least_similar = least_similar[:n] # Take top n least similar | |
| # Format output | |
| model_display = AVAILABLE_MODELS[model_key]['display'] | |
| output = [ | |
| f"**Using: {model_display}**\n", | |
| "**MOST SIMILAR:**" | |
| ] | |
| output.extend([f"{word}: {sim:.4f}" for word, sim in most_similar]) | |
| output.append("\n**LEAST SIMILAR:**") | |
| output.extend([f"{word}: {sim:.4f}" for word, sim in least_similar]) | |
| return "\n".join(output) | |
| except Exception as e: | |
| import traceback | |
| error_details = traceback.format_exc() | |
| print(f"Error in find_most_and_least_similar: {error_details}") | |
| print(f"Collection count: {collection.count() if collection else 'N/A'}") | |
| return f"Error: {str(e)}\n\nPlease check the logs for details." | |
| def compare_words(model_key, reference_word, comparison_words): | |
| """ | |
| Compare reference word to a list of comparison words | |
| """ | |
| model, collection = load_model_and_collection(model_key) | |
| if model is None or collection is None: | |
| return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly." | |
| if not reference_word.strip(): | |
| return "Please enter a reference word." | |
| if not comparison_words.strip(): | |
| return "Please enter at least one comparison word." | |
| # Parse comparison words (one per line, max 10) | |
| comp_list = [w.strip() for w in comparison_words.split('\n') if w.strip()] | |
| if len(comp_list) > 10: | |
| return "Maximum 10 comparison words allowed. Please reduce your list." | |
| if not comp_list: | |
| return "Please enter at least one comparison word." | |
| # Get embeddings | |
| all_words = [reference_word] + comp_list | |
| embeddings = model.encode(all_words, convert_to_numpy=True) | |
| ref_embedding = embeddings[0] | |
| comp_embeddings = embeddings[1:] | |
| # Compute similarities | |
| results = [] | |
| for word, embedding in zip(comp_list, comp_embeddings): | |
| sim = compute_similarity(ref_embedding, embedding) | |
| results.append((word, sim)) | |
| # Sort by similarity (descending) | |
| results.sort(key=lambda x: -x[1]) | |
| # Format output | |
| model_display = AVAILABLE_MODELS[model_key]['display'] | |
| output = [f"**Using: {model_display}**\n"] | |
| output.extend([f"{word}: {sim:.4f}" for word, sim in results]) | |
| return "\n".join(output) | |
| def create_interface(): | |
| """Create the Gradio interface with model dropdown and 3 tabs""" | |
| # Get model choices for dropdown | |
| model_choices = [info["display"] for info in AVAILABLE_MODELS.values()] | |
| model_keys = list(AVAILABLE_MODELS.keys()) | |
| with gr.Blocks(title="Semantic Explorer") as app: | |
| gr.Markdown("# 🔍 Semantic Explorer") | |
| gr.Markdown("Explore semantic similarity between words using embedding vectors") | |
| # Model selector at the top | |
| model_selector = gr.Radio( | |
| choices=model_choices, | |
| value=model_choices[0], | |
| label="Select Embedding Model", | |
| info="Choose which embedding model to use for similarity calculations" | |
| ) | |
| with gr.Tabs(): | |
| # Tab 1: Comparison Tool (FIRST) | |
| with gr.Tab("Comparison Tool"): | |
| gr.Markdown("### Compare a reference word to specific comparison words") | |
| gr.Markdown("*Enter up to 10 comparison words, one per line.*") | |
| with gr.Row(): | |
| with gr.Column(): | |
| ct_reference = gr.Textbox( | |
| label="Reference Word", | |
| placeholder="Enter a word...", | |
| lines=1 | |
| ) | |
| ct_comparisons = gr.Textbox( | |
| label="Comparison Words (one per line, max 10)", | |
| placeholder="word1\nword2\nword3\n...", | |
| lines=10 | |
| ) | |
| ct_button = gr.Button("Compare Words", variant="primary") | |
| with gr.Column(): | |
| ct_output = gr.Textbox( | |
| label="Results", | |
| lines=15, | |
| placeholder="Results will appear here..." | |
| ) | |
| ct_button.click( | |
| fn=lambda selector, ref, comp: compare_words( | |
| model_keys[model_choices.index(selector)], ref, comp | |
| ), | |
| inputs=[model_selector, ct_reference, ct_comparisons], | |
| outputs=ct_output | |
| ) | |
| # Tab 2: Most & Least Similar (COMBINED) | |
| with gr.Tab("Most & Least Similar"): | |
| gr.Markdown("### Find both the most AND least semantically similar words") | |
| gr.Markdown("*Shows 20 results for each category*") | |
| with gr.Row(): | |
| with gr.Column(): | |
| ml_reference = gr.Textbox( | |
| label="Reference Word", | |
| placeholder="Enter a word...", | |
| lines=1 | |
| ) | |
| ml_button = gr.Button("Find Similar & Dissimilar Words", variant="primary") | |
| with gr.Column(): | |
| ml_output = gr.Textbox( | |
| label="Results", | |
| lines=25, | |
| placeholder="Results will appear here..." | |
| ) | |
| ml_button.click( | |
| fn=lambda selector, ref: find_most_and_least_similar( | |
| model_keys[model_choices.index(selector)], ref, 20 | |
| ), | |
| inputs=[model_selector, ml_reference], | |
| outputs=ml_output | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("*Select different embedding models to compare their semantic representations*") | |
| return app | |
| # Create the interface | |
| demo = create_interface() | |
| # Launch when run directly | |
| if __name__ == "__main__": | |
| print("=" * 70) | |
| print("SEMANTIC EXPLORER - HUGGING FACE SPACES") | |
| print("=" * 70) | |
| print("\nAvailable models:") | |
| for key, info in AVAILABLE_MODELS.items(): | |
| print(f" - {info['display']}") | |
| print("\nNote: Models and collections will be loaded on-demand.") | |
| print("=" * 70) | |
| demo.launch() | |