Spaces:

jnalv
/

semantic-explorer

Paused

App Files Files Community

semantic-explorer / app.py

jnalv

Fix: Properly check numpy array length instead of truthiness to avoid ambiguity error

8d18a79 about 2 months ago

raw

history blame contribute delete

16 kB

	#!/usr/bin/env python3
	"""
	Semantic Explorer - Hugging Face Spaces Version
	Multi-model support with lazy loading for HF Spaces deployment
	"""

	import gradio as gr
	import chromadb
	from chromadb.config import Settings
	from sentence_transformers import SentenceTransformer
	import numpy as np
	from pathlib import Path

	# Project paths
	PROJECT_ROOT = Path(__file__).parent
	CHROMA_DIR = PROJECT_ROOT / "chromadb"

	# Model registry with dimensions
	AVAILABLE_MODELS = {
	"all-MiniLM-L6-v2": {
	"name": "all-MiniLM-L6-v2",
	"display": "all-MiniLM-L6-v2: 384 dimensions",
	"hf_id": "sentence-transformers/all-MiniLM-L6-v2",
	"dimensions": 384,
	"trust_remote_code": False
	},
	"bge-large-en-v1.5": {
	"name": "bge-large-en-v1.5",
	"display": "BGE Large EN v1.5: 1024 dimensions",
	"hf_id": "BAAI/bge-large-en-v1.5",
	"dimensions": 1024,
	"trust_remote_code": False
	}
	}

	# Global variables for caching models and collections
	loaded_models = {}
	loaded_collections = {}


	def get_collection_name(model_key):
	"""Generate collection name based on model"""
	return f"words_{model_key.replace('-', '_').replace('.', '_')}"


	def load_model_and_collection(model_key):
	"""Lazy load model and collection, cache them"""
	global loaded_models, loaded_collections

	# Return cached if already loaded
	if model_key in loaded_models and model_key in loaded_collections:
	print(f"Using cached model: {AVAILABLE_MODELS[model_key]['display']}")
	return loaded_models[model_key], loaded_collections[model_key]

	model_info = AVAILABLE_MODELS[model_key]
	print(f"Loading model: {model_info['display']}")

	# Load embedding model directly from Hugging Face Hub
	trust_remote_code = model_info.get("trust_remote_code", False)
	print(f"Loading from Hugging Face: {model_info['hf_id']}")
	model = SentenceTransformer(
	model_info["hf_id"],
	trust_remote_code=trust_remote_code
	)

	# Load ChromaDB collection
	collection_name = get_collection_name(model_key)
	client = chromadb.PersistentClient(
	path=str(CHROMA_DIR),
	settings=Settings(anonymized_telemetry=False)
	)

	try:
	collection = client.get_collection(collection_name)
	count = collection.count()
	print(f"Loaded collection '{collection_name}' with {count} words")
	except Exception as e:
	error_msg = f"Could not load collection '{collection_name}'.\nPlease ensure the ChromaDB database is uploaded correctly."
	print(f"Error: {e}")
	print(error_msg)
	return None, None

	# Cache them
	loaded_models[model_key] = model
	loaded_collections[model_key] = collection

	return model, collection


	def compute_similarity(vec1, vec2):
	"""Compute cosine similarity between two vectors"""
	return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


	def find_most_and_least_similar(model_key, reference_word, n=20):
	"""
	Find both the n most similar AND n least similar words to the reference word
	Queries a sample of words to find the range of similarities

	Returns:
	String with both most and least similar formatted
	"""
	model, collection = load_model_and_collection(model_key)

	if model is None or collection is None:
	return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly."

	if not reference_word.strip():
	return "Please enter a reference word."

	try:
	# Get embedding for reference word
	ref_embedding = model.encode([reference_word], convert_to_numpy=True)[0]

	# Strategy: Use query() for most similar (should work)
	# For least similar, we'll query with negated embedding and compute actual similarities

	# Query for most similar words using query_texts (more reliable than query_embeddings)
	# This should work since Comparison Tool uses similar approach
	most_results = collection.query(
	query_texts=[reference_word],
	n_results=n + 10,
	include=["documents", "embeddings"]
	)

	# Process most similar results
	most_similar = []
	if most_results and most_results.get('documents') and len(most_results['documents']) > 0:
	docs = most_results['documents'][0]
	embs_list = most_results.get('embeddings', [[]])
	embs = embs_list[0] if embs_list and len(embs_list) > 0 else []

	# Compute actual cosine similarities
	if len(embs) > 0 and len(embs) == len(docs):
	for word, emb in zip(docs, embs):
	if word.lower() != reference_word.lower():
	word_embedding = np.array(emb)
	similarity = compute_similarity(ref_embedding, word_embedding)
	most_similar.append((word, similarity))
	if len(most_similar) >= n:
	break
	else:
	# Fallback: use distances
	distances_list = most_results.get('distances', [[]])
	distances = distances_list[0] if distances_list and len(distances_list) > 0 else []
	for i, word in enumerate(docs):
	if word.lower() != reference_word.lower():
	if i < len(distances):
	distance = distances[i]
	similarity = max(0.0, 1.0 - distance)
	else:
	similarity = 0.0
	most_similar.append((word, similarity))
	if len(most_similar) >= n:
	break

	if len(most_similar) == 0:
	return f"Error: Query returned no results. Collection count: {collection.count()}"

	# For least similar, query with a semantically opposite word
	# Use words that are likely to be dissimilar (abstract concepts, opposites, etc.)
	opposite_words = ["nothing", "abstract", "void", "nonexistent", "opposite"]
	least_similar = []

	# Query with opposite words to find dissimilar words
	for opp_word in opposite_words:
	try:
	opp_results = collection.query(
	query_texts=[opp_word],
	n_results=n + 5,
	include=["documents", "embeddings"]
	)

	if opp_results and opp_results.get('documents') and len(opp_results['documents']) > 0:
	docs = opp_results['documents'][0]
	embs_list = opp_results.get('embeddings', [[]])
	embs = embs_list[0] if embs_list and len(embs_list) > 0 else []

	if len(embs) > 0 and len(embs) == len(docs):
	for word, emb in zip(docs, embs):
	if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]:
	word_embedding = np.array(emb)
	similarity = compute_similarity(ref_embedding, word_embedding)
	least_similar.append((word, similarity))
	if len(least_similar) >= n * 2: # Get extra to sort
	break

	if len(least_similar) >= n * 2:
	break
	except Exception as e:
	print(f"Error querying with '{opp_word}': {e}")
	continue

	# If we still don't have enough, try querying with negated embedding using query_texts
	if len(least_similar) < n:
	# Encode a negated concept
	neg_word_embedding = model.encode(["not " + reference_word], convert_to_numpy=True)[0]
	try:
	neg_results = collection.query(
	query_embeddings=[neg_word_embedding.tolist()],
	n_results=n + 10,
	include=["documents", "embeddings"]
	)

	if neg_results and neg_results.get('documents') and len(neg_results['documents']) > 0:
	docs = neg_results['documents'][0]
	embs_list = neg_results.get('embeddings', [[]])
	embs = embs_list[0] if embs_list and len(embs_list) > 0 else []

	if len(embs) > 0 and len(embs) == len(docs):
	for word, emb in zip(docs, embs):
	if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]:
	word_embedding = np.array(emb)
	similarity = compute_similarity(ref_embedding, word_embedding)
	least_similar.append((word, similarity))
	if len(least_similar) >= n * 2:
	break
	except Exception as e:
	print(f"Error with negated query: {e}")

	# Sort least similar (ascending - lowest similarity first)
	least_similar.sort(key=lambda x: x[1])
	least_similar = least_similar[:n] # Take top n least similar

	# Format output
	model_display = AVAILABLE_MODELS[model_key]['display']
	output = [
	f"Using: {model_display}\n",
	"MOST SIMILAR:"
	]
	output.extend([f"{word}: {sim:.4f}" for word, sim in most_similar])
	output.append("\nLEAST SIMILAR:")
	output.extend([f"{word}: {sim:.4f}" for word, sim in least_similar])

	return "\n".join(output)

	except Exception as e:
	import traceback
	error_details = traceback.format_exc()
	print(f"Error in find_most_and_least_similar: {error_details}")
	print(f"Collection count: {collection.count() if collection else 'N/A'}")
	return f"Error: {str(e)}\n\nPlease check the logs for details."




	def compare_words(model_key, reference_word, comparison_words):
	"""
	Compare reference word to a list of comparison words
	"""
	model, collection = load_model_and_collection(model_key)

	if model is None or collection is None:
	return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly."

	if not reference_word.strip():
	return "Please enter a reference word."

	if not comparison_words.strip():
	return "Please enter at least one comparison word."

	# Parse comparison words (one per line, max 10)
	comp_list = [w.strip() for w in comparison_words.split('\n') if w.strip()]
	if len(comp_list) > 10:
	return "Maximum 10 comparison words allowed. Please reduce your list."

	if not comp_list:
	return "Please enter at least one comparison word."

	# Get embeddings
	all_words = [reference_word] + comp_list
	embeddings = model.encode(all_words, convert_to_numpy=True)

	ref_embedding = embeddings[0]
	comp_embeddings = embeddings[1:]

	# Compute similarities
	results = []
	for word, embedding in zip(comp_list, comp_embeddings):
	sim = compute_similarity(ref_embedding, embedding)
	results.append((word, sim))

	# Sort by similarity (descending)
	results.sort(key=lambda x: -x[1])

	# Format output
	model_display = AVAILABLE_MODELS[model_key]['display']
	output = [f"Using: {model_display}\n"]
	output.extend([f"{word}: {sim:.4f}" for word, sim in results])
	return "\n".join(output)


	def create_interface():
	"""Create the Gradio interface with model dropdown and 3 tabs"""

	# Get model choices for dropdown
	model_choices = [info["display"] for info in AVAILABLE_MODELS.values()]
	model_keys = list(AVAILABLE_MODELS.keys())

	with gr.Blocks(title="Semantic Explorer") as app:
	gr.Markdown("# 🔍 Semantic Explorer")
	gr.Markdown("Explore semantic similarity between words using embedding vectors")

	# Model selector at the top
	model_selector = gr.Radio(
	choices=model_choices,
	value=model_choices[0],
	label="Select Embedding Model",
	info="Choose which embedding model to use for similarity calculations"
	)

	with gr.Tabs():
	# Tab 1: Comparison Tool (FIRST)
	with gr.Tab("Comparison Tool"):
	gr.Markdown("### Compare a reference word to specific comparison words")
	gr.Markdown("Enter up to 10 comparison words, one per line.")
	with gr.Row():
	with gr.Column():
	ct_reference = gr.Textbox(
	label="Reference Word",
	placeholder="Enter a word...",
	lines=1
	)
	ct_comparisons = gr.Textbox(
	label="Comparison Words (one per line, max 10)",
	placeholder="word1\nword2\nword3\n...",
	lines=10
	)
	ct_button = gr.Button("Compare Words", variant="primary")
	with gr.Column():
	ct_output = gr.Textbox(
	label="Results",
	lines=15,
	placeholder="Results will appear here..."
	)

	ct_button.click(
	fn=lambda selector, ref, comp: compare_words(
	model_keys[model_choices.index(selector)], ref, comp
	),
	inputs=[model_selector, ct_reference, ct_comparisons],
	outputs=ct_output
	)

	# Tab 2: Most & Least Similar (COMBINED)
	with gr.Tab("Most & Least Similar"):
	gr.Markdown("### Find both the most AND least semantically similar words")
	gr.Markdown("Shows 20 results for each category")
	with gr.Row():
	with gr.Column():
	ml_reference = gr.Textbox(
	label="Reference Word",
	placeholder="Enter a word...",
	lines=1
	)
	ml_button = gr.Button("Find Similar & Dissimilar Words", variant="primary")
	with gr.Column():
	ml_output = gr.Textbox(
	label="Results",
	lines=25,
	placeholder="Results will appear here..."
	)

	ml_button.click(
	fn=lambda selector, ref: find_most_and_least_similar(
	model_keys[model_choices.index(selector)], ref, 20
	),
	inputs=[model_selector, ml_reference],
	outputs=ml_output
	)


	gr.Markdown("---")
	gr.Markdown("Select different embedding models to compare their semantic representations")

	return app


	# Create the interface
	demo = create_interface()

	# Launch when run directly
	if __name__ == "__main__":
	print("=" * 70)
	print("SEMANTIC EXPLORER - HUGGING FACE SPACES")
	print("=" * 70)
	print("\nAvailable models:")
	for key, info in AVAILABLE_MODELS.items():
	print(f" - {info['display']}")
	print("\nNote: Models and collections will be loaded on-demand.")
	print("=" * 70)
	demo.launch()