#!/usr/bin/env python3
"""
Semantic Explorer - Hugging Face Spaces Version
Multi-model support with lazy loading for HF Spaces deployment
"""

import gradio as gr
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import numpy as np
from pathlib import Path

# Project paths
PROJECT_ROOT = Path(__file__).parent
CHROMA_DIR = PROJECT_ROOT / "chromadb"

# Model registry with dimensions
AVAILABLE_MODELS = {
    "all-MiniLM-L6-v2": {
        "name": "all-MiniLM-L6-v2",
        "display": "all-MiniLM-L6-v2: 384 dimensions",
        "hf_id": "sentence-transformers/all-MiniLM-L6-v2",
        "dimensions": 384,
        "trust_remote_code": False
    },
    "bge-large-en-v1.5": {
        "name": "bge-large-en-v1.5",
        "display": "BGE Large EN v1.5: 1024 dimensions",
        "hf_id": "BAAI/bge-large-en-v1.5",
        "dimensions": 1024,
        "trust_remote_code": False
    }
}

# Global variables for caching models and collections
loaded_models = {}
loaded_collections = {}


def get_collection_name(model_key):
    """Generate collection name based on model"""
    return f"words_{model_key.replace('-', '_').replace('.', '_')}"


def load_model_and_collection(model_key):
    """Lazy load model and collection, cache them"""
    global loaded_models, loaded_collections
    
    # Return cached if already loaded
    if model_key in loaded_models and model_key in loaded_collections:
        print(f"Using cached model: {AVAILABLE_MODELS[model_key]['display']}")
        return loaded_models[model_key], loaded_collections[model_key]
    
    model_info = AVAILABLE_MODELS[model_key]
    print(f"Loading model: {model_info['display']}")
    
    # Load embedding model directly from Hugging Face Hub
    trust_remote_code = model_info.get("trust_remote_code", False)
    print(f"Loading from Hugging Face: {model_info['hf_id']}")
    model = SentenceTransformer(
        model_info["hf_id"], 
        trust_remote_code=trust_remote_code
    )
    
    # Load ChromaDB collection
    collection_name = get_collection_name(model_key)
    client = chromadb.PersistentClient(
        path=str(CHROMA_DIR),
        settings=Settings(anonymized_telemetry=False)
    )
    
    try:
        collection = client.get_collection(collection_name)
        count = collection.count()
        print(f"Loaded collection '{collection_name}' with {count} words")
    except Exception as e:
        error_msg = f"Could not load collection '{collection_name}'.\nPlease ensure the ChromaDB database is uploaded correctly."
        print(f"Error: {e}")
        print(error_msg)
        return None, None
    
    # Cache them
    loaded_models[model_key] = model
    loaded_collections[model_key] = collection
    
    return model, collection


def compute_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors"""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def find_most_and_least_similar(model_key, reference_word, n=20):
    """
    Find both the n most similar AND n least similar words to the reference word
    Queries a sample of words to find the range of similarities
    
    Returns:
        String with both most and least similar formatted
    """
    model, collection = load_model_and_collection(model_key)
    
    if model is None or collection is None:
        return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly."
    
    if not reference_word.strip():
        return "Please enter a reference word."
    
    try:
        # Get embedding for reference word
        ref_embedding = model.encode([reference_word], convert_to_numpy=True)[0]
        
        # Strategy: Use query() for most similar (should work)
        # For least similar, we'll query with negated embedding and compute actual similarities
        
        # Query for most similar words using query_texts (more reliable than query_embeddings)
        # This should work since Comparison Tool uses similar approach
        most_results = collection.query(
            query_texts=[reference_word],
            n_results=n + 10,
            include=["documents", "embeddings"]
        )
        
        # Process most similar results
        most_similar = []
        if most_results and most_results.get('documents') and len(most_results['documents']) > 0:
            docs = most_results['documents'][0]
            embs_list = most_results.get('embeddings', [[]])
            embs = embs_list[0] if embs_list and len(embs_list) > 0 else []
            
            # Compute actual cosine similarities
            if len(embs) > 0 and len(embs) == len(docs):
                for word, emb in zip(docs, embs):
                    if word.lower() != reference_word.lower():
                        word_embedding = np.array(emb)
                        similarity = compute_similarity(ref_embedding, word_embedding)
                        most_similar.append((word, similarity))
                        if len(most_similar) >= n:
                            break
            else:
                # Fallback: use distances
                distances_list = most_results.get('distances', [[]])
                distances = distances_list[0] if distances_list and len(distances_list) > 0 else []
                for i, word in enumerate(docs):
                    if word.lower() != reference_word.lower():
                        if i < len(distances):
                            distance = distances[i]
                            similarity = max(0.0, 1.0 - distance)
                        else:
                            similarity = 0.0
                        most_similar.append((word, similarity))
                        if len(most_similar) >= n:
                            break
        
        if len(most_similar) == 0:
            return f"Error: Query returned no results. Collection count: {collection.count()}"
        
        # For least similar, query with a semantically opposite word
        # Use words that are likely to be dissimilar (abstract concepts, opposites, etc.)
        opposite_words = ["nothing", "abstract", "void", "nonexistent", "opposite"]
        least_similar = []
        
        # Query with opposite words to find dissimilar words
        for opp_word in opposite_words:
            try:
                opp_results = collection.query(
                    query_texts=[opp_word],
                    n_results=n + 5,
                    include=["documents", "embeddings"]
                )
                
                if opp_results and opp_results.get('documents') and len(opp_results['documents']) > 0:
                    docs = opp_results['documents'][0]
                    embs_list = opp_results.get('embeddings', [[]])
                    embs = embs_list[0] if embs_list and len(embs_list) > 0 else []
                    
                    if len(embs) > 0 and len(embs) == len(docs):
                        for word, emb in zip(docs, embs):
                            if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]:
                                word_embedding = np.array(emb)
                                similarity = compute_similarity(ref_embedding, word_embedding)
                                least_similar.append((word, similarity))
                                if len(least_similar) >= n * 2:  # Get extra to sort
                                    break
                    
                    if len(least_similar) >= n * 2:
                        break
            except Exception as e:
                print(f"Error querying with '{opp_word}': {e}")
                continue
        
        # If we still don't have enough, try querying with negated embedding using query_texts
        if len(least_similar) < n:
            # Encode a negated concept
            neg_word_embedding = model.encode(["not " + reference_word], convert_to_numpy=True)[0]
            try:
                neg_results = collection.query(
                    query_embeddings=[neg_word_embedding.tolist()],
                    n_results=n + 10,
                    include=["documents", "embeddings"]
                )
                
                if neg_results and neg_results.get('documents') and len(neg_results['documents']) > 0:
                    docs = neg_results['documents'][0]
                    embs_list = neg_results.get('embeddings', [[]])
                    embs = embs_list[0] if embs_list and len(embs_list) > 0 else []
                    
                    if len(embs) > 0 and len(embs) == len(docs):
                        for word, emb in zip(docs, embs):
                            if word.lower() != reference_word.lower() and word not in [w for w, _ in least_similar]:
                                word_embedding = np.array(emb)
                                similarity = compute_similarity(ref_embedding, word_embedding)
                                least_similar.append((word, similarity))
                                if len(least_similar) >= n * 2:
                                    break
            except Exception as e:
                print(f"Error with negated query: {e}")
        
        # Sort least similar (ascending - lowest similarity first)
        least_similar.sort(key=lambda x: x[1])
        least_similar = least_similar[:n]  # Take top n least similar
        
        # Format output
        model_display = AVAILABLE_MODELS[model_key]['display']
        output = [
            f"**Using: {model_display}**\n",
            "**MOST SIMILAR:**"
        ]
        output.extend([f"{word}: {sim:.4f}" for word, sim in most_similar])
        output.append("\n**LEAST SIMILAR:**")
        output.extend([f"{word}: {sim:.4f}" for word, sim in least_similar])
        
        return "\n".join(output)
        
    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error in find_most_and_least_similar: {error_details}")
        print(f"Collection count: {collection.count() if collection else 'N/A'}")
        return f"Error: {str(e)}\n\nPlease check the logs for details."


def compare_words(model_key, reference_word, comparison_words):
    """
    Compare reference word to a list of comparison words
    """
    model, collection = load_model_and_collection(model_key)
    
    if model is None or collection is None:
        return "Error: Model or collection not loaded. Please ensure ChromaDB database is uploaded correctly."
    
    if not reference_word.strip():
        return "Please enter a reference word."
    
    if not comparison_words.strip():
        return "Please enter at least one comparison word."
    
    # Parse comparison words (one per line, max 10)
    comp_list = [w.strip() for w in comparison_words.split('\n') if w.strip()]
    if len(comp_list) > 10:
        return "Maximum 10 comparison words allowed. Please reduce your list."
    
    if not comp_list:
        return "Please enter at least one comparison word."
    
    # Get embeddings
    all_words = [reference_word] + comp_list
    embeddings = model.encode(all_words, convert_to_numpy=True)
    
    ref_embedding = embeddings[0]
    comp_embeddings = embeddings[1:]
    
    # Compute similarities
    results = []
    for word, embedding in zip(comp_list, comp_embeddings):
        sim = compute_similarity(ref_embedding, embedding)
        results.append((word, sim))
    
    # Sort by similarity (descending)
    results.sort(key=lambda x: -x[1])
    
    # Format output
    model_display = AVAILABLE_MODELS[model_key]['display']
    output = [f"**Using: {model_display}**\n"]
    output.extend([f"{word}: {sim:.4f}" for word, sim in results])
    return "\n".join(output)


def create_interface():
    """Create the Gradio interface with model dropdown and 3 tabs"""
    
    # Get model choices for dropdown
    model_choices = [info["display"] for info in AVAILABLE_MODELS.values()]
    model_keys = list(AVAILABLE_MODELS.keys())
    
    with gr.Blocks(title="Semantic Explorer") as app:
        gr.Markdown("# 🔍 Semantic Explorer")
        gr.Markdown("Explore semantic similarity between words using embedding vectors")
        
        # Model selector at the top
        model_selector = gr.Radio(
            choices=model_choices,
            value=model_choices[0],
            label="Select Embedding Model",
            info="Choose which embedding model to use for similarity calculations"
        )
        
        with gr.Tabs():
            # Tab 1: Comparison Tool (FIRST)
            with gr.Tab("Comparison Tool"):
                gr.Markdown("### Compare a reference word to specific comparison words")
                gr.Markdown("*Enter up to 10 comparison words, one per line.*")
                with gr.Row():
                    with gr.Column():
                        ct_reference = gr.Textbox(
                            label="Reference Word",
                            placeholder="Enter a word...",
                            lines=1
                        )
                        ct_comparisons = gr.Textbox(
                            label="Comparison Words (one per line, max 10)",
                            placeholder="word1\nword2\nword3\n...",
                            lines=10
                        )
                        ct_button = gr.Button("Compare Words", variant="primary")
                    with gr.Column():
                        ct_output = gr.Textbox(
                            label="Results",
                            lines=15,
                            placeholder="Results will appear here..."
                        )
                
                ct_button.click(
                    fn=lambda selector, ref, comp: compare_words(
                        model_keys[model_choices.index(selector)], ref, comp
                    ),
                    inputs=[model_selector, ct_reference, ct_comparisons],
                    outputs=ct_output
                )
            
            # Tab 2: Most & Least Similar (COMBINED)
            with gr.Tab("Most & Least Similar"):
                gr.Markdown("### Find both the most AND least semantically similar words")
                gr.Markdown("*Shows 20 results for each category*")
                with gr.Row():
                    with gr.Column():
                        ml_reference = gr.Textbox(
                            label="Reference Word",
                            placeholder="Enter a word...",
                            lines=1
                        )
                        ml_button = gr.Button("Find Similar & Dissimilar Words", variant="primary")
                    with gr.Column():
                        ml_output = gr.Textbox(
                            label="Results",
                            lines=25,
                            placeholder="Results will appear here..."
                        )
                
                ml_button.click(
                    fn=lambda selector, ref: find_most_and_least_similar(
                        model_keys[model_choices.index(selector)], ref, 20
                    ),
                    inputs=[model_selector, ml_reference],
                    outputs=ml_output
                )
            
        
        gr.Markdown("---")
        gr.Markdown("*Select different embedding models to compare their semantic representations*")
    
    return app


# Create the interface
demo = create_interface()

# Launch when run directly
if __name__ == "__main__":
    print("=" * 70)
    print("SEMANTIC EXPLORER - HUGGING FACE SPACES")
    print("=" * 70)
    print("\nAvailable models:")
    for key, info in AVAILABLE_MODELS.items():
        print(f"  - {info['display']}")
    print("\nNote: Models and collections will be loaded on-demand.")
    print("=" * 70)
    demo.launch()