import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import google as genai
from sklearn.metrics.pairwise import cosine_similarity
import io

# Page configuration
st.set_page_config(page_title="Sentence Classification Tool", layout="wide")

st.title("📊 Sentence Classification Tool")
st.markdown("Upload your reference vectors and source data to classify sentences based on cosine similarity.")

# Sidebar for configuration
st.sidebar.header("⚙️ Configuration")

# File uploads
st.sidebar.subheader("1. Upload Files")
reference_file = st.sidebar.file_uploader("Reference Vector File (tag, sentence)", type=['csv'])
source_file = st.sidebar.file_uploader("Source Data File (sentence)", type=['csv'])

# Model selection
st.sidebar.subheader("2. Select Embedding Model")
model_choice = st.sidebar.selectbox(
    "Embedding Model",
    ["sentence-transformers/all-MiniLM-L6-v2", "Google Gemini text-embedding-004"]
)

# API Key input for Gemini
api_key = None
if model_choice == "Google Gemini text-embedding-004":
    api_key = st.sidebar.text_input("Gemini API Key", type="password")

# Threshold input
st.sidebar.subheader("3. Set Threshold")
threshold = st.sidebar.number_input(
    "Cosine Similarity Threshold",
    min_value=0.0,
    max_value=1.0,
    value=0.7,
    step=0.01,
    format="%.2f"
)

# Batch size for processing
BATCH_SIZE = 50

def load_and_validate_files(ref_file, src_file):
    """Load and validate the uploaded CSV files"""
    try:
        ref_df = pd.read_csv(ref_file)
        src_df = pd.read_csv(src_file)
        
        # Validate reference file columns
        if 'tag' not in ref_df.columns or 'sentence' not in ref_df.columns:
            st.error("Reference file must contain 'tag' and 'sentence' columns")
            return None, None
        
        # Validate source file columns
        if 'sentence' not in src_df.columns:
            st.error("Source file must contain 'sentence' column")
            return None, None
        
        # Remove any rows with missing sentences
        ref_df = ref_df.dropna(subset=['sentence', 'tag'])
        src_df = src_df.dropna(subset=['sentence'])
        
        return ref_df, src_df
    except Exception as e:
        st.error(f"Error loading files: {str(e)}")
        return None, None

def get_embeddings_sentence_transformer(sentences, model):
    """Generate embeddings using sentence-transformers in batches"""
    all_embeddings = []
    
    for i in range(0, len(sentences), BATCH_SIZE):
        batch = sentences[i:i + BATCH_SIZE]
        embeddings = model.encode(batch)
        all_embeddings.extend(embeddings)
    
    return np.array(all_embeddings)

def get_embeddings_gemini(sentences, client):
    """Generate embeddings using Google Gemini in batches"""
    all_embeddings = []
    
    for i in range(0, len(sentences), BATCH_SIZE):
        batch = sentences[i:i + BATCH_SIZE]
        result = client.models.embed_content(
            model="text-embedding-004",
            contents=batch
        )
        for embedding in result.embeddings:
            all_embeddings.append(embedding.values)
    
    return np.array(all_embeddings)

def classify_sentences(ref_embeddings, src_embeddings, ref_df, src_df, threshold):
    """Classify source sentences based on cosine similarity with reference vectors"""
    predicted_tags = []
    similarity_scores = []
    matched_sentences = []
    
    # Create progress bar
    progress_bar = st.progress(0)
    status_text = st.empty()
    
    total_sentences = len(src_embeddings)
    
    for idx, src_emb in enumerate(src_embeddings):
        # Update progress
        progress = (idx + 1) / total_sentences
        progress_bar.progress(progress)
        status_text.text(f"Processing sentence {idx + 1} of {total_sentences}")
        
        # Calculate cosine similarity with all reference embeddings
        similarities = cosine_similarity([src_emb], ref_embeddings)[0]
        
        # Find the highest similarity score and its index
        max_idx = np.argmax(similarities)
        max_score = similarities[max_idx]
        
        # Check if score exceeds threshold
        if max_score >= threshold:
            predicted_tag = ref_df.iloc[max_idx]['tag']
            matched_sentence = ref_df.iloc[max_idx]['sentence']
        else:
            predicted_tag = "Other"
            matched_sentence = ""
        
        predicted_tags.append(predicted_tag)
        similarity_scores.append(round(max_score, 4))
        matched_sentences.append(matched_sentence)
    
    progress_bar.empty()
    status_text.empty()
    
    # Create a copy of the source dataframe and add new columns
    results_df = src_df.copy()
    results_df['Predicted Tag'] = predicted_tags
    results_df['Cosine Similarity Score'] = similarity_scores
    results_df['Matched Reference Sentence'] = matched_sentences
    
    return results_df


# Main processing
if reference_file and source_file:
    # Load and validate files
    ref_df, src_df = load_and_validate_files(reference_file, source_file)
    
    if ref_df is not None and src_df is not None:
        # Display file information
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("📄 Reference Vectors")
            st.write(f"Total reference sentences: {len(ref_df)}")
            st.dataframe(ref_df.head(), use_container_width=True)
        
        with col2:
            st.subheader("📄 Source Data")
            st.write(f"Total source sentences: {len(src_df)}")
            st.dataframe(src_df.head(), use_container_width=True)
        
        # Process button
        if st.button("🚀 Start Classification", type="primary"):
            # Validate Gemini API key if needed
            if model_choice == "Google Gemini text-embedding-004" and not api_key:
                st.error("Please provide a Gemini API key")
            else:
                try:
                    with st.spinner("Loading embedding model..."):
                        if model_choice == "sentence-transformers/all-MiniLM-L6-v2":
                            model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
                            st.success("✅ Model loaded successfully")
                        else:
                            client = genai.Client(api_key=api_key)
                            st.success("✅ Gemini client initialized")
                    
                    # Generate embeddings for reference vectors
                    with st.spinner("Generating embeddings for reference vectors..."):
                        ref_sentences = ref_df['sentence'].tolist()
                        if model_choice == "sentence-transformers/all-MiniLM-L6-v2":
                            ref_embeddings = get_embeddings_sentence_transformer(ref_sentences, model)
                        else:
                            ref_embeddings = get_embeddings_gemini(ref_sentences, client)
                        st.success(f"✅ Generated {len(ref_embeddings)} reference embeddings")
                    
                    # Generate embeddings for source data
                    with st.spinner("Generating embeddings for source data..."):
                        src_sentences = src_df['sentence'].tolist()
                        if model_choice == "sentence-transformers/all-MiniLM-L6-v2":
                            src_embeddings = get_embeddings_sentence_transformer(src_sentences, model)
                        else:
                            src_embeddings = get_embeddings_gemini(src_sentences, client)
                        st.success(f"✅ Generated {len(src_embeddings)} source embeddings")
                    
                    # Classify sentences
                    st.subheader("🔄 Classification Progress")
                    results_df = classify_sentences(
                        ref_embeddings, 
                        src_embeddings, 
                        ref_df, 
                        src_df, 
                        threshold
                    )
                    
                    st.success("✅ Classification complete!")
                    
                    # Display results
                    st.subheader("📊 Results")
                    
                    # Summary statistics
                    col1, col2, col3 = st.columns(3)
                    with col1:
                        st.metric("Total Sentences", len(results_df))
                    with col2:
                        classified = len(results_df[results_df['Predicted Tag'] != 'Other'])
                        st.metric("Classified", classified)
                    with col3:
                        other = len(results_df[results_df['Predicted Tag'] == 'Other'])
                        st.metric("Other", other)
                    
                    # Display results table
                    st.dataframe(results_df, use_container_width=True)
                    
                    # Download button
                    csv_buffer = io.StringIO()
                    results_df.to_csv(csv_buffer, index=False)
                    csv_bytes = csv_buffer.getvalue().encode()
                    
                    st.download_button(
                        label="⬇️ Download Results CSV",
                        data=csv_bytes,
                        file_name="classification_results.csv",
                        mime="text/csv"
                    )
                    
                except Exception as e:
                    st.error(f"Error during processing: {str(e)}")

else:
    st.info("👈 Please upload both reference and source files to begin")
    
    # Instructions
    st.markdown("""
    ### 📖 Instructions
    
    1. **Upload Reference Vector File**: CSV with columns `tag` and `sentence`
    2. **Upload Source Data File**: CSV with column `sentence`
    3. **Select Embedding Model**: Choose between Sentence Transformers or Google Gemini
    4. **Set Threshold**: Adjust the cosine similarity threshold (0-1)
    5. **Click Start Classification**: Process your data
    
    The tool will classify each source sentence by finding the most similar reference sentence and assigning its tag if the similarity exceeds the threshold.
    """)