import streamlit as st
import os
import glob
import torch
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import CrossEncoder
import pickle
import chromadb
from chromadb.utils import embedding_functions
from huggingface_hub import login

hf_token = os.getenv("HF_TOKEN")

if hf_token:
    try:
        # Use the official login function
        login(token=hf_token) 
        st.success("Hugging Face token successfully validated (or cached).")
    except Exception as e:
        # This might reveal a more specific error than AxiosError
        st.error(f"Hugging Face login failed (check token validity): {e}")
        # Still set the environment variables for downstream libraries
        os.environ["HF_TOKEN"] = hf_token
        os.environ["HUGGINGFACE_HUB_TOKEN"] = hf_token
else:
    st.warning("HF_TOKEN not found. Using anonymous access (may lead to 403 for private models).")
    
BASE_DIR = "/tmp"
os.makedirs(BASE_DIR, exist_ok=True)
# Global variables
collected_file = f"{BASE_DIR}/collected_data.txt"
vector_db_file = f"{BASE_DIR}/vector_db.faiss"
embedding_file = f"{BASE_DIR}/embeddings.npy"
chunks_file = f"{BASE_DIR}/chunks.pkl"
emb_choice_file = f"{BASE_DIR}/embedding_choice.txt"
index_choice_file = f"{BASE_DIR}/index_choice.txt"
chroma_dir = f"{BASE_DIR}/chroma_db"

os.makedirs(chroma_dir, exist_ok=True)

def bert_encode(model,tokenizer,texts, batch_size=300, device="cpu"):
    model.to(device)
    all_embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            all_embeddings.append(embeddings.cpu().numpy().astype("float16"))  # save memory
    return np.vstack(all_embeddings)
    
st.title("Retriever Web Application")
st.header("Collect Data -> Chunking & Embedding -> Vector DB Creation -> Inquiry & Reranking")

# Tab 1: Collect Data
tab1, tab2, tab3 = st.tabs(["Collect Data", "DB Formation", "Inquiry Vector DB"])

with tab1:
    st.header("Collect Data")
    
    # NEW: Use a text area for data input
    text_input = st.text_area(
        "Paste your data file content here:", 
        height=300,
        placeholder="Paste your text data here..."
    )
    
    collected_file_path = collected_file
    
    # 🌟 FIX: Pass a unique 'key' argument to the button
    collect_button_pressed = st.button("Collect", key="collect_data_button_tab1")
    
    if collect_button_pressed and text_input:
        
        all_text = text_input

        # Save the content to the temporary file
        with open(collected_file_path, "w", encoding="utf-8") as f:
            f.write(all_text)
        
        st.success("Collected 1 file's content successfully!")

    elif collect_button_pressed and not text_input:
        st.warning("Please paste content into the text area before clicking Collect.")
        
    else:
        st.write("Waiting for data input...")
        
# Tab 2: DB Formation
with tab2:
    st.header("Vector DB Formation")
    chunk_size = st.number_input("Chunk size:", 50, 1000, 200, step=50)
    overlap = st.number_input("Overlap size:", 0, 500, 50, step=10)
    embedding_choice = st.selectbox("Embedding Technique", ["SentencePiece", "TF-IDF", "BERT"])
    index_choice = st.selectbox("Vector DB", ["FAISS","ChromaDB"])
    embeddings = None
    if st.button("Create DB"):
        with open(collected_file, "r", encoding="utf-8") as f:
            text_data = f.read()
        chunks = [text_data[i:i+chunk_size] for i in range(0, len(text_data), chunk_size-overlap)]
        
        if embedding_choice == "SentencePiece":
            model = SentenceTransformer("all-MiniLM-L6-v2",use_auth_token=hf_token)
            embeddings = model.encode(chunks, batch_size=300)
        elif embedding_choice == "TF-IDF":
            vectorizer = TfidfVectorizer()
            embeddings = vectorizer.fit_transform(chunks).toarray()
        elif embedding_choice == "BERT":
            model_name = "bert-base-uncased"
            tokenizer = AutoTokenizer.from_pretrained(model_name,token=hf_token)
            model = AutoModel.from_pretrained(model_name,token=hf_token)
            embeddings = bert_encode(model,tokenizer,chunks)
        
        if index_choice == "FAISS":
            dim = len(embeddings[0])
            index = faiss.IndexFlatL2(dim)
            index.add(np.array(embeddings).astype("float32"))
            faiss.write_index(index, vector_db_file)
            np.save(embedding_file, embeddings)
        else:  # ChromaDB
#            client = chromadb.PersistentClient(path="chroma_db")
            client = chromadb.PersistentClient(path=chroma_dir)
            try:
                client.delete_collection("rag_collection")
            except:
                pass
            collection = client.get_or_create_collection("rag_collection")
            collection.add(
                documents=chunks,
                embeddings=embeddings,
                ids=[str(i) for i in range(len(chunks))]
            )

        
        with open(chunks_file, "wb") as f:
            pickle.dump(chunks, f)
        with open(emb_choice_file, "w") as f:
            f.write(embedding_choice)
        with open(index_choice_file, "w") as f:
            f.write(index_choice)
        
        st.write(f"Saved embeddings with shape: {embeddings.shape}")

# Tab 3: Inquiry Vector DB
with tab3:
    st.header("Inquiry Vector DB")
    user_query = st.text_area("User Query")
    expert_answer = st.text_area("Expert Answer")
    k = st.number_input("Number of retrieved data (k):", 1, 20, 5, step=1)
    #similarity_metric = st.selectbox("Similarity", ["cosine", "euclidean"])
    
    
    if st.button("Search"):
        # Load chunks and embedding choice and index choice
        with open(chunks_file, "rb") as f:
            chunks = pickle.load(f)
        with open(emb_choice_file, "r") as f:
            embedding_choice = f.read().strip()
        with open(index_choice_file, "r") as f:
            index_choice = f.read().strip()
        #display embedding choice and index choice
        st.header(f"Using Embedding: {embedding_choice}, Index: {index_choice}")
        query_emb = None
        if embedding_choice == "SentencePiece":
            model = SentenceTransformer("all-MiniLM-L6-v2")
            query_emb = model.encode([user_query])
        elif embedding_choice == "TF-IDF":
            vectorizer = TfidfVectorizer()
            vectorizer.fit(chunks)  # fit on same chunks
            query_emb = vectorizer.transform([user_query]).toarray()
        elif embedding_choice == "BERT":
            model_name = "bert-base-uncased"
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModel.from_pretrained(model_name)
            query_emb = bert_encode(model,tokenizer,[user_query])   
        
        if index_choice == "ChromaDB":
            #display similarity score measure used by chromadb and illustrate what number of score means more similar and its range
            st.write("Using ChromaDB with cosine similarity. In cosine similarity, a score closer to 1 means more similarity. " \
            "Conversely, a score closer to 0 means less similarity." \
            "Cosine similarity scores range from -1 to 1, where 1 indicates perfect similarity, 0 indicates no similarity, and -1 indicates " \
            "perfect dissimilarity.")

            client = chromadb.PersistentClient(path=chroma_dir)
            collection = client.get_or_create_collection("rag_collection")
            results = collection.query(
                query_embeddings=query_emb.tolist(),
                n_results=k,
                include=["documents", "distances"]
            )
            #display retrieved texts and scores beside each other
            retrieved_texts = results["documents"][0]
            retrieved_scores = results["distances"][0]

            st.subheader("Retrieved texts and scores:")
            for doc, score in zip(retrieved_texts, retrieved_scores):
                st.markdown(f"**Score:** {score:.4f}")
                st.write(doc)
                st.markdown("---")

            
        else:  # FAISS
            #display similarity score measure used by FAISS and illustrate what number of score means more similar and its range
            st.write("Using FAISS with L2 distance In L2 distance, a smaller score means more similarity." \
            "L2 distance scores range from 0 to infinity, where 0 indicates perfect similarity (identical vectors), "
            "and larger values indicate less similarity.")

            index = faiss.read_index(vector_db_file)
            D, I = index.search(query_emb.astype("float32"), k)
            #display retrieved texts and scores beside each other
            retrieved_texts = [chunks[i] for i in I[0]]
            st.subheader("Retrieved texts and scores:")
            for doc, score in zip(retrieved_texts, D[0]):
                st.markdown(f"**Score:** {score:.4f}")
                st.write(doc)
                st.markdown("---")

        # Reranking
        #display similarity score measure used by ReRank and illustrate what number of score means more similar and its range
        st.write("Reranking using Cross-ReRank (higher score means more relevance, and lower score means less relevance). " \
        "It is relative ranking (higher score = more relevant), not the absolute magnitude.")
        reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2",token=hf_token)
        scores = [reranker.predict([(user_query, doc)])[0] for doc in retrieved_texts]
        st.subheader("Reranked scores:")
        for doc, score in zip(retrieved_texts, scores):
            st.markdown(f"**Rerank Score:** {score:.4f}")
            st.write(doc)
            st.markdown("---")

        #meausre relevance if expert answer is provided
        if expert_answer.strip():
            relevance_scores = [reranker.predict([(expert_answer, doc)])[0] for doc in retrieved_texts]
            st.subheader("Relevance to Expert Answer:")
            for doc, score in zip(retrieved_texts, relevance_scores):
                st.markdown(f"**Relevance Score:** {score:.4f}")
                st.write(doc)
                st.markdown("---")