# import gradio as gr
# import faiss
# import json
# import numpy as np
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline

# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# index = faiss.read_index("faiss_index.bin")  
# with open("processed_chunks.json", "r") as f:
#     chunks = json.load(f)


# #weights load automatically to the Space
# pipe = pipeline("text-generation", model="ibm-granite/granite-3.1-2b-instruct")
# # pipe = pipeline("text-generation", model="ibm-granite/granite-3.3-8b-instruct")


# def ask_hr_bot(question):
#     # Retrieval
#     query_vec = embed_model.encode([question])
#     distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
    
#     # out-of-scope questions
#     if distances[0][0] > 1.5:
#         return "I could not find information on this in the HR documents.", "N/A"

#     # Context Retrieval
#     relevant_chunk = chunks[indices[0][0]]
#     context = relevant_chunk['text']
#     source = relevant_chunk['doc_name']

#     # Grounded Generation
#     prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
#     res = pipe(prompt, max_new_tokens=60, do_sample=False)
    
#     answer = res[0]['generated_text'].split("Answer:")[-1].strip()
#     return answer, source


# interface = gr.Interface(
#     fn=ask_hr_bot,
#     inputs=gr.Textbox(label="Ask an HR Question"),
#     outputs=[gr.Textbox(label="Bot Answer"), gr.Textbox(label="Source Used")],
#     title="HR Knowledge Assistant",
#     description="Enterprise RAG Prototype using IBM Granite Instruct Family."
# )

# interface.launch()

# import gradio as gr
# import faiss
# import json
# import numpy as np
# import torch
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline

# # 1. Load Retrieval Logic
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# index = faiss.read_index("faiss_index.bin")
# with open("processed_chunks.json", "r") as f:
#     chunks = json.load(f)

# # (Using Path 1: Ungated models to avoid the 401 Unauthorized Error)
# MODELS = {
#     "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
#     "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
#     "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
#     "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
# }

# def ask_specific_model(model_name, prompt):
#     # Load model, generate, and delete to save RAM
#     pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
#     res = pipe(prompt, max_new_tokens=60, do_sample=False)
#     return res[0]['generated_text'].split("Answer:")[-1].strip()

# def compare_hr_bots(question):
#     # 1. Retrieval
#     query_vec = embed_model.encode([question])
#     distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
    
#     # Fallback Rule
#     if distances[0][0] > 1.5:
#         # Yield the fallback immediately
#         yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
#         return

#     # 2. Context Setup
#     relevant_chunk = chunks[indices[0][0]]
#     context = relevant_chunk['text']
#     source = relevant_chunk['doc_name']
#     prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"

#     # 3. Setup progressive output array
#     # This acts as our visual placeholders
#     results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
    
#     # Yield the initial state so the user sees the source document immediately
#     yield results[0], results[1], results[2], results[3], source

#     # 4. Sequential Generation with Yield
#     model_names = list(MODELS.keys())
#     for i, name in enumerate(model_names):
#         # Update the current box to show it is actively generating
#         results[i] = "⚙️ Generating..."
#         yield results[0], results[1], results[2], results[3], source
        
#         # Run the model
#         ans = ask_specific_model(name, prompt)
        
#         # Save the answer and yield the updated UI
#         results[i] = ans
#         yield results[0], results[1], results[2], results[3], source

# # 5. Multi-Output Interface
# interface = gr.Interface(
#     fn=compare_hr_bots,
#     inputs=gr.Textbox(label="Ask an HR Question"),
#     outputs=[
#         gr.Textbox(label="IBM Granite 3.1"),
#         gr.Textbox(label="Alibaba Qwen 2.5 1.5B"),
#         gr.Textbox(label="HuggingFaceTB SmolLM 1.7B"),
#         gr.Textbox(label="Microsoft Phi 3.5"),
#         gr.Textbox(label="Source Used")
#     ],
#     title="RAG Model Benchmarking",
#     description="Sequential model comparison. Answers yield progressively to manage CPU RAM limits."
# )

# interface.launch()


# import gradio as gr
# import faiss
# import json
# import numpy as np
# import torch
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline

# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# index = faiss.read_index("faiss_index.bin")
# with open("processed_chunks.json", "r") as f:
#     chunks = json.load(f)

# MODELS = {
#     "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
#     "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
#     "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
#      "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct"
# }

# def ask_specific_model(model_name, prompt):
#     pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
#     res = pipe(prompt, max_new_tokens=60, do_sample=False)
#     return res[0]['generated_text'].split("Answer:")[-1].strip()

# def compare_hr_bots(question):
#     query_vec = embed_model.encode([question])
#     distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
    
#     if distances[0][0] > 1.5:
#         yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
#         return

#     relevant_chunk = chunks[indices[0][0]]
#     context = relevant_chunk['text']
#     source = relevant_chunk['doc_name']
#     prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"

#     results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
#     yield results[0], results[1], results[2], results[3], source

#     # Sequential Generation
#     model_names = list(MODELS.keys())
#     for i, name in enumerate(model_names):
#         results[i] = "⚙️ Generating..."
#         yield results[0], results[1], results[2], results[3], source
#         ans = ask_specific_model(name, prompt)
#         results[i] = ans
#         yield results[0], results[1], results[2], results[3], source

# interface = gr.Interface(
#     fn=compare_hr_bots,
#     inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"),
#     outputs=[
#         gr.Textbox(label="IBM Granite 3.1 2B"),
#         gr.Textbox(label="Qwen 2.5 1.5B"),
#         gr.Textbox(label="SmolLM 1.7B"),
#         gr.Textbox(label="Microsoft Phi 3.5 Mini"),
#         gr.Textbox(label="Source Used")
#     ],
#     title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking",
#     description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)"
# )

# interface.launch()


import gradio as gr
import faiss
import json
import numpy as np
import torch
import gc
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer


embed_model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.read_index("faiss_index.bin")
with open("processed_chunks.json", "r") as f:
    chunks = json.load(f)


MODELS = {
    "IBM Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
    "Microsoft Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
    "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
    "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct"
}


def ask_specific_model(model_name, prompt):
    pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
    res = pipe(prompt, max_new_tokens=60, do_sample=False)
    return res[0]['generated_text'].split("Answer:")[-1].strip()

def compare_hr_bots(question):
    query_vec = embed_model.encode([question])
    distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
    
    if distances[0][0] > 1.5:
        yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
        return

    relevant_chunk = chunks[indices[0][0]]
    context = relevant_chunk['text']
    source = relevant_chunk['doc_name']
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"

    results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
    yield results[0], results[1], results[2], results[3], source

    model_names = list(MODELS.keys())
    for i, name in enumerate(model_names):
        results[i] = "⚙️ Generating..."
        yield results[0], results[1], results[2], results[3], source
        ans = ask_specific_model(name, prompt)
        results[i] = ans
        yield results[0], results[1], results[2], results[3], source


def calculate_perplexity(model_name):
    try:
        model_id = MODELS[model_name]
        
        
        sample_texts = [chunk['text'] for chunk in chunks[:3]]
        test_text = " ".join(sample_texts)
        
        
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
        
        
        inputs = tokenizer(test_text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
            loss = outputs.loss
            
        perplexity = torch.exp(loss).item()
        
       
        del model
        del tokenizer
        del inputs
        del outputs
        gc.collect()
        
        return f"Perplexity Score: {perplexity:.2f}\n\n(Tested on internal HR policies. Lower is better.)"
        
    except Exception as e:
        return f"Error calculating perplexity: {str(e)}"


with gr.Blocks(theme=gr.themes.Soft()) as interface:
    gr.Markdown("# ADQ Enterprise HR Knowledge Assistant & Evaluation Toolkit")
    gr.Markdown("Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)")
    
    with gr.Tabs():
        # TAB 1 UI
        with gr.TabItem("💬 RAG Chatbot (Benchmarking)"):
            question_input = gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?")
            submit_btn = gr.Button("Compare Models")
            
            with gr.Row():
                out_granite = gr.Textbox(label="IBM Granite 3.1 2B")
                out_phi = gr.Textbox(label="Microsoft Phi 3.5 Mini")
            with gr.Row():
                out_qwen = gr.Textbox(label="Alibaba Qwen 2.5 1.5B")
                out_smol = gr.Textbox(label="HuggingFace SmolLM 1.7B")
            out_source = gr.Textbox(label="Source Document Used")
            
            submit_btn.click(
                fn=compare_hr_bots,
                inputs=question_input,
                outputs=[out_granite, out_phi, out_qwen, out_smol, out_source]
            )
            
       
        with gr.TabItem("📊 Perplexity Evaluator"):
            gr.Markdown("Select a single model to calculate its perplexity against our internal HR dataset. **Warning: Takes 30-60 seconds on CPU.**")
            
            model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), label="Select Model to Evaluate", value="IBM Granite 3.1 2B")
            eval_btn = gr.Button("Calculate Perplexity")
            eval_output = gr.Textbox(label="Evaluation Result")
            
            eval_btn.click(
                fn=calculate_perplexity,
                inputs=model_dropdown,
                outputs=eval_output
            )

interface.launch()