# import gradio as gr # import faiss # import json # import numpy as np # from sentence_transformers import SentenceTransformer # from transformers import pipeline # embed_model = SentenceTransformer('all-MiniLM-L6-v2') # index = faiss.read_index("faiss_index.bin") # with open("processed_chunks.json", "r") as f: # chunks = json.load(f) # #weights load automatically to the Space # pipe = pipeline("text-generation", model="ibm-granite/granite-3.1-2b-instruct") # # pipe = pipeline("text-generation", model="ibm-granite/granite-3.3-8b-instruct") # def ask_hr_bot(question): # # Retrieval # query_vec = embed_model.encode([question]) # distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) # # out-of-scope questions # if distances[0][0] > 1.5: # return "I could not find information on this in the HR documents.", "N/A" # # Context Retrieval # relevant_chunk = chunks[indices[0][0]] # context = relevant_chunk['text'] # source = relevant_chunk['doc_name'] # # Grounded Generation # prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" # res = pipe(prompt, max_new_tokens=60, do_sample=False) # answer = res[0]['generated_text'].split("Answer:")[-1].strip() # return answer, source # interface = gr.Interface( # fn=ask_hr_bot, # inputs=gr.Textbox(label="Ask an HR Question"), # outputs=[gr.Textbox(label="Bot Answer"), gr.Textbox(label="Source Used")], # title="HR Knowledge Assistant", # description="Enterprise RAG Prototype using IBM Granite Instruct Family." # ) # interface.launch() # import gradio as gr # import faiss # import json # import numpy as np # import torch # from sentence_transformers import SentenceTransformer # from transformers import pipeline # # 1. Load Retrieval Logic # embed_model = SentenceTransformer('all-MiniLM-L6-v2') # index = faiss.read_index("faiss_index.bin") # with open("processed_chunks.json", "r") as f: # chunks = json.load(f) # # (Using Path 1: Ungated models to avoid the 401 Unauthorized Error) # MODELS = { # "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct", # "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct", # "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct", # "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct", # } # def ask_specific_model(model_name, prompt): # # Load model, generate, and delete to save RAM # pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu") # res = pipe(prompt, max_new_tokens=60, do_sample=False) # return res[0]['generated_text'].split("Answer:")[-1].strip() # def compare_hr_bots(question): # # 1. Retrieval # query_vec = embed_model.encode([question]) # distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) # # Fallback Rule # if distances[0][0] > 1.5: # # Yield the fallback immediately # yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A" # return # # 2. Context Setup # relevant_chunk = chunks[indices[0][0]] # context = relevant_chunk['text'] # source = relevant_chunk['doc_name'] # prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" # # 3. Setup progressive output array # # This acts as our visual placeholders # results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."] # # Yield the initial state so the user sees the source document immediately # yield results[0], results[1], results[2], results[3], source # # 4. Sequential Generation with Yield # model_names = list(MODELS.keys()) # for i, name in enumerate(model_names): # # Update the current box to show it is actively generating # results[i] = "⚙️ Generating..." # yield results[0], results[1], results[2], results[3], source # # Run the model # ans = ask_specific_model(name, prompt) # # Save the answer and yield the updated UI # results[i] = ans # yield results[0], results[1], results[2], results[3], source # # 5. Multi-Output Interface # interface = gr.Interface( # fn=compare_hr_bots, # inputs=gr.Textbox(label="Ask an HR Question"), # outputs=[ # gr.Textbox(label="IBM Granite 3.1"), # gr.Textbox(label="Alibaba Qwen 2.5 1.5B"), # gr.Textbox(label="HuggingFaceTB SmolLM 1.7B"), # gr.Textbox(label="Microsoft Phi 3.5"), # gr.Textbox(label="Source Used") # ], # title="RAG Model Benchmarking", # description="Sequential model comparison. Answers yield progressively to manage CPU RAM limits." # ) # interface.launch() # import gradio as gr # import faiss # import json # import numpy as np # import torch # from sentence_transformers import SentenceTransformer # from transformers import pipeline # embed_model = SentenceTransformer('all-MiniLM-L6-v2') # index = faiss.read_index("faiss_index.bin") # with open("processed_chunks.json", "r") as f: # chunks = json.load(f) # MODELS = { # "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct", # "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct", # "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct", # "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct" # } # def ask_specific_model(model_name, prompt): # pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu") # res = pipe(prompt, max_new_tokens=60, do_sample=False) # return res[0]['generated_text'].split("Answer:")[-1].strip() # def compare_hr_bots(question): # query_vec = embed_model.encode([question]) # distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) # if distances[0][0] > 1.5: # yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A" # return # relevant_chunk = chunks[indices[0][0]] # context = relevant_chunk['text'] # source = relevant_chunk['doc_name'] # prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" # results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."] # yield results[0], results[1], results[2], results[3], source # # Sequential Generation # model_names = list(MODELS.keys()) # for i, name in enumerate(model_names): # results[i] = "⚙️ Generating..." # yield results[0], results[1], results[2], results[3], source # ans = ask_specific_model(name, prompt) # results[i] = ans # yield results[0], results[1], results[2], results[3], source # interface = gr.Interface( # fn=compare_hr_bots, # inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"), # outputs=[ # gr.Textbox(label="IBM Granite 3.1 2B"), # gr.Textbox(label="Qwen 2.5 1.5B"), # gr.Textbox(label="SmolLM 1.7B"), # gr.Textbox(label="Microsoft Phi 3.5 Mini"), # gr.Textbox(label="Source Used") # ], # title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking", # description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)" # ) # interface.launch() import gradio as gr import faiss import json import numpy as np import torch import gc from sentence_transformers import SentenceTransformer from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer embed_model = SentenceTransformer('all-MiniLM-L6-v2') index = faiss.read_index("faiss_index.bin") with open("processed_chunks.json", "r") as f: chunks = json.load(f) MODELS = { "IBM Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct", "Microsoft Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct", "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct", "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct" } def ask_specific_model(model_name, prompt): pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu") res = pipe(prompt, max_new_tokens=60, do_sample=False) return res[0]['generated_text'].split("Answer:")[-1].strip() def compare_hr_bots(question): query_vec = embed_model.encode([question]) distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) if distances[0][0] > 1.5: yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A" return relevant_chunk = chunks[indices[0][0]] context = relevant_chunk['text'] source = relevant_chunk['doc_name'] prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."] yield results[0], results[1], results[2], results[3], source model_names = list(MODELS.keys()) for i, name in enumerate(model_names): results[i] = "⚙️ Generating..." yield results[0], results[1], results[2], results[3], source ans = ask_specific_model(name, prompt) results[i] = ans yield results[0], results[1], results[2], results[3], source def calculate_perplexity(model_name): try: model_id = MODELS[model_name] sample_texts = [chunk['text'] for chunk in chunks[:3]] test_text = " ".join(sample_texts) tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu") inputs = tokenizer(test_text, return_tensors="pt") with torch.no_grad(): outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"]) loss = outputs.loss perplexity = torch.exp(loss).item() del model del tokenizer del inputs del outputs gc.collect() return f"Perplexity Score: {perplexity:.2f}\n\n(Tested on internal HR policies. Lower is better.)" except Exception as e: return f"Error calculating perplexity: {str(e)}" with gr.Blocks(theme=gr.themes.Soft()) as interface: gr.Markdown("# ADQ Enterprise HR Knowledge Assistant & Evaluation Toolkit") gr.Markdown("Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)") with gr.Tabs(): # TAB 1 UI with gr.TabItem("💬 RAG Chatbot (Benchmarking)"): question_input = gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?") submit_btn = gr.Button("Compare Models") with gr.Row(): out_granite = gr.Textbox(label="IBM Granite 3.1 2B") out_phi = gr.Textbox(label="Microsoft Phi 3.5 Mini") with gr.Row(): out_qwen = gr.Textbox(label="Alibaba Qwen 2.5 1.5B") out_smol = gr.Textbox(label="HuggingFace SmolLM 1.7B") out_source = gr.Textbox(label="Source Document Used") submit_btn.click( fn=compare_hr_bots, inputs=question_input, outputs=[out_granite, out_phi, out_qwen, out_smol, out_source] ) with gr.TabItem("📊 Perplexity Evaluator"): gr.Markdown("Select a single model to calculate its perplexity against our internal HR dataset. **Warning: Takes 30-60 seconds on CPU.**") model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), label="Select Model to Evaluate", value="IBM Granite 3.1 2B") eval_btn = gr.Button("Calculate Perplexity") eval_output = gr.Textbox(label="Evaluation Result") eval_btn.click( fn=calculate_perplexity, inputs=model_dropdown, outputs=eval_output ) interface.launch()