| # import gradio as gr | |
| # import faiss | |
| # import json | |
| # import numpy as np | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| # embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # index = faiss.read_index("faiss_index.bin") | |
| # with open("processed_chunks.json", "r") as f: | |
| # chunks = json.load(f) | |
| # #weights load automatically to the Space | |
| # pipe = pipeline("text-generation", model="ibm-granite/granite-3.1-2b-instruct") | |
| # # pipe = pipeline("text-generation", model="ibm-granite/granite-3.3-8b-instruct") | |
| # def ask_hr_bot(question): | |
| # # Retrieval | |
| # query_vec = embed_model.encode([question]) | |
| # distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) | |
| # # out-of-scope questions | |
| # if distances[0][0] > 1.5: | |
| # return "I could not find information on this in the HR documents.", "N/A" | |
| # # Context Retrieval | |
| # relevant_chunk = chunks[indices[0][0]] | |
| # context = relevant_chunk['text'] | |
| # source = relevant_chunk['doc_name'] | |
| # # Grounded Generation | |
| # prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" | |
| # res = pipe(prompt, max_new_tokens=60, do_sample=False) | |
| # answer = res[0]['generated_text'].split("Answer:")[-1].strip() | |
| # return answer, source | |
| # interface = gr.Interface( | |
| # fn=ask_hr_bot, | |
| # inputs=gr.Textbox(label="Ask an HR Question"), | |
| # outputs=[gr.Textbox(label="Bot Answer"), gr.Textbox(label="Source Used")], | |
| # title="HR Knowledge Assistant", | |
| # description="Enterprise RAG Prototype using IBM Granite Instruct Family." | |
| # ) | |
| # interface.launch() | |
| # import gradio as gr | |
| # import faiss | |
| # import json | |
| # import numpy as np | |
| # import torch | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| # # 1. Load Retrieval Logic | |
| # embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # index = faiss.read_index("faiss_index.bin") | |
| # with open("processed_chunks.json", "r") as f: | |
| # chunks = json.load(f) | |
| # # (Using Path 1: Ungated models to avoid the 401 Unauthorized Error) | |
| # MODELS = { | |
| # "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct", | |
| # "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct", | |
| # "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct", | |
| # "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct", | |
| # } | |
| # def ask_specific_model(model_name, prompt): | |
| # # Load model, generate, and delete to save RAM | |
| # pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu") | |
| # res = pipe(prompt, max_new_tokens=60, do_sample=False) | |
| # return res[0]['generated_text'].split("Answer:")[-1].strip() | |
| # def compare_hr_bots(question): | |
| # # 1. Retrieval | |
| # query_vec = embed_model.encode([question]) | |
| # distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) | |
| # # Fallback Rule | |
| # if distances[0][0] > 1.5: | |
| # # Yield the fallback immediately | |
| # yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A" | |
| # return | |
| # # 2. Context Setup | |
| # relevant_chunk = chunks[indices[0][0]] | |
| # context = relevant_chunk['text'] | |
| # source = relevant_chunk['doc_name'] | |
| # prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" | |
| # # 3. Setup progressive output array | |
| # # This acts as our visual placeholders | |
| # results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."] | |
| # # Yield the initial state so the user sees the source document immediately | |
| # yield results[0], results[1], results[2], results[3], source | |
| # # 4. Sequential Generation with Yield | |
| # model_names = list(MODELS.keys()) | |
| # for i, name in enumerate(model_names): | |
| # # Update the current box to show it is actively generating | |
| # results[i] = "⚙️ Generating..." | |
| # yield results[0], results[1], results[2], results[3], source | |
| # # Run the model | |
| # ans = ask_specific_model(name, prompt) | |
| # # Save the answer and yield the updated UI | |
| # results[i] = ans | |
| # yield results[0], results[1], results[2], results[3], source | |
| # # 5. Multi-Output Interface | |
| # interface = gr.Interface( | |
| # fn=compare_hr_bots, | |
| # inputs=gr.Textbox(label="Ask an HR Question"), | |
| # outputs=[ | |
| # gr.Textbox(label="IBM Granite 3.1"), | |
| # gr.Textbox(label="Alibaba Qwen 2.5 1.5B"), | |
| # gr.Textbox(label="HuggingFaceTB SmolLM 1.7B"), | |
| # gr.Textbox(label="Microsoft Phi 3.5"), | |
| # gr.Textbox(label="Source Used") | |
| # ], | |
| # title="RAG Model Benchmarking", | |
| # description="Sequential model comparison. Answers yield progressively to manage CPU RAM limits." | |
| # ) | |
| # interface.launch() | |
| # import gradio as gr | |
| # import faiss | |
| # import json | |
| # import numpy as np | |
| # import torch | |
| # from sentence_transformers import SentenceTransformer | |
| # from transformers import pipeline | |
| # embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # index = faiss.read_index("faiss_index.bin") | |
| # with open("processed_chunks.json", "r") as f: | |
| # chunks = json.load(f) | |
| # MODELS = { | |
| # "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct", | |
| # "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct", | |
| # "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct", | |
| # "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct" | |
| # } | |
| # def ask_specific_model(model_name, prompt): | |
| # pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu") | |
| # res = pipe(prompt, max_new_tokens=60, do_sample=False) | |
| # return res[0]['generated_text'].split("Answer:")[-1].strip() | |
| # def compare_hr_bots(question): | |
| # query_vec = embed_model.encode([question]) | |
| # distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) | |
| # if distances[0][0] > 1.5: | |
| # yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A" | |
| # return | |
| # relevant_chunk = chunks[indices[0][0]] | |
| # context = relevant_chunk['text'] | |
| # source = relevant_chunk['doc_name'] | |
| # prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" | |
| # results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."] | |
| # yield results[0], results[1], results[2], results[3], source | |
| # # Sequential Generation | |
| # model_names = list(MODELS.keys()) | |
| # for i, name in enumerate(model_names): | |
| # results[i] = "⚙️ Generating..." | |
| # yield results[0], results[1], results[2], results[3], source | |
| # ans = ask_specific_model(name, prompt) | |
| # results[i] = ans | |
| # yield results[0], results[1], results[2], results[3], source | |
| # interface = gr.Interface( | |
| # fn=compare_hr_bots, | |
| # inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"), | |
| # outputs=[ | |
| # gr.Textbox(label="IBM Granite 3.1 2B"), | |
| # gr.Textbox(label="Qwen 2.5 1.5B"), | |
| # gr.Textbox(label="SmolLM 1.7B"), | |
| # gr.Textbox(label="Microsoft Phi 3.5 Mini"), | |
| # gr.Textbox(label="Source Used") | |
| # ], | |
| # title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking", | |
| # description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)" | |
| # ) | |
| # interface.launch() | |
| import gradio as gr | |
| import faiss | |
| import json | |
| import numpy as np | |
| import torch | |
| import gc | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer | |
| embed_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| index = faiss.read_index("faiss_index.bin") | |
| with open("processed_chunks.json", "r") as f: | |
| chunks = json.load(f) | |
| MODELS = { | |
| "IBM Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct", | |
| "Microsoft Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct", | |
| "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct", | |
| "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct" | |
| } | |
| def ask_specific_model(model_name, prompt): | |
| pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu") | |
| res = pipe(prompt, max_new_tokens=60, do_sample=False) | |
| return res[0]['generated_text'].split("Answer:")[-1].strip() | |
| def compare_hr_bots(question): | |
| query_vec = embed_model.encode([question]) | |
| distances, indices = index.search(np.array(query_vec).astype('float32'), k=1) | |
| if distances[0][0] > 1.5: | |
| yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A" | |
| return | |
| relevant_chunk = chunks[indices[0][0]] | |
| context = relevant_chunk['text'] | |
| source = relevant_chunk['doc_name'] | |
| prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:" | |
| results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."] | |
| yield results[0], results[1], results[2], results[3], source | |
| model_names = list(MODELS.keys()) | |
| for i, name in enumerate(model_names): | |
| results[i] = "⚙️ Generating..." | |
| yield results[0], results[1], results[2], results[3], source | |
| ans = ask_specific_model(name, prompt) | |
| results[i] = ans | |
| yield results[0], results[1], results[2], results[3], source | |
| def calculate_perplexity(model_name): | |
| try: | |
| model_id = MODELS[model_name] | |
| sample_texts = [chunk['text'] for chunk in chunks[:3]] | |
| test_text = " ".join(sample_texts) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu") | |
| inputs = tokenizer(test_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"]) | |
| loss = outputs.loss | |
| perplexity = torch.exp(loss).item() | |
| del model | |
| del tokenizer | |
| del inputs | |
| del outputs | |
| gc.collect() | |
| return f"Perplexity Score: {perplexity:.2f}\n\n(Tested on internal HR policies. Lower is better.)" | |
| except Exception as e: | |
| return f"Error calculating perplexity: {str(e)}" | |
| with gr.Blocks(theme=gr.themes.Soft()) as interface: | |
| gr.Markdown("# ADQ Enterprise HR Knowledge Assistant & Evaluation Toolkit") | |
| gr.Markdown("Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)") | |
| with gr.Tabs(): | |
| # TAB 1 UI | |
| with gr.TabItem("💬 RAG Chatbot (Benchmarking)"): | |
| question_input = gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?") | |
| submit_btn = gr.Button("Compare Models") | |
| with gr.Row(): | |
| out_granite = gr.Textbox(label="IBM Granite 3.1 2B") | |
| out_phi = gr.Textbox(label="Microsoft Phi 3.5 Mini") | |
| with gr.Row(): | |
| out_qwen = gr.Textbox(label="Alibaba Qwen 2.5 1.5B") | |
| out_smol = gr.Textbox(label="HuggingFace SmolLM 1.7B") | |
| out_source = gr.Textbox(label="Source Document Used") | |
| submit_btn.click( | |
| fn=compare_hr_bots, | |
| inputs=question_input, | |
| outputs=[out_granite, out_phi, out_qwen, out_smol, out_source] | |
| ) | |
| with gr.TabItem("📊 Perplexity Evaluator"): | |
| gr.Markdown("Select a single model to calculate its perplexity against our internal HR dataset. **Warning: Takes 30-60 seconds on CPU.**") | |
| model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), label="Select Model to Evaluate", value="IBM Granite 3.1 2B") | |
| eval_btn = gr.Button("Calculate Perplexity") | |
| eval_output = gr.Textbox(label="Evaluation Result") | |
| eval_btn.click( | |
| fn=calculate_perplexity, | |
| inputs=model_dropdown, | |
| outputs=eval_output | |
| ) | |
| interface.launch() |