jayjay-12345's picture
Update app.py
f1b302b verified
# import gradio as gr
# import faiss
# import json
# import numpy as np
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# index = faiss.read_index("faiss_index.bin")
# with open("processed_chunks.json", "r") as f:
# chunks = json.load(f)
# #weights load automatically to the Space
# pipe = pipeline("text-generation", model="ibm-granite/granite-3.1-2b-instruct")
# # pipe = pipeline("text-generation", model="ibm-granite/granite-3.3-8b-instruct")
# def ask_hr_bot(question):
# # Retrieval
# query_vec = embed_model.encode([question])
# distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
# # out-of-scope questions
# if distances[0][0] > 1.5:
# return "I could not find information on this in the HR documents.", "N/A"
# # Context Retrieval
# relevant_chunk = chunks[indices[0][0]]
# context = relevant_chunk['text']
# source = relevant_chunk['doc_name']
# # Grounded Generation
# prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
# res = pipe(prompt, max_new_tokens=60, do_sample=False)
# answer = res[0]['generated_text'].split("Answer:")[-1].strip()
# return answer, source
# interface = gr.Interface(
# fn=ask_hr_bot,
# inputs=gr.Textbox(label="Ask an HR Question"),
# outputs=[gr.Textbox(label="Bot Answer"), gr.Textbox(label="Source Used")],
# title="HR Knowledge Assistant",
# description="Enterprise RAG Prototype using IBM Granite Instruct Family."
# )
# interface.launch()
# import gradio as gr
# import faiss
# import json
# import numpy as np
# import torch
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# # 1. Load Retrieval Logic
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# index = faiss.read_index("faiss_index.bin")
# with open("processed_chunks.json", "r") as f:
# chunks = json.load(f)
# # (Using Path 1: Ungated models to avoid the 401 Unauthorized Error)
# MODELS = {
# "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
# "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
# "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
# "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
# }
# def ask_specific_model(model_name, prompt):
# # Load model, generate, and delete to save RAM
# pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
# res = pipe(prompt, max_new_tokens=60, do_sample=False)
# return res[0]['generated_text'].split("Answer:")[-1].strip()
# def compare_hr_bots(question):
# # 1. Retrieval
# query_vec = embed_model.encode([question])
# distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
# # Fallback Rule
# if distances[0][0] > 1.5:
# # Yield the fallback immediately
# yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
# return
# # 2. Context Setup
# relevant_chunk = chunks[indices[0][0]]
# context = relevant_chunk['text']
# source = relevant_chunk['doc_name']
# prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
# # 3. Setup progressive output array
# # This acts as our visual placeholders
# results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
# # Yield the initial state so the user sees the source document immediately
# yield results[0], results[1], results[2], results[3], source
# # 4. Sequential Generation with Yield
# model_names = list(MODELS.keys())
# for i, name in enumerate(model_names):
# # Update the current box to show it is actively generating
# results[i] = "⚙️ Generating..."
# yield results[0], results[1], results[2], results[3], source
# # Run the model
# ans = ask_specific_model(name, prompt)
# # Save the answer and yield the updated UI
# results[i] = ans
# yield results[0], results[1], results[2], results[3], source
# # 5. Multi-Output Interface
# interface = gr.Interface(
# fn=compare_hr_bots,
# inputs=gr.Textbox(label="Ask an HR Question"),
# outputs=[
# gr.Textbox(label="IBM Granite 3.1"),
# gr.Textbox(label="Alibaba Qwen 2.5 1.5B"),
# gr.Textbox(label="HuggingFaceTB SmolLM 1.7B"),
# gr.Textbox(label="Microsoft Phi 3.5"),
# gr.Textbox(label="Source Used")
# ],
# title="RAG Model Benchmarking",
# description="Sequential model comparison. Answers yield progressively to manage CPU RAM limits."
# )
# interface.launch()
# import gradio as gr
# import faiss
# import json
# import numpy as np
# import torch
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# index = faiss.read_index("faiss_index.bin")
# with open("processed_chunks.json", "r") as f:
# chunks = json.load(f)
# MODELS = {
# "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
# "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
# "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
# "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct"
# }
# def ask_specific_model(model_name, prompt):
# pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
# res = pipe(prompt, max_new_tokens=60, do_sample=False)
# return res[0]['generated_text'].split("Answer:")[-1].strip()
# def compare_hr_bots(question):
# query_vec = embed_model.encode([question])
# distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
# if distances[0][0] > 1.5:
# yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
# return
# relevant_chunk = chunks[indices[0][0]]
# context = relevant_chunk['text']
# source = relevant_chunk['doc_name']
# prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
# results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
# yield results[0], results[1], results[2], results[3], source
# # Sequential Generation
# model_names = list(MODELS.keys())
# for i, name in enumerate(model_names):
# results[i] = "⚙️ Generating..."
# yield results[0], results[1], results[2], results[3], source
# ans = ask_specific_model(name, prompt)
# results[i] = ans
# yield results[0], results[1], results[2], results[3], source
# interface = gr.Interface(
# fn=compare_hr_bots,
# inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"),
# outputs=[
# gr.Textbox(label="IBM Granite 3.1 2B"),
# gr.Textbox(label="Qwen 2.5 1.5B"),
# gr.Textbox(label="SmolLM 1.7B"),
# gr.Textbox(label="Microsoft Phi 3.5 Mini"),
# gr.Textbox(label="Source Used")
# ],
# title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking",
# description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)"
# )
# interface.launch()
import gradio as gr
import faiss
import json
import numpy as np
import torch
import gc
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.read_index("faiss_index.bin")
with open("processed_chunks.json", "r") as f:
chunks = json.load(f)
MODELS = {
"IBM Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
"Microsoft Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
"Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
"SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct"
}
def ask_specific_model(model_name, prompt):
pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
res = pipe(prompt, max_new_tokens=60, do_sample=False)
return res[0]['generated_text'].split("Answer:")[-1].strip()
def compare_hr_bots(question):
query_vec = embed_model.encode([question])
distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
if distances[0][0] > 1.5:
yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
return
relevant_chunk = chunks[indices[0][0]]
context = relevant_chunk['text']
source = relevant_chunk['doc_name']
prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
yield results[0], results[1], results[2], results[3], source
model_names = list(MODELS.keys())
for i, name in enumerate(model_names):
results[i] = "⚙️ Generating..."
yield results[0], results[1], results[2], results[3], source
ans = ask_specific_model(name, prompt)
results[i] = ans
yield results[0], results[1], results[2], results[3], source
def calculate_perplexity(model_name):
try:
model_id = MODELS[model_name]
sample_texts = [chunk['text'] for chunk in chunks[:3]]
test_text = " ".join(sample_texts)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
inputs = tokenizer(test_text, return_tensors="pt")
with torch.no_grad():
outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
loss = outputs.loss
perplexity = torch.exp(loss).item()
del model
del tokenizer
del inputs
del outputs
gc.collect()
return f"Perplexity Score: {perplexity:.2f}\n\n(Tested on internal HR policies. Lower is better.)"
except Exception as e:
return f"Error calculating perplexity: {str(e)}"
with gr.Blocks(theme=gr.themes.Soft()) as interface:
gr.Markdown("# ADQ Enterprise HR Knowledge Assistant & Evaluation Toolkit")
gr.Markdown("Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)")
with gr.Tabs():
# TAB 1 UI
with gr.TabItem("💬 RAG Chatbot (Benchmarking)"):
question_input = gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?")
submit_btn = gr.Button("Compare Models")
with gr.Row():
out_granite = gr.Textbox(label="IBM Granite 3.1 2B")
out_phi = gr.Textbox(label="Microsoft Phi 3.5 Mini")
with gr.Row():
out_qwen = gr.Textbox(label="Alibaba Qwen 2.5 1.5B")
out_smol = gr.Textbox(label="HuggingFace SmolLM 1.7B")
out_source = gr.Textbox(label="Source Document Used")
submit_btn.click(
fn=compare_hr_bots,
inputs=question_input,
outputs=[out_granite, out_phi, out_qwen, out_smol, out_source]
)
with gr.TabItem("📊 Perplexity Evaluator"):
gr.Markdown("Select a single model to calculate its perplexity against our internal HR dataset. **Warning: Takes 30-60 seconds on CPU.**")
model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), label="Select Model to Evaluate", value="IBM Granite 3.1 2B")
eval_btn = gr.Button("Calculate Perplexity")
eval_output = gr.Textbox(label="Evaluation Result")
eval_btn.click(
fn=calculate_perplexity,
inputs=model_dropdown,
outputs=eval_output
)
interface.launch()