Spaces:

jayjay-12345
/

CODS641_Enterprise_FAQ_Bot

Sleeping

App Files Files Community

CODS641_Enterprise_FAQ_Bot / app.py

jayjay-12345

Update app.py

f1b302b verified 16 days ago

raw

history blame contribute delete

12.3 kB

	# import gradio as gr
	# import faiss
	# import json
	# import numpy as np
	# from sentence_transformers import SentenceTransformer
	# from transformers import pipeline

	# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# index = faiss.read_index("faiss_index.bin")
	# with open("processed_chunks.json", "r") as f:
	# chunks = json.load(f)


	# #weights load automatically to the Space
	# pipe = pipeline("text-generation", model="ibm-granite/granite-3.1-2b-instruct")
	# # pipe = pipeline("text-generation", model="ibm-granite/granite-3.3-8b-instruct")


	# def ask_hr_bot(question):
	# # Retrieval
	# query_vec = embed_model.encode([question])
	# distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)

	# # out-of-scope questions
	# if distances[0][0] > 1.5:
	# return "I could not find information on this in the HR documents.", "N/A"

	# # Context Retrieval
	# relevant_chunk = chunks[indices[0][0]]
	# context = relevant_chunk['text']
	# source = relevant_chunk['doc_name']

	# # Grounded Generation
	# prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
	# res = pipe(prompt, max_new_tokens=60, do_sample=False)

	# answer = res[0]['generated_text'].split("Answer:")[-1].strip()
	# return answer, source


	# interface = gr.Interface(
	# fn=ask_hr_bot,
	# inputs=gr.Textbox(label="Ask an HR Question"),
	# outputs=[gr.Textbox(label="Bot Answer"), gr.Textbox(label="Source Used")],
	# title="HR Knowledge Assistant",
	# description="Enterprise RAG Prototype using IBM Granite Instruct Family."
	# )

	# interface.launch()

	# import gradio as gr
	# import faiss
	# import json
	# import numpy as np
	# import torch
	# from sentence_transformers import SentenceTransformer
	# from transformers import pipeline

	# # 1. Load Retrieval Logic
	# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# index = faiss.read_index("faiss_index.bin")
	# with open("processed_chunks.json", "r") as f:
	# chunks = json.load(f)

	# # (Using Path 1: Ungated models to avoid the 401 Unauthorized Error)
	# MODELS = {
	# "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
	# "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
	# "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
	# "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
	# }

	# def ask_specific_model(model_name, prompt):
	# # Load model, generate, and delete to save RAM
	# pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
	# res = pipe(prompt, max_new_tokens=60, do_sample=False)
	# return res[0]['generated_text'].split("Answer:")[-1].strip()

	# def compare_hr_bots(question):
	# # 1. Retrieval
	# query_vec = embed_model.encode([question])
	# distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)

	# # Fallback Rule
	# if distances[0][0] > 1.5:
	# # Yield the fallback immediately
	# yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
	# return

	# # 2. Context Setup
	# relevant_chunk = chunks[indices[0][0]]
	# context = relevant_chunk['text']
	# source = relevant_chunk['doc_name']
	# prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"

	# # 3. Setup progressive output array
	# # This acts as our visual placeholders
	# results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]

	# # Yield the initial state so the user sees the source document immediately
	# yield results[0], results[1], results[2], results[3], source

	# # 4. Sequential Generation with Yield
	# model_names = list(MODELS.keys())
	# for i, name in enumerate(model_names):
	# # Update the current box to show it is actively generating
	# results[i] = "⚙️ Generating..."
	# yield results[0], results[1], results[2], results[3], source

	# # Run the model
	# ans = ask_specific_model(name, prompt)

	# # Save the answer and yield the updated UI
	# results[i] = ans
	# yield results[0], results[1], results[2], results[3], source

	# # 5. Multi-Output Interface
	# interface = gr.Interface(
	# fn=compare_hr_bots,
	# inputs=gr.Textbox(label="Ask an HR Question"),
	# outputs=[
	# gr.Textbox(label="IBM Granite 3.1"),
	# gr.Textbox(label="Alibaba Qwen 2.5 1.5B"),
	# gr.Textbox(label="HuggingFaceTB SmolLM 1.7B"),
	# gr.Textbox(label="Microsoft Phi 3.5"),
	# gr.Textbox(label="Source Used")
	# ],
	# title="RAG Model Benchmarking",
	# description="Sequential model comparison. Answers yield progressively to manage CPU RAM limits."
	# )

	# interface.launch()










	# import gradio as gr
	# import faiss
	# import json
	# import numpy as np
	# import torch
	# from sentence_transformers import SentenceTransformer
	# from transformers import pipeline

	# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	# index = faiss.read_index("faiss_index.bin")
	# with open("processed_chunks.json", "r") as f:
	# chunks = json.load(f)

	# MODELS = {
	# "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
	# "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
	# "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
	# "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct"
	# }

	# def ask_specific_model(model_name, prompt):
	# pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
	# res = pipe(prompt, max_new_tokens=60, do_sample=False)
	# return res[0]['generated_text'].split("Answer:")[-1].strip()

	# def compare_hr_bots(question):
	# query_vec = embed_model.encode([question])
	# distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)

	# if distances[0][0] > 1.5:
	# yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
	# return

	# relevant_chunk = chunks[indices[0][0]]
	# context = relevant_chunk['text']
	# source = relevant_chunk['doc_name']
	# prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"

	# results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
	# yield results[0], results[1], results[2], results[3], source

	# # Sequential Generation
	# model_names = list(MODELS.keys())
	# for i, name in enumerate(model_names):
	# results[i] = "⚙️ Generating..."
	# yield results[0], results[1], results[2], results[3], source
	# ans = ask_specific_model(name, prompt)
	# results[i] = ans
	# yield results[0], results[1], results[2], results[3], source

	# interface = gr.Interface(
	# fn=compare_hr_bots,
	# inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"),
	# outputs=[
	# gr.Textbox(label="IBM Granite 3.1 2B"),
	# gr.Textbox(label="Qwen 2.5 1.5B"),
	# gr.Textbox(label="SmolLM 1.7B"),
	# gr.Textbox(label="Microsoft Phi 3.5 Mini"),
	# gr.Textbox(label="Source Used")
	# ],
	# title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking",
	# description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)"
	# )

	# interface.launch()


	import gradio as gr
	import faiss
	import json
	import numpy as np
	import torch
	import gc
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer


	embed_model = SentenceTransformer('all-MiniLM-L6-v2')
	index = faiss.read_index("faiss_index.bin")
	with open("processed_chunks.json", "r") as f:
	chunks = json.load(f)


	MODELS = {
	"IBM Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
	"Microsoft Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
	"Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
	"SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct"
	}


	def ask_specific_model(model_name, prompt):
	pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
	res = pipe(prompt, max_new_tokens=60, do_sample=False)
	return res[0]['generated_text'].split("Answer:")[-1].strip()

	def compare_hr_bots(question):
	query_vec = embed_model.encode([question])
	distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)

	if distances[0][0] > 1.5:
	yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
	return

	relevant_chunk = chunks[indices[0][0]]
	context = relevant_chunk['text']
	source = relevant_chunk['doc_name']
	prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"

	results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
	yield results[0], results[1], results[2], results[3], source

	model_names = list(MODELS.keys())
	for i, name in enumerate(model_names):
	results[i] = "⚙️ Generating..."
	yield results[0], results[1], results[2], results[3], source
	ans = ask_specific_model(name, prompt)
	results[i] = ans
	yield results[0], results[1], results[2], results[3], source



	def calculate_perplexity(model_name):
	try:
	model_id = MODELS[model_name]


	sample_texts = [chunk['text'] for chunk in chunks[:3]]
	test_text = " ".join(sample_texts)


	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")


	inputs = tokenizer(test_text, return_tensors="pt")
	with torch.no_grad():
	outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
	loss = outputs.loss

	perplexity = torch.exp(loss).item()


	del model
	del tokenizer
	del inputs
	del outputs
	gc.collect()

	return f"Perplexity Score: {perplexity:.2f}\n\n(Tested on internal HR policies. Lower is better.)"

	except Exception as e:
	return f"Error calculating perplexity: {str(e)}"



	with gr.Blocks(theme=gr.themes.Soft()) as interface:
	gr.Markdown("# ADQ Enterprise HR Knowledge Assistant & Evaluation Toolkit")
	gr.Markdown("Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)")

	with gr.Tabs():
	# TAB 1 UI
	with gr.TabItem("💬 RAG Chatbot (Benchmarking)"):
	question_input = gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?")
	submit_btn = gr.Button("Compare Models")

	with gr.Row():
	out_granite = gr.Textbox(label="IBM Granite 3.1 2B")
	out_phi = gr.Textbox(label="Microsoft Phi 3.5 Mini")
	with gr.Row():
	out_qwen = gr.Textbox(label="Alibaba Qwen 2.5 1.5B")
	out_smol = gr.Textbox(label="HuggingFace SmolLM 1.7B")
	out_source = gr.Textbox(label="Source Document Used")

	submit_btn.click(
	fn=compare_hr_bots,
	inputs=question_input,
	outputs=[out_granite, out_phi, out_qwen, out_smol, out_source]
	)


	with gr.TabItem("📊 Perplexity Evaluator"):
	gr.Markdown("Select a single model to calculate its perplexity against our internal HR dataset. Warning: Takes 30-60 seconds on CPU.")

	model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), label="Select Model to Evaluate", value="IBM Granite 3.1 2B")
	eval_btn = gr.Button("Calculate Perplexity")
	eval_output = gr.Textbox(label="Evaluation Result")

	eval_btn.click(
	fn=calculate_perplexity,
	inputs=model_dropdown,
	outputs=eval_output
	)

	interface.launch()