Spaces:

jayjay-12345
/

CODS641_Enterprise_FAQ_Bot

Sleeping

App Files Files Community

jayjay-12345 commited on 23 days ago

Commit

777df97

verified ·

1 Parent(s): 3233665

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -18

app.py CHANGED Viewed

@@ -131,26 +131,106 @@
 # interface.launch()
 import gradio as gr
 import faiss
 import json
 import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 embed_model = SentenceTransformer('all-MiniLM-L6-v2')
 index = faiss.read_index("faiss_index.bin")
 with open("processed_chunks.json", "r") as f:
     chunks = json.load(f)
 MODELS = {
-    "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
     "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
-    "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
-     "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct"
 }
 def ask_specific_model(model_name, prompt):
     pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
     res = pipe(prompt, max_new_tokens=60, do_sample=False)
@@ -172,7 +252,6 @@ def compare_hr_bots(question):
     results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
     yield results[0], results[1], results[2], results[3], source
-    # Sequential Generation
     model_names = list(MODELS.keys())
     for i, name in enumerate(model_names):
         results[i] = "⚙️ Generating..."
@@ -181,18 +260,78 @@ def compare_hr_bots(question):
         results[i] = ans
         yield results[0], results[1], results[2], results[3], source
-interface = gr.Interface(
-    fn=compare_hr_bots,
-    inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"),
-    outputs=[
-        gr.Textbox(label="IBM Granite 3.1 2B"),
-        gr.Textbox(label="Qwen 2.5 1.5B"),
-        gr.Textbox(label="SmolLM 1.7B"),
-        gr.Textbox(label="Microsoft Phi 3.5 Mini"),
-        gr.Textbox(label="Source Used")
-    ],
-    title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking",
-    description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)"
-)
 interface.launch()

 # interface.launch()
+# import gradio as gr
+# import faiss
+# import json
+# import numpy as np
+# import torch
+# from sentence_transformers import SentenceTransformer
+# from transformers import pipeline
+# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
+# index = faiss.read_index("faiss_index.bin")
+# with open("processed_chunks.json", "r") as f:
+#     chunks = json.load(f)
+# MODELS = {
+#     "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
+#     "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
+#     "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
+#      "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct"
+# }
+# def ask_specific_model(model_name, prompt):
+#     pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
+#     res = pipe(prompt, max_new_tokens=60, do_sample=False)
+#     return res[0]['generated_text'].split("Answer:")[-1].strip()
+# def compare_hr_bots(question):
+#     query_vec = embed_model.encode([question])
+#     distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
+#     if distances[0][0] > 1.5:
+#         yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
+#         return
+#     relevant_chunk = chunks[indices[0][0]]
+#     context = relevant_chunk['text']
+#     source = relevant_chunk['doc_name']
+#     prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
+#     results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
+#     yield results[0], results[1], results[2], results[3], source
+#     # Sequential Generation
+#     model_names = list(MODELS.keys())
+#     for i, name in enumerate(model_names):
+#         results[i] = "⚙️ Generating..."
+#         yield results[0], results[1], results[2], results[3], source
+#         ans = ask_specific_model(name, prompt)
+#         results[i] = ans
+#         yield results[0], results[1], results[2], results[3], source
+# interface = gr.Interface(
+#     fn=compare_hr_bots,
+#     inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"),
+#     outputs=[
+#         gr.Textbox(label="IBM Granite 3.1 2B"),
+#         gr.Textbox(label="Qwen 2.5 1.5B"),
+#         gr.Textbox(label="SmolLM 1.7B"),
+#         gr.Textbox(label="Microsoft Phi 3.5 Mini"),
+#         gr.Textbox(label="Source Used")
+#     ],
+#     title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking",
+#     description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)"
+# )
+# interface.launch()
 import gradio as gr
 import faiss
 import json
 import numpy as np
 import torch
+import gc
 from sentence_transformers import SentenceTransformer
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+# 1. Load RAG Memory
 embed_model = SentenceTransformer('all-MiniLM-L6-v2')
 index = faiss.read_index("faiss_index.bin")
 with open("processed_chunks.json", "r") as f:
     chunks = json.load(f)
+# 2. Define Models
 MODELS = {
+    "IBM Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
+    "Microsoft Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
     "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
+    "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct"
 }
+# --- TAB 1: RAG CHATBOT LOGIC ---
 def ask_specific_model(model_name, prompt):
     pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
     res = pipe(prompt, max_new_tokens=60, do_sample=False)
     results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
     yield results[0], results[1], results[2], results[3], source
     model_names = list(MODELS.keys())
     for i, name in enumerate(model_names):
         results[i] = "⚙️ Generating..."
         results[i] = ans
         yield results[0], results[1], results[2], results[3], source
+# --- TAB 2: PERPLEXITY EVALUATION LOGIC ---
+def calculate_perplexity(model_name):
+    try:
+        model_id = MODELS[model_name]
+        # Grab a chunk of our actual HR data to test on
+        sample_texts = [chunk['text'] for chunk in chunks[:3]]
+        test_text = " ".join(sample_texts)
+        # Load Model
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
+        # Tokenize and compute loss
+        inputs = tokenizer(test_text, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
+            loss = outputs.loss
+        perplexity = torch.exp(loss).item()
+        # STRICT MEMORY CLEANUP
+        del model
+        del tokenizer
+        del inputs
+        del outputs
+        gc.collect()
+        return f"Perplexity Score: {perplexity:.2f}\n\n(Tested on internal HR policies. Lower is better.)"
+    except Exception as e:
+        return f"Error calculating perplexity: {str(e)}"
+# --- GRADIO UI BUILDER ---
+with gr.Blocks(theme=gr.themes.Soft()) as interface:
+    gr.Markdown("# ADU HR Knowledge Assistant & Evaluation Toolkit")
+    gr.Markdown("Enterprise RAG Prototype using strictly Open-Source LLMs.")
+    with gr.Tabs():
+        # TAB 1 UI
+        with gr.TabItem("💬 RAG Chatbot (Benchmarking)"):
+            question_input = gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?")
+            submit_btn = gr.Button("Compare Models")
+            with gr.Row():
+                out_granite = gr.Textbox(label="IBM Granite 3.1 2B")
+                out_phi = gr.Textbox(label="Microsoft Phi 3.5 Mini")
+            with gr.Row():
+                out_qwen = gr.Textbox(label="Qwen 2.5 1.5B")
+                out_smol = gr.Textbox(label="SmolLM 1.7B")
+            out_source = gr.Textbox(label="Source Document Used")
+            submit_btn.click(
+                fn=compare_hr_bots,
+                inputs=question_input,
+                outputs=[out_granite, out_phi, out_qwen, out_smol, out_source]
+            )
+        # TAB 2 UI
+        with gr.TabItem("📊 Perplexity Evaluator"):
+            gr.Markdown("Select a single model to calculate its perplexity against our internal HR dataset. **Warning: Takes 30-60 seconds on CPU.**")
+            model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), label="Select Model to Evaluate", value="IBM Granite 3.1 2B")
+            eval_btn = gr.Button("Calculate Perplexity")
+            eval_output = gr.Textbox(label="Evaluation Result")
+            eval_btn.click(
+                fn=calculate_perplexity,
+                inputs=model_dropdown,
+                outputs=eval_output
+            )
 interface.launch()