jayjay-12345 commited on
Commit
777df97
·
verified ·
1 Parent(s): 3233665

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -18
app.py CHANGED
@@ -131,26 +131,106 @@
131
 
132
  # interface.launch()
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  import gradio as gr
135
  import faiss
136
  import json
137
  import numpy as np
138
  import torch
 
139
  from sentence_transformers import SentenceTransformer
140
- from transformers import pipeline
141
 
 
142
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
143
  index = faiss.read_index("faiss_index.bin")
144
  with open("processed_chunks.json", "r") as f:
145
  chunks = json.load(f)
146
 
 
147
  MODELS = {
148
- "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
 
149
  "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
150
- "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
151
- "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct"
152
  }
153
 
 
154
  def ask_specific_model(model_name, prompt):
155
  pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
156
  res = pipe(prompt, max_new_tokens=60, do_sample=False)
@@ -172,7 +252,6 @@ def compare_hr_bots(question):
172
  results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
173
  yield results[0], results[1], results[2], results[3], source
174
 
175
- # Sequential Generation
176
  model_names = list(MODELS.keys())
177
  for i, name in enumerate(model_names):
178
  results[i] = "⚙️ Generating..."
@@ -181,18 +260,78 @@ def compare_hr_bots(question):
181
  results[i] = ans
182
  yield results[0], results[1], results[2], results[3], source
183
 
184
- interface = gr.Interface(
185
- fn=compare_hr_bots,
186
- inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"),
187
- outputs=[
188
- gr.Textbox(label="IBM Granite 3.1 2B"),
189
- gr.Textbox(label="Qwen 2.5 1.5B"),
190
- gr.Textbox(label="SmolLM 1.7B"),
191
- gr.Textbox(label="Microsoft Phi 3.5 Mini"),
192
- gr.Textbox(label="Source Used")
193
- ],
194
- title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking",
195
- description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)"
196
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  interface.launch()
 
131
 
132
  # interface.launch()
133
 
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+ # import gradio as gr
144
+ # import faiss
145
+ # import json
146
+ # import numpy as np
147
+ # import torch
148
+ # from sentence_transformers import SentenceTransformer
149
+ # from transformers import pipeline
150
+
151
+ # embed_model = SentenceTransformer('all-MiniLM-L6-v2')
152
+ # index = faiss.read_index("faiss_index.bin")
153
+ # with open("processed_chunks.json", "r") as f:
154
+ # chunks = json.load(f)
155
+
156
+ # MODELS = {
157
+ # "Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
158
+ # "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
159
+ # "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct",
160
+ # "Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct"
161
+ # }
162
+
163
+ # def ask_specific_model(model_name, prompt):
164
+ # pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
165
+ # res = pipe(prompt, max_new_tokens=60, do_sample=False)
166
+ # return res[0]['generated_text'].split("Answer:")[-1].strip()
167
+
168
+ # def compare_hr_bots(question):
169
+ # query_vec = embed_model.encode([question])
170
+ # distances, indices = index.search(np.array(query_vec).astype('float32'), k=1)
171
+
172
+ # if distances[0][0] > 1.5:
173
+ # yield "Out of scope", "Out of scope", "Out of scope", "Out of scope", "N/A"
174
+ # return
175
+
176
+ # relevant_chunk = chunks[indices[0][0]]
177
+ # context = relevant_chunk['text']
178
+ # source = relevant_chunk['doc_name']
179
+ # prompt = f"Context: {context}\nQuestion: {question}\nAnswer only from context. Cite source: {source}\nAnswer:"
180
+
181
+ # results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
182
+ # yield results[0], results[1], results[2], results[3], source
183
+
184
+ # # Sequential Generation
185
+ # model_names = list(MODELS.keys())
186
+ # for i, name in enumerate(model_names):
187
+ # results[i] = "⚙️ Generating..."
188
+ # yield results[0], results[1], results[2], results[3], source
189
+ # ans = ask_specific_model(name, prompt)
190
+ # results[i] = ans
191
+ # yield results[0], results[1], results[2], results[3], source
192
+
193
+ # interface = gr.Interface(
194
+ # fn=compare_hr_bots,
195
+ # inputs=gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?"),
196
+ # outputs=[
197
+ # gr.Textbox(label="IBM Granite 3.1 2B"),
198
+ # gr.Textbox(label="Qwen 2.5 1.5B"),
199
+ # gr.Textbox(label="SmolLM 1.7B"),
200
+ # gr.Textbox(label="Microsoft Phi 3.5 Mini"),
201
+ # gr.Textbox(label="Source Used")
202
+ # ],
203
+ # title="ADU Enterprise HR Knowledge Assistant: Model Benchmarking",
204
+ # description="Comparing grounding quality across 4 open-source LLMs using Enterprise HR Policies. Please be patient since there is a limit of 16GB RAM :)"
205
+ # )
206
+
207
+ # interface.launch()
208
+
209
+
210
  import gradio as gr
211
  import faiss
212
  import json
213
  import numpy as np
214
  import torch
215
+ import gc
216
  from sentence_transformers import SentenceTransformer
217
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
218
 
219
+ # 1. Load RAG Memory
220
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
221
  index = faiss.read_index("faiss_index.bin")
222
  with open("processed_chunks.json", "r") as f:
223
  chunks = json.load(f)
224
 
225
+ # 2. Define Models
226
  MODELS = {
227
+ "IBM Granite 3.1 2B": "ibm-granite/granite-3.1-2b-instruct",
228
+ "Microsoft Phi 3.5 Mini": "microsoft/Phi-3.5-mini-instruct",
229
  "Qwen 2.5 1.5B": "Qwen/Qwen2.5-1.5B-Instruct",
230
+ "SmolLM 1.7B": "HuggingFaceTB/SmolLM-1.7B-Instruct"
 
231
  }
232
 
233
+ # --- TAB 1: RAG CHATBOT LOGIC ---
234
  def ask_specific_model(model_name, prompt):
235
  pipe = pipeline("text-generation", model=MODELS[model_name], device_map="cpu")
236
  res = pipe(prompt, max_new_tokens=60, do_sample=False)
 
252
  results = ["⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting...", "⏳ Waiting..."]
253
  yield results[0], results[1], results[2], results[3], source
254
 
 
255
  model_names = list(MODELS.keys())
256
  for i, name in enumerate(model_names):
257
  results[i] = "⚙️ Generating..."
 
260
  results[i] = ans
261
  yield results[0], results[1], results[2], results[3], source
262
 
263
+
264
+ # --- TAB 2: PERPLEXITY EVALUATION LOGIC ---
265
+ def calculate_perplexity(model_name):
266
+ try:
267
+ model_id = MODELS[model_name]
268
+
269
+ # Grab a chunk of our actual HR data to test on
270
+ sample_texts = [chunk['text'] for chunk in chunks[:3]]
271
+ test_text = " ".join(sample_texts)
272
+
273
+ # Load Model
274
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
275
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
276
+
277
+ # Tokenize and compute loss
278
+ inputs = tokenizer(test_text, return_tensors="pt")
279
+ with torch.no_grad():
280
+ outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
281
+ loss = outputs.loss
282
+
283
+ perplexity = torch.exp(loss).item()
284
+
285
+ # STRICT MEMORY CLEANUP
286
+ del model
287
+ del tokenizer
288
+ del inputs
289
+ del outputs
290
+ gc.collect()
291
+
292
+ return f"Perplexity Score: {perplexity:.2f}\n\n(Tested on internal HR policies. Lower is better.)"
293
+
294
+ except Exception as e:
295
+ return f"Error calculating perplexity: {str(e)}"
296
+
297
+
298
+ # --- GRADIO UI BUILDER ---
299
+ with gr.Blocks(theme=gr.themes.Soft()) as interface:
300
+ gr.Markdown("# ADU HR Knowledge Assistant & Evaluation Toolkit")
301
+ gr.Markdown("Enterprise RAG Prototype using strictly Open-Source LLMs.")
302
+
303
+ with gr.Tabs():
304
+ # TAB 1 UI
305
+ with gr.TabItem("💬 RAG Chatbot (Benchmarking)"):
306
+ question_input = gr.Textbox(label="Ask an HR Question", placeholder="e.g., How many annual leave days do I get?")
307
+ submit_btn = gr.Button("Compare Models")
308
+
309
+ with gr.Row():
310
+ out_granite = gr.Textbox(label="IBM Granite 3.1 2B")
311
+ out_phi = gr.Textbox(label="Microsoft Phi 3.5 Mini")
312
+ with gr.Row():
313
+ out_qwen = gr.Textbox(label="Qwen 2.5 1.5B")
314
+ out_smol = gr.Textbox(label="SmolLM 1.7B")
315
+ out_source = gr.Textbox(label="Source Document Used")
316
+
317
+ submit_btn.click(
318
+ fn=compare_hr_bots,
319
+ inputs=question_input,
320
+ outputs=[out_granite, out_phi, out_qwen, out_smol, out_source]
321
+ )
322
+
323
+ # TAB 2 UI
324
+ with gr.TabItem("📊 Perplexity Evaluator"):
325
+ gr.Markdown("Select a single model to calculate its perplexity against our internal HR dataset. **Warning: Takes 30-60 seconds on CPU.**")
326
+
327
+ model_dropdown = gr.Dropdown(choices=list(MODELS.keys()), label="Select Model to Evaluate", value="IBM Granite 3.1 2B")
328
+ eval_btn = gr.Button("Calculate Perplexity")
329
+ eval_output = gr.Textbox(label="Evaluation Result")
330
+
331
+ eval_btn.click(
332
+ fn=calculate_perplexity,
333
+ inputs=model_dropdown,
334
+ outputs=eval_output
335
+ )
336
 
337
  interface.launch()