heerjtdev commited on
Commit
2307277
·
verified ·
1 Parent(s): 0e4e76a

Update app.py

Browse files

extensive evaluations

Files changed (1) hide show
  1. app.py +173 -51
app.py CHANGED
@@ -1,3 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import re
@@ -12,83 +124,93 @@ from sentence_transformers import SentenceTransformer, util
12
  GEMINI_API_KEY = "AIzaSyBrbLGXkSdXReb0lUucYqcNCNBkvS-RBFw"
13
  genai.configure(api_key=GEMINI_API_KEY)
14
 
15
- # UPDATED: Use a supported 2026 model
16
  MODEL = genai.GenerativeModel("gemini-2.5-flash")
17
 
18
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
20
  SIM_THRESHOLD = 0.55
21
 
22
- print("Loading embedding model...")
23
  embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
24
  print("✅ Ready")
25
 
26
  # ============================================================
27
- # LOGIC
28
  # ============================================================
29
- def get_evaluation_data(kb, question):
30
- """Gets both intent and rubric in one single API request."""
31
  prompt = f"""
32
- Acting as an examiner, analyze the Knowledge Base (KB) and Question.
33
- 1. Determine the intent (FACTUAL, EXPLANATORY, PROCESS, or COMPARISON).
34
- 2. Create a rubric of 3-5 atomic grading criteria based ONLY on the KB.
35
 
36
- KB: {kb}
 
 
 
 
 
 
 
 
 
 
37
  Question: {question}
 
38
 
39
- OUTPUT JSON ONLY:
40
  {{
41
- "intent": "LABEL",
42
- "criteria": ["criterion 1", "criterion 2"]
 
 
 
 
 
 
43
  }}
44
  """
45
  try:
46
  response = MODEL.generate_content(prompt)
47
- # Handle cases where model might wrap JSON in backticks
48
  clean_text = re.sub(r'```json|```', '', response.text).strip()
49
  return json.loads(clean_text)
50
  except Exception as e:
51
- print(f"API Error: {e}")
52
- return {"intent": "ERROR", "criteria": []}
53
 
54
- def evaluate(answer, question, kb):
55
- # API Call
56
- data = get_evaluation_data(kb, question)
57
- rubric = data.get("criteria", [])
58
 
59
- if not rubric:
60
- return {"error": "Could not generate rubric. Check API status."}
61
 
62
- # Semantic Matching (Local)
 
63
  sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 5]
64
- if not sents:
65
- return {"error": "Answer is too short to evaluate."}
66
-
67
- ans_emb = embedder.encode(sents, convert_to_tensor=True)
68
- results = []
69
- for crit in rubric:
70
- crit_emb = embedder.encode(crit, convert_to_tensor=True)
71
- sims = util.cos_sim(crit_emb, ans_emb)[0]
72
- score = float(torch.max(sims)) if sims.numel() else 0.0
73
- results.append({"criterion": crit, "satisfied": score >= SIM_THRESHOLD})
74
-
75
- # Verdict
76
- hits = sum(r["satisfied"] for r in results)
77
- verdict = "✅ CORRECT" if hits == len(results) else "⚠️ PARTIAL" if hits > 0 else "❌ INCORRECT"
78
-
79
- return {
80
- "intent": data.get("intent"),
81
- "rubric_results": results,
82
- "final_verdict": verdict
83
- }
84
-
85
- # UI
86
- with gr.Blocks() as demo:
87
- gr.Markdown("## 🧠 Gemini 2.5 Answer Grader")
88
- kb_input = gr.Textbox(label="Knowledge Base", lines=5)
89
- q_input = gr.Textbox(label="Question")
90
- a_input = gr.Textbox(label="Student Answer", lines=4)
91
- out = gr.JSON(label="Evaluation Result")
92
- gr.Button("Evaluate").click(evaluate, [a_input, q_input, kb_input], out)
93
 
94
  demo.launch()
 
1
+ # import os
2
+ # import json
3
+ # import re
4
+ # import torch
5
+ # import gradio as gr
6
+ # import google.generativeai as genai
7
+ # from sentence_transformers import SentenceTransformer, util
8
+
9
+ # # ============================================================
10
+ # # CONFIG
11
+ # # ============================================================
12
+ # GEMINI_API_KEY = "AIzaSyBrbLGXkSdXReb0lUucYqcNCNBkvS-RBFw"
13
+ # genai.configure(api_key=GEMINI_API_KEY)
14
+
15
+ # # UPDATED: Use a supported 2026 model
16
+ # MODEL = genai.GenerativeModel("gemini-2.5-flash")
17
+
18
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
+ # EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
20
+ # SIM_THRESHOLD = 0.55
21
+
22
+ # print("Loading embedding model...")
23
+ # embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
24
+ # print("✅ Ready")
25
+
26
+ # # ============================================================
27
+ # # LOGIC
28
+ # # ============================================================
29
+ # def get_evaluation_data(kb, question):
30
+ # """Gets both intent and rubric in one single API request."""
31
+ # prompt = f"""
32
+ # Acting as an examiner, analyze the Knowledge Base (KB) and Question.
33
+ # 1. Determine the intent (FACTUAL, EXPLANATORY, PROCESS, or COMPARISON).
34
+ # 2. Create a rubric of 3-5 atomic grading criteria based ONLY on the KB.
35
+
36
+ # KB: {kb}
37
+ # Question: {question}
38
+
39
+ # OUTPUT JSON ONLY:
40
+ # {{
41
+ # "intent": "LABEL",
42
+ # "criteria": ["criterion 1", "criterion 2"]
43
+ # }}
44
+ # """
45
+ # try:
46
+ # response = MODEL.generate_content(prompt)
47
+ # # Handle cases where model might wrap JSON in backticks
48
+ # clean_text = re.sub(r'```json|```', '', response.text).strip()
49
+ # return json.loads(clean_text)
50
+ # except Exception as e:
51
+ # print(f"API Error: {e}")
52
+ # return {"intent": "ERROR", "criteria": []}
53
+
54
+ # def evaluate(answer, question, kb):
55
+ # # API Call
56
+ # data = get_evaluation_data(kb, question)
57
+ # rubric = data.get("criteria", [])
58
+
59
+ # if not rubric:
60
+ # return {"error": "Could not generate rubric. Check API status."}
61
+
62
+ # # Semantic Matching (Local)
63
+ # sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 5]
64
+ # if not sents:
65
+ # return {"error": "Answer is too short to evaluate."}
66
+
67
+ # ans_emb = embedder.encode(sents, convert_to_tensor=True)
68
+ # results = []
69
+ # for crit in rubric:
70
+ # crit_emb = embedder.encode(crit, convert_to_tensor=True)
71
+ # sims = util.cos_sim(crit_emb, ans_emb)[0]
72
+ # score = float(torch.max(sims)) if sims.numel() else 0.0
73
+ # results.append({"criterion": crit, "satisfied": score >= SIM_THRESHOLD})
74
+
75
+ # # Verdict
76
+ # hits = sum(r["satisfied"] for r in results)
77
+ # verdict = "✅ CORRECT" if hits == len(results) else "⚠️ PARTIAL" if hits > 0 else "❌ INCORRECT"
78
+
79
+ # return {
80
+ # "intent": data.get("intent"),
81
+ # "rubric_results": results,
82
+ # "final_verdict": verdict
83
+ # }
84
+
85
+ # # UI
86
+ # with gr.Blocks() as demo:
87
+ # gr.Markdown("## 🧠 Gemini 2.5 Answer Grader")
88
+ # kb_input = gr.Textbox(label="Knowledge Base", lines=5)
89
+ # q_input = gr.Textbox(label="Question")
90
+ # a_input = gr.Textbox(label="Student Answer", lines=4)
91
+ # out = gr.JSON(label="Evaluation Result")
92
+ # gr.Button("Evaluate").click(evaluate, [a_input, q_input, kb_input], out)
93
+
94
+ # demo.launch()
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
  import os
114
  import json
115
  import re
 
124
  GEMINI_API_KEY = "AIzaSyBrbLGXkSdXReb0lUucYqcNCNBkvS-RBFw"
125
  genai.configure(api_key=GEMINI_API_KEY)
126
 
 
127
  MODEL = genai.GenerativeModel("gemini-2.5-flash")
128
 
129
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
130
  EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
131
  SIM_THRESHOLD = 0.55
132
 
133
+ print("Loading local embedding auditor...")
134
  embedder = SentenceTransformer(EMBED_MODEL, device=DEVICE)
135
  print("✅ Ready")
136
 
137
  # ============================================================
138
+ # SOPHISTICATED EVALUATION LOGIC
139
  # ============================================================
140
+ def get_advanced_evaluation(kb, question, answer):
 
141
  prompt = f"""
142
+ You are a Senior Academic Evaluator. Compare the Answer against the Knowledge Base (KB) for the specific Question.
 
 
143
 
144
+ TASK:
145
+ 1. Identify 'intent' (e.g., FACTUAL, PROCEDURAL).
146
+ 2. Create a 'rubric' of 3-5 criteria from the KB.
147
+ 3. For each criterion:
148
+ - Determine if 'satisfied' (true/false).
149
+ - Provide a 'confidence' score (0-100) based on how clearly the answer matches the KB.
150
+ 4. Extract 'irrelevant_snippets': Parts of the answer that don't help answer the question.
151
+ 5. Extract 'contradictions': Parts of the answer that factually conflict with the KB.
152
+ 6. Suggest a 'total_score' (0-100) and 'feedback'.
153
+
154
+ Knowledge Base: {kb}
155
  Question: {question}
156
+ Student Answer: {answer}
157
 
158
+ STRICT JSON OUTPUT ONLY:
159
  {{
160
+ "intent": "...",
161
+ "rubric": [
162
+ {{"criterion": "...", "satisfied": true, "confidence": 95}}
163
+ ],
164
+ "irrelevant_snippets": ["...", "..."],
165
+ "contradictions": ["...", "..."],
166
+ "suggested_mark": 85,
167
+ "feedback": "..."
168
  }}
169
  """
170
  try:
171
  response = MODEL.generate_content(prompt)
 
172
  clean_text = re.sub(r'```json|```', '', response.text).strip()
173
  return json.loads(clean_text)
174
  except Exception as e:
175
+ return { "error": str(e) }
 
176
 
177
+ def evaluate(kb, question, answer):
178
+ # Perform the single heavy-duty API call
179
+ eval_data = get_advanced_evaluation(kb, question, answer)
 
180
 
181
+ if "error" in eval_data:
182
+ return eval_data
183
 
184
+ # --- Local Semantic Cross-Check (Local, no API cost) ---
185
+ # This helps catch if Gemini was "too nice" or missed a nuance
186
  sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 5]
187
+ if sents:
188
+ ans_emb = embedder.encode(sents, convert_to_tensor=True)
189
+ for item in eval_data.get("rubric", []):
190
+ crit_emb = embedder.encode(item["criterion"], convert_to_tensor=True)
191
+ sims = util.cos_sim(crit_emb, ans_emb)[0]
192
+ max_sim = float(torch.max(sims)) if sims.numel() else 0.0
193
+ # We add this 'local_check' to the JSON so the user can compare
194
+ item["local_semantic_similarity"] = round(max_sim * 100, 1)
195
+
196
+ return eval_data
197
+
198
+ # ============================================================
199
+ # IMPROVED UI
200
+ # ============================================================
201
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
202
+ gr.Markdown("# 🎓 Advanced AI Grading System (Gemini 2.5)")
203
+
204
+ with gr.Row():
205
+ with gr.Column():
206
+ kb_input = gr.Textbox(label="1. Reference Material (KB)", lines=8, placeholder="Paste the factual source here...")
207
+ q_input = gr.Textbox(label="2. Question", placeholder="What are you asking?")
208
+ a_input = gr.Textbox(label="3. Student Answer", lines=8, placeholder="Paste the answer to grade...")
209
+ btn = gr.Button("🔍 Run Deep Analysis", variant="primary")
210
+
211
+ with gr.Column():
212
+ out = gr.JSON(label="Grading Report & Forensic Analysis")
213
+
214
+ btn.click(evaluate, [kb_input, q_input, a_input], out)
 
215
 
216
  demo.launch()