Spaces:

heerjtdev
/

try_answer

Running

App Files Files Community

heerjtdev commited on Feb 4

Commit

6662485

verified ·

1 Parent(s): bdf342a

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -30

app.py CHANGED Viewed

@@ -138,13 +138,12 @@ class OnnxBgeEmbeddings(Embeddings):
 # ---------------------------------------------------------
 # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
-        # Qwen 0.5B is great for speed, but needs VERY specific prompts to be strict.
         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
@@ -175,33 +174,42 @@ class LLMEvaluator:
         )
     def evaluate(self, context, question, student_answer, max_marks):
-        # --- STRATEGY: FEW-SHOT PROMPTING & CHAIN OF THOUGHT ---
-        # Small models (0.5B) need examples to understand "Strictness".
-        system_prompt = """You are a strict automated grader. You grade ONLY based on the provided Context.
-        RULES:
-        1. If the Student Answer contains facts NOT found in the Context, Score is 0.
-        2. If the Student Answer contradicts the Context, Score is 0.
-        3. Do not use outside knowledge. If it's not in the text, it's wrong.
-        --- EXAMPLE 1 (WRONG ANSWER) ---
-        Context: The sky is blue because of Rayleigh scattering.
         Question: Why is the sky blue?
-        Student Answer: Because the ocean reflects into it.
-        Analysis: The context mentions Rayleigh scattering. The student mentioned ocean reflection. These do not match.
         Score: 0/{max_marks}
-        --- EXAMPLE 2 (CORRECT ANSWER) ---
         Context: Mitochondria is the powerhouse of the cell.
-        Question: What is the mitochondria?
-        Student Answer: It is the powerhouse of the cell.
-        Analysis: The student answer matches the context text exactly.
         Score: {max_marks}/{max_marks}
         """
         user_prompt = f"""
-        --- NOW GRADE THIS ---
         Context:
         {context}
@@ -211,12 +219,8 @@ class LLMEvaluator:
         Student Answer:
         {student_answer}
-        Task:
-        1. Analyze if the specific keywords in Student Answer exist in Context.
-        2. Assign a Score.
-        Output format:
-        Analysis: [Analysis here]
         Score: [X]/{max_marks}
         """
@@ -228,14 +232,15 @@ class LLMEvaluator:
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
                 max_new_tokens=150,
-                temperature=0.1,    # Low temperature for facts
-                top_p=0.1,          # Reduce creativity
                 do_sample=True,
-                repetition_penalty=1.1
             )
         input_length = inputs['input_ids'].shape[1]
@@ -243,6 +248,110 @@ class LLMEvaluator:
         return response
 # ---------------------------------------------------------
 # 3. Main Application Logic
 # ---------------------------------------------------------

 # ---------------------------------------------------------
 # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
 # ---------------------------------------------------------
 class LLMEvaluator:
     def __init__(self):
+        # Qwen 2.5 0.5B is fast but needs "Few-Shot" examples to be strict.
         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
         self.local_dir = "onnx_qwen_local"
         )
     def evaluate(self, context, question, student_answer, max_marks):
+        # --- IMPROVED PROMPT STRATEGY ---
+        # 1. Role: We set the persona to a "Strict Logical Validator" not a "Teacher".
+        # 2. Few-Shot: We give examples of HALLUCINATIONS getting 0 marks.
+        system_prompt = f"""You are a strict Logic Validator. You are NOT a helpful assistant.
+        Your job is to check if the Student Answer is FACTUALLY present in the Context.
+        GRADING ALGORITHM:
+        1. IF the Student Answer mentions things NOT in the Context -> PENALTY (-100%).
+        2. IF the Student Answer interprets the text opposite to its meaning -> PENALTY (-100%).
+        3. IF the Student Answer is generic fluff -> SCORE: 0.
+        --- EXAMPLE 1 (HALLUCINATION) ---
+        Context: The sky is blue due to Rayleigh scattering.
         Question: Why is the sky blue?
+        Student Answer: Because the ocean reflects the water into the sky.
+        Analysis: The Context mentions 'Rayleigh scattering'. The student mentions 'ocean reflection'. These are different. The student is hallucinating outside facts.
         Score: 0/{max_marks}
+        --- EXAMPLE 2 (CONTRADICTION) ---
+        Context: One must efface one's own personality. Good prose is like a windowpane.
+        Question: What does the author mean?
+        Student Answer: It means we should see the author's personality clearly.
+        Analysis: The text says 'efface' (remove) personality. The student says 'see' personality. This is a direct contradiction.
+        Score: 0/{max_marks}
+        --- EXAMPLE 3 (CORRECT) ---
         Context: Mitochondria is the powerhouse of the cell.
+        Question: What is mitochondria?
+        Student Answer: It is the cell's powerhouse.
+        Analysis: Matches the text meaning exactly.
         Score: {max_marks}/{max_marks}
         """
         user_prompt = f"""
+        --- YOUR TASK ---
         Context:
         {context}
         Student Answer:
         {student_answer}
+        OUTPUT FORMAT:
+        Analysis: [Compare Student Answer vs Context. List any hallucinations or contradictions.]
         Score: [X]/{max_marks}
         """
         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(input_text, return_tensors="pt")
+        # Lower temperature for strictness
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
                 max_new_tokens=150,
+                temperature=0.1,    # Strict logic, no creativity
+                top_p=0.2,          # Cut off unlikely tokens
                 do_sample=True,
+                repetition_penalty=1.2 # Penalize repetition
             )
         input_length = inputs['input_ids'].shape[1]
         return response
+# # ---------------------------------------------------------
+# # 2. OPTIMIZED LLM (Qwen 2.5 - 0.5B) - STRICT GRADING
+# # ---------------------------------------------------------
+# class LLMEvaluator:
+#     def __init__(self):
+#         # Qwen 0.5B is great for speed, but needs VERY specific prompts to be strict.
+#         self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
+#         self.local_dir = "onnx_qwen_local"
+#         print(f"🔄 Preparing CPU LLM: {self.repo_id}...")
+#         if not os.path.exists(self.local_dir):
+#             print(f"📥 Downloading FP16 model to {self.local_dir}...")
+#             snapshot_download(
+#                 repo_id=self.repo_id,
+#                 local_dir=self.local_dir,
+#                 allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
+#             )
+#             print("✅ Download complete.")
+#         self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
+#         sess_options = SessionOptions()
+#         sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
+#         self.model = ORTModelForCausalLM.from_pretrained(
+#             self.local_dir,
+#             subfolder="onnx",
+#             file_name="model_fp16.onnx",
+#             use_cache=True,
+#             use_io_binding=False,
+#             provider=PROVIDERS[0],
+#             session_options=sess_options
+#         )
+#     def evaluate(self, context, question, student_answer, max_marks):
+#         # --- STRATEGY: FEW-SHOT PROMPTING & CHAIN OF THOUGHT ---
+#         # Small models (0.5B) need examples to understand "Strictness".
+#         system_prompt = """You are a strict automated grader. You grade ONLY based on the provided Context.
+#         RULES:
+#         1. If the Student Answer contains facts NOT found in the Context, Score is 0.
+#         2. If the Student Answer contradicts the Context, Score is 0.
+#         3. Do not use outside knowledge. If it's not in the text, it's wrong.
+#         --- EXAMPLE 1 (WRONG ANSWER) ---
+#         Context: The sky is blue because of Rayleigh scattering.
+#         Question: Why is the sky blue?
+#         Student Answer: Because the ocean reflects into it.
+#         Analysis: The context mentions Rayleigh scattering. The student mentioned ocean reflection. These do not match.
+#         Score: 0/{max_marks}
+#         --- EXAMPLE 2 (CORRECT ANSWER) ---
+#         Context: Mitochondria is the powerhouse of the cell.
+#         Question: What is the mitochondria?
+#         Student Answer: It is the powerhouse of the cell.
+#         Analysis: The student answer matches the context text exactly.
+#         Score: {max_marks}/{max_marks}
+#         """
+#         user_prompt = f"""
+#         --- NOW GRADE THIS ---
+#         Context:
+#         {context}
+#         Question:
+#         {question}
+#         Student Answer:
+#         {student_answer}
+#         Task:
+#         1. Analyze if the specific keywords in Student Answer exist in Context.
+#         2. Assign a Score.
+#         Output format:
+#         Analysis: [Analysis here]
+#         Score: [X]/{max_marks}
+#         """
+#         messages = [
+#             {"role": "system", "content": system_prompt},
+#             {"role": "user", "content": user_prompt}
+#         ]
+#         input_text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#         inputs = self.tokenizer(input_text, return_tensors="pt")
+#         with torch.no_grad():
+#             outputs = self.model.generate(
+#                 **inputs,
+#                 max_new_tokens=150,
+#                 temperature=0.1,    # Low temperature for facts
+#                 top_p=0.1,          # Reduce creativity
+#                 do_sample=True,
+#                 repetition_penalty=1.1
+#             )
+#         input_length = inputs['input_ids'].shape[1]
+#         response = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
+#         return response
 # ---------------------------------------------------------
 # 3. Main Application Logic
 # ---------------------------------------------------------