File size: 6,763 Bytes
358dfff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import json
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from models.assessment import Assessment
from models.job import Job
from models.user import User
from models.application import Application
from models.base import Base
from config import settings
from schemas.assessment import AssessmentQuestion, AssessmentQuestionOption
from schemas.enums import QuestionType
from services.ai_service import score_answer
from uuid import uuid4

def test_scoring_methodology():
    """Test that multiple choice questions are scored directly and text-based use AI evaluation"""
    
    print("Testing scoring methodology...")
    
    # Test multiple choice question scoring (direct comparison)
    print("\n1. Testing multiple choice question scoring (direct comparison)...")
    mc_question = AssessmentQuestion(
        id=str(uuid4()),
        text="What is the capital of France?",
        weight=3,
        skill_categories=["geography", "knowledge"],
        type=QuestionType.choose_one,
        options=[
            AssessmentQuestionOption(text="London", value="a"),
            AssessmentQuestionOption(text="Paris", value="b"),
            AssessmentQuestionOption(text="Berlin", value="c")
        ],
        correct_options=["b"]
    )
    
    # Test correct answer
    correct_mc_result = score_answer(
        question=mc_question,
        answer_text="",
        selected_options=["b"]
    )
    print(f"   Correct MC answer score: {correct_mc_result['score']}")
    print(f"   Correct MC answer rationale: {correct_mc_result['rationale']}")
    assert correct_mc_result['score'] == 1.0, f"Expected 1.0 for correct MC answer, got {correct_mc_result['score']}"
    assert correct_mc_result['correct'] == True, f"Expected True for correct MC answer, got {correct_mc_result['correct']}"
    print("   [PASS] Correct multiple choice answer scored directly")
    
    # Test incorrect answer
    incorrect_mc_result = score_answer(
        question=mc_question,
        answer_text="",
        selected_options=["a"]  # London is wrong
    )
    print(f"   Incorrect MC answer score: {incorrect_mc_result['score']}")
    print(f"   Incorrect MC answer rationale: {incorrect_mc_result['rationale']}")
    assert incorrect_mc_result['score'] == 0.0, f"Expected 0.0 for incorrect MC answer, got {incorrect_mc_result['score']}"
    assert incorrect_mc_result['correct'] == False, f"Expected False for incorrect MC answer, got {incorrect_mc_result['correct']}"
    print("   [PASS] Incorrect multiple choice answer scored directly")
    
    # Test text-based question scoring (AI evaluation)
    print("\n2. Testing text-based question scoring (AI evaluation)...")
    text_question = AssessmentQuestion(
        id=str(uuid4()),
        text="Explain the importance of renewable energy.",
        weight=5,
        skill_categories=["environment", "science"],
        type=QuestionType.text_based,
        options=[],
        correct_options=[]
    )
    
    text_result = score_answer(
        question=text_question,
        answer_text="Renewable energy is important because it reduces carbon emissions and is sustainable for future generations.",
        selected_options=[]
    )
    print(f"   Text answer score: {text_result['score']}")
    print(f"   Text answer rationale: {text_result['rationale']}")
    # The score should be based on our heuristic evaluation (length, keywords, etc.)
    assert 0.0 <= text_result['score'] <= 1.0, f"Text score {text_result['score']} is not in range [0,1]"
    print("   [PASS] Text-based answer scored using AI evaluation heuristics")
    
    # Test text-based question with poor answer
    poor_text_result = score_answer(
        question=text_question,
        answer_text="It's good.",
        selected_options=[]
    )
    print(f"   Poor text answer score: {poor_text_result['score']}")
    print(f"   Poor text answer rationale: {poor_text_result['rationale']}")
    # Short answers should receive lower scores
    assert poor_text_result['score'] < text_result['score'], f"Short answer should score lower than detailed answer"
    print("   [PASS] Poor text answer received lower score")
    
    # Test choose-many question
    print("\n3. Testing choose-many question scoring (direct comparison)...")
    multichoice_question = AssessmentQuestion(
        id=str(uuid4()),
        text="Which of the following are programming languages?",
        weight=4,
        skill_categories=["programming", "computer-science"],
        type=QuestionType.choose_many,
        options=[
            AssessmentQuestionOption(text="Python", value="a"),
            AssessmentQuestionOption(text="HTML", value="b"),
            AssessmentQuestionOption(text="Java", value="c"),
            AssessmentQuestionOption(text="CSS", value="d")
        ],
        correct_options=["a", "c"]  # Python and Java are programming languages
    )
    
    correct_multichoice_result = score_answer(
        question=multichoice_question,
        answer_text="",
        selected_options=["a", "c"]  # Correct answers
    )
    print(f"   Correct multichoice score: {correct_multichoice_result['score']}")
    print(f"   Correct multichoice rationale: {correct_multichoice_result['rationale']}")
    assert correct_multichoice_result['score'] == 1.0, f"Expected 1.0 for correct multichoice answer, got {correct_multichoice_result['score']}"
    assert correct_multichoice_result['correct'] == True, f"Expected True for correct multichoice answer, got {correct_multichoice_result['correct']}"
    print("   [PASS] Correct choose-many answer scored directly")
    
    incorrect_multichoice_result = score_answer(
        question=multichoice_question,
        answer_text="",
        selected_options=["a", "b"]  # Partially incorrect (includes HTML)
    )
    print(f"   Incorrect multichoice score: {incorrect_multichoice_result['score']}")
    print(f"   Incorrect multichoice rationale: {incorrect_multichoice_result['rationale']}")
    assert incorrect_multichoice_result['score'] == 0.0, f"Expected 0.0 for incorrect multichoice answer, got {incorrect_multichoice_result['score']}"
    assert incorrect_multichoice_result['correct'] == False, f"Expected False for incorrect multichoice answer, got {incorrect_multichoice_result['correct']}"
    print("   [PASS] Incorrect choose-many answer scored directly")
    
    print("\n[PASS] Scoring methodology test completed successfully!")
    print("- Multiple choice questions are scored directly by comparing options")
    print("- Text-based questions use AI evaluation (heuristic scoring in mock)")
    print("- This approach optimizes performance by avoiding unnecessary AI calls")


if __name__ == "__main__":
    test_scoring_methodology()