masharma commited on
Commit
73da86c
Β·
verified Β·
1 Parent(s): 3f3323d

Upload 11 files

Browse files
README_HF.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ROAR Item Generator
3
+ emoji: 🦁
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.11.0
8
+ app_file: app_gradio.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # ROAR Assessment Item Generator
14
+
15
+ Generate reading comprehension items with AI-powered difficulty estimation.
16
+
17
+ ## Features
18
+ - AI-powered item generation using Claude
19
+ - Automatic difficulty estimation using ModernBERT
20
+ - Save and export items to CSV
21
+ - Interactive chat interface
22
+
23
+ ## Model
24
+ Uses a custom-trained difficulty estimation model (ModernBERT + Ridge Regression)
app_gradio.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from anthropic import Anthropic
4
+ from difficulty_estimator import DifficultyEstimator
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Initialize Anthropic client
11
+ client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
12
+
13
+ # Initialize difficulty estimator
14
+ MODEL_PATH = os.getenv('MODEL_PATH', './models')
15
+ difficulty_estimator = DifficultyEstimator(MODEL_PATH)
16
+
17
+ # Load ROAR prompt
18
+ with open('prompts/roar_prompt.md', 'r') as f:
19
+ ROAR_PROMPT = f.read()
20
+
21
+ SYSTEM_MESSAGE = """You are an expert educational assessment designer specializing in creating reading comprehension items.
22
+ Generate high-quality assessment items following the exact format provided."""
23
+
24
+ # Store conversation history and current item
25
+ conversation_state = {"history": [], "current_item": None, "collection": []}
26
+
27
+
28
+ def parse_item_from_response(text):
29
+ """Parse item from Claude's response"""
30
+ # Remove markdown bold formatting
31
+ text = text.replace('**', '')
32
+
33
+ item = {}
34
+
35
+ # Define field markers and their end markers
36
+ fields = {
37
+ 'passage': 'Question:',
38
+ 'question': 'Target Answer:',
39
+ 'target_answer': 'Distractor 1:',
40
+ 'distractor_1': 'Distractor 2:',
41
+ 'distractor_2': 'Distractor 3:',
42
+ 'distractor_3': 'Metadata:',
43
+ }
44
+
45
+ # Parse each field
46
+ for field, end_marker in fields.items():
47
+ # Find the field label
48
+ field_label = field.replace('_', ' ').title().replace('Distractor', 'Distractor')
49
+ if field == 'passage':
50
+ field_label = 'Passage:'
51
+ elif field == 'question':
52
+ field_label = 'Question:'
53
+ elif field == 'target_answer':
54
+ field_label = 'Target Answer:'
55
+ elif field.startswith('distractor'):
56
+ num = field.split('_')[1]
57
+ field_label = f'Distractor {num}:'
58
+
59
+ start_pos = text.find(field_label)
60
+ if start_pos == -1:
61
+ continue
62
+
63
+ start_pos += len(field_label)
64
+ end_pos = text.find(end_marker, start_pos) if end_marker else len(text)
65
+
66
+ if end_pos == -1:
67
+ end_pos = len(text)
68
+
69
+ content = text[start_pos:end_pos].strip()
70
+
71
+ # For distractors, clean up extra formatting
72
+ if field.startswith('distractor'):
73
+ # Remove parenthetical notes
74
+ if '(' in content:
75
+ paren_pos = content.find('(')
76
+ content = content[:paren_pos].strip()
77
+ # Take only first line
78
+ if '\n' in content:
79
+ content = content.split('\n')[0].strip()
80
+ # Remove dashes
81
+ content = content.replace('---', '').strip()
82
+
83
+ item[field] = content
84
+
85
+ # Parse metadata
86
+ metadata_section = text[text.find('Metadata:'):] if 'Metadata:' in text else ''
87
+
88
+ metadata_fields = {
89
+ 'event_chain_relation': 'Event-Chain Relation:',
90
+ 'knowledge_base_inference': 'Knowledge-Base Inference:',
91
+ 'qar_level': 'QAR Level:',
92
+ 'coherence_level': 'Coherence Level:',
93
+ 'explanatory_stance': 'Explanatory Stance:'
94
+ }
95
+
96
+ for field, label in metadata_fields.items():
97
+ if label in metadata_section:
98
+ start = metadata_section.find(label) + len(label)
99
+ end = metadata_section.find('\n', start)
100
+ if end == -1:
101
+ end = len(metadata_section)
102
+ value = metadata_section[start:end].strip()
103
+ # Clean up value
104
+ if '(' in value:
105
+ value = value[:value.find('(')].strip()
106
+ item[field] = value
107
+
108
+ return item
109
+
110
+
111
+ def chat_with_ai(user_message, history):
112
+ """Handle chat with Claude and generate items"""
113
+ if not user_message:
114
+ return history, None, None
115
+
116
+ # Add user message to history
117
+ conversation_state["history"].append({
118
+ 'role': 'user',
119
+ 'content': user_message
120
+ })
121
+
122
+ # Get response from Claude
123
+ messages = [{'role': msg['role'], 'content': msg['content']}
124
+ for msg in conversation_state["history"]]
125
+
126
+ with client.messages.stream(
127
+ model='claude-sonnet-4-20250514',
128
+ max_tokens=4000,
129
+ temperature=1,
130
+ system=SYSTEM_MESSAGE + "\n\n" + ROAR_PROMPT,
131
+ messages=messages
132
+ ) as stream:
133
+ assistant_message = ""
134
+ for text in stream.text_stream:
135
+ assistant_message += text
136
+
137
+ conversation_state["history"].append({
138
+ 'role': 'assistant',
139
+ 'content': assistant_message
140
+ })
141
+
142
+ # Parse item from response
143
+ item = None
144
+ difficulty = None
145
+ try:
146
+ item = parse_item_from_response(assistant_message)
147
+ if item and (item.get('passage') or item.get('question')):
148
+ conversation_state["current_item"] = item
149
+ if difficulty_estimator.is_loaded():
150
+ difficulty = difficulty_estimator.estimate_difficulty(item)
151
+ except Exception as e:
152
+ print(f"Error parsing item: {e}")
153
+
154
+ # Update chat history for display
155
+ history.append((user_message, assistant_message))
156
+
157
+ # Format item display
158
+ item_display = format_item_display(item, difficulty) if item else "No item generated yet"
159
+
160
+ return history, item_display, item
161
+
162
+
163
+ def format_item_display(item, difficulty=None):
164
+ """Format item for display"""
165
+ if not item:
166
+ return "No item to display"
167
+
168
+ display = "# Current Item\n\n"
169
+
170
+ # Add difficulty if available
171
+ if difficulty:
172
+ score = difficulty['score']
173
+ irt_score = difficulty.get('irt_difficulty', 'N/A')
174
+ label = difficulty.get('interpretation', 'Medium')
175
+ display += f"**Estimated Difficulty:** {label}\n"
176
+ display += f"- Normalized: {score*100:.1f}%\n"
177
+ display += f"- IRT Score: {irt_score:.3f if isinstance(irt_score, float) else irt_score}\n\n"
178
+
179
+ # Add item fields
180
+ display += f"**Passage:**\n{item.get('passage', 'N/A')}\n\n"
181
+ display += f"**Question:**\n{item.get('question', 'N/A')}\n\n"
182
+ display += f"**Target Answer:**\n{item.get('target_answer', 'N/A')}\n\n"
183
+ display += f"**Distractor 1:**\n{item.get('distractor_1', 'N/A')}\n\n"
184
+ display += f"**Distractor 2:**\n{item.get('distractor_2', 'N/A')}\n\n"
185
+ display += f"**Distractor 3:**\n{item.get('distractor_3', 'N/A')}\n\n"
186
+
187
+ # Add metadata
188
+ display += "---\n**Metadata:**\n"
189
+ display += f"- Event-Chain Relation: {item.get('event_chain_relation', 'N/A')}\n"
190
+ display += f"- Knowledge-Base Inference: {item.get('knowledge_base_inference', 'N/A')}\n"
191
+ display += f"- QAR Level: {item.get('qar_level', 'N/A')}\n"
192
+ display += f"- Coherence Level: {item.get('coherence_level', 'N/A')}\n"
193
+ display += f"- Explanatory Stance: {item.get('explanatory_stance', 'N/A')}\n"
194
+
195
+ return display
196
+
197
+
198
+ def save_to_collection(item_data):
199
+ """Save current item to collection"""
200
+ if not conversation_state["current_item"]:
201
+ return "No item to save", format_collection_display()
202
+
203
+ # Add to collection
204
+ item_copy = conversation_state["current_item"].copy()
205
+ item_copy['item_id'] = len(conversation_state["collection"]) + 1
206
+
207
+ # Add difficulty if available
208
+ if difficulty_estimator.is_loaded():
209
+ difficulty = difficulty_estimator.estimate_difficulty(item_copy)
210
+ if difficulty:
211
+ item_copy['difficulty_score'] = difficulty['score']
212
+ item_copy['difficulty_irt'] = difficulty.get('irt_difficulty')
213
+ item_copy['difficulty_label'] = difficulty.get('interpretation')
214
+
215
+ conversation_state["collection"].append(item_copy)
216
+
217
+ return f"βœ… Item saved! ({len(conversation_state['collection'])} items total)", format_collection_display()
218
+
219
+
220
+ def format_collection_display():
221
+ """Format collection for display"""
222
+ if not conversation_state["collection"]:
223
+ return "No items in collection yet"
224
+
225
+ display = f"# Collection ({len(conversation_state['collection'])} items)\n\n"
226
+
227
+ for item in conversation_state["collection"]:
228
+ display += f"## Item #{item['item_id']}\n"
229
+ if 'difficulty_label' in item:
230
+ display += f"**Difficulty:** {item['difficulty_label']} "
231
+ display += f"({item.get('difficulty_score', 0)*100:.1f}%)\n"
232
+ display += f"**Question:** {item.get('question', 'N/A')[:100]}...\n\n"
233
+
234
+ return display
235
+
236
+
237
+ def export_collection():
238
+ """Export collection as CSV"""
239
+ if not conversation_state["collection"]:
240
+ return None
241
+
242
+ import pandas as pd
243
+ import io
244
+ from datetime import datetime
245
+
246
+ df = pd.DataFrame(conversation_state["collection"])
247
+
248
+ # Save to file
249
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
250
+ filename = f'roar_items_{timestamp}.csv'
251
+ df.to_csv(filename, index=False)
252
+
253
+ return filename
254
+
255
+
256
+ def clear_chat():
257
+ """Clear chat history"""
258
+ conversation_state["history"] = []
259
+ conversation_state["current_item"] = None
260
+ return [], "No item generated yet"
261
+
262
+
263
+ # Create Gradio interface
264
+ with gr.Blocks(title="ROAR Item Generator", theme=gr.themes.Soft()) as demo:
265
+ gr.Markdown("# 🦁 ROAR Assessment Item Generator")
266
+ gr.Markdown("Generate reading comprehension items with AI guidance and difficulty estimation")
267
+
268
+ with gr.Row():
269
+ with gr.Column(scale=2):
270
+ chatbot = gr.Chatbot(label="Chat", height=500)
271
+ msg = gr.Textbox(
272
+ label="Your message",
273
+ placeholder="Try: Generate a reading comprehension item about ocean animals",
274
+ lines=2
275
+ )
276
+ with gr.Row():
277
+ send_btn = gr.Button("Send", variant="primary")
278
+ clear_btn = gr.Button("Clear Chat")
279
+
280
+ with gr.Column(scale=1):
281
+ item_display = gr.Markdown("No item generated yet", label="Current Item")
282
+ save_btn = gr.Button("πŸ’Ύ Save to Collection", variant="secondary")
283
+ save_status = gr.Textbox(label="Status", lines=1, interactive=False)
284
+
285
+ gr.Markdown("---")
286
+
287
+ with gr.Accordion("πŸ“š Collection", open=False):
288
+ collection_display = gr.Markdown("No items in collection yet")
289
+ export_btn = gr.Button("πŸ“₯ Export Collection as CSV")
290
+ export_file = gr.File(label="Download CSV")
291
+
292
+ # Hidden state to pass item data
293
+ item_state = gr.State(None)
294
+
295
+ # Event handlers
296
+ msg.submit(chat_with_ai, [msg, chatbot], [chatbot, item_display, item_state]).then(
297
+ lambda: "", None, msg
298
+ )
299
+ send_btn.click(chat_with_ai, [msg, chatbot], [chatbot, item_display, item_state]).then(
300
+ lambda: "", None, msg
301
+ )
302
+ clear_btn.click(clear_chat, None, [chatbot, item_display])
303
+ save_btn.click(save_to_collection, item_state, [save_status, collection_display])
304
+ export_btn.click(export_collection, None, export_file)
305
+
306
+ if __name__ == "__main__":
307
+ demo.launch()
difficulty_estimator.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import joblib
3
+ import numpy as np
4
+ import pandas as pd
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModel
7
+
8
+
9
+ class DifficultyEstimator:
10
+ """
11
+ Estimates item difficulty using ModernBERT + PCA + Ridge model.
12
+ Matches the training pipeline from [item_difficulty]_difficulty_estimator_model.py
13
+ """
14
+
15
+ def __init__(self, model_dir=None):
16
+ self.ridge = None
17
+ self.pca = None
18
+ self.scaler_emb = None
19
+ self.scaler_features = None
20
+ self.grade_columns = None
21
+ self.tokenizer = None
22
+ self.bert_model = None
23
+ self.device = None
24
+
25
+ if model_dir and os.path.exists(model_dir):
26
+ try:
27
+ print("Loading difficulty model components...")
28
+
29
+ # Load all artifacts
30
+ self.ridge = joblib.load(f'{model_dir}/ridge_model.pkl')
31
+ self.pca = joblib.load(f'{model_dir}/pca.pkl')
32
+ self.scaler_emb = joblib.load(f'{model_dir}/scaler_emb.pkl')
33
+ self.scaler_features = joblib.load(f'{model_dir}/scaler_features.pkl')
34
+ self.grade_columns = joblib.load(f'{model_dir}/grade_columns.pkl')
35
+
36
+ print("Loading ModernBERT...")
37
+ self.tokenizer = AutoTokenizer.from_pretrained('answerdotai/ModernBERT-base')
38
+ self.bert_model = AutoModel.from_pretrained('answerdotai/ModernBERT-base')
39
+ self.bert_model.eval()
40
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
41
+ self.bert_model.to(self.device)
42
+
43
+ print(f"βœ… Difficulty model loaded successfully (using {self.device})")
44
+
45
+ except Exception as e:
46
+ print(f"⚠️ Could not load model: {e}")
47
+ import traceback
48
+ traceback.print_exc()
49
+
50
+ def is_loaded(self):
51
+ """Check if model is fully loaded"""
52
+ return all([
53
+ self.ridge is not None,
54
+ self.pca is not None,
55
+ self.scaler_emb is not None,
56
+ self.scaler_features is not None,
57
+ self.grade_columns is not None,
58
+ self.tokenizer is not None,
59
+ self.bert_model is not None
60
+ ])
61
+
62
+ def build_text(self, item):
63
+ """
64
+ Build input text matching training format (Figure 2 in paper).
65
+ Format:
66
+ Question: {question}
67
+ Correct: {target_answer}
68
+ Wrong 1: {distractor_1}
69
+ Wrong 2: {distractor_2}
70
+ Wrong 3: {distractor_3} # Note: your items only have 2 distractors
71
+ Passage: {passage}
72
+ """
73
+ return (
74
+ f"Question: {item.get('question', '')}\n"
75
+ f"Correct: {item.get('target_answer', '')}\n"
76
+ f"Wrong 1: {item.get('distractor_1', '')}\n"
77
+ f"Wrong 2: {item.get('distractor_2', '')}\n"
78
+ f"Wrong 3: \n" # Empty third distractor since ROAR only has 2
79
+ f"Passage: {item.get('passage', '')}"
80
+ )
81
+
82
+ def get_embedding(self, text):
83
+ """
84
+ Extract ModernBERT embedding using average pooling over real tokens.
85
+ Matches training code: average over all tokens up to last non-padding.
86
+ """
87
+ inputs = self.tokenizer(
88
+ text,
89
+ return_tensors='pt',
90
+ truncation=True,
91
+ max_length=512,
92
+ padding=True
93
+ )
94
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
95
+
96
+ with torch.no_grad():
97
+ outputs = self.bert_model(**inputs)
98
+ hidden = outputs.last_hidden_state # (1, seq_len, hidden_dim)
99
+ mask = inputs['attention_mask'] # (1, seq_len)
100
+
101
+ # Last non-padding token index
102
+ last_idx = mask[0].nonzero(as_tuple=True)[0][-1].item()
103
+
104
+ # Average over all real tokens
105
+ real_hidden = hidden[0, :last_idx+1, :]
106
+ avg_emb = real_hidden.mean(dim=0).cpu().numpy()
107
+
108
+ return avg_emb
109
+
110
+ def get_grade_ohe(self, grade):
111
+ """
112
+ Create one-hot encoded grade vector.
113
+ ROAR items don't have grade info, so default to Grade4.
114
+ """
115
+ grade_ohe = pd.DataFrame(0, index=[0], columns=self.grade_columns)
116
+
117
+ # Try to match grade format
118
+ if grade:
119
+ col = f'grade_{grade}'
120
+ if col in self.grade_columns:
121
+ grade_ohe[col] = 1
122
+ else:
123
+ # Default to Grade4 if no grade specified
124
+ if 'grade_Grade4' in self.grade_columns:
125
+ grade_ohe['grade_Grade4'] = 1
126
+
127
+ return grade_ohe.values
128
+
129
+ def estimate_difficulty(self, item):
130
+ """
131
+ Estimate difficulty of an item.
132
+ Returns dict with IRT difficulty score or None if model not loaded.
133
+
134
+ IRT scale interpretation:
135
+ - Negative values = easier items
136
+ - Positive values = harder items
137
+ - Typically ranges from -3 to +3
138
+ """
139
+ if not self.is_loaded():
140
+ return None
141
+
142
+ try:
143
+ # 1. Build text input
144
+ text = self.build_text(item)
145
+
146
+ # 2. Get ModernBERT embedding
147
+ emb = self.get_embedding(text)
148
+
149
+ # 3. Scale -> PCA
150
+ emb_scaled = self.scaler_emb.transform(emb.reshape(1, -1))
151
+ emb_pca = self.pca.transform(emb_scaled)
152
+
153
+ # 4. Add grade one-hot (default to Grade4 for ROAR items)
154
+ grade = item.get('grade', 'Grade4')
155
+ grade_ohe = self.get_grade_ohe(grade)
156
+
157
+ # 5. Combine features
158
+ features = np.hstack([emb_pca, grade_ohe])
159
+
160
+ # 6. Scale and predict
161
+ features_scaled = self.scaler_features.transform(features)
162
+ irt_score = self.ridge.predict(features_scaled)[0]
163
+
164
+ # Convert IRT score to 0-1 scale for display
165
+ # IRT typically ranges -3 to +3, so we'll map to 0-1
166
+ # where 0 = very easy, 1 = very hard
167
+ normalized_score = (irt_score + 3) / 6
168
+ normalized_score = np.clip(normalized_score, 0, 1)
169
+
170
+ return {
171
+ 'score': float(normalized_score), # 0-1 for display
172
+ 'irt_difficulty': float(irt_score), # raw IRT score
173
+ 'interpretation': self.get_interpretation(normalized_score)
174
+ }
175
+
176
+ except Exception as e:
177
+ print(f"Error estimating difficulty: {e}")
178
+ import traceback
179
+ traceback.print_exc()
180
+ return None
181
+
182
+ def get_interpretation(self, score):
183
+ """Get text interpretation of difficulty score"""
184
+ if score < 0.4:
185
+ return "Easy"
186
+ elif score < 0.7:
187
+ return "Medium"
188
+ else:
189
+ return "Hard"
models/.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/grade_columns.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6eb4ad5acd0917adbdecc1713e78dd8886f991e2eaf4e88ac68644458df24b0
3
+ size 106
models/pca.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fa1e465d70f40b09987f5e0cbf6267648c6b62436c3399b9622b76faaf00195
3
+ size 158271
models/ridge_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c10ac18867fd5143f7912c31a5b87267bdd7a23f3dcfe5a94e9fc3e90d27525f
3
+ size 1015
models/scaler_emb.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a70f072019e087abe093450f270bb6ee745219ce8f0d3729502388383c97652b
3
+ size 19047
models/scaler_features.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efdac3fd91d73c99048ac49772996d742959ea1fd5df1642f9734b10a4209a99
3
+ size 1959
prompts/roar_prompt.md ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ROAR Reading Comprehension Item Generation Prompt
2
+
3
+ This prompt template can be used for generating ROAR-Inference assessment items.
4
+ To use it, add it to the system message in app.py when needed.
5
+
6
+ ---
7
+
8
+ You are an expert educational content designer creating reading comprehension items for the ROAR-Inference assessment. Generate ONE complete item per request following all rules below.
9
+
10
+ ---
11
+
12
+ ## ITEM STRUCTURE
13
+
14
+ Create items with:
15
+ - **Passage:** 3-5 sentences, age-appropriate (grades 2-5)
16
+ - **Question:** Targets one inference type
17
+ - **Target Answer:** Full coherence (Level 2)
18
+ - **Distractor 1:** Partial coherence (Level 1) - uses passage info incorrectly
19
+ - **Distractor 2:** Minimal coherence (Level 0) - outside text, world knowledge only
20
+
21
+ ---
22
+
23
+ ## CORE FRAMEWORKS (Choose one from each)
24
+
25
+ ### 1. EVENT-CHAIN RELATION
26
+ - **Logical:** Why/how questions (causes, motivations, enabling conditions)
27
+ - **Informational:** Who/what/when/where questions (referential/spatiotemporal tracking)
28
+ - **Evaluative:** Themes, lessons, significance (global interpretation only)
29
+
30
+ ### 2. KNOWLEDGE-BASE INFERENCE
31
+ - **Superordinate goal:** Purpose, intent, future goals (teleological)
32
+ - **Causal-antecedent:** Prior causes, mechanisms (mechanistic)
33
+ - **State:** Emotions, traits, beliefs explaining behavior (mechanistic)
34
+ - **Referential:** Pronoun resolution, textual connections
35
+ - **Thematic:** Moral/lesson (evaluative)
36
+
37
+ ### 3. QAR LEVEL
38
+ **Text-Explicit:**
39
+ - Answer verbatim/near-verbatim in passage
40
+ - Grammatical link between question and answer
41
+ - Use exact passage wording
42
+
43
+ **Text-Implicit:**
44
+ - Combine adjacent passage details
45
+ - NO grammatical link
46
+ - Local coherence only
47
+ - Must use passage vocabulary (no synonyms/elevated terms)
48
+
49
+ **Script-Implicit:**
50
+ - Requires world knowledge + passage
51
+ - NO grammatical link
52
+ - Global coherence
53
+ - May use terms not in passage
54
+
55
+ ### 4. COHERENCE LEVEL
56
+ - **Local:** Adjacent sentences, working memory span
57
+ - **Global:** Distant text parts + world knowledge integration
58
+
59
+ **Mapping:** Text-Explicit/Implicit β†’ Local | Script-Implicit β†’ Global
60
+
61
+ ---
62
+
63
+ ## CRITICAL CONSTRAINTS
64
+
65
+ ### Vocabulary Matching (Text-Explicit/Implicit ONLY)
66
+ βœ… **MUST** use exact passage wording
67
+ ❌ **NEVER** replace with synonyms or higher-level terms
68
+
69
+ **Violations:**
70
+ - "thin air" β†’ "high elevation" ❌
71
+ - "butterfly emerge" β†’ "metamorphosis" ❌
72
+ - "land was scarce" β†’ "limited land" ❌
73
+
74
+ ### Target Answer Rules
75
+ **DO NOT ADD:**
76
+ - Teleological additions not in text ("safely", "to be safe")
77
+ - Emotions not stated ("scared", "fearful")
78
+ - Purposes not indicated
79
+ - Higher-level vocabulary (for Text-Explicit/Implicit)
80
+
81
+ **Coherence Quality (Breadth + Simplicity):**
82
+ - **Breadth:** Target should connect/explain multiple story elements, not just one detail
83
+ - **Simplicity:** Target should require minimal additional assumptions beyond the passage
84
+ - Best answers integrate multiple pieces of evidence while remaining straightforward
85
+
86
+ ---
87
+
88
+ ## DISTRACTOR CONSTRUCTION
89
+
90
+ **Psychometric Ordering Requirement:**
91
+ Distractors must follow attractiveness hierarchy:
92
+ - **D1 (Partial Coherence):** Should attract mid-ability students who engage with text but miss full inference
93
+ - **D2 (Minimal Coherence):** Should attract low-ability students who rely on world knowledge without text integration
94
+ - D1 must be MORE plausible than D2 to create proper difficulty ordering
95
+
96
+ ### Distractor 1 (Partial Coherence)
97
+ **Pattern:** Text-based misconnection
98
+ - References details FROM passage
99
+ - Connects them incorrectly to question
100
+ - Shows partial text engagement
101
+ - Lacks full explanatory integration
102
+ - **Attractiveness:** Plausible enough to tempt students who read the passage but don't make full inference
103
+
104
+ ### Distractor 2 (Minimal Coherence)
105
+ **Pattern:** Over-reliance on world knowledge
106
+ - Based on question/general knowledge only
107
+ - Ignores passage content
108
+ - Plausible generally, not for this story
109
+ - Represents reading question without passage
110
+ - **Attractiveness:** Less plausible than D1; attracts students who don't engage with passage
111
+
112
+ ---
113
+
114
+ ## OUTPUT FORMAT
115
+
116
+ ```
117
+ Passage: [3-5 sentences]
118
+
119
+ Question: [Your question]
120
+
121
+ Target Answer: [Full coherence]
122
+
123
+ Distractor 1 (Partial Coherence): [Text-based misconnection]
124
+
125
+ Distractor 2 (Minimal Coherence): [World knowledge only]
126
+
127
+ ---
128
+ METADATA:
129
+ Event-Chain Relation: [Logical/Informational/Evaluative]
130
+ Knowledge-Base Inference: [Superordinate Goal/Causal-Antecedent/State/Referential/Thematic]
131
+ QAR Level: [Text-Explicit/Text-Implicit/Script-Implicit]
132
+ Coherence Level: [Local/Global]
133
+ Explanatory Stance: [Teleological/Mechanistic/N/A]
134
+ ---
135
+ ```
136
+
137
+ ---
138
+
139
+ ## KEY PRINCIPLES
140
+
141
+ 1. **Vocabulary matching mandatory** for Text-Explicit/Implicit (no synonyms/elevated terms)
142
+ 2. **Never add to story** (no unstated safety/emotions/purposes)
143
+ 3. **Clear distractor hierarchy** (D1=partial text, D2=world knowledge only)
144
+ 4. **Attractiveness ordering** (Target > D1 > D2 in plausibility for different ability levels)
145
+ 5. **Coherence quality** (Target shows breadth across story elements + simplicity in assumptions)
146
+ 6. **No redundancy** (distractors must be qualitatively different)
147
+ 7. **Plausible distractors** (wrong due to coherence, not impossibility)
148
+ 8. **QAR consistency** (question-answer-passage relationship must match chosen level)
149
+
150
+ ---
151
+
152
+ Generate items that provide diagnostic information about students' inferential reasoning and coherence evaluation processes.
requirements_hf.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ anthropic>=0.40.0
2
+ gradio>=5.0.0
3
+ pandas>=2.2.0
4
+ python-dotenv>=1.0.0
5
+ joblib>=1.3.2
6
+ scikit-learn>=1.4.0
7
+ numpy>=1.26.0
8
+ torch>=2.0.0
9
+ transformers>=4.30.0