Spaces:
Sleeping
Sleeping
Update fix.py
Browse files
fix.py
CHANGED
|
@@ -1,22 +1,22 @@
|
|
| 1 |
# fix.py
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
import
|
| 5 |
import json
|
| 6 |
import logging
|
| 7 |
-
import
|
| 8 |
-
|
| 9 |
-
import
|
| 10 |
import time
|
| 11 |
from datetime import datetime
|
| 12 |
-
import
|
| 13 |
-
import functools
|
| 14 |
|
| 15 |
-
from openai import AzureOpenAI
|
| 16 |
-
from supabase import create_client, Client
|
| 17 |
-
from tqdm import tqdm
|
| 18 |
from dotenv import load_dotenv
|
|
|
|
| 19 |
from ratelimiter import RateLimiter
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Set up logging with thread safety and custom formatting
|
| 22 |
class CustomFormatter(logging.Formatter):
|
|
@@ -81,7 +81,7 @@ SUPABASE_URL = os.getenv("SUPABASE_DB_URL")
|
|
| 81 |
SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
|
| 82 |
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
| 83 |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 84 |
-
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("
|
| 85 |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
|
| 86 |
|
| 87 |
# Validate environment variables
|
|
@@ -94,8 +94,6 @@ if not AZURE_OPENAI_KEY:
|
|
| 94 |
missing_vars.append("AZURE_OPENAI_KEY")
|
| 95 |
if not AZURE_OPENAI_ENDPOINT:
|
| 96 |
missing_vars.append("AZURE_OPENAI_ENDPOINT")
|
| 97 |
-
if not AZURE_OPENAI_DEPLOYMENT_NAME:
|
| 98 |
-
missing_vars.append("AZURE_OPENAI_DEPLOYMENT_NAME")
|
| 99 |
|
| 100 |
if missing_vars:
|
| 101 |
error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
|
|
@@ -166,7 +164,11 @@ def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
| 166 |
Returns a dictionary with fixed content or None if generation fails.
|
| 167 |
"""
|
| 168 |
try:
|
| 169 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
system_message = """You are an expert in standardized English test content. You must return your response as a valid JSON object with the following structure:
|
| 171 |
{
|
| 172 |
"reading_passage": "formatted passage text",
|
|
@@ -176,37 +178,35 @@ def generate_fixed_content(row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
| 176 |
"option_c": "option C text",
|
| 177 |
"option_d": "option D text",
|
| 178 |
"explanation": "explanation text"
|
| 179 |
-
}
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
4. Quality:
|
| 203 |
-
- Fix grammar and clarity issues
|
| 204 |
-
- Ensure proper organization
|
| 205 |
-
- Use clear, unambiguous language"""
|
| 206 |
|
| 207 |
# Create user message with the content to fix
|
| 208 |
user_message = f"""Please format and fix the following exam content, returning a JSON object with the specified structure:
|
| 209 |
|
|
|
|
|
|
|
| 210 |
Reading Passage:
|
| 211 |
{row.get('reading_passage', '')}
|
| 212 |
|
|
@@ -243,28 +243,35 @@ Explanation:
|
|
| 243 |
|
| 244 |
content = response.choices[0].message.content
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
try:
|
| 247 |
# Parse JSON response
|
| 248 |
fixed_data = json.loads(content)
|
| 249 |
|
| 250 |
-
#
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
-
if missing_fields:
|
| 255 |
-
logger.error(f"Missing or empty required fields: {', '.join(missing_fields)}")
|
| 256 |
-
return None
|
| 257 |
-
|
| 258 |
-
# Validate content length
|
| 259 |
-
short_fields = [field for field in required_fields if len(str(fixed_data.get(field, ''))) < 2]
|
| 260 |
-
if short_fields:
|
| 261 |
-
logger.error(f"Fields with insufficient content: {', '.join(short_fields)}")
|
| 262 |
-
return None
|
| 263 |
-
|
| 264 |
# Copy over unchanged fields
|
| 265 |
for key in row:
|
| 266 |
if key not in fixed_data and key != 'id':
|
| 267 |
fixed_data[key] = row[key]
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
return fixed_data
|
| 270 |
|
|
@@ -295,74 +302,59 @@ def clean_text(text: str) -> str:
|
|
| 295 |
|
| 296 |
def check_row_quality(row: Dict[str, Any]) -> bool:
|
| 297 |
"""
|
| 298 |
-
|
| 299 |
-
Returns True if the row is good, False if it needs fixing.
|
| 300 |
"""
|
| 301 |
-
# Skip if already fixed
|
| 302 |
-
if row.get('is_fixed'):
|
| 303 |
return True
|
| 304 |
|
| 305 |
-
#
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
| 311 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
-
#
|
| 314 |
-
for
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
# Check for valid exam type
|
| 320 |
-
if row['exam_type'] not in EXAM_TYPES:
|
| 321 |
-
return False
|
| 322 |
-
|
| 323 |
-
# Check for valid difficulty level
|
| 324 |
-
if row['difficulty_level'] not in DIFFICULTY_LEVELS:
|
| 325 |
-
return False
|
| 326 |
-
|
| 327 |
-
# Check for valid correct answer format
|
| 328 |
-
if not is_valid_correct_answer(row['correct_answer']):
|
| 329 |
-
return False
|
| 330 |
-
|
| 331 |
-
# Check for common OCR and formatting issues
|
| 332 |
-
text_fields = ['reading_passage', 'question_text', 'option_a', 'option_b', 'option_c', 'option_d', 'explanation']
|
| 333 |
-
for field in text_fields:
|
| 334 |
-
text = row.get(field, '')
|
| 335 |
-
if isinstance(text, str):
|
| 336 |
-
# Check for OCR artifacts
|
| 337 |
-
if any(artifact in text.lower() for artifact in [
|
| 338 |
-
'arebasedonthe', 'lineno', 'click here', 'seenext', 'seebelow',
|
| 339 |
-
'answerthefollowing', 'choosethebest', 'selectthe'
|
| 340 |
-
]):
|
| 341 |
-
return False
|
| 342 |
-
|
| 343 |
-
# Check for formatting issues
|
| 344 |
-
if text.count('.') > 20: # Too many periods might indicate formatting issues
|
| 345 |
-
return False
|
| 346 |
-
if text.count('\n') > 20: # Too many newlines might indicate formatting issues
|
| 347 |
-
return False
|
| 348 |
-
if len(text.split()) < 2: # Text should have at least 2 words
|
| 349 |
-
return False
|
| 350 |
-
|
| 351 |
-
# Check minimum length requirements
|
| 352 |
-
if len(row['reading_passage'].split()) < MIN_PASSAGE_WORDS:
|
| 353 |
-
return False
|
| 354 |
-
|
| 355 |
# Check for duplicate options
|
| 356 |
-
options =
|
| 357 |
-
|
| 358 |
-
return
|
| 359 |
-
|
| 360 |
-
#
|
| 361 |
-
explanation = row
|
| 362 |
-
if len(explanation
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
return False
|
| 366 |
|
| 367 |
return True
|
| 368 |
|
|
@@ -388,77 +380,116 @@ def update_row_in_supabase(row_id: str, fixed_data: Dict[str, Any]) -> bool:
|
|
| 388 |
return False
|
| 389 |
|
| 390 |
def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
|
| 391 |
-
"""Process a single row
|
| 392 |
-
result = {
|
| 393 |
-
'row_id': row.get('id'),
|
| 394 |
-
'success': False,
|
| 395 |
-
'message': '',
|
| 396 |
-
'changes_made': []
|
| 397 |
-
}
|
| 398 |
-
|
| 399 |
try:
|
| 400 |
row_id = row.get('id')
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
# Generate fixed content
|
| 432 |
fixed_data = generate_fixed_content(row)
|
| 433 |
if not fixed_data:
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
else:
|
| 453 |
-
|
| 454 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
|
| 456 |
except Exception as e:
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
|
|
|
| 462 |
|
| 463 |
def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
|
| 464 |
"""
|
|
@@ -499,6 +530,7 @@ def main():
|
|
| 499 |
total_rows = 0
|
| 500 |
success_count = 0
|
| 501 |
failure_count = 0
|
|
|
|
| 502 |
changes_by_field = {
|
| 503 |
'reading_passage': 0,
|
| 504 |
'question_text': 0,
|
|
@@ -534,6 +566,9 @@ def main():
|
|
| 534 |
# Update changes counter
|
| 535 |
for field in result['changes_made']:
|
| 536 |
changes_by_field[field] = changes_by_field.get(field, 0) + 1
|
|
|
|
|
|
|
|
|
|
| 537 |
else:
|
| 538 |
failure_count += 1
|
| 539 |
pbar.update(1)
|
|
@@ -547,6 +582,7 @@ def main():
|
|
| 547 |
f"Total questions processed: {total_rows}",
|
| 548 |
f"Successful updates: {success_count}",
|
| 549 |
f"Failed updates: {failure_count}",
|
|
|
|
| 550 |
f"Execution time: {execution_time:.2f} seconds",
|
| 551 |
"\nChanges by field:",
|
| 552 |
*[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
|
|
|
|
| 1 |
# fix.py
|
| 2 |
|
| 3 |
+
import concurrent.futures
|
| 4 |
+
import functools
|
| 5 |
import json
|
| 6 |
import logging
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import threading
|
| 10 |
import time
|
| 11 |
from datetime import datetime
|
| 12 |
+
from typing import Any, Dict, Optional
|
|
|
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
+
from openai import AzureOpenAI
|
| 16 |
from ratelimiter import RateLimiter
|
| 17 |
+
from supabase import Client, create_client
|
| 18 |
+
from tqdm import tqdm
|
| 19 |
+
|
| 20 |
|
| 21 |
# Set up logging with thread safety and custom formatting
|
| 22 |
class CustomFormatter(logging.Formatter):
|
|
|
|
| 81 |
SUPABASE_API_KEY = os.getenv("SUPABASE_API_KEY")
|
| 82 |
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
|
| 83 |
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
| 84 |
+
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_FIX", "gpt-4o-mini") # Use specific deployment for fixing
|
| 85 |
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2023-05-15")
|
| 86 |
|
| 87 |
# Validate environment variables
|
|
|
|
| 94 |
missing_vars.append("AZURE_OPENAI_KEY")
|
| 95 |
if not AZURE_OPENAI_ENDPOINT:
|
| 96 |
missing_vars.append("AZURE_OPENAI_ENDPOINT")
|
|
|
|
|
|
|
| 97 |
|
| 98 |
if missing_vars:
|
| 99 |
error_msg = f"Missing required environment variables: {', '.join(missing_vars)}"
|
|
|
|
| 164 |
Returns a dictionary with fixed content or None if generation fails.
|
| 165 |
"""
|
| 166 |
try:
|
| 167 |
+
# Determine if this is a math question
|
| 168 |
+
domain = row.get('domain', '').lower()
|
| 169 |
+
is_math = any(math_term in domain.lower() for math_term in ['math', 'algebra', 'geometry', 'calculus', 'arithmetic'])
|
| 170 |
+
|
| 171 |
+
# Create system message with domain-specific instructions
|
| 172 |
system_message = """You are an expert in standardized English test content. You must return your response as a valid JSON object with the following structure:
|
| 173 |
{
|
| 174 |
"reading_passage": "formatted passage text",
|
|
|
|
| 178 |
"option_c": "option C text",
|
| 179 |
"option_d": "option D text",
|
| 180 |
"explanation": "explanation text"
|
| 181 |
+
}"""
|
| 182 |
+
|
| 183 |
+
if is_math:
|
| 184 |
+
system_message += """
|
| 185 |
+
IMPORTANT: For ALL mathematics questions:
|
| 186 |
+
- You MUST set reading_passage to an empty string (""). No exceptions.
|
| 187 |
+
- Move any context or problem setup from the reading passage into the question_text
|
| 188 |
+
- The question_text should contain all necessary mathematical information
|
| 189 |
+
- Format: reading_passage must be "", question_text contains everything
|
| 190 |
+
|
| 191 |
+
Example math question format:
|
| 192 |
+
{
|
| 193 |
+
"reading_passage": "",
|
| 194 |
+
"question_text": "In the given system of equations, y = -1.5 and y = x^2 + 8x + a, where a is a positive constant. The system has exactly one distinct real solution. What is the value of a?",
|
| 195 |
+
...
|
| 196 |
+
}"""
|
| 197 |
+
else:
|
| 198 |
+
system_message += """
|
| 199 |
+
For reading comprehension questions:
|
| 200 |
+
- Format the reading_passage professionally with proper paragraphing
|
| 201 |
+
- Ensure the question is answerable from the passage
|
| 202 |
+
- Make answer options clear and distinct
|
| 203 |
+
- Reference the passage in the explanation"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
# Create user message with the content to fix
|
| 206 |
user_message = f"""Please format and fix the following exam content, returning a JSON object with the specified structure:
|
| 207 |
|
| 208 |
+
Domain: {domain}
|
| 209 |
+
|
| 210 |
Reading Passage:
|
| 211 |
{row.get('reading_passage', '')}
|
| 212 |
|
|
|
|
| 243 |
|
| 244 |
content = response.choices[0].message.content
|
| 245 |
|
| 246 |
+
# Calculate cost (gpt-4o-mini pricing)
|
| 247 |
+
input_tokens = (len(system_message) + len(user_message)) / 4 # Rough estimate: 4 chars per token
|
| 248 |
+
output_tokens = len(content) / 4
|
| 249 |
+
# gpt-4o-mini pricing:
|
| 250 |
+
# Input: $0.300 per 1M tokens
|
| 251 |
+
# Output: $1.200 per 1M tokens
|
| 252 |
+
fix_cost = (input_tokens / 1_000_000 * 0.300) + (output_tokens / 1_000_000 * 1.200)
|
| 253 |
+
logger.info(f"Estimated cost for fixing this question: ${fix_cost:.6f}")
|
| 254 |
+
|
| 255 |
try:
|
| 256 |
# Parse JSON response
|
| 257 |
fixed_data = json.loads(content)
|
| 258 |
|
| 259 |
+
# For math questions, ensure reading passage is empty
|
| 260 |
+
if is_math and fixed_data.get('reading_passage', '').strip():
|
| 261 |
+
# Move reading passage content to question text if needed
|
| 262 |
+
current_passage = fixed_data.get('reading_passage', '').strip()
|
| 263 |
+
current_question = fixed_data.get('question_text', '').strip()
|
| 264 |
+
if current_passage:
|
| 265 |
+
fixed_data['question_text'] = f"{current_passage} {current_question}"
|
| 266 |
+
fixed_data['reading_passage'] = ""
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
# Copy over unchanged fields
|
| 269 |
for key in row:
|
| 270 |
if key not in fixed_data and key != 'id':
|
| 271 |
fixed_data[key] = row[key]
|
| 272 |
+
|
| 273 |
+
# Add the fix cost to the data
|
| 274 |
+
fixed_data['fix_cost'] = fix_cost
|
| 275 |
|
| 276 |
return fixed_data
|
| 277 |
|
|
|
|
| 302 |
|
| 303 |
def check_row_quality(row: Dict[str, Any]) -> bool:
|
| 304 |
"""
|
| 305 |
+
Check if a row meets quality standards.
|
| 306 |
+
Returns True if the row is good quality, False if it needs fixing.
|
| 307 |
"""
|
| 308 |
+
# Skip if already marked as fixed
|
| 309 |
+
if row.get('is_fixed', False):
|
| 310 |
return True
|
| 311 |
|
| 312 |
+
# Check for image-related questions that should be deleted
|
| 313 |
+
question_text = row.get('question_text', '').lower()
|
| 314 |
+
reading_passage = row.get('reading_passage', '').lower()
|
| 315 |
+
|
| 316 |
+
# Keywords that indicate image-based questions
|
| 317 |
+
image_keywords = [
|
| 318 |
+
'image', 'picture', 'diagram', 'graph', 'figure', 'photo', 'illustration',
|
| 319 |
+
'shown', 'depicted', 'displayed', 'above', 'below', 'following figure',
|
| 320 |
+
'look at the', 'in this picture', 'as shown', 'pictured'
|
| 321 |
]
|
| 322 |
+
|
| 323 |
+
# Check if question or passage refers to images
|
| 324 |
+
if any(keyword in question_text for keyword in image_keywords) or \
|
| 325 |
+
any(keyword in reading_passage for keyword in image_keywords):
|
| 326 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - contains image references")
|
| 327 |
+
return None # Return None to indicate deletion
|
| 328 |
+
|
| 329 |
+
# Basic validation for required fields
|
| 330 |
+
if not row.get('question_text') or not row.get('explanation'):
|
| 331 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - missing required fields")
|
| 332 |
+
return None
|
| 333 |
+
|
| 334 |
+
if not all(row.get(f'option_{opt}') for opt in ['a', 'b', 'c', 'd']):
|
| 335 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - missing options")
|
| 336 |
+
return None
|
| 337 |
+
|
| 338 |
+
if not is_valid_correct_answer(row.get('correct_answer', '')):
|
| 339 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - invalid correct answer")
|
| 340 |
+
return None
|
| 341 |
|
| 342 |
+
# Option quality checks
|
| 343 |
+
options = [row.get(f'option_{opt}', '').strip() for opt in ['a', 'b', 'c', 'd']]
|
| 344 |
+
if any(len(opt) < 1 for opt in options):
|
| 345 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - empty options")
|
| 346 |
+
return None
|
| 347 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
# Check for duplicate options
|
| 349 |
+
if len(set(options)) != 4:
|
| 350 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - duplicate options")
|
| 351 |
+
return None
|
| 352 |
+
|
| 353 |
+
# Basic explanation quality check
|
| 354 |
+
explanation = row.get('explanation', '')
|
| 355 |
+
if len(explanation) < 50 or not explanation.strip():
|
| 356 |
+
logger.info(f"Row {row.get('id')}: Marked for deletion - insufficient explanation")
|
| 357 |
+
return None
|
|
|
|
| 358 |
|
| 359 |
return True
|
| 360 |
|
|
|
|
| 380 |
return False
|
| 381 |
|
| 382 |
def process_row(row: Dict[str, Any], progress_counter: AtomicCounter, total_rows: int, row_number: int) -> Dict[str, Any]:
|
| 383 |
+
"""Process a single row and return the result."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
try:
|
| 385 |
row_id = row.get('id')
|
| 386 |
+
|
| 387 |
+
# Check quality first
|
| 388 |
+
quality_check = check_row_quality(row)
|
| 389 |
+
|
| 390 |
+
# If quality_check is None, delete the row
|
| 391 |
+
if quality_check is None:
|
| 392 |
+
try:
|
| 393 |
+
supabase.table("exam_contents").delete().eq("id", row_id).execute()
|
| 394 |
+
logger.info(f"Row {row_id}: Deleted due to quality issues.")
|
| 395 |
+
return {
|
| 396 |
+
'success': True,
|
| 397 |
+
'changes_made': ['deleted'],
|
| 398 |
+
'row_id': row_id,
|
| 399 |
+
'cost': 0.0
|
| 400 |
+
}
|
| 401 |
+
except Exception as e:
|
| 402 |
+
logger.error(f"Row {row_id}: Failed to delete - {str(e)}")
|
| 403 |
+
return {
|
| 404 |
+
'success': False,
|
| 405 |
+
'row_id': row_id,
|
| 406 |
+
'cost': 0.0
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
# If row passes quality check, no need to fix
|
| 410 |
+
if quality_check is True:
|
| 411 |
+
# Update is_fixed flag
|
| 412 |
+
try:
|
| 413 |
+
supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
|
| 414 |
+
logger.info(f"Row {row_id}: Already good quality. Marked as fixed.")
|
| 415 |
+
return {
|
| 416 |
+
'success': True,
|
| 417 |
+
'changes_made': ['marked_fixed'],
|
| 418 |
+
'row_id': row_id,
|
| 419 |
+
'cost': 0.0
|
| 420 |
+
}
|
| 421 |
+
except Exception as e:
|
| 422 |
+
logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
|
| 423 |
+
return {
|
| 424 |
+
'success': False,
|
| 425 |
+
'row_id': row_id,
|
| 426 |
+
'cost': 0.0
|
| 427 |
+
}
|
| 428 |
|
| 429 |
# Generate fixed content
|
| 430 |
fixed_data = generate_fixed_content(row)
|
| 431 |
if not fixed_data:
|
| 432 |
+
logger.error(f"Row {row_id}: Failed to generate fixed content.")
|
| 433 |
+
return {
|
| 434 |
+
'success': False,
|
| 435 |
+
'row_id': row_id,
|
| 436 |
+
'cost': 0.0
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
# Track what fields were modified
|
| 440 |
+
changes_made = []
|
| 441 |
+
for field in fixed_data:
|
| 442 |
+
if field in row and fixed_data[field] != row[field]:
|
| 443 |
+
changes_made.append(field)
|
| 444 |
+
|
| 445 |
+
if changes_made:
|
| 446 |
+
# Add is_fixed flag
|
| 447 |
+
fixed_data['is_fixed'] = True
|
| 448 |
+
|
| 449 |
+
# Update in database
|
| 450 |
+
try:
|
| 451 |
+
supabase.table("exam_contents").update(fixed_data).eq("id", row_id).execute()
|
| 452 |
+
change_list = ', '.join(changes_made)
|
| 453 |
+
logger.info(f"Row {row_id}: Fixed successfully. Modified: {change_list}")
|
| 454 |
+
return {
|
| 455 |
+
'success': True,
|
| 456 |
+
'changes_made': changes_made,
|
| 457 |
+
'row_id': row_id,
|
| 458 |
+
'cost': fixed_data.get('fix_cost', 0.0) # Include the fix cost
|
| 459 |
+
}
|
| 460 |
+
except Exception as e:
|
| 461 |
+
logger.error(f"Row {row_id}: Failed to update - {str(e)}")
|
| 462 |
+
return {
|
| 463 |
+
'success': False,
|
| 464 |
+
'row_id': row_id,
|
| 465 |
+
'cost': 0.0
|
| 466 |
+
}
|
| 467 |
else:
|
| 468 |
+
# No changes needed, just mark as fixed
|
| 469 |
+
try:
|
| 470 |
+
supabase.table("exam_contents").update({"is_fixed": True}).eq("id", row_id).execute()
|
| 471 |
+
logger.info(f"Row {row_id}: Fixed successfully. Modified: No changes needed")
|
| 472 |
+
return {
|
| 473 |
+
'success': True,
|
| 474 |
+
'changes_made': ['marked_fixed'],
|
| 475 |
+
'row_id': row_id,
|
| 476 |
+
'cost': fixed_data.get('fix_cost', 0.0) # Include the fix cost even if no changes
|
| 477 |
+
}
|
| 478 |
+
except Exception as e:
|
| 479 |
+
logger.error(f"Row {row_id}: Failed to update fixed status - {str(e)}")
|
| 480 |
+
return {
|
| 481 |
+
'success': False,
|
| 482 |
+
'row_id': row_id,
|
| 483 |
+
'cost': 0.0
|
| 484 |
+
}
|
| 485 |
|
| 486 |
except Exception as e:
|
| 487 |
+
logger.error(f"Error processing row {row.get('id', 'unknown')}: {str(e)}")
|
| 488 |
+
return {
|
| 489 |
+
'success': False,
|
| 490 |
+
'row_id': row.get('id', 'unknown'),
|
| 491 |
+
'cost': 0.0
|
| 492 |
+
}
|
| 493 |
|
| 494 |
def fetch_all_unfixed_rows(supabase_client: Client, batch_size: int = 1000):
|
| 495 |
"""
|
|
|
|
| 530 |
total_rows = 0
|
| 531 |
success_count = 0
|
| 532 |
failure_count = 0
|
| 533 |
+
total_cost = 0.0
|
| 534 |
changes_by_field = {
|
| 535 |
'reading_passage': 0,
|
| 536 |
'question_text': 0,
|
|
|
|
| 566 |
# Update changes counter
|
| 567 |
for field in result['changes_made']:
|
| 568 |
changes_by_field[field] = changes_by_field.get(field, 0) + 1
|
| 569 |
+
# Add cost if available
|
| 570 |
+
if 'cost' in result:
|
| 571 |
+
total_cost += result['cost']
|
| 572 |
else:
|
| 573 |
failure_count += 1
|
| 574 |
pbar.update(1)
|
|
|
|
| 582 |
f"Total questions processed: {total_rows}",
|
| 583 |
f"Successful updates: {success_count}",
|
| 584 |
f"Failed updates: {failure_count}",
|
| 585 |
+
f"Total cost: ${total_cost:.6f}",
|
| 586 |
f"Execution time: {execution_time:.2f} seconds",
|
| 587 |
"\nChanges by field:",
|
| 588 |
*[f"- {field}: {count}" for field, count in changes_by_field.items() if count > 0],
|