| | |
| | """ |
| | Simple TOEFL Independent Speaking Judge - Command Line Interface |
| | Usage: python toefl_judge_cli.py |
| | """ |
| |
|
| | from mlx_lm import load, generate |
| | import re |
| | import argparse |
| | import sys |
| |
|
| | class SimpleTOEFLJudge: |
| | def __init__(self, model_path="mlx-community/Llama-3.2-3B-Instruct-4bit", adapter_path="toefl_judge_adapter"): |
| | """Initialize the TOEFL judge with model paths""" |
| | self.model_path = model_path |
| | self.adapter_path = adapter_path |
| | self.model = None |
| | self.tokenizer = None |
| | |
| | print("π TOEFL Independent Speaking Judge") |
| | print("="*50) |
| | print("Loading model... (this may take a moment)") |
| | |
| | self.load_model() |
| | |
| | def load_model(self): |
| | """Load the fine-tuned model""" |
| | try: |
| | self.model, self.tokenizer = load( |
| | path_or_hf_repo=self.model_path, |
| | adapter_path=self.adapter_path |
| | ) |
| | print("β
Model loaded successfully!") |
| | print(f"π Model Performance: 86.1% accuracy, 0.943 correlation") |
| | print("="*50) |
| | except Exception as e: |
| | print(f"β Error loading model: {e}") |
| | print("Please check that your model and adapter paths are correct.") |
| | sys.exit(1) |
| | |
| | def create_prompt(self, question, answer): |
| | """Create the evaluation prompt""" |
| | system_prompt = """You are an expert TOEFL iBT Independent Speaking evaluator. Your task is to score student responses on a scale of 0-4 based on the official TOEFL Independent Speaking rubric. |
| | |
| | Scoring Criteria: |
| | - Score 4: Fulfills task demands with sustained, coherent discourse. Well-paced delivery, effective grammar/vocabulary, well-developed and coherent ideas. |
| | - Score 3: Addresses task appropriately but may lack full development. Generally clear speech with some fluency, fairly effective language use, mostly coherent with some limitations. |
| | - Score 2: Addresses task but limited development. Intelligible speech requiring listener effort, limited grammar/vocabulary range, basic ideas with limited elaboration. |
| | - Score 1: Very limited content/coherence, largely unintelligible speech, severely limited language control, lacks substance beyond basic ideas. |
| | - Score 0: No attempt or unrelated to topic. |
| | |
| | Provide your score and a brief explanation focusing on delivery, language use, and topic development.""" |
| | |
| | prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> |
| | {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|> |
| | Please evaluate this TOEFL Independent Speaking response: |
| | |
| | Question: {question} |
| | |
| | Student Response: {answer}<|eot_id|><|start_header_id|>assistant<|end_header_id|> |
| | """ |
| | return prompt |
| | |
| | def extract_score(self, response): |
| | """Extract numerical score from response""" |
| | score_match = re.search(r'Score:\s*(\d+)', response) |
| | return int(score_match.group(1)) if score_match else None |
| | |
| | def evaluate(self, question, answer): |
| | """Evaluate a TOEFL response""" |
| | print("π€ Evaluating response...") |
| | |
| | try: |
| | prompt = self.create_prompt(question, answer) |
| | |
| | response = generate( |
| | self.model, |
| | self.tokenizer, |
| | prompt=prompt, |
| | max_tokens=400 |
| | ) |
| | |
| | score = self.extract_score(response) |
| | |
| | return { |
| | 'score': score, |
| | 'response': response, |
| | 'success': True |
| | } |
| | |
| | except Exception as e: |
| | return { |
| | 'score': None, |
| | 'response': f"Error: {e}", |
| | 'success': False |
| | } |
| | |
| | def print_result(self, question, answer, result): |
| | """Print evaluation results in a nice format""" |
| | print("\n" + "="*60) |
| | print("π EVALUATION RESULTS") |
| | print("="*60) |
| | |
| | print(f"\n㪠Question:") |
| | print(f"{question}") |
| | |
| | print(f"\nπ£οΈ Student Response:") |
| | print(f"{answer}") |
| | |
| | if result['success'] and result['score'] is not None: |
| | score = result['score'] |
| | |
| | |
| | if score == 4: |
| | print(f"\nπ SCORE: {score}/4 (Excellent)") |
| | elif score == 3: |
| | print(f"\nβ
SCORE: {score}/4 (Good)") |
| | elif score == 2: |
| | print(f"\nβ οΈ SCORE: {score}/4 (Limited)") |
| | elif score == 1: |
| | print(f"\nβ SCORE: {score}/4 (Very Limited)") |
| | else: |
| | print(f"\nπ SCORE: {score}/4 (No Response)") |
| | |
| | print(f"\nπ Detailed Feedback:") |
| | print(f"{result['response']}") |
| | |
| | else: |
| | print(f"\nβ Evaluation failed:") |
| | print(f"{result['response']}") |
| | |
| | print("\n" + "="*60) |
| | |
| | def interactive_mode(self): |
| | """Run in interactive mode""" |
| | print("\nπ― Interactive Mode") |
| | print("Enter 'quit' or 'exit' to stop, 'help' for assistance") |
| | print("-" * 50) |
| | |
| | while True: |
| | try: |
| | |
| | print("\nπ Enter the TOEFL Speaking question:") |
| | question = input("> ").strip() |
| | |
| | if question.lower() in ['quit', 'exit', 'q']: |
| | print("π Goodbye!") |
| | break |
| | elif question.lower() == 'help': |
| | self.show_help() |
| | continue |
| | elif not question: |
| | print("β οΈ Please enter a question.") |
| | continue |
| | |
| | |
| | print("\nπ£οΈ Enter the student's response:") |
| | answer = input("> ").strip() |
| | |
| | if not answer: |
| | print("β οΈ Please enter a response.") |
| | continue |
| | |
| | |
| | result = self.evaluate(question, answer) |
| | self.print_result(question, answer, result) |
| | |
| | |
| | print("\nπ Evaluate another response? (y/n)") |
| | continue_choice = input("> ").strip().lower() |
| | if continue_choice in ['n', 'no']: |
| | print("π Goodbye!") |
| | break |
| | |
| | except KeyboardInterrupt: |
| | print("\nπ Goodbye!") |
| | break |
| | except Exception as e: |
| | print(f"β Error: {e}") |
| | |
| | def show_help(self): |
| | """Show help information""" |
| | print("\nπ HELP - TOEFL Independent Speaking Judge") |
| | print("="*50) |
| | print("This tool evaluates TOEFL Independent Speaking responses on a 0-4 scale.") |
| | print("\nScoring Rubric:") |
| | print("π Score 4: Excellent - Sustained, coherent discourse") |
| | print("β
Score 3: Good - Mostly coherent with minor limitations") |
| | print("β οΈ Score 2: Limited - Basic ideas with limited development") |
| | print("β Score 1: Very Limited - Major language/content issues") |
| | print("π Score 0: No attempt or unrelated to topic") |
| | print("\nTips:") |
| | print("β’ Enter the exact question as given in TOEFL") |
| | print("β’ Provide the student's transcribed spoken response") |
| | print("β’ The model evaluates based on language use, delivery, and topic development") |
| | print("β’ Type 'quit' or 'exit' to stop") |
| | print("="*50) |
| |
|
| | def main(): |
| | """Main function with command line argument support""" |
| | parser = argparse.ArgumentParser(description='TOEFL Independent Speaking Judge') |
| | parser.add_argument('--model', default='mlx-community/Llama-3.2-3B-Instruct-4bit', |
| | help='Model path') |
| | parser.add_argument('--adapter', default='toefl_judge_adapter', |
| | help='Adapter path') |
| | parser.add_argument('--question', '-q', help='TOEFL question') |
| | parser.add_argument('--answer', '-a', help='Student response') |
| | parser.add_argument('--interactive', '-i', action='store_true', |
| | help='Run in interactive mode') |
| | |
| | args = parser.parse_args() |
| | |
| | |
| | judge = SimpleTOEFLJudge(model_path=args.model, adapter_path=args.adapter) |
| | |
| | |
| | if args.question and args.answer: |
| | result = judge.evaluate(args.question, args.answer) |
| | judge.print_result(args.question, args.answer, result) |
| | |
| | |
| | else: |
| | judge.interactive_mode() |
| |
|
| | |
| | SAMPLE_QUESTIONS = [ |
| | "Do you agree or disagree with the following statement: it is better to work in teams than to work alone? Use specific examples to support your answer.", |
| | "Some people prefer to live in small towns, while others prefer big cities. Which do you prefer and why?", |
| | "Do you think students should be required to wear uniforms in school? Explain your opinion with specific reasons.", |
| | "Would you rather have a job that pays well but is stressful, or a job that pays less but is enjoyable? Explain your choice.", |
| | "Some people believe that technology has made our lives easier, while others think it has made life more complicated. What is your opinion?" |
| | ] |
| |
|
| | if __name__ == "__main__": |
| | print("π Quick Test Mode") |
| | print("Want to try a sample question? Here are some examples:") |
| | for i, q in enumerate(SAMPLE_QUESTIONS[:3], 1): |
| | print(f"{i}. {q}") |
| | print("\nStarting TOEFL Judge...") |
| | print("-" * 70) |
| | |
| | main() |