agentic-language-partner

Sleeping

File size: 9,726 Bytes

aa3fdef

# -*- coding: utf-8 -*-
"""
Difficulty Scorer - Multi-language Support

Supports 6 languages with proficiency test databases:
- English (en): CEFR A1-C2
- Chinese (zh-cn): HSK 1-6
- German (de): CEFR A1-C2
- Spanish (es): CEFR A1-C2
- Japanese (ja): JLPT N5-N1
- Korean (ko): TOPIK 1-6
"""

import json
from typing import Dict, Any, List, Optional
from pathlib import Path


class DifficultyScorer:
    """Multi-language difficulty scoring system"""

    LANGUAGE_TESTS = {
        'en': 'cefr',
        'de': 'cefr',
        'es': 'cefr',
        'fr': 'cefr',
        'it': 'cefr',
        'zh-cn': 'hsk',
        'zh-tw': 'hsk',
        'ja': 'jlpt',
        'ko': 'topik',
        'ru': 'cefr',
    }

    JLPT_MAPPING = {
        'N5': 1, 'N4': 2, 'N3': 3, 'N2': 4, 'N1': 5
    }

    def __init__(self, data_dir: str = None):
        """
        Initialize multi-language difficulty scorer

        Args:
            data_dir: Path to data directory containing proficiency databases
        """
        if data_dir is None:
            current_dir = Path(__file__).parent
            project_root = current_dir.parent.parent
            data_dir = project_root / "data"

        self.data_dir = Path(data_dir)
        self.databases = self._load_all_databases()
        self.word_lookups = self._create_word_lookups()

    def _load_all_databases(self) -> Dict[str, Dict]:
        """Load all language proficiency databases"""
        databases = {}

        # Load CEFR (English, German, Spanish, etc.)
        cefr_path = self.data_dir / "cefr" / "cefr_words.json"
        if cefr_path.exists():
            try:
                with open(cefr_path, 'r', encoding='utf-8') as f:
                    databases['cefr'] = json.load(f)
            except Exception as e:
                print(f"[DifficultyScorer] Failed to load CEFR: {e}")

        # Load HSK (Chinese)
        hsk_path = self.data_dir / "hsk" / "hsk_words.json"
        if hsk_path.exists():
            try:
                with open(hsk_path, 'r', encoding='utf-8') as f:
                    databases['hsk'] = json.load(f)
            except Exception as e:
                print(f"[DifficultyScorer] Failed to load HSK: {e}")

        # Load JLPT (Japanese)
        jlpt_path = self.data_dir / "jlpt" / "jlpt_words.json"
        if jlpt_path.exists():
            try:
                with open(jlpt_path, 'r', encoding='utf-8') as f:
                    databases['jlpt'] = json.load(f)
            except Exception as e:
                print(f"[DifficultyScorer] Failed to load JLPT: {e}")

        # Load TOPIK (Korean)
        topik_path = self.data_dir / "topik" / "topik_words.json"
        if topik_path.exists():
            try:
                with open(topik_path, 'r', encoding='utf-8') as f:
                    databases['topik'] = json.load(f)
            except Exception as e:
                print(f"[DifficultyScorer] Failed to load TOPIK: {e}")

        return databases

    def _create_word_lookups(self) -> Dict[str, Dict[str, int]]:
        """Create word-to-score lookup tables for all languages"""
        lookups = {}

        # CEFR lookups
        if 'cefr' in self.databases:
            cefr = self.databases['cefr']
            for lang_code in ['en', 'de', 'es', 'fr', 'it', 'ru']:
                lookups[lang_code] = {}
                if 'levels' in cefr:
                    for level, data in cefr['levels'].items():
                        score = data.get('score', 3)
                        if lang_code in data:
                            for word in data[lang_code]:
                                lookups[lang_code][word.lower()] = score

        # HSK lookup (Chinese)
        if 'hsk' in self.databases:
            lookups['zh-cn'] = {}
            lookups['zh-tw'] = {}
            if 'levels' in self.databases['hsk']:
                for level, data in self.databases['hsk']['levels'].items():
                    score = data.get('score', 3)
                    for word in data.get('words', []):
                        lookups['zh-cn'][word] = score
                        lookups['zh-tw'][word] = score

        # JLPT lookup (Japanese)
        if 'jlpt' in self.databases:
            lookups['ja'] = {}
            if 'levels' in self.databases['jlpt']:
                for level, data in self.databases['jlpt']['levels'].items():
                    score = data.get('score', 3)
                    for word in data.get('words', []):
                        lookups['ja'][word] = score

        # TOPIK lookup (Korean)
        if 'topik' in self.databases:
            lookups['ko'] = {}
            if 'levels' in self.databases['topik']:
                for level, data in self.databases['topik']['levels'].items():
                    score = data.get('score', 3)
                    for word in data.get('words', []):
                        lookups['ko'][word] = score

        return lookups

    def get_proficiency_score(self, word: str, language: str) -> float:
        """
        Get proficiency test score for a word

        Args:
            word: Word or phrase
            language: Language code

        Returns:
            Score 1-6 (1=easiest, 6=hardest)
        """
        language = language.lower()

        if language not in self.word_lookups:
            return self._estimate_by_length(word)

        lookup = self.word_lookups[language]
        search_word = word if language in ['zh-cn', 'zh-tw', 'ja', 'ko'] else word.lower()

        if search_word in lookup:
            return float(lookup[search_word])

        return self._estimate_by_length(word)

    def _estimate_by_length(self, word: str) -> float:
        """Estimate difficulty by word length (fallback)"""
        length = len(word)
        if length <= 3:
            return 2.0
        elif length <= 6:
            return 3.5
        elif length <= 10:
            return 4.5
        else:
            return 5.5

    def get_length_score(self, word: str) -> float:
        """Score based on word length"""
        length = len(word)
        if length == 1:
            return 1.0
        elif length <= 3:
            return 2.0
        elif length <= 6:
            return 3.0
        elif length <= 10:
            return 4.0
        elif length <= 15:
            return 5.0
        else:
            return 6.0

    def calculate_difficulty(self, word: str, language: str) -> Dict[str, Any]:
        """
        Calculate comprehensive difficulty score

        Weights:
        - Proficiency level: 60%
        - Word length: 40%
        """
        proficiency_score = self.get_proficiency_score(word, language)
        length_score = self.get_length_score(word)

        overall_score = proficiency_score * 0.6 + length_score * 0.4

        if overall_score <= 2.5:
            level = "beginner"
        elif overall_score <= 4.5:
            level = "intermediate"
        else:
            level = "advanced"

        test_name = self.LANGUAGE_TESTS.get(language.lower(), 'unknown')

        return {
            "overall_score": round(overall_score, 2),
            "level": level,
            "factors": {
                "proficiency_score": round(proficiency_score, 2),
                "length": len(word),
                "length_score": round(length_score, 2),
                "test_system": test_name.upper()
            }
        }

    def score_flashcard(self, card: Dict[str, Any]) -> Dict[str, Any]:
        """Add difficulty score to flashcard"""
        word = card.get('front', '')
        language = card.get('language', 'en')

        difficulty = self.calculate_difficulty(word, language)

        card_with_difficulty = card.copy()
        card_with_difficulty['difficulty'] = difficulty

        return card_with_difficulty

    def score_all_flashcards(self, flashcards: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Score all flashcards"""
        return [self.score_flashcard(card) for card in flashcards]

    def get_statistics(self, flashcards: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate difficulty statistics"""
        if not flashcards:
            return {}

        level_counts = {"beginner": 0, "intermediate": 0, "advanced": 0}
        scores = []
        by_language = {}

        for card in flashcards:
            if 'difficulty' in card:
                level = card['difficulty']['level']
                level_counts[level] += 1
                scores.append(card['difficulty']['overall_score'])

                lang = card.get('language', 'unknown')
                if lang not in by_language:
                    by_language[lang] = {"count": 0, "scores": []}
                by_language[lang]["count"] += 1
                by_language[lang]["scores"].append(card['difficulty']['overall_score'])

        for lang in by_language:
            lang_scores = by_language[lang]["scores"]
            by_language[lang]["avg_score"] = round(sum(lang_scores) / len(lang_scores), 2)
            del by_language[lang]["scores"]

        return {
            "total_cards": len(flashcards),
            "by_level": level_counts,
            "by_language": by_language,
            "average_score": round(sum(scores) / len(scores), 2) if scores else 0,
            "min_score": round(min(scores), 2) if scores else 0,
            "max_score": round(max(scores), 2) if scores else 0
        }


# Global instance (lazy initialization)
_difficulty_scorer = None


def get_difficulty_scorer() -> DifficultyScorer:
    """Get or create the global DifficultyScorer instance"""
    global _difficulty_scorer
    if _difficulty_scorer is None:
        _difficulty_scorer = DifficultyScorer()
    return _difficulty_scorer