File size: 11,576 Bytes

"""

Moduł do ekstrakcji podstawowych, statystycznych cech tekstu.



Zawiera funkcje do analizy na poziomie znaków, słów i linii, które nie

wymagają zaawansowanych modeli lingwistycznych.

"""
import re
from collections import Counter
from typing import Dict, List

from ..utils import safe_divide
from ..constants import (PUNCTUATION_PATTERN, EXCESSIVE_SPACES_PATTERN,
                       ALLOWED_CHARS_PATTERN, COMMON_CHARACTERS)

# --- Funkcje analizujące znaki ---

def analyze_character_stats(text: str, text_lower: str) -> Dict[str, float]:
    """Oblicza podstawowe statystyki na poziomie znaków."""
    total_chars = len(text)
    char_counts = Counter(text)
    
    if not total_chars:
        return {
            'characters': 0, 'digit_count': 0, 'digit_ratio': 0.0,
            'overall_uppercase_ratio': 0.0, 'unique_characters_all': 0,
            'unique_characters_lower': 0, 'characters_out_of_common': 0,
            'tabs': 0, 'multispaces': 0
        }

    return {
        'characters': total_chars,
        'digit_count': sum(ch.isdigit() for ch in text),
        'digit_ratio': safe_divide(sum(ch.isdigit() for ch in text), total_chars),
        'overall_uppercase_ratio': safe_divide(sum(ch.isupper() for ch in text), total_chars),
        'unique_characters_all': len(set(text)),
        'unique_characters_lower': len(set(text_lower)),
        'characters_out_of_common': len([c for c in text if c not in COMMON_CHARACTERS]),
        'tabs': text.count('\t'),
        'multispaces': len(EXCESSIVE_SPACES_PATTERN.findall(text))
    }

def analyze_punctuation_stats(text: str) -> Dict[str, float]:
    """Analizuje występowanie interpunkcji i specyficznych znaków."""
    total_chars = len(text)
    if not total_chars:
        return {
            'punct_frequency': 0.0, 'bracet_count': 0, 'bracket_ratio': 0.0,
            'count_special_chars': 0
        }
    
    open_paren = text.count('(')
    close_paren = text.count(')')
    open_bracket = text.count('[')
    close_bracket = text.count(']')

    return {
        'punct_frequency': safe_divide(len(PUNCTUATION_PATTERN.findall(text)), total_chars),
        'bracet_count': open_paren + close_paren + open_bracket + close_bracket,
        'bracket_ratio': safe_divide(open_bracket, close_bracket),
        'count_special_chars': len(re.findall(r'(\?|!){3,}', text))
    }

def analyze_advanced_char_features(text: str) -> Dict[str, float]:
    """Analizuje zaawansowane cechy rozkładu znaków i słów (dawniej analyze_char_features)."""
    total_chars = len(text)
    words_found = re.findall(r'\w+', text)
    word_count = len(words_found)

    if not total_chars or not word_count:
        return {
            'word_count': 0, 'unique_word_count': 0, 'top_word_count': 0, 'top_word_ratio': 0.0,
            'top_5_ratio': 0.0, 'top_10_ratio': 0.0, 'hapax_legomena_ratio': 0.0,
            'looping_suspicion': 0, 'polish_diacritics_count': 0, 'polish_diacritics_ratio': 0.0,
            'polish_diacritics_per_word': 0.0, 'diacritics_to_letters_ratio': 0.0,
            'replacement_char_count': 0, 'replacement_char_ratio': 0.0,
            'not_allowed_chars_count': 0, 'not_allowed_chars_ratio': 0.0,
            'encoding_suspicion': 0, 'single_char_word_count': 0, 'single_char_unique_count': 0,
            'single_char_upper_count': 0, 'single_char_lower_count': 0,
            'single_char_upper_unique_count': 0, 'single_char_lower_unique_count': 0,
            'single_char_top_1_codepoint': 0, 'single_char_top_2_codepoint': 0,
            'single_char_top_3_codepoint': 0
        }

    word_freq = Counter(words_found)
    most_common = word_freq.most_common(10)

    polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
    char_counts = Counter(text)
    diac_count = sum(char_counts.get(ch, 0) for ch in polish_diacritics)
    letters_count = sum(1 for ch in text if ch.isalpha())

    single_chars = [w for w in words_found if len(w) == 1]
    single_char_freq = Counter(single_chars)
    top_3_single = single_char_freq.most_common(3)
    top_codes = [ord(w) for w, _ in top_3_single]
    while len(top_codes) < 3: top_codes.append(0)

    replacement_count = char_counts.get('\uFFFD', 0)
    not_allowed_count = sum(1 for ch in text if not ALLOWED_CHARS_PATTERN.match(ch))
    replacement_ratio = safe_divide(replacement_count, total_chars)
    not_allowed_ratio = safe_divide(not_allowed_count, total_chars)
    
    top_word_ratio = safe_divide(most_common[0][1] if most_common else 0, word_count)
    top_5_ratio = safe_divide(sum(cnt for _, cnt in most_common[:5]), word_count)

    features = {
        'word_count': word_count,
        'unique_word_count': len(word_freq),
        'top_word_count': most_common[0][1] if most_common else 0,
        'top_word_ratio': top_word_ratio,
        'top_5_ratio': top_5_ratio,
        'top_10_ratio': safe_divide(sum(cnt for _, cnt in most_common[:10]), word_count),
        'hapax_legomena_ratio': safe_divide(sum(1 for cnt in word_freq.values() if cnt == 1), word_count),
        'looping_suspicion': 1 if (top_word_ratio > 0.15 or top_5_ratio > 0.4) else 0,
        'polish_diacritics_count': diac_count,
        'polish_diacritics_ratio': safe_divide(diac_count, total_chars),
        'polish_diacritics_per_word': safe_divide(diac_count, word_count),
        'diacritics_to_letters_ratio': safe_divide(diac_count, letters_count),
        'replacement_char_count': replacement_count,
        'replacement_char_ratio': replacement_ratio,
        'not_allowed_chars_count': not_allowed_count,
        'not_allowed_chars_ratio': not_allowed_ratio,
        'encoding_suspicion': 1 if (replacement_ratio > 0.01 or not_allowed_ratio > 0.05) else 0,
        'single_char_word_count': len(single_chars),
        'single_char_unique_count': len(single_char_freq),
        'single_char_upper_count': sum(1 for w in single_chars if w.isupper()),
        'single_char_lower_count': sum(1 for w in single_chars if w.islower()),
        'single_char_upper_unique_count': len({w for w in single_chars if w.isupper()}),
        'single_char_lower_unique_count': len({w for w in single_chars if w.islower()}),
        'single_char_top_1_codepoint': top_codes[0],
        'single_char_top_2_codepoint': top_codes[1],
        'single_char_top_3_codepoint': top_codes[2],
    }
    return features

# --- Funkcje analizujące słowa ---

def analyze_word_stats(words: List[str], words_lower: List[str]) -> Dict[str, float]:
    total_words = len(words)
    if not total_words: return {'mean_word_length': 0.0, 'lexical_diversity': 0.0, 'count_caps': 0.0, 'word_isupper<5': 0, 'word_isupper>5': 0, 'count_digit_to_caps': 0.0}
    
    digit_count = sum(1 for w in words if any(ch.isdigit() for ch in w))
    caps_count = sum(1 for w in words if w.isupper())
    
    return {
        'mean_word_length': safe_divide(sum(len(w) for w in words_lower), total_words),
        'lexical_diversity': safe_divide(len(set(words_lower)), total_words),
        'count_caps': safe_divide(caps_count, total_words),
        'word_isupper<5': sum(1 for w in words if w.isupper() and len(w) < 5),
        'word_isupper>5': sum(1 for w in words if w.isupper() and len(w) >= 5),
        'count_digit_to_caps': safe_divide(digit_count, caps_count)
    }

def count_contextual_word_repetitions(words_lower: List[str]) -> Dict[str, float]:
    """Liczy powtórzenia tego samego słowa bezpośrednio po sobie."""
    count = sum(1 for i in range(len(words_lower) - 1) if words_lower[i] == words_lower[i+1])
    return {
        "contextual_word_repetitions_count": count,
        "contextual_word_repetitions_ratio": safe_divide(count, len(words_lower))
    }

def count_single_chars_and_ratio(text: str) -> Dict[str, float]:
    """Liczy słowa składające się z jednego znaku (wersja z oryginalnego kodu)."""
    t = " " + text + " "
    count = sum(1 for i in range(1, len(t) - 1) if t[i-1].isspace() and t[i+1].isspace())
    return {
        'single_char_count': count,
        'single_char_ratio': safe_divide(count, len(t))
    }

# --- Funkcje analizujące linie ---

def analyze_line_length_stats(lines: List[str]) -> Dict[str, float]:
    """Oblicza statystyki związane z długością linii."""
    total_lines = len(lines)
    if not total_lines:
        return {
            'average_lines': 0.0, 'short_line_count_3': 0,
            'short_line_count_5': 0, 'short_line_count_10': 0, 'short_line_count_20': 0,
            'short_line_ratio_3': 0.0, 'short_line_ratio_5': 0.0,
            'short_line_ratio_10': 0.0, 'short_line_ratio_20': 0.0
        }
    
    line_lengths = [len(line) for line in lines]
    stats = {'average_lines': safe_divide(sum(line_lengths), total_lines)}
    
    for threshold in [3, 5, 10, 20]:
        count = sum(1 for length in line_lengths if length < threshold)
        stats[f'short_line_count_{threshold}'] = count
        stats[f'short_line_ratio_{threshold}'] = safe_divide(count, total_lines)
    return stats

def analyze_line_content(lines: List[str]) -> Dict[str, float]:
    """Analizuje zawartość linii pod kątem specyficznych wzorców."""
    total_lines = len(lines)
    if not total_lines:
        return {
            'blank_lines': 0, 'blank_lines_ratio': 0.0,
            'ellipsis_fractions': 0.0, 'line_counts': 0,
            'digit_start_lines': 0, 'duplicated_lines': 0, 'duplicate_line_ratio': 0.0
        }
        
    non_empty_lines = [line for line in lines if line.strip()]
    blanks_count = total_lines - len(non_empty_lines)
    ellipsis_lines_count = sum(1 for line in lines if line.strip().endswith(('...', '…')))
    digit_start_lines_count = sum(1 for line in non_empty_lines if line.strip() and line.strip()[0].isdigit())
    
    line_counts = Counter(non_empty_lines)
    duplicated_lines_count = sum(cnt - 1 for cnt in line_counts.values() if cnt > 1)
    
    return {
        'blank_lines': blanks_count,
        'blank_lines_ratio': safe_divide(blanks_count, total_lines),
        'ellipsis_fractions': safe_divide(ellipsis_lines_count, total_lines),
        'line_counts': total_lines,
        'digit_start_lines': digit_start_lines_count,
        'duplicated_lines': duplicated_lines_count,
        'duplicate_line_ratio': safe_divide(duplicated_lines_count, len(non_empty_lines))
    }

def count_lorem_ipsum(text_lower: str) -> Dict[str, float]:
    """Oblicza stosunek lorem ipsum"""
    count = text_lower.count('lorem ipsum')
    return {'lorem_ipsum_ratio': safe_divide(count, len(text_lower))}

# --- Główna funkcja agregująca ---

def calculate_all_base_features(text: str, text_lower: str, words: List[str], words_lower: List[str], lines: List[str]) -> Dict[str, float]:
    """Agreguje wszystkie podstawowe cechy tekstu z tego modułu."""
    features = {}
    features.update(analyze_character_stats(text, text_lower))
    features.update(analyze_punctuation_stats(text))
    features.update(analyze_advanced_char_features(text))
    features.update(analyze_word_stats(words, words_lower))
    features.update(count_contextual_word_repetitions(words_lower))
    features.update(count_single_chars_and_ratio(text))
    features.update(analyze_line_length_stats(lines))
    features.update(analyze_line_content(lines))
    features.update(count_lorem_ipsum(text_lower))
    return features