Joblib
adgw's picture
fix
ef22613 verified
"""
Moduł do ekstrakcji podstawowych, statystycznych cech tekstu.
Zawiera funkcje do analizy na poziomie znaków, słów i linii, które nie
wymagają zaawansowanych modeli lingwistycznych.
"""
import re
from collections import Counter
from typing import Dict, List
from ..utils import safe_divide
from ..constants import (PUNCTUATION_PATTERN, EXCESSIVE_SPACES_PATTERN,
ALLOWED_CHARS_PATTERN, COMMON_CHARACTERS)
# --- Funkcje analizujące znaki ---
def analyze_character_stats(text: str, text_lower: str) -> Dict[str, float]:
"""Oblicza podstawowe statystyki na poziomie znaków."""
total_chars = len(text)
char_counts = Counter(text)
if not total_chars:
return {
'characters': 0, 'digit_count': 0, 'digit_ratio': 0.0,
'overall_uppercase_ratio': 0.0, 'unique_characters_all': 0,
'unique_characters_lower': 0, 'characters_out_of_common': 0,
'tabs': 0, 'multispaces': 0
}
return {
'characters': total_chars,
'digit_count': sum(ch.isdigit() for ch in text),
'digit_ratio': safe_divide(sum(ch.isdigit() for ch in text), total_chars),
'overall_uppercase_ratio': safe_divide(sum(ch.isupper() for ch in text), total_chars),
'unique_characters_all': len(set(text)),
'unique_characters_lower': len(set(text_lower)),
'characters_out_of_common': len([c for c in text if c not in COMMON_CHARACTERS]),
'tabs': text.count('\t'),
'multispaces': len(EXCESSIVE_SPACES_PATTERN.findall(text))
}
def analyze_punctuation_stats(text: str) -> Dict[str, float]:
"""Analizuje występowanie interpunkcji i specyficznych znaków."""
total_chars = len(text)
if not total_chars:
return {
'punct_frequency': 0.0, 'bracet_count': 0, 'bracket_ratio': 0.0,
'count_special_chars': 0
}
open_paren = text.count('(')
close_paren = text.count(')')
open_bracket = text.count('[')
close_bracket = text.count(']')
return {
'punct_frequency': safe_divide(len(PUNCTUATION_PATTERN.findall(text)), total_chars),
'bracet_count': open_paren + close_paren + open_bracket + close_bracket,
'bracket_ratio': safe_divide(open_bracket, close_bracket),
'count_special_chars': len(re.findall(r'(\?|!){3,}', text))
}
def analyze_advanced_char_features(text: str) -> Dict[str, float]:
"""Analizuje zaawansowane cechy rozkładu znaków i słów (dawniej analyze_char_features)."""
total_chars = len(text)
words_found = re.findall(r'\w+', text)
word_count = len(words_found)
if not total_chars or not word_count:
return {
'word_count': 0, 'unique_word_count': 0, 'top_word_count': 0, 'top_word_ratio': 0.0,
'top_5_ratio': 0.0, 'top_10_ratio': 0.0, 'hapax_legomena_ratio': 0.0,
'looping_suspicion': 0, 'polish_diacritics_count': 0, 'polish_diacritics_ratio': 0.0,
'polish_diacritics_per_word': 0.0, 'diacritics_to_letters_ratio': 0.0,
'replacement_char_count': 0, 'replacement_char_ratio': 0.0,
'not_allowed_chars_count': 0, 'not_allowed_chars_ratio': 0.0,
'encoding_suspicion': 0, 'single_char_word_count': 0, 'single_char_unique_count': 0,
'single_char_upper_count': 0, 'single_char_lower_count': 0,
'single_char_upper_unique_count': 0, 'single_char_lower_unique_count': 0,
'single_char_top_1_codepoint': 0, 'single_char_top_2_codepoint': 0,
'single_char_top_3_codepoint': 0
}
word_freq = Counter(words_found)
most_common = word_freq.most_common(10)
polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
char_counts = Counter(text)
diac_count = sum(char_counts.get(ch, 0) for ch in polish_diacritics)
letters_count = sum(1 for ch in text if ch.isalpha())
single_chars = [w for w in words_found if len(w) == 1]
single_char_freq = Counter(single_chars)
top_3_single = single_char_freq.most_common(3)
top_codes = [ord(w) for w, _ in top_3_single]
while len(top_codes) < 3: top_codes.append(0)
replacement_count = char_counts.get('\uFFFD', 0)
not_allowed_count = sum(1 for ch in text if not ALLOWED_CHARS_PATTERN.match(ch))
replacement_ratio = safe_divide(replacement_count, total_chars)
not_allowed_ratio = safe_divide(not_allowed_count, total_chars)
top_word_ratio = safe_divide(most_common[0][1] if most_common else 0, word_count)
top_5_ratio = safe_divide(sum(cnt for _, cnt in most_common[:5]), word_count)
features = {
'word_count': word_count,
'unique_word_count': len(word_freq),
'top_word_count': most_common[0][1] if most_common else 0,
'top_word_ratio': top_word_ratio,
'top_5_ratio': top_5_ratio,
'top_10_ratio': safe_divide(sum(cnt for _, cnt in most_common[:10]), word_count),
'hapax_legomena_ratio': safe_divide(sum(1 for cnt in word_freq.values() if cnt == 1), word_count),
'looping_suspicion': 1 if (top_word_ratio > 0.15 or top_5_ratio > 0.4) else 0,
'polish_diacritics_count': diac_count,
'polish_diacritics_ratio': safe_divide(diac_count, total_chars),
'polish_diacritics_per_word': safe_divide(diac_count, word_count),
'diacritics_to_letters_ratio': safe_divide(diac_count, letters_count),
'replacement_char_count': replacement_count,
'replacement_char_ratio': replacement_ratio,
'not_allowed_chars_count': not_allowed_count,
'not_allowed_chars_ratio': not_allowed_ratio,
'encoding_suspicion': 1 if (replacement_ratio > 0.01 or not_allowed_ratio > 0.05) else 0,
'single_char_word_count': len(single_chars),
'single_char_unique_count': len(single_char_freq),
'single_char_upper_count': sum(1 for w in single_chars if w.isupper()),
'single_char_lower_count': sum(1 for w in single_chars if w.islower()),
'single_char_upper_unique_count': len({w for w in single_chars if w.isupper()}),
'single_char_lower_unique_count': len({w for w in single_chars if w.islower()}),
'single_char_top_1_codepoint': top_codes[0],
'single_char_top_2_codepoint': top_codes[1],
'single_char_top_3_codepoint': top_codes[2],
}
return features
# --- Funkcje analizujące słowa ---
def analyze_word_stats(words: List[str], words_lower: List[str]) -> Dict[str, float]:
total_words = len(words)
if not total_words: return {'mean_word_length': 0.0, 'lexical_diversity': 0.0, 'count_caps': 0.0, 'word_isupper<5': 0, 'word_isupper>5': 0, 'count_digit_to_caps': 0.0}
digit_count = sum(1 for w in words if any(ch.isdigit() for ch in w))
caps_count = sum(1 for w in words if w.isupper())
return {
'mean_word_length': safe_divide(sum(len(w) for w in words_lower), total_words),
'lexical_diversity': safe_divide(len(set(words_lower)), total_words),
'count_caps': safe_divide(caps_count, total_words),
'word_isupper<5': sum(1 for w in words if w.isupper() and len(w) < 5),
'word_isupper>5': sum(1 for w in words if w.isupper() and len(w) >= 5),
'count_digit_to_caps': safe_divide(digit_count, caps_count)
}
def count_contextual_word_repetitions(words_lower: List[str]) -> Dict[str, float]:
"""Liczy powtórzenia tego samego słowa bezpośrednio po sobie."""
count = sum(1 for i in range(len(words_lower) - 1) if words_lower[i] == words_lower[i+1])
return {
"contextual_word_repetitions_count": count,
"contextual_word_repetitions_ratio": safe_divide(count, len(words_lower))
}
def count_single_chars_and_ratio(text: str) -> Dict[str, float]:
"""Liczy słowa składające się z jednego znaku (wersja z oryginalnego kodu)."""
t = " " + text + " "
count = sum(1 for i in range(1, len(t) - 1) if t[i-1].isspace() and t[i+1].isspace())
return {
'single_char_count': count,
'single_char_ratio': safe_divide(count, len(t))
}
# --- Funkcje analizujące linie ---
def analyze_line_length_stats(lines: List[str]) -> Dict[str, float]:
"""Oblicza statystyki związane z długością linii."""
total_lines = len(lines)
if not total_lines:
return {
'average_lines': 0.0, 'short_line_count_3': 0,
'short_line_count_5': 0, 'short_line_count_10': 0, 'short_line_count_20': 0,
'short_line_ratio_3': 0.0, 'short_line_ratio_5': 0.0,
'short_line_ratio_10': 0.0, 'short_line_ratio_20': 0.0
}
line_lengths = [len(line) for line in lines]
stats = {'average_lines': safe_divide(sum(line_lengths), total_lines)}
for threshold in [3, 5, 10, 20]:
count = sum(1 for length in line_lengths if length < threshold)
stats[f'short_line_count_{threshold}'] = count
stats[f'short_line_ratio_{threshold}'] = safe_divide(count, total_lines)
return stats
def analyze_line_content(lines: List[str]) -> Dict[str, float]:
"""Analizuje zawartość linii pod kątem specyficznych wzorców."""
total_lines = len(lines)
if not total_lines:
return {
'blank_lines': 0, 'blank_lines_ratio': 0.0,
'ellipsis_fractions': 0.0, 'line_counts': 0,
'digit_start_lines': 0, 'duplicated_lines': 0, 'duplicate_line_ratio': 0.0
}
non_empty_lines = [line for line in lines if line.strip()]
blanks_count = total_lines - len(non_empty_lines)
ellipsis_lines_count = sum(1 for line in lines if line.strip().endswith(('...', '…')))
digit_start_lines_count = sum(1 for line in non_empty_lines if line.strip() and line.strip()[0].isdigit())
line_counts = Counter(non_empty_lines)
duplicated_lines_count = sum(cnt - 1 for cnt in line_counts.values() if cnt > 1)
return {
'blank_lines': blanks_count,
'blank_lines_ratio': safe_divide(blanks_count, total_lines),
'ellipsis_fractions': safe_divide(ellipsis_lines_count, total_lines),
'line_counts': total_lines,
'digit_start_lines': digit_start_lines_count,
'duplicated_lines': duplicated_lines_count,
'duplicate_line_ratio': safe_divide(duplicated_lines_count, len(non_empty_lines))
}
def count_lorem_ipsum(text_lower: str) -> Dict[str, float]:
"""Oblicza stosunek lorem ipsum"""
count = text_lower.count('lorem ipsum')
return {'lorem_ipsum_ratio': safe_divide(count, len(text_lower))}
# --- Główna funkcja agregująca ---
def calculate_all_base_features(text: str, text_lower: str, words: List[str], words_lower: List[str], lines: List[str]) -> Dict[str, float]:
"""Agreguje wszystkie podstawowe cechy tekstu z tego modułu."""
features = {}
features.update(analyze_character_stats(text, text_lower))
features.update(analyze_punctuation_stats(text))
features.update(analyze_advanced_char_features(text))
features.update(analyze_word_stats(words, words_lower))
features.update(count_contextual_word_repetitions(words_lower))
features.update(count_single_chars_and_ratio(text))
features.update(analyze_line_length_stats(lines))
features.update(analyze_line_content(lines))
features.update(count_lorem_ipsum(text_lower))
return features