|
|
"""
|
|
|
Moduł do ekstrakcji podstawowych, statystycznych cech tekstu.
|
|
|
|
|
|
Zawiera funkcje do analizy na poziomie znaków, słów i linii, które nie
|
|
|
wymagają zaawansowanych modeli lingwistycznych.
|
|
|
"""
|
|
|
import re
|
|
|
from collections import Counter
|
|
|
from typing import Dict, List
|
|
|
|
|
|
from ..utils import safe_divide
|
|
|
from ..constants import (PUNCTUATION_PATTERN, EXCESSIVE_SPACES_PATTERN,
|
|
|
ALLOWED_CHARS_PATTERN, COMMON_CHARACTERS)
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_character_stats(text: str, text_lower: str) -> Dict[str, float]:
|
|
|
"""Oblicza podstawowe statystyki na poziomie znaków."""
|
|
|
total_chars = len(text)
|
|
|
char_counts = Counter(text)
|
|
|
|
|
|
if not total_chars:
|
|
|
return {
|
|
|
'characters': 0, 'digit_count': 0, 'digit_ratio': 0.0,
|
|
|
'overall_uppercase_ratio': 0.0, 'unique_characters_all': 0,
|
|
|
'unique_characters_lower': 0, 'characters_out_of_common': 0,
|
|
|
'tabs': 0, 'multispaces': 0
|
|
|
}
|
|
|
|
|
|
return {
|
|
|
'characters': total_chars,
|
|
|
'digit_count': sum(ch.isdigit() for ch in text),
|
|
|
'digit_ratio': safe_divide(sum(ch.isdigit() for ch in text), total_chars),
|
|
|
'overall_uppercase_ratio': safe_divide(sum(ch.isupper() for ch in text), total_chars),
|
|
|
'unique_characters_all': len(set(text)),
|
|
|
'unique_characters_lower': len(set(text_lower)),
|
|
|
'characters_out_of_common': len([c for c in text if c not in COMMON_CHARACTERS]),
|
|
|
'tabs': text.count('\t'),
|
|
|
'multispaces': len(EXCESSIVE_SPACES_PATTERN.findall(text))
|
|
|
}
|
|
|
|
|
|
def analyze_punctuation_stats(text: str) -> Dict[str, float]:
|
|
|
"""Analizuje występowanie interpunkcji i specyficznych znaków."""
|
|
|
total_chars = len(text)
|
|
|
if not total_chars:
|
|
|
return {
|
|
|
'punct_frequency': 0.0, 'bracet_count': 0, 'bracket_ratio': 0.0,
|
|
|
'count_special_chars': 0
|
|
|
}
|
|
|
|
|
|
open_paren = text.count('(')
|
|
|
close_paren = text.count(')')
|
|
|
open_bracket = text.count('[')
|
|
|
close_bracket = text.count(']')
|
|
|
|
|
|
return {
|
|
|
'punct_frequency': safe_divide(len(PUNCTUATION_PATTERN.findall(text)), total_chars),
|
|
|
'bracet_count': open_paren + close_paren + open_bracket + close_bracket,
|
|
|
'bracket_ratio': safe_divide(open_bracket, close_bracket),
|
|
|
'count_special_chars': len(re.findall(r'(\?|!){3,}', text))
|
|
|
}
|
|
|
|
|
|
def analyze_advanced_char_features(text: str) -> Dict[str, float]:
|
|
|
"""Analizuje zaawansowane cechy rozkładu znaków i słów (dawniej analyze_char_features)."""
|
|
|
total_chars = len(text)
|
|
|
words_found = re.findall(r'\w+', text)
|
|
|
word_count = len(words_found)
|
|
|
|
|
|
if not total_chars or not word_count:
|
|
|
return {
|
|
|
'word_count': 0, 'unique_word_count': 0, 'top_word_count': 0, 'top_word_ratio': 0.0,
|
|
|
'top_5_ratio': 0.0, 'top_10_ratio': 0.0, 'hapax_legomena_ratio': 0.0,
|
|
|
'looping_suspicion': 0, 'polish_diacritics_count': 0, 'polish_diacritics_ratio': 0.0,
|
|
|
'polish_diacritics_per_word': 0.0, 'diacritics_to_letters_ratio': 0.0,
|
|
|
'replacement_char_count': 0, 'replacement_char_ratio': 0.0,
|
|
|
'not_allowed_chars_count': 0, 'not_allowed_chars_ratio': 0.0,
|
|
|
'encoding_suspicion': 0, 'single_char_word_count': 0, 'single_char_unique_count': 0,
|
|
|
'single_char_upper_count': 0, 'single_char_lower_count': 0,
|
|
|
'single_char_upper_unique_count': 0, 'single_char_lower_unique_count': 0,
|
|
|
'single_char_top_1_codepoint': 0, 'single_char_top_2_codepoint': 0,
|
|
|
'single_char_top_3_codepoint': 0
|
|
|
}
|
|
|
|
|
|
word_freq = Counter(words_found)
|
|
|
most_common = word_freq.most_common(10)
|
|
|
|
|
|
polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
|
|
|
char_counts = Counter(text)
|
|
|
diac_count = sum(char_counts.get(ch, 0) for ch in polish_diacritics)
|
|
|
letters_count = sum(1 for ch in text if ch.isalpha())
|
|
|
|
|
|
single_chars = [w for w in words_found if len(w) == 1]
|
|
|
single_char_freq = Counter(single_chars)
|
|
|
top_3_single = single_char_freq.most_common(3)
|
|
|
top_codes = [ord(w) for w, _ in top_3_single]
|
|
|
while len(top_codes) < 3: top_codes.append(0)
|
|
|
|
|
|
replacement_count = char_counts.get('\uFFFD', 0)
|
|
|
not_allowed_count = sum(1 for ch in text if not ALLOWED_CHARS_PATTERN.match(ch))
|
|
|
replacement_ratio = safe_divide(replacement_count, total_chars)
|
|
|
not_allowed_ratio = safe_divide(not_allowed_count, total_chars)
|
|
|
|
|
|
top_word_ratio = safe_divide(most_common[0][1] if most_common else 0, word_count)
|
|
|
top_5_ratio = safe_divide(sum(cnt for _, cnt in most_common[:5]), word_count)
|
|
|
|
|
|
features = {
|
|
|
'word_count': word_count,
|
|
|
'unique_word_count': len(word_freq),
|
|
|
'top_word_count': most_common[0][1] if most_common else 0,
|
|
|
'top_word_ratio': top_word_ratio,
|
|
|
'top_5_ratio': top_5_ratio,
|
|
|
'top_10_ratio': safe_divide(sum(cnt for _, cnt in most_common[:10]), word_count),
|
|
|
'hapax_legomena_ratio': safe_divide(sum(1 for cnt in word_freq.values() if cnt == 1), word_count),
|
|
|
'looping_suspicion': 1 if (top_word_ratio > 0.15 or top_5_ratio > 0.4) else 0,
|
|
|
'polish_diacritics_count': diac_count,
|
|
|
'polish_diacritics_ratio': safe_divide(diac_count, total_chars),
|
|
|
'polish_diacritics_per_word': safe_divide(diac_count, word_count),
|
|
|
'diacritics_to_letters_ratio': safe_divide(diac_count, letters_count),
|
|
|
'replacement_char_count': replacement_count,
|
|
|
'replacement_char_ratio': replacement_ratio,
|
|
|
'not_allowed_chars_count': not_allowed_count,
|
|
|
'not_allowed_chars_ratio': not_allowed_ratio,
|
|
|
'encoding_suspicion': 1 if (replacement_ratio > 0.01 or not_allowed_ratio > 0.05) else 0,
|
|
|
'single_char_word_count': len(single_chars),
|
|
|
'single_char_unique_count': len(single_char_freq),
|
|
|
'single_char_upper_count': sum(1 for w in single_chars if w.isupper()),
|
|
|
'single_char_lower_count': sum(1 for w in single_chars if w.islower()),
|
|
|
'single_char_upper_unique_count': len({w for w in single_chars if w.isupper()}),
|
|
|
'single_char_lower_unique_count': len({w for w in single_chars if w.islower()}),
|
|
|
'single_char_top_1_codepoint': top_codes[0],
|
|
|
'single_char_top_2_codepoint': top_codes[1],
|
|
|
'single_char_top_3_codepoint': top_codes[2],
|
|
|
}
|
|
|
return features
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_word_stats(words: List[str], words_lower: List[str]) -> Dict[str, float]:
|
|
|
total_words = len(words)
|
|
|
if not total_words: return {'mean_word_length': 0.0, 'lexical_diversity': 0.0, 'count_caps': 0.0, 'word_isupper<5': 0, 'word_isupper>5': 0, 'count_digit_to_caps': 0.0}
|
|
|
|
|
|
digit_count = sum(1 for w in words if any(ch.isdigit() for ch in w))
|
|
|
caps_count = sum(1 for w in words if w.isupper())
|
|
|
|
|
|
return {
|
|
|
'mean_word_length': safe_divide(sum(len(w) for w in words_lower), total_words),
|
|
|
'lexical_diversity': safe_divide(len(set(words_lower)), total_words),
|
|
|
'count_caps': safe_divide(caps_count, total_words),
|
|
|
'word_isupper<5': sum(1 for w in words if w.isupper() and len(w) < 5),
|
|
|
'word_isupper>5': sum(1 for w in words if w.isupper() and len(w) >= 5),
|
|
|
'count_digit_to_caps': safe_divide(digit_count, caps_count)
|
|
|
}
|
|
|
|
|
|
def count_contextual_word_repetitions(words_lower: List[str]) -> Dict[str, float]:
|
|
|
"""Liczy powtórzenia tego samego słowa bezpośrednio po sobie."""
|
|
|
count = sum(1 for i in range(len(words_lower) - 1) if words_lower[i] == words_lower[i+1])
|
|
|
return {
|
|
|
"contextual_word_repetitions_count": count,
|
|
|
"contextual_word_repetitions_ratio": safe_divide(count, len(words_lower))
|
|
|
}
|
|
|
|
|
|
def count_single_chars_and_ratio(text: str) -> Dict[str, float]:
|
|
|
"""Liczy słowa składające się z jednego znaku (wersja z oryginalnego kodu)."""
|
|
|
t = " " + text + " "
|
|
|
count = sum(1 for i in range(1, len(t) - 1) if t[i-1].isspace() and t[i+1].isspace())
|
|
|
return {
|
|
|
'single_char_count': count,
|
|
|
'single_char_ratio': safe_divide(count, len(t))
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_line_length_stats(lines: List[str]) -> Dict[str, float]:
|
|
|
"""Oblicza statystyki związane z długością linii."""
|
|
|
total_lines = len(lines)
|
|
|
if not total_lines:
|
|
|
return {
|
|
|
'average_lines': 0.0, 'short_line_count_3': 0,
|
|
|
'short_line_count_5': 0, 'short_line_count_10': 0, 'short_line_count_20': 0,
|
|
|
'short_line_ratio_3': 0.0, 'short_line_ratio_5': 0.0,
|
|
|
'short_line_ratio_10': 0.0, 'short_line_ratio_20': 0.0
|
|
|
}
|
|
|
|
|
|
line_lengths = [len(line) for line in lines]
|
|
|
stats = {'average_lines': safe_divide(sum(line_lengths), total_lines)}
|
|
|
|
|
|
for threshold in [3, 5, 10, 20]:
|
|
|
count = sum(1 for length in line_lengths if length < threshold)
|
|
|
stats[f'short_line_count_{threshold}'] = count
|
|
|
stats[f'short_line_ratio_{threshold}'] = safe_divide(count, total_lines)
|
|
|
return stats
|
|
|
|
|
|
def analyze_line_content(lines: List[str]) -> Dict[str, float]:
|
|
|
"""Analizuje zawartość linii pod kątem specyficznych wzorców."""
|
|
|
total_lines = len(lines)
|
|
|
if not total_lines:
|
|
|
return {
|
|
|
'blank_lines': 0, 'blank_lines_ratio': 0.0,
|
|
|
'ellipsis_fractions': 0.0, 'line_counts': 0,
|
|
|
'digit_start_lines': 0, 'duplicated_lines': 0, 'duplicate_line_ratio': 0.0
|
|
|
}
|
|
|
|
|
|
non_empty_lines = [line for line in lines if line.strip()]
|
|
|
blanks_count = total_lines - len(non_empty_lines)
|
|
|
ellipsis_lines_count = sum(1 for line in lines if line.strip().endswith(('...', '…')))
|
|
|
digit_start_lines_count = sum(1 for line in non_empty_lines if line.strip() and line.strip()[0].isdigit())
|
|
|
|
|
|
line_counts = Counter(non_empty_lines)
|
|
|
duplicated_lines_count = sum(cnt - 1 for cnt in line_counts.values() if cnt > 1)
|
|
|
|
|
|
return {
|
|
|
'blank_lines': blanks_count,
|
|
|
'blank_lines_ratio': safe_divide(blanks_count, total_lines),
|
|
|
'ellipsis_fractions': safe_divide(ellipsis_lines_count, total_lines),
|
|
|
'line_counts': total_lines,
|
|
|
'digit_start_lines': digit_start_lines_count,
|
|
|
'duplicated_lines': duplicated_lines_count,
|
|
|
'duplicate_line_ratio': safe_divide(duplicated_lines_count, len(non_empty_lines))
|
|
|
}
|
|
|
|
|
|
def count_lorem_ipsum(text_lower: str) -> Dict[str, float]:
|
|
|
"""Oblicza stosunek lorem ipsum"""
|
|
|
count = text_lower.count('lorem ipsum')
|
|
|
return {'lorem_ipsum_ratio': safe_divide(count, len(text_lower))}
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_all_base_features(text: str, text_lower: str, words: List[str], words_lower: List[str], lines: List[str]) -> Dict[str, float]:
|
|
|
"""Agreguje wszystkie podstawowe cechy tekstu z tego modułu."""
|
|
|
features = {}
|
|
|
features.update(analyze_character_stats(text, text_lower))
|
|
|
features.update(analyze_punctuation_stats(text))
|
|
|
features.update(analyze_advanced_char_features(text))
|
|
|
features.update(analyze_word_stats(words, words_lower))
|
|
|
features.update(count_contextual_word_repetitions(words_lower))
|
|
|
features.update(count_single_chars_and_ratio(text))
|
|
|
features.update(analyze_line_length_stats(lines))
|
|
|
features.update(analyze_line_content(lines))
|
|
|
features.update(count_lorem_ipsum(text_lower))
|
|
|
return features |