quality_classifier_pl / text_analyzer /features /base_features.py

fix

ef22613 verified 6 months ago

11.6 kB

	"""
	Moduł do ekstrakcji podstawowych, statystycznych cech tekstu.

	Zawiera funkcje do analizy na poziomie znaków, słów i linii, które nie
	wymagają zaawansowanych modeli lingwistycznych.
	"""
	import re
	from collections import Counter
	from typing import Dict, List

	from ..utils import safe_divide
	from ..constants import (PUNCTUATION_PATTERN, EXCESSIVE_SPACES_PATTERN,
	ALLOWED_CHARS_PATTERN, COMMON_CHARACTERS)

	# --- Funkcje analizujące znaki ---

	def analyze_character_stats(text: str, text_lower: str) -> Dict[str, float]:
	"""Oblicza podstawowe statystyki na poziomie znaków."""
	total_chars = len(text)
	char_counts = Counter(text)

	if not total_chars:
	return {
	'characters': 0, 'digit_count': 0, 'digit_ratio': 0.0,
	'overall_uppercase_ratio': 0.0, 'unique_characters_all': 0,
	'unique_characters_lower': 0, 'characters_out_of_common': 0,
	'tabs': 0, 'multispaces': 0
	}

	return {
	'characters': total_chars,
	'digit_count': sum(ch.isdigit() for ch in text),
	'digit_ratio': safe_divide(sum(ch.isdigit() for ch in text), total_chars),
	'overall_uppercase_ratio': safe_divide(sum(ch.isupper() for ch in text), total_chars),
	'unique_characters_all': len(set(text)),
	'unique_characters_lower': len(set(text_lower)),
	'characters_out_of_common': len([c for c in text if c not in COMMON_CHARACTERS]),
	'tabs': text.count('\t'),
	'multispaces': len(EXCESSIVE_SPACES_PATTERN.findall(text))
	}

	def analyze_punctuation_stats(text: str) -> Dict[str, float]:
	"""Analizuje występowanie interpunkcji i specyficznych znaków."""
	total_chars = len(text)
	if not total_chars:
	return {
	'punct_frequency': 0.0, 'bracet_count': 0, 'bracket_ratio': 0.0,
	'count_special_chars': 0
	}

	open_paren = text.count('(')
	close_paren = text.count(')')
	open_bracket = text.count('[')
	close_bracket = text.count(']')

	return {
	'punct_frequency': safe_divide(len(PUNCTUATION_PATTERN.findall(text)), total_chars),
	'bracet_count': open_paren + close_paren + open_bracket + close_bracket,
	'bracket_ratio': safe_divide(open_bracket, close_bracket),
	'count_special_chars': len(re.findall(r'(\?\|!){3,}', text))
	}

	def analyze_advanced_char_features(text: str) -> Dict[str, float]:
	"""Analizuje zaawansowane cechy rozkładu znaków i słów (dawniej analyze_char_features)."""
	total_chars = len(text)
	words_found = re.findall(r'\w+', text)
	word_count = len(words_found)

	if not total_chars or not word_count:
	return {
	'word_count': 0, 'unique_word_count': 0, 'top_word_count': 0, 'top_word_ratio': 0.0,
	'top_5_ratio': 0.0, 'top_10_ratio': 0.0, 'hapax_legomena_ratio': 0.0,
	'looping_suspicion': 0, 'polish_diacritics_count': 0, 'polish_diacritics_ratio': 0.0,
	'polish_diacritics_per_word': 0.0, 'diacritics_to_letters_ratio': 0.0,
	'replacement_char_count': 0, 'replacement_char_ratio': 0.0,
	'not_allowed_chars_count': 0, 'not_allowed_chars_ratio': 0.0,
	'encoding_suspicion': 0, 'single_char_word_count': 0, 'single_char_unique_count': 0,
	'single_char_upper_count': 0, 'single_char_lower_count': 0,
	'single_char_upper_unique_count': 0, 'single_char_lower_unique_count': 0,
	'single_char_top_1_codepoint': 0, 'single_char_top_2_codepoint': 0,
	'single_char_top_3_codepoint': 0
	}

	word_freq = Counter(words_found)
	most_common = word_freq.most_common(10)

	polish_diacritics = 'ąćęłńóśźżĄĆĘŁŃÓŚŹŻ'
	char_counts = Counter(text)
	diac_count = sum(char_counts.get(ch, 0) for ch in polish_diacritics)
	letters_count = sum(1 for ch in text if ch.isalpha())

	single_chars = [w for w in words_found if len(w) == 1]
	single_char_freq = Counter(single_chars)
	top_3_single = single_char_freq.most_common(3)
	top_codes = [ord(w) for w, _ in top_3_single]
	while len(top_codes) < 3: top_codes.append(0)

	replacement_count = char_counts.get('\uFFFD', 0)
	not_allowed_count = sum(1 for ch in text if not ALLOWED_CHARS_PATTERN.match(ch))
	replacement_ratio = safe_divide(replacement_count, total_chars)
	not_allowed_ratio = safe_divide(not_allowed_count, total_chars)

	top_word_ratio = safe_divide(most_common[0][1] if most_common else 0, word_count)
	top_5_ratio = safe_divide(sum(cnt for _, cnt in most_common[:5]), word_count)

	features = {
	'word_count': word_count,
	'unique_word_count': len(word_freq),
	'top_word_count': most_common[0][1] if most_common else 0,
	'top_word_ratio': top_word_ratio,
	'top_5_ratio': top_5_ratio,
	'top_10_ratio': safe_divide(sum(cnt for _, cnt in most_common[:10]), word_count),
	'hapax_legomena_ratio': safe_divide(sum(1 for cnt in word_freq.values() if cnt == 1), word_count),
	'looping_suspicion': 1 if (top_word_ratio > 0.15 or top_5_ratio > 0.4) else 0,
	'polish_diacritics_count': diac_count,
	'polish_diacritics_ratio': safe_divide(diac_count, total_chars),
	'polish_diacritics_per_word': safe_divide(diac_count, word_count),
	'diacritics_to_letters_ratio': safe_divide(diac_count, letters_count),
	'replacement_char_count': replacement_count,
	'replacement_char_ratio': replacement_ratio,
	'not_allowed_chars_count': not_allowed_count,
	'not_allowed_chars_ratio': not_allowed_ratio,
	'encoding_suspicion': 1 if (replacement_ratio > 0.01 or not_allowed_ratio > 0.05) else 0,
	'single_char_word_count': len(single_chars),
	'single_char_unique_count': len(single_char_freq),
	'single_char_upper_count': sum(1 for w in single_chars if w.isupper()),
	'single_char_lower_count': sum(1 for w in single_chars if w.islower()),
	'single_char_upper_unique_count': len({w for w in single_chars if w.isupper()}),
	'single_char_lower_unique_count': len({w for w in single_chars if w.islower()}),
	'single_char_top_1_codepoint': top_codes[0],
	'single_char_top_2_codepoint': top_codes[1],
	'single_char_top_3_codepoint': top_codes[2],
	}
	return features

	# --- Funkcje analizujące słowa ---

	def analyze_word_stats(words: List[str], words_lower: List[str]) -> Dict[str, float]:
	total_words = len(words)
	if not total_words: return {'mean_word_length': 0.0, 'lexical_diversity': 0.0, 'count_caps': 0.0, 'word_isupper<5': 0, 'word_isupper>5': 0, 'count_digit_to_caps': 0.0}

	digit_count = sum(1 for w in words if any(ch.isdigit() for ch in w))
	caps_count = sum(1 for w in words if w.isupper())

	return {
	'mean_word_length': safe_divide(sum(len(w) for w in words_lower), total_words),
	'lexical_diversity': safe_divide(len(set(words_lower)), total_words),
	'count_caps': safe_divide(caps_count, total_words),
	'word_isupper<5': sum(1 for w in words if w.isupper() and len(w) < 5),
	'word_isupper>5': sum(1 for w in words if w.isupper() and len(w) >= 5),
	'count_digit_to_caps': safe_divide(digit_count, caps_count)
	}

	def count_contextual_word_repetitions(words_lower: List[str]) -> Dict[str, float]:
	"""Liczy powtórzenia tego samego słowa bezpośrednio po sobie."""
	count = sum(1 for i in range(len(words_lower) - 1) if words_lower[i] == words_lower[i+1])
	return {
	"contextual_word_repetitions_count": count,
	"contextual_word_repetitions_ratio": safe_divide(count, len(words_lower))
	}

	def count_single_chars_and_ratio(text: str) -> Dict[str, float]:
	"""Liczy słowa składające się z jednego znaku (wersja z oryginalnego kodu)."""
	t = " " + text + " "
	count = sum(1 for i in range(1, len(t) - 1) if t[i-1].isspace() and t[i+1].isspace())
	return {
	'single_char_count': count,
	'single_char_ratio': safe_divide(count, len(t))
	}

	# --- Funkcje analizujące linie ---

	def analyze_line_length_stats(lines: List[str]) -> Dict[str, float]:
	"""Oblicza statystyki związane z długością linii."""
	total_lines = len(lines)
	if not total_lines:
	return {
	'average_lines': 0.0, 'short_line_count_3': 0,
	'short_line_count_5': 0, 'short_line_count_10': 0, 'short_line_count_20': 0,
	'short_line_ratio_3': 0.0, 'short_line_ratio_5': 0.0,
	'short_line_ratio_10': 0.0, 'short_line_ratio_20': 0.0
	}

	line_lengths = [len(line) for line in lines]
	stats = {'average_lines': safe_divide(sum(line_lengths), total_lines)}

	for threshold in [3, 5, 10, 20]:
	count = sum(1 for length in line_lengths if length < threshold)
	stats[f'short_line_count_{threshold}'] = count
	stats[f'short_line_ratio_{threshold}'] = safe_divide(count, total_lines)
	return stats

	def analyze_line_content(lines: List[str]) -> Dict[str, float]:
	"""Analizuje zawartość linii pod kątem specyficznych wzorców."""
	total_lines = len(lines)
	if not total_lines:
	return {
	'blank_lines': 0, 'blank_lines_ratio': 0.0,
	'ellipsis_fractions': 0.0, 'line_counts': 0,
	'digit_start_lines': 0, 'duplicated_lines': 0, 'duplicate_line_ratio': 0.0
	}

	non_empty_lines = [line for line in lines if line.strip()]
	blanks_count = total_lines - len(non_empty_lines)
	ellipsis_lines_count = sum(1 for line in lines if line.strip().endswith(('...', '…')))
	digit_start_lines_count = sum(1 for line in non_empty_lines if line.strip() and line.strip()[0].isdigit())

	line_counts = Counter(non_empty_lines)
	duplicated_lines_count = sum(cnt - 1 for cnt in line_counts.values() if cnt > 1)

	return {
	'blank_lines': blanks_count,
	'blank_lines_ratio': safe_divide(blanks_count, total_lines),
	'ellipsis_fractions': safe_divide(ellipsis_lines_count, total_lines),
	'line_counts': total_lines,
	'digit_start_lines': digit_start_lines_count,
	'duplicated_lines': duplicated_lines_count,
	'duplicate_line_ratio': safe_divide(duplicated_lines_count, len(non_empty_lines))
	}

	def count_lorem_ipsum(text_lower: str) -> Dict[str, float]:
	"""Oblicza stosunek lorem ipsum"""
	count = text_lower.count('lorem ipsum')
	return {'lorem_ipsum_ratio': safe_divide(count, len(text_lower))}

	# --- Główna funkcja agregująca ---

	def calculate_all_base_features(text: str, text_lower: str, words: List[str], words_lower: List[str], lines: List[str]) -> Dict[str, float]:
	"""Agreguje wszystkie podstawowe cechy tekstu z tego modułu."""
	features = {}
	features.update(analyze_character_stats(text, text_lower))
	features.update(analyze_punctuation_stats(text))
	features.update(analyze_advanced_char_features(text))
	features.update(analyze_word_stats(words, words_lower))
	features.update(count_contextual_word_repetitions(words_lower))
	features.update(count_single_chars_and_ratio(text))
	features.update(analyze_line_length_stats(lines))
	features.update(analyze_line_content(lines))
	features.update(count_lorem_ipsum(text_lower))
	return features