|
|
"""
|
|
|
Moduł do ekstrakcji cech lingwistycznych i stylistycznych tekstu.
|
|
|
"""
|
|
|
import math
|
|
|
import re
|
|
|
from collections import Counter
|
|
|
from typing import Dict, List
|
|
|
|
|
|
from ..utils import safe_divide
|
|
|
from ..constants import STOP_WORDS, BAD_WORDS, NON_WORD_CHARS_PATTERN
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_stop_word_ratio(words_lower: List[str]) -> Dict[str, float]:
|
|
|
"""Oblicza stosunek stop-words do wszystkich słów."""
|
|
|
total_words = len(words_lower)
|
|
|
stop_word_count = sum(1 for w in words_lower if w in STOP_WORDS)
|
|
|
return {'stop_word_ratio': safe_divide(stop_word_count, total_words)}
|
|
|
|
|
|
def count_bad_words(words_lower: List[str]) -> Dict[str, int]:
|
|
|
"""Liczy wystąpienia wulgaryzmów."""
|
|
|
return {'bad_word_count': sum(1 for w in words_lower if w in BAD_WORDS)}
|
|
|
|
|
|
def calculate_unigram_entropy(words_lower: List[str]) -> Dict[str, float]:
|
|
|
"""Oblicza entropię rozkładu unigramów (słów)."""
|
|
|
total_words = len(words_lower)
|
|
|
if not total_words: return {'entropy': 0.0}
|
|
|
counts = Counter(words_lower)
|
|
|
entropy = -sum((cnt / total_words) * math.log(cnt / total_words) for cnt in counts.values())
|
|
|
return {'entropy': entropy}
|
|
|
|
|
|
def count_non_alpha_words(text: str) -> Dict[str, float]:
|
|
|
"""Liczy stosunek znaków niealfabetycznych do wszystkich."""
|
|
|
total_chars = len(text)
|
|
|
if not total_chars: return {'non_alpha_word_fractions': 0.0}
|
|
|
non_alpha = sum(1 for char in text if not char.isalpha())
|
|
|
return {'non_alpha_word_fractions': safe_divide(non_alpha, total_chars)}
|
|
|
|
|
|
def calculate_symbol_to_word_ratio(words: List[str], text: str) -> Dict[str, float]:
|
|
|
"""Oblicza stosunek symboli do słów."""
|
|
|
total_words = len(words)
|
|
|
char_counts = Counter(text)
|
|
|
triple_dot_count = text.count('...')
|
|
|
symbol_count = char_counts.get('#', 0) + triple_dot_count + char_counts.get('…', 0)
|
|
|
return {'symbol_to_word_ratio': safe_divide(symbol_count, total_words)}
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_ngram_fractions(words: List[str]) -> Dict[str, float]:
|
|
|
"""Analizuje frakcje powtarzających się i najczęstszych n-gramów."""
|
|
|
normalized_text = NON_WORD_CHARS_PATTERN.sub('', ' '.join(words))
|
|
|
total_chars = len(normalized_text)
|
|
|
|
|
|
keys = [f'fraction_duplicate_{n}_ngram' for n in range(5, 11)] + \
|
|
|
[f'fraction_top_{n}_ngram' for n in range(2, 6)]
|
|
|
if total_chars == 0 or len(words) < 10:
|
|
|
return {key: 0.0 for key in keys}
|
|
|
|
|
|
results = {}
|
|
|
for n in range(2, 11):
|
|
|
if len(words) < n: continue
|
|
|
ngrams_gen = (' '.join(words[i:i + n]) for i in range(len(words) - n + 1))
|
|
|
counts = Counter(ngrams_gen)
|
|
|
if n >= 5:
|
|
|
dup_chars = sum(len(ngram) for ngram, cnt in counts.items() if cnt > 1)
|
|
|
results[f'fraction_duplicate_{n}_ngram'] = safe_divide(dup_chars, total_chars)
|
|
|
if 2 <= n <= 5:
|
|
|
if counts:
|
|
|
top_ngram, _ = counts.most_common(1)[0]
|
|
|
results[f'fraction_top_{n}_ngram'] = safe_divide(len(top_ngram), total_chars)
|
|
|
else:
|
|
|
results[f'fraction_top_{n}_ngram'] = 0.0
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_stylistic_metrics(text: str, words: List[str], sentences: List[str]) -> Dict[str, float]:
|
|
|
sentences_from_regex = re.findall(r'[^.!?]+[.!?]', text)
|
|
|
num_sentences = len(sentences_from_regex)
|
|
|
words_per_sentence = [len(s.split()) for s in sentences_from_regex]
|
|
|
|
|
|
formal_words = ['Pan', 'Pani', 'Państwo', 'uprzejmie', 'proszę', 'dziękuję']
|
|
|
formal_count = sum(text.count(word) for word in formal_words)
|
|
|
|
|
|
cohesive_words = ['jednak', 'ponadto', 'w konsekwencji', 'zatem', 'więc', 'dlatego', 'natomiast', 'niemniej']
|
|
|
cohesion_count = sum(text.lower().count(word) for word in cohesive_words)
|
|
|
|
|
|
quote_count = len(re.findall(r'[„"]', text)) // 2
|
|
|
reference_count = len(re.findall(r'\[[0-9]+\]', text))
|
|
|
|
|
|
first_words = [s.split()[0].lower() for s in sentences_from_regex if s.split()]
|
|
|
|
|
|
return {
|
|
|
'formal_words_ratio': safe_divide(formal_count, len(words)),
|
|
|
'cohesive_words_per_sentence': safe_divide(cohesion_count, num_sentences),
|
|
|
'quotes_and_references_per_sentence': safe_divide(quote_count + reference_count, num_sentences),
|
|
|
'unique_sentence_beginnings_ratio': safe_divide(len(set(first_words)), num_sentences),
|
|
|
'commas_per_sentence': safe_divide(text.count(','), num_sentences),
|
|
|
'semicolons_per_sentence': safe_divide(text.count(';'), num_sentences),
|
|
|
'dashes_per_sentence': safe_divide(text.count('—') + text.count('–'), num_sentences),
|
|
|
'colons_per_sentence': safe_divide(text.count(':'), num_sentences),
|
|
|
'short_sentences_ratio': safe_divide(sum(1 for c in words_per_sentence if c < 5), num_sentences),
|
|
|
'long_sentences_ratio': safe_divide(sum(1 for c in words_per_sentence if c > 30), num_sentences)
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_all_linguistic_features(text: str, text_lower: str, words: List[str], words_lower: List[str], sentences: List[str]) -> Dict[str, float]:
|
|
|
"""Agreguje wszystkie cechy lingwistyczne i stylistyczne z tego modułu."""
|
|
|
features = {}
|
|
|
features.update(calculate_stop_word_ratio(words_lower))
|
|
|
features.update(count_bad_words(words_lower))
|
|
|
features.update(calculate_unigram_entropy(words_lower))
|
|
|
features.update(count_non_alpha_words(text))
|
|
|
features.update(calculate_symbol_to_word_ratio(words, text))
|
|
|
features.update(calculate_ngram_fractions(words))
|
|
|
features.update(analyze_stylistic_metrics(text, words, sentences))
|
|
|
features['javascript_counts_per_line'] = text_lower.count('javascript')
|
|
|
return features |