|
|
"""
|
|
|
Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.
|
|
|
"""
|
|
|
import re
|
|
|
from collections import Counter
|
|
|
from statistics import mean, variance
|
|
|
from typing import Dict, List
|
|
|
|
|
|
from ..utils import safe_divide
|
|
|
from ..constants import MARKDOWN_PATTERNS
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_paragraph_stats(text: str) -> Dict[str, float]:
|
|
|
"""Analizuje statystyki paragrafów."""
|
|
|
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
|
|
if not paragraphs:
|
|
|
return {'avg_paragraph_length': 0.0, 'paragraph_length_variance': 0.0}
|
|
|
para_lengths_in_words = [len(p.split()) for p in paragraphs]
|
|
|
return {
|
|
|
'avg_paragraph_length': mean(para_lengths_in_words) if para_lengths_in_words else 0.0,
|
|
|
'paragraph_length_variance': variance(para_lengths_in_words) if len(para_lengths_in_words) > 1 else 0.0
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_formatting_and_links(text: str, sentences: List[str]) -> Dict[str, float]:
|
|
|
"""Zlicza elementy formatowania (HTML, BBCode), linki, emotikony i slang."""
|
|
|
total_chars = len(text)
|
|
|
words = text.split()
|
|
|
html_tags = re.findall(r'<[^>]+>', text)
|
|
|
bbcode_tags = re.findall(r'\[[^\]]+\]', text)
|
|
|
slang_words_count = len(re.findall(r'\b(?:lol|omg|lmao|xd|wtf)\b', text.lower()))
|
|
|
markup_len = sum(len(tag) for tag in html_tags) + sum(len(tag) for tag in bbcode_tags)
|
|
|
incomplete_sentences_count = sum(1 for s in sentences if len(s.split()) < 3)
|
|
|
|
|
|
return {
|
|
|
'html_tags': len(html_tags),
|
|
|
'bbcode_tags': len(bbcode_tags),
|
|
|
'urls': len(re.findall(r'https?://\S+|www\.\S+', text)),
|
|
|
'text_to_markup_ratio': safe_divide(total_chars - markup_len, total_chars),
|
|
|
'emoticons': len(re.findall(r'[:;=]-?[)(DPp]', text)),
|
|
|
'slang_words': slang_words_count,
|
|
|
'slang_words_ratio': safe_divide(slang_words_count, len(words)),
|
|
|
'excessive_chars': len(re.findall(r'(\.|,|-|_){4,}', text)),
|
|
|
'incomplete_sentences': incomplete_sentences_count,
|
|
|
}
|
|
|
|
|
|
def analyze_markdown_features(text: str) -> Dict[str, float]:
|
|
|
"""Analizuje użycie składni Markdown, w tym wskaźniki poszczególnych znaków."""
|
|
|
total_chars = len(text)
|
|
|
if not total_chars:
|
|
|
|
|
|
keys = [f'{name}_per_1000_chars_md' for name in MARKDOWN_PATTERNS.keys()]
|
|
|
keys += [f'char_ratio_{ch}' for ch in ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']]
|
|
|
keys += ['average_header_level_md', 'special_chars_ratio_md', 'lowercase_ratio_md',
|
|
|
'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md']
|
|
|
return {key: 0.0 for key in keys}
|
|
|
|
|
|
features = {'average_header_level_md': 0.0}
|
|
|
|
|
|
|
|
|
for name, pattern in MARKDOWN_PATTERNS.items():
|
|
|
count = len(pattern.findall(text))
|
|
|
|
|
|
key_name_map = {'header': 'headers', 'bold': 'bold', 'italic': 'italic',
|
|
|
'unordered_list': 'unordered_list_items', 'ordered_list': 'ordered_list_items',
|
|
|
'link': 'links', 'image': 'images', 'inline_code': 'inline_code_fragments',
|
|
|
'code_block': 'code_blocks', 'blockquote': 'blockquotes', 'horizontal_rule': 'horizontal_rules'}
|
|
|
features[f'{key_name_map[name]}_per_1000_chars_md'] = safe_divide(count * 1000, total_chars)
|
|
|
|
|
|
|
|
|
headers = MARKDOWN_PATTERNS['header'].findall(text)
|
|
|
if headers:
|
|
|
header_levels = [h.count('#') for h in headers]
|
|
|
features['average_header_level_md'] = safe_divide(sum(header_levels), len(header_levels))
|
|
|
|
|
|
|
|
|
char_counts = Counter(text)
|
|
|
special_chars_list = ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']
|
|
|
for ch in special_chars_list:
|
|
|
features[f'char_ratio_{ch}'] = safe_divide(char_counts.get(ch, 0), total_chars)
|
|
|
|
|
|
features['special_chars_ratio_md'] = safe_divide(sum(char_counts.get(ch, 0) for ch in special_chars_list), total_chars)
|
|
|
features['lowercase_ratio_md'] = safe_divide(sum(1 for c in text if c.islower()), total_chars)
|
|
|
features['uppercase_ratio_md'] = safe_divide(sum(1 for c in text if c.isupper()), total_chars)
|
|
|
features['digit_ratio_md'] = safe_divide(sum(1 for c in text if c.isdigit()), total_chars)
|
|
|
features['whitespace_ratio_md'] = safe_divide(sum(1 for c in text if c.isspace()), total_chars)
|
|
|
|
|
|
return features
|
|
|
|
|
|
def analyze_markdown_table_structure(text: str, lines: List[str]) -> Dict[str, float]:
|
|
|
"""Analizuje cechy związane z tabelami w Markdown."""
|
|
|
total_chars = len(text)
|
|
|
total_lines = len(lines)
|
|
|
if not total_chars:
|
|
|
return {'table_pipe_count': 0, 'table_pipe_ratio': 0.0, 'table_pipe_per_1000_chars': 0.0,
|
|
|
'table_lines_count': 0, 'table_lines_ratio': 0.0, 'table_header_separators_count': 0,
|
|
|
'avg_pipes_per_table_line': 0.0, 'estimated_avg_columns': 0.0}
|
|
|
|
|
|
pipe_count = text.count('|')
|
|
|
table_lines = [line for line in lines if '|' in line]
|
|
|
header_separators = len([line for line in table_lines if re.match(r'^[\|\-\:\s]+$', line.strip())])
|
|
|
|
|
|
return {
|
|
|
'table_pipe_count': pipe_count,
|
|
|
'table_pipe_ratio': safe_divide(pipe_count, total_chars),
|
|
|
'table_pipe_per_1000_chars': safe_divide(pipe_count * 1000, total_chars),
|
|
|
'table_lines_count': len(table_lines),
|
|
|
'table_lines_ratio': safe_divide(len(table_lines), total_lines),
|
|
|
'table_header_separators_count': header_separators,
|
|
|
'avg_pipes_per_table_line': safe_divide(pipe_count, len(table_lines)),
|
|
|
'estimated_avg_columns': safe_divide(pipe_count, (len(table_lines) * 2)) if table_lines else 0
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_line_structure(lines: List[str]) -> Dict[str, float]:
|
|
|
"""Analizuje linie pod kątem specyficznych struktur."""
|
|
|
total_lines = len(lines)
|
|
|
non_empty_lines = [line.strip() for line in lines if line.strip()]
|
|
|
|
|
|
if not total_lines:
|
|
|
return {'lines_with_bullet': 0, 'ratio_of_bulletpoints': 0.0,
|
|
|
'single_word_line_ratio': 0.0, 'repeated_word_line_ratio': 0.0}
|
|
|
|
|
|
bullets = {'•', '○', '‣', '-', '–', '—', '·', '⚪', '⚫', '▢', '■', '→', '★', '✓', '✕', '◇', '◆', '➤', '«', '»'}
|
|
|
bullet_count = sum(1 for line in non_empty_lines if line and line[0] in bullets)
|
|
|
|
|
|
return {
|
|
|
'lines_with_bullet': bullet_count,
|
|
|
'ratio_of_bulletpoints': safe_divide(bullet_count, len(non_empty_lines)),
|
|
|
'single_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) == 1), total_lines),
|
|
|
'repeated_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) > 1 and len(set(l.split())) == 1), total_lines)
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_all_structural_features(text: str, lines: List[str], sentences: List[str]) -> Dict[str, float]:
|
|
|
"""Agreguje wszystkie cechy strukturalne i formatowania."""
|
|
|
features = {}
|
|
|
features.update(analyze_paragraph_stats(text))
|
|
|
features.update(analyze_formatting_and_links(text, sentences))
|
|
|
features.update(analyze_markdown_features(text))
|
|
|
features.update(analyze_markdown_table_structure(text, lines))
|
|
|
features.update(analyze_line_structure(lines))
|
|
|
|
|
|
return features |