File size: 7,651 Bytes
5c8f9d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
"""
Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.
"""
import re
from collections import Counter
from statistics import mean, variance
from typing import Dict, List
from ..utils import safe_divide
from ..constants import MARKDOWN_PATTERNS
# --- Funkcje analizujące strukturę paragrafów ---
def analyze_paragraph_stats(text: str) -> Dict[str, float]:
"""Analizuje statystyki paragrafów."""
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
if not paragraphs:
return {'avg_paragraph_length': 0.0, 'paragraph_length_variance': 0.0}
para_lengths_in_words = [len(p.split()) for p in paragraphs]
return {
'avg_paragraph_length': mean(para_lengths_in_words) if para_lengths_in_words else 0.0,
'paragraph_length_variance': variance(para_lengths_in_words) if len(para_lengths_in_words) > 1 else 0.0
}
# --- Funkcje analizujące formatowanie i elementy nietekstowe ---
def analyze_formatting_and_links(text: str, sentences: List[str]) -> Dict[str, float]:
"""Zlicza elementy formatowania (HTML, BBCode), linki, emotikony i slang."""
total_chars = len(text)
words = text.split()
html_tags = re.findall(r'<[^>]+>', text)
bbcode_tags = re.findall(r'\[[^\]]+\]', text)
slang_words_count = len(re.findall(r'\b(?:lol|omg|lmao|xd|wtf)\b', text.lower()))
markup_len = sum(len(tag) for tag in html_tags) + sum(len(tag) for tag in bbcode_tags)
incomplete_sentences_count = sum(1 for s in sentences if len(s.split()) < 3)
return {
'html_tags': len(html_tags),
'bbcode_tags': len(bbcode_tags),
'urls': len(re.findall(r'https?://\S+|www\.\S+', text)),
'text_to_markup_ratio': safe_divide(total_chars - markup_len, total_chars),
'emoticons': len(re.findall(r'[:;=]-?[)(DPp]', text)),
'slang_words': slang_words_count,
'slang_words_ratio': safe_divide(slang_words_count, len(words)),
'excessive_chars': len(re.findall(r'(\.|,|-|_){4,}', text)),
'incomplete_sentences': incomplete_sentences_count,
}
def analyze_markdown_features(text: str) -> Dict[str, float]:
"""Analizuje użycie składni Markdown, w tym wskaźniki poszczególnych znaków."""
total_chars = len(text)
if not total_chars:
# Zwróć zera dla wszystkich cech, aby uniknąć błędów
keys = [f'{name}_per_1000_chars_md' for name in MARKDOWN_PATTERNS.keys()]
keys += [f'char_ratio_{ch}' for ch in ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']]
keys += ['average_header_level_md', 'special_chars_ratio_md', 'lowercase_ratio_md',
'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md']
return {key: 0.0 for key in keys}
features = {'average_header_level_md': 0.0}
# Podstawowe elementy Markdown
for name, pattern in MARKDOWN_PATTERNS.items():
count = len(pattern.findall(text))
# Zmieniamy nazwy kluczy, aby pasowały do column_order
key_name_map = {'header': 'headers', 'bold': 'bold', 'italic': 'italic',
'unordered_list': 'unordered_list_items', 'ordered_list': 'ordered_list_items',
'link': 'links', 'image': 'images', 'inline_code': 'inline_code_fragments',
'code_block': 'code_blocks', 'blockquote': 'blockquotes', 'horizontal_rule': 'horizontal_rules'}
features[f'{key_name_map[name]}_per_1000_chars_md'] = safe_divide(count * 1000, total_chars)
# Średni poziom nagłówków
headers = MARKDOWN_PATTERNS['header'].findall(text)
if headers:
header_levels = [h.count('#') for h in headers]
features['average_header_level_md'] = safe_divide(sum(header_levels), len(header_levels))
# Cechy znakowe
char_counts = Counter(text)
special_chars_list = ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']
for ch in special_chars_list:
features[f'char_ratio_{ch}'] = safe_divide(char_counts.get(ch, 0), total_chars)
features['special_chars_ratio_md'] = safe_divide(sum(char_counts.get(ch, 0) for ch in special_chars_list), total_chars)
features['lowercase_ratio_md'] = safe_divide(sum(1 for c in text if c.islower()), total_chars)
features['uppercase_ratio_md'] = safe_divide(sum(1 for c in text if c.isupper()), total_chars)
features['digit_ratio_md'] = safe_divide(sum(1 for c in text if c.isdigit()), total_chars)
features['whitespace_ratio_md'] = safe_divide(sum(1 for c in text if c.isspace()), total_chars)
return features
def analyze_markdown_table_structure(text: str, lines: List[str]) -> Dict[str, float]:
"""Analizuje cechy związane z tabelami w Markdown."""
total_chars = len(text)
total_lines = len(lines)
if not total_chars:
return {'table_pipe_count': 0, 'table_pipe_ratio': 0.0, 'table_pipe_per_1000_chars': 0.0,
'table_lines_count': 0, 'table_lines_ratio': 0.0, 'table_header_separators_count': 0,
'avg_pipes_per_table_line': 0.0, 'estimated_avg_columns': 0.0}
pipe_count = text.count('|')
table_lines = [line for line in lines if '|' in line]
header_separators = len([line for line in table_lines if re.match(r'^[\|\-\:\s]+$', line.strip())])
return {
'table_pipe_count': pipe_count,
'table_pipe_ratio': safe_divide(pipe_count, total_chars),
'table_pipe_per_1000_chars': safe_divide(pipe_count * 1000, total_chars),
'table_lines_count': len(table_lines),
'table_lines_ratio': safe_divide(len(table_lines), total_lines),
'table_header_separators_count': header_separators,
'avg_pipes_per_table_line': safe_divide(pipe_count, len(table_lines)),
'estimated_avg_columns': safe_divide(pipe_count, (len(table_lines) * 2)) if table_lines else 0
}
# --- Funkcje analizujące strukturę linii ---
def analyze_line_structure(lines: List[str]) -> Dict[str, float]:
"""Analizuje linie pod kątem specyficznych struktur."""
total_lines = len(lines)
non_empty_lines = [line.strip() for line in lines if line.strip()]
if not total_lines:
return {'lines_with_bullet': 0, 'ratio_of_bulletpoints': 0.0,
'single_word_line_ratio': 0.0, 'repeated_word_line_ratio': 0.0}
bullets = {'•', '○', '‣', '-', '–', '—', '·', '⚪', '⚫', '▢', '■', '→', '★', '✓', '✕', '◇', '◆', '➤', '«', '»'}
bullet_count = sum(1 for line in non_empty_lines if line and line[0] in bullets)
return {
'lines_with_bullet': bullet_count,
'ratio_of_bulletpoints': safe_divide(bullet_count, len(non_empty_lines)),
'single_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) == 1), total_lines),
'repeated_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) > 1 and len(set(l.split())) == 1), total_lines)
}
# --- Główna funkcja agregująca ---
def calculate_all_structural_features(text: str, lines: List[str], sentences: List[str]) -> Dict[str, float]:
"""Agreguje wszystkie cechy strukturalne i formatowania."""
features = {}
features.update(analyze_paragraph_stats(text))
features.update(analyze_formatting_and_links(text, sentences))
features.update(analyze_markdown_features(text))
features.update(analyze_markdown_table_structure(text, lines))
features.update(analyze_line_structure(lines))
return features |