""" Moduł do ekstrakcji cech strukturalnych i formatowania tekstu. """ import re from collections import Counter from statistics import mean, variance from typing import Dict, List from ..utils import safe_divide from ..constants import MARKDOWN_PATTERNS # --- Funkcje analizujące strukturę paragrafów --- def analyze_paragraph_stats(text: str) -> Dict[str, float]: """Analizuje statystyki paragrafów.""" paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] if not paragraphs: return {'avg_paragraph_length': 0.0, 'paragraph_length_variance': 0.0} para_lengths_in_words = [len(p.split()) for p in paragraphs] return { 'avg_paragraph_length': mean(para_lengths_in_words) if para_lengths_in_words else 0.0, 'paragraph_length_variance': variance(para_lengths_in_words) if len(para_lengths_in_words) > 1 else 0.0 } # --- Funkcje analizujące formatowanie i elementy nietekstowe --- def analyze_formatting_and_links(text: str, sentences: List[str]) -> Dict[str, float]: """Zlicza elementy formatowania (HTML, BBCode), linki, emotikony i slang.""" total_chars = len(text) words = text.split() html_tags = re.findall(r'<[^>]+>', text) bbcode_tags = re.findall(r'\[[^\]]+\]', text) slang_words_count = len(re.findall(r'\b(?:lol|omg|lmao|xd|wtf)\b', text.lower())) markup_len = sum(len(tag) for tag in html_tags) + sum(len(tag) for tag in bbcode_tags) incomplete_sentences_count = sum(1 for s in sentences if len(s.split()) < 3) return { 'html_tags': len(html_tags), 'bbcode_tags': len(bbcode_tags), 'urls': len(re.findall(r'https?://\S+|www\.\S+', text)), 'text_to_markup_ratio': safe_divide(total_chars - markup_len, total_chars), 'emoticons': len(re.findall(r'[:;=]-?[)(DPp]', text)), 'slang_words': slang_words_count, 'slang_words_ratio': safe_divide(slang_words_count, len(words)), 'excessive_chars': len(re.findall(r'(\.|,|-|_){4,}', text)), 'incomplete_sentences': incomplete_sentences_count, } def analyze_markdown_features(text: str) -> Dict[str, float]: """Analizuje użycie składni Markdown, w tym wskaźniki poszczególnych znaków.""" total_chars = len(text) if not total_chars: # Zwróć zera dla wszystkich cech, aby uniknąć błędów keys = [f'{name}_per_1000_chars_md' for name in MARKDOWN_PATTERNS.keys()] keys += [f'char_ratio_{ch}' for ch in ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']] keys += ['average_header_level_md', 'special_chars_ratio_md', 'lowercase_ratio_md', 'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md'] return {key: 0.0 for key in keys} features = {'average_header_level_md': 0.0} # Podstawowe elementy Markdown for name, pattern in MARKDOWN_PATTERNS.items(): count = len(pattern.findall(text)) # Zmieniamy nazwy kluczy, aby pasowały do column_order key_name_map = {'header': 'headers', 'bold': 'bold', 'italic': 'italic', 'unordered_list': 'unordered_list_items', 'ordered_list': 'ordered_list_items', 'link': 'links', 'image': 'images', 'inline_code': 'inline_code_fragments', 'code_block': 'code_blocks', 'blockquote': 'blockquotes', 'horizontal_rule': 'horizontal_rules'} features[f'{key_name_map[name]}_per_1000_chars_md'] = safe_divide(count * 1000, total_chars) # Średni poziom nagłówków headers = MARKDOWN_PATTERNS['header'].findall(text) if headers: header_levels = [h.count('#') for h in headers] features['average_header_level_md'] = safe_divide(sum(header_levels), len(header_levels)) # Cechy znakowe char_counts = Counter(text) special_chars_list = ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!'] for ch in special_chars_list: features[f'char_ratio_{ch}'] = safe_divide(char_counts.get(ch, 0), total_chars) features['special_chars_ratio_md'] = safe_divide(sum(char_counts.get(ch, 0) for ch in special_chars_list), total_chars) features['lowercase_ratio_md'] = safe_divide(sum(1 for c in text if c.islower()), total_chars) features['uppercase_ratio_md'] = safe_divide(sum(1 for c in text if c.isupper()), total_chars) features['digit_ratio_md'] = safe_divide(sum(1 for c in text if c.isdigit()), total_chars) features['whitespace_ratio_md'] = safe_divide(sum(1 for c in text if c.isspace()), total_chars) return features def analyze_markdown_table_structure(text: str, lines: List[str]) -> Dict[str, float]: """Analizuje cechy związane z tabelami w Markdown.""" total_chars = len(text) total_lines = len(lines) if not total_chars: return {'table_pipe_count': 0, 'table_pipe_ratio': 0.0, 'table_pipe_per_1000_chars': 0.0, 'table_lines_count': 0, 'table_lines_ratio': 0.0, 'table_header_separators_count': 0, 'avg_pipes_per_table_line': 0.0, 'estimated_avg_columns': 0.0} pipe_count = text.count('|') table_lines = [line for line in lines if '|' in line] header_separators = len([line for line in table_lines if re.match(r'^[\|\-\:\s]+$', line.strip())]) return { 'table_pipe_count': pipe_count, 'table_pipe_ratio': safe_divide(pipe_count, total_chars), 'table_pipe_per_1000_chars': safe_divide(pipe_count * 1000, total_chars), 'table_lines_count': len(table_lines), 'table_lines_ratio': safe_divide(len(table_lines), total_lines), 'table_header_separators_count': header_separators, 'avg_pipes_per_table_line': safe_divide(pipe_count, len(table_lines)), 'estimated_avg_columns': safe_divide(pipe_count, (len(table_lines) * 2)) if table_lines else 0 } # --- Funkcje analizujące strukturę linii --- def analyze_line_structure(lines: List[str]) -> Dict[str, float]: """Analizuje linie pod kątem specyficznych struktur.""" total_lines = len(lines) non_empty_lines = [line.strip() for line in lines if line.strip()] if not total_lines: return {'lines_with_bullet': 0, 'ratio_of_bulletpoints': 0.0, 'single_word_line_ratio': 0.0, 'repeated_word_line_ratio': 0.0} bullets = {'•', '○', '‣', '-', '–', '—', '·', '⚪', '⚫', '▢', '■', '→', '★', '✓', '✕', '◇', '◆', '➤', '«', '»'} bullet_count = sum(1 for line in non_empty_lines if line and line[0] in bullets) return { 'lines_with_bullet': bullet_count, 'ratio_of_bulletpoints': safe_divide(bullet_count, len(non_empty_lines)), 'single_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) == 1), total_lines), 'repeated_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) > 1 and len(set(l.split())) == 1), total_lines) } # --- Główna funkcja agregująca --- def calculate_all_structural_features(text: str, lines: List[str], sentences: List[str]) -> Dict[str, float]: """Agreguje wszystkie cechy strukturalne i formatowania.""" features = {} features.update(analyze_paragraph_stats(text)) features.update(analyze_formatting_and_links(text, sentences)) features.update(analyze_markdown_features(text)) features.update(analyze_markdown_table_structure(text, lines)) features.update(analyze_line_structure(lines)) return features