Joblib
File size: 7,651 Bytes
5c8f9d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""

Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.

"""
import re
from collections import Counter
from statistics import mean, variance
from typing import Dict, List

from ..utils import safe_divide
from ..constants import MARKDOWN_PATTERNS

# --- Funkcje analizujące strukturę paragrafów ---

def analyze_paragraph_stats(text: str) -> Dict[str, float]:
    """Analizuje statystyki paragrafów."""
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    if not paragraphs:
        return {'avg_paragraph_length': 0.0, 'paragraph_length_variance': 0.0}
    para_lengths_in_words = [len(p.split()) for p in paragraphs]
    return {
        'avg_paragraph_length': mean(para_lengths_in_words) if para_lengths_in_words else 0.0,
        'paragraph_length_variance': variance(para_lengths_in_words) if len(para_lengths_in_words) > 1 else 0.0
    }

# --- Funkcje analizujące formatowanie i elementy nietekstowe ---

def analyze_formatting_and_links(text: str, sentences: List[str]) -> Dict[str, float]:
    """Zlicza elementy formatowania (HTML, BBCode), linki, emotikony i slang."""
    total_chars = len(text)
    words = text.split()
    html_tags = re.findall(r'<[^>]+>', text)
    bbcode_tags = re.findall(r'\[[^\]]+\]', text)
    slang_words_count = len(re.findall(r'\b(?:lol|omg|lmao|xd|wtf)\b', text.lower()))
    markup_len = sum(len(tag) for tag in html_tags) + sum(len(tag) for tag in bbcode_tags)
    incomplete_sentences_count = sum(1 for s in sentences if len(s.split()) < 3)

    return {
        'html_tags': len(html_tags),
        'bbcode_tags': len(bbcode_tags),
        'urls': len(re.findall(r'https?://\S+|www\.\S+', text)),
        'text_to_markup_ratio': safe_divide(total_chars - markup_len, total_chars),
        'emoticons': len(re.findall(r'[:;=]-?[)(DPp]', text)),
        'slang_words': slang_words_count,
        'slang_words_ratio': safe_divide(slang_words_count, len(words)),
        'excessive_chars': len(re.findall(r'(\.|,|-|_){4,}', text)),
        'incomplete_sentences': incomplete_sentences_count,
    }

def analyze_markdown_features(text: str) -> Dict[str, float]:
    """Analizuje użycie składni Markdown, w tym wskaźniki poszczególnych znaków."""
    total_chars = len(text)
    if not total_chars:
        # Zwróć zera dla wszystkich cech, aby uniknąć błędów
        keys = [f'{name}_per_1000_chars_md' for name in MARKDOWN_PATTERNS.keys()]
        keys += [f'char_ratio_{ch}' for ch in ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']]
        keys += ['average_header_level_md', 'special_chars_ratio_md', 'lowercase_ratio_md', 
                 'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md']
        return {key: 0.0 for key in keys}

    features = {'average_header_level_md': 0.0}
    
    # Podstawowe elementy Markdown
    for name, pattern in MARKDOWN_PATTERNS.items():
        count = len(pattern.findall(text))
        # Zmieniamy nazwy kluczy, aby pasowały do column_order
        key_name_map = {'header': 'headers', 'bold': 'bold', 'italic': 'italic',
                        'unordered_list': 'unordered_list_items', 'ordered_list': 'ordered_list_items',
                        'link': 'links', 'image': 'images', 'inline_code': 'inline_code_fragments',
                        'code_block': 'code_blocks', 'blockquote': 'blockquotes', 'horizontal_rule': 'horizontal_rules'}
        features[f'{key_name_map[name]}_per_1000_chars_md'] = safe_divide(count * 1000, total_chars)

    # Średni poziom nagłówków
    headers = MARKDOWN_PATTERNS['header'].findall(text)
    if headers:
        header_levels = [h.count('#') for h in headers]
        features['average_header_level_md'] = safe_divide(sum(header_levels), len(header_levels))
    
    # Cechy znakowe
    char_counts = Counter(text)
    special_chars_list = ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']
    for ch in special_chars_list:
        features[f'char_ratio_{ch}'] = safe_divide(char_counts.get(ch, 0), total_chars)
    
    features['special_chars_ratio_md'] = safe_divide(sum(char_counts.get(ch, 0) for ch in special_chars_list), total_chars)
    features['lowercase_ratio_md'] = safe_divide(sum(1 for c in text if c.islower()), total_chars)
    features['uppercase_ratio_md'] = safe_divide(sum(1 for c in text if c.isupper()), total_chars)
    features['digit_ratio_md'] = safe_divide(sum(1 for c in text if c.isdigit()), total_chars)
    features['whitespace_ratio_md'] = safe_divide(sum(1 for c in text if c.isspace()), total_chars)
    
    return features

def analyze_markdown_table_structure(text: str, lines: List[str]) -> Dict[str, float]:
    """Analizuje cechy związane z tabelami w Markdown."""
    total_chars = len(text)
    total_lines = len(lines)
    if not total_chars:
        return {'table_pipe_count': 0, 'table_pipe_ratio': 0.0, 'table_pipe_per_1000_chars': 0.0,
                'table_lines_count': 0, 'table_lines_ratio': 0.0, 'table_header_separators_count': 0,
                'avg_pipes_per_table_line': 0.0, 'estimated_avg_columns': 0.0}

    pipe_count = text.count('|')
    table_lines = [line for line in lines if '|' in line]
    header_separators = len([line for line in table_lines if re.match(r'^[\|\-\:\s]+$', line.strip())])
    
    return {
        'table_pipe_count': pipe_count,
        'table_pipe_ratio': safe_divide(pipe_count, total_chars),
        'table_pipe_per_1000_chars': safe_divide(pipe_count * 1000, total_chars),
        'table_lines_count': len(table_lines),
        'table_lines_ratio': safe_divide(len(table_lines), total_lines),
        'table_header_separators_count': header_separators,
        'avg_pipes_per_table_line': safe_divide(pipe_count, len(table_lines)),
        'estimated_avg_columns': safe_divide(pipe_count, (len(table_lines) * 2)) if table_lines else 0
    }

# --- Funkcje analizujące strukturę linii ---

def analyze_line_structure(lines: List[str]) -> Dict[str, float]:
    """Analizuje linie pod kątem specyficznych struktur."""
    total_lines = len(lines)
    non_empty_lines = [line.strip() for line in lines if line.strip()]
    
    if not total_lines:
        return {'lines_with_bullet': 0, 'ratio_of_bulletpoints': 0.0,
                'single_word_line_ratio': 0.0, 'repeated_word_line_ratio': 0.0}

    bullets = {'•', '○', '‣', '-', '–', '—', '·', '⚪', '⚫', '▢', '■', '→', '★', '✓', '✕', '◇', '◆', '➤', '«', '»'}
    bullet_count = sum(1 for line in non_empty_lines if line and line[0] in bullets)
    
    return {
        'lines_with_bullet': bullet_count,
        'ratio_of_bulletpoints': safe_divide(bullet_count, len(non_empty_lines)),
        'single_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) == 1), total_lines),
        'repeated_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) > 1 and len(set(l.split())) == 1), total_lines)
    }

# --- Główna funkcja agregująca ---

def calculate_all_structural_features(text: str, lines: List[str], sentences: List[str]) -> Dict[str, float]:
    """Agreguje wszystkie cechy strukturalne i formatowania."""
    features = {}
    features.update(analyze_paragraph_stats(text))
    features.update(analyze_formatting_and_links(text, sentences)) 
    features.update(analyze_markdown_features(text))
    features.update(analyze_markdown_table_structure(text, lines))
    features.update(analyze_line_structure(lines))

    return features