File size: 7,651 Bytes

5c8f9d2

"""

Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.

"""
import re
from collections import Counter
from statistics import mean, variance
from typing import Dict, List

from ..utils import safe_divide
from ..constants import MARKDOWN_PATTERNS

# --- Funkcje analizujące strukturę paragrafów ---

def analyze_paragraph_stats(text: str) -> Dict[str, float]:
    """Analizuje statystyki paragrafów."""
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    if not paragraphs:
        return {'avg_paragraph_length': 0.0, 'paragraph_length_variance': 0.0}
    para_lengths_in_words = [len(p.split()) for p in paragraphs]
    return {
        'avg_paragraph_length': mean(para_lengths_in_words) if para_lengths_in_words else 0.0,
        'paragraph_length_variance': variance(para_lengths_in_words) if len(para_lengths_in_words) > 1 else 0.0
    }

# --- Funkcje analizujące formatowanie i elementy nietekstowe ---

def analyze_formatting_and_links(text: str, sentences: List[str]) -> Dict[str, float]:
    """Zlicza elementy formatowania (HTML, BBCode), linki, emotikony i slang."""
    total_chars = len(text)
    words = text.split()
    html_tags = re.findall(r'<[^>]+>', text)
    bbcode_tags = re.findall(r'\[[^\]]+\]', text)
    slang_words_count = len(re.findall(r'\b(?:lol|omg|lmao|xd|wtf)\b', text.lower()))
    markup_len = sum(len(tag) for tag in html_tags) + sum(len(tag) for tag in bbcode_tags)
    incomplete_sentences_count = sum(1 for s in sentences if len(s.split()) < 3)

    return {
        'html_tags': len(html_tags),
        'bbcode_tags': len(bbcode_tags),
        'urls': len(re.findall(r'https?://\S+|www\.\S+', text)),
        'text_to_markup_ratio': safe_divide(total_chars - markup_len, total_chars),
        'emoticons': len(re.findall(r'[:;=]-?[)(DPp]', text)),
        'slang_words': slang_words_count,
        'slang_words_ratio': safe_divide(slang_words_count, len(words)),
        'excessive_chars': len(re.findall(r'(\.|,|-|_){4,}', text)),
        'incomplete_sentences': incomplete_sentences_count,
    }

def analyze_markdown_features(text: str) -> Dict[str, float]:
    """Analizuje użycie składni Markdown, w tym wskaźniki poszczególnych znaków."""
    total_chars = len(text)
    if not total_chars:
        # Zwróć zera dla wszystkich cech, aby uniknąć błędów
        keys = [f'{name}_per_1000_chars_md' for name in MARKDOWN_PATTERNS.keys()]
        keys += [f'char_ratio_{ch}' for ch in ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']]
        keys += ['average_header_level_md', 'special_chars_ratio_md', 'lowercase_ratio_md', 
                 'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md']
        return {key: 0.0 for key in keys}

    features = {'average_header_level_md': 0.0}
    
    # Podstawowe elementy Markdown
    for name, pattern in MARKDOWN_PATTERNS.items():
        count = len(pattern.findall(text))
        # Zmieniamy nazwy kluczy, aby pasowały do column_order
        key_name_map = {'header': 'headers', 'bold': 'bold', 'italic': 'italic',
                        'unordered_list': 'unordered_list_items', 'ordered_list': 'ordered_list_items',
                        'link': 'links', 'image': 'images', 'inline_code': 'inline_code_fragments',
                        'code_block': 'code_blocks', 'blockquote': 'blockquotes', 'horizontal_rule': 'horizontal_rules'}
        features[f'{key_name_map[name]}_per_1000_chars_md'] = safe_divide(count * 1000, total_chars)

    # Średni poziom nagłówków
    headers = MARKDOWN_PATTERNS['header'].findall(text)
    if headers:
        header_levels = [h.count('#') for h in headers]
        features['average_header_level_md'] = safe_divide(sum(header_levels), len(header_levels))
    
    # Cechy znakowe
    char_counts = Counter(text)
    special_chars_list = ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']
    for ch in special_chars_list:
        features[f'char_ratio_{ch}'] = safe_divide(char_counts.get(ch, 0), total_chars)
    
    features['special_chars_ratio_md'] = safe_divide(sum(char_counts.get(ch, 0) for ch in special_chars_list), total_chars)
    features['lowercase_ratio_md'] = safe_divide(sum(1 for c in text if c.islower()), total_chars)
    features['uppercase_ratio_md'] = safe_divide(sum(1 for c in text if c.isupper()), total_chars)
    features['digit_ratio_md'] = safe_divide(sum(1 for c in text if c.isdigit()), total_chars)
    features['whitespace_ratio_md'] = safe_divide(sum(1 for c in text if c.isspace()), total_chars)
    
    return features

def analyze_markdown_table_structure(text: str, lines: List[str]) -> Dict[str, float]:
    """Analizuje cechy związane z tabelami w Markdown."""
    total_chars = len(text)
    total_lines = len(lines)
    if not total_chars:
        return {'table_pipe_count': 0, 'table_pipe_ratio': 0.0, 'table_pipe_per_1000_chars': 0.0,
                'table_lines_count': 0, 'table_lines_ratio': 0.0, 'table_header_separators_count': 0,
                'avg_pipes_per_table_line': 0.0, 'estimated_avg_columns': 0.0}

    pipe_count = text.count('|')
    table_lines = [line for line in lines if '|' in line]
    header_separators = len([line for line in table_lines if re.match(r'^[\|\-\:\s]+$', line.strip())])
    
    return {
        'table_pipe_count': pipe_count,
        'table_pipe_ratio': safe_divide(pipe_count, total_chars),
        'table_pipe_per_1000_chars': safe_divide(pipe_count * 1000, total_chars),
        'table_lines_count': len(table_lines),
        'table_lines_ratio': safe_divide(len(table_lines), total_lines),
        'table_header_separators_count': header_separators,
        'avg_pipes_per_table_line': safe_divide(pipe_count, len(table_lines)),
        'estimated_avg_columns': safe_divide(pipe_count, (len(table_lines) * 2)) if table_lines else 0
    }

# --- Funkcje analizujące strukturę linii ---

def analyze_line_structure(lines: List[str]) -> Dict[str, float]:
    """Analizuje linie pod kątem specyficznych struktur."""
    total_lines = len(lines)
    non_empty_lines = [line.strip() for line in lines if line.strip()]
    
    if not total_lines:
        return {'lines_with_bullet': 0, 'ratio_of_bulletpoints': 0.0,
                'single_word_line_ratio': 0.0, 'repeated_word_line_ratio': 0.0}

    bullets = {'•', '○', '‣', '-', '–', '—', '·', '⚪', '⚫', '▢', '■', '→', '★', '✓', '✕', '◇', '◆', '➤', '«', '»'}
    bullet_count = sum(1 for line in non_empty_lines if line and line[0] in bullets)
    
    return {
        'lines_with_bullet': bullet_count,
        'ratio_of_bulletpoints': safe_divide(bullet_count, len(non_empty_lines)),
        'single_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) == 1), total_lines),
        'repeated_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) > 1 and len(set(l.split())) == 1), total_lines)
    }

# --- Główna funkcja agregująca ---

def calculate_all_structural_features(text: str, lines: List[str], sentences: List[str]) -> Dict[str, float]:
    """Agreguje wszystkie cechy strukturalne i formatowania."""
    features = {}
    features.update(analyze_paragraph_stats(text))
    features.update(analyze_formatting_and_links(text, sentences)) 
    features.update(analyze_markdown_features(text))
    features.update(analyze_markdown_table_structure(text, lines))
    features.update(analyze_line_structure(lines))

    return features