Joblib
adgw's picture
fix
ef22613 verified
"""
Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.
"""
import re
from collections import Counter
from statistics import mean, variance
from typing import Dict, List
from ..utils import safe_divide
from ..constants import MARKDOWN_PATTERNS
# --- Funkcje analizujące strukturę paragrafów ---
def analyze_paragraph_stats(text: str) -> Dict[str, float]:
"""Analizuje statystyki paragrafów."""
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
if not paragraphs:
return {'avg_paragraph_length': 0.0, 'paragraph_length_variance': 0.0}
para_lengths_in_words = [len(p.split()) for p in paragraphs]
return {
'avg_paragraph_length': mean(para_lengths_in_words) if para_lengths_in_words else 0.0,
'paragraph_length_variance': variance(para_lengths_in_words) if len(para_lengths_in_words) > 1 else 0.0
}
# --- Funkcje analizujące formatowanie i elementy nietekstowe ---
def analyze_formatting_and_links(text: str, sentences: List[str]) -> Dict[str, float]:
"""Zlicza elementy formatowania (HTML, BBCode), linki, emotikony i slang."""
total_chars = len(text)
words = text.split()
html_tags = re.findall(r'<[^>]+>', text)
bbcode_tags = re.findall(r'\[[^\]]+\]', text)
slang_words_count = len(re.findall(r'\b(?:lol|omg|lmao|xd|wtf)\b', text.lower()))
markup_len = sum(len(tag) for tag in html_tags) + sum(len(tag) for tag in bbcode_tags)
incomplete_sentences_count = sum(1 for s in sentences if len(s.split()) < 3)
return {
'html_tags': len(html_tags),
'bbcode_tags': len(bbcode_tags),
'urls': len(re.findall(r'https?://\S+|www\.\S+', text)),
'text_to_markup_ratio': safe_divide(total_chars - markup_len, total_chars),
'emoticons': len(re.findall(r'[:;=]-?[)(DPp]', text)),
'slang_words': slang_words_count,
'slang_words_ratio': safe_divide(slang_words_count, len(words)),
'excessive_chars': len(re.findall(r'(\.|,|-|_){4,}', text)),
'incomplete_sentences': incomplete_sentences_count,
}
def analyze_markdown_features(text: str) -> Dict[str, float]:
"""Analizuje użycie składni Markdown, w tym wskaźniki poszczególnych znaków."""
total_chars = len(text)
if not total_chars:
# Zwróć zera dla wszystkich cech, aby uniknąć błędów
keys = [f'{name}_per_1000_chars_md' for name in MARKDOWN_PATTERNS.keys()]
keys += [f'char_ratio_{ch}' for ch in ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']]
keys += ['average_header_level_md', 'special_chars_ratio_md', 'lowercase_ratio_md',
'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md']
return {key: 0.0 for key in keys}
features = {'average_header_level_md': 0.0}
# Podstawowe elementy Markdown
for name, pattern in MARKDOWN_PATTERNS.items():
count = len(pattern.findall(text))
# Zmieniamy nazwy kluczy, aby pasowały do column_order
key_name_map = {'header': 'headers', 'bold': 'bold', 'italic': 'italic',
'unordered_list': 'unordered_list_items', 'ordered_list': 'ordered_list_items',
'link': 'links', 'image': 'images', 'inline_code': 'inline_code_fragments',
'code_block': 'code_blocks', 'blockquote': 'blockquotes', 'horizontal_rule': 'horizontal_rules'}
features[f'{key_name_map[name]}_per_1000_chars_md'] = safe_divide(count * 1000, total_chars)
# Średni poziom nagłówków
headers = MARKDOWN_PATTERNS['header'].findall(text)
if headers:
header_levels = [h.count('#') for h in headers]
features['average_header_level_md'] = safe_divide(sum(header_levels), len(header_levels))
# Cechy znakowe
char_counts = Counter(text)
special_chars_list = ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']
for ch in special_chars_list:
features[f'char_ratio_{ch}'] = safe_divide(char_counts.get(ch, 0), total_chars)
features['special_chars_ratio_md'] = safe_divide(sum(char_counts.get(ch, 0) for ch in special_chars_list), total_chars)
features['lowercase_ratio_md'] = safe_divide(sum(1 for c in text if c.islower()), total_chars)
features['uppercase_ratio_md'] = safe_divide(sum(1 for c in text if c.isupper()), total_chars)
features['digit_ratio_md'] = safe_divide(sum(1 for c in text if c.isdigit()), total_chars)
features['whitespace_ratio_md'] = safe_divide(sum(1 for c in text if c.isspace()), total_chars)
return features
def analyze_markdown_table_structure(text: str, lines: List[str]) -> Dict[str, float]:
"""Analizuje cechy związane z tabelami w Markdown."""
total_chars = len(text)
total_lines = len(lines)
if not total_chars:
return {'table_pipe_count': 0, 'table_pipe_ratio': 0.0, 'table_pipe_per_1000_chars': 0.0,
'table_lines_count': 0, 'table_lines_ratio': 0.0, 'table_header_separators_count': 0,
'avg_pipes_per_table_line': 0.0, 'estimated_avg_columns': 0.0}
pipe_count = text.count('|')
table_lines = [line for line in lines if '|' in line]
header_separators = len([line for line in table_lines if re.match(r'^[\|\-\:\s]+$', line.strip())])
return {
'table_pipe_count': pipe_count,
'table_pipe_ratio': safe_divide(pipe_count, total_chars),
'table_pipe_per_1000_chars': safe_divide(pipe_count * 1000, total_chars),
'table_lines_count': len(table_lines),
'table_lines_ratio': safe_divide(len(table_lines), total_lines),
'table_header_separators_count': header_separators,
'avg_pipes_per_table_line': safe_divide(pipe_count, len(table_lines)),
'estimated_avg_columns': safe_divide(pipe_count, (len(table_lines) * 2)) if table_lines else 0
}
# --- Funkcje analizujące strukturę linii ---
def analyze_line_structure(lines: List[str]) -> Dict[str, float]:
"""Analizuje linie pod kątem specyficznych struktur."""
total_lines = len(lines)
non_empty_lines = [line.strip() for line in lines if line.strip()]
if not total_lines:
return {'lines_with_bullet': 0, 'ratio_of_bulletpoints': 0.0,
'single_word_line_ratio': 0.0, 'repeated_word_line_ratio': 0.0}
bullets = {'•', '○', '‣', '-', '–', '—', '·', '⚪', '⚫', '▢', '■', '→', '★', '✓', '✕', '◇', '◆', '➤', '«', '»'}
bullet_count = sum(1 for line in non_empty_lines if line and line[0] in bullets)
return {
'lines_with_bullet': bullet_count,
'ratio_of_bulletpoints': safe_divide(bullet_count, len(non_empty_lines)),
'single_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) == 1), total_lines),
'repeated_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) > 1 and len(set(l.split())) == 1), total_lines)
}
# --- Główna funkcja agregująca ---
def calculate_all_structural_features(text: str, lines: List[str], sentences: List[str]) -> Dict[str, float]:
"""Agreguje wszystkie cechy strukturalne i formatowania."""
features = {}
features.update(analyze_paragraph_stats(text))
features.update(analyze_formatting_and_links(text, sentences))
features.update(analyze_markdown_features(text))
features.update(analyze_markdown_table_structure(text, lines))
features.update(analyze_line_structure(lines))
return features