quality_classifier_pl / text_analyzer /features /structural_features.py

fix

ef22613 verified 6 months ago

7.65 kB

	"""
	Moduł do ekstrakcji cech strukturalnych i formatowania tekstu.
	"""
	import re
	from collections import Counter
	from statistics import mean, variance
	from typing import Dict, List

	from ..utils import safe_divide
	from ..constants import MARKDOWN_PATTERNS

	# --- Funkcje analizujące strukturę paragrafów ---

	def analyze_paragraph_stats(text: str) -> Dict[str, float]:
	"""Analizuje statystyki paragrafów."""
	paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
	if not paragraphs:
	return {'avg_paragraph_length': 0.0, 'paragraph_length_variance': 0.0}
	para_lengths_in_words = [len(p.split()) for p in paragraphs]
	return {
	'avg_paragraph_length': mean(para_lengths_in_words) if para_lengths_in_words else 0.0,
	'paragraph_length_variance': variance(para_lengths_in_words) if len(para_lengths_in_words) > 1 else 0.0
	}

	# --- Funkcje analizujące formatowanie i elementy nietekstowe ---

	def analyze_formatting_and_links(text: str, sentences: List[str]) -> Dict[str, float]:
	"""Zlicza elementy formatowania (HTML, BBCode), linki, emotikony i slang."""
	total_chars = len(text)
	words = text.split()
	html_tags = re.findall(r'<[^>]+>', text)
	bbcode_tags = re.findall(r'\[[^\]]+\]', text)
	slang_words_count = len(re.findall(r'\b(?:lol\|omg\|lmao\|xd\|wtf)\b', text.lower()))
	markup_len = sum(len(tag) for tag in html_tags) + sum(len(tag) for tag in bbcode_tags)
	incomplete_sentences_count = sum(1 for s in sentences if len(s.split()) < 3)

	return {
	'html_tags': len(html_tags),
	'bbcode_tags': len(bbcode_tags),
	'urls': len(re.findall(r'https?://\S+\|www\.\S+', text)),
	'text_to_markup_ratio': safe_divide(total_chars - markup_len, total_chars),
	'emoticons': len(re.findall(r'[:;=]-?[)(DPp]', text)),
	'slang_words': slang_words_count,
	'slang_words_ratio': safe_divide(slang_words_count, len(words)),
	'excessive_chars': len(re.findall(r'(\.\|,\|-\|_){4,}', text)),
	'incomplete_sentences': incomplete_sentences_count,
	}

	def analyze_markdown_features(text: str) -> Dict[str, float]:
	"""Analizuje użycie składni Markdown, w tym wskaźniki poszczególnych znaków."""
	total_chars = len(text)
	if not total_chars:
	# Zwróć zera dla wszystkich cech, aby uniknąć błędów
	keys = [f'{name}_per_1000_chars_md' for name in MARKDOWN_PATTERNS.keys()]
	keys += [f'char_ratio_{ch}' for ch in ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']]
	keys += ['average_header_level_md', 'special_chars_ratio_md', 'lowercase_ratio_md',
	'uppercase_ratio_md', 'digit_ratio_md', 'whitespace_ratio_md']
	return {key: 0.0 for key in keys}

	features = {'average_header_level_md': 0.0}

	# Podstawowe elementy Markdown
	for name, pattern in MARKDOWN_PATTERNS.items():
	count = len(pattern.findall(text))
	# Zmieniamy nazwy kluczy, aby pasowały do column_order
	key_name_map = {'header': 'headers', 'bold': 'bold', 'italic': 'italic',
	'unordered_list': 'unordered_list_items', 'ordered_list': 'ordered_list_items',
	'link': 'links', 'image': 'images', 'inline_code': 'inline_code_fragments',
	'code_block': 'code_blocks', 'blockquote': 'blockquotes', 'horizontal_rule': 'horizontal_rules'}
	features[f'{key_name_map[name]}_per_1000_chars_md'] = safe_divide(count * 1000, total_chars)

	# Średni poziom nagłówków
	headers = MARKDOWN_PATTERNS['header'].findall(text)
	if headers:
	header_levels = [h.count('#') for h in headers]
	features['average_header_level_md'] = safe_divide(sum(header_levels), len(header_levels))

	# Cechy znakowe
	char_counts = Counter(text)
	special_chars_list = ['#', '*', '-', '+', '[', ']', '(', ')', '`', '>', '_', '!']
	for ch in special_chars_list:
	features[f'char_ratio_{ch}'] = safe_divide(char_counts.get(ch, 0), total_chars)

	features['special_chars_ratio_md'] = safe_divide(sum(char_counts.get(ch, 0) for ch in special_chars_list), total_chars)
	features['lowercase_ratio_md'] = safe_divide(sum(1 for c in text if c.islower()), total_chars)
	features['uppercase_ratio_md'] = safe_divide(sum(1 for c in text if c.isupper()), total_chars)
	features['digit_ratio_md'] = safe_divide(sum(1 for c in text if c.isdigit()), total_chars)
	features['whitespace_ratio_md'] = safe_divide(sum(1 for c in text if c.isspace()), total_chars)

	return features

	def analyze_markdown_table_structure(text: str, lines: List[str]) -> Dict[str, float]:
	"""Analizuje cechy związane z tabelami w Markdown."""
	total_chars = len(text)
	total_lines = len(lines)
	if not total_chars:
	return {'table_pipe_count': 0, 'table_pipe_ratio': 0.0, 'table_pipe_per_1000_chars': 0.0,
	'table_lines_count': 0, 'table_lines_ratio': 0.0, 'table_header_separators_count': 0,
	'avg_pipes_per_table_line': 0.0, 'estimated_avg_columns': 0.0}

	pipe_count = text.count('\|')
	table_lines = [line for line in lines if '\|' in line]
	header_separators = len([line for line in table_lines if re.match(r'^[\\|\-\:\s]+$', line.strip())])

	return {
	'table_pipe_count': pipe_count,
	'table_pipe_ratio': safe_divide(pipe_count, total_chars),
	'table_pipe_per_1000_chars': safe_divide(pipe_count * 1000, total_chars),
	'table_lines_count': len(table_lines),
	'table_lines_ratio': safe_divide(len(table_lines), total_lines),
	'table_header_separators_count': header_separators,
	'avg_pipes_per_table_line': safe_divide(pipe_count, len(table_lines)),
	'estimated_avg_columns': safe_divide(pipe_count, (len(table_lines) * 2)) if table_lines else 0
	}

	# --- Funkcje analizujące strukturę linii ---

	def analyze_line_structure(lines: List[str]) -> Dict[str, float]:
	"""Analizuje linie pod kątem specyficznych struktur."""
	total_lines = len(lines)
	non_empty_lines = [line.strip() for line in lines if line.strip()]

	if not total_lines:
	return {'lines_with_bullet': 0, 'ratio_of_bulletpoints': 0.0,
	'single_word_line_ratio': 0.0, 'repeated_word_line_ratio': 0.0}

	bullets = {'•', '○', '‣', '-', '–', '—', '·', '⚪', '⚫', '▢', '■', '→', '★', '✓', '✕', '◇', '◆', '➤', '«', '»'}
	bullet_count = sum(1 for line in non_empty_lines if line and line[0] in bullets)

	return {
	'lines_with_bullet': bullet_count,
	'ratio_of_bulletpoints': safe_divide(bullet_count, len(non_empty_lines)),
	'single_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) == 1), total_lines),
	'repeated_word_line_ratio': safe_divide(sum(1 for l in non_empty_lines if len(l.split()) > 1 and len(set(l.split())) == 1), total_lines)
	}

	# --- Główna funkcja agregująca ---

	def calculate_all_structural_features(text: str, lines: List[str], sentences: List[str]) -> Dict[str, float]:
	"""Agreguje wszystkie cechy strukturalne i formatowania."""
	features = {}
	features.update(analyze_paragraph_stats(text))
	features.update(analyze_formatting_and_links(text, sentences))
	features.update(analyze_markdown_features(text))
	features.update(analyze_markdown_table_structure(text, lines))
	features.update(analyze_line_structure(lines))

	return features