# src/utils.py """ Вспомогательные утилиты: загрузка JSONL, вычисление статистики по текстам, создание сводной информации о корпусе и сохранение результатов. """ from __future__ import annotations import json from collections import Counter from dataclasses import dataclass, asdict from pathlib import Path from typing import Any, Dict, Iterable, List, Tuple, Optional import numpy as np def load_jsonl(path: str, max_items: Optional[int] = None) -> List[Dict[str, Any]]: items: List[Dict[str, Any]] = [] with open(path, "r", encoding="utf-8") as f: for i, line in enumerate(f): if max_items is not None and i >= max_items: break line = line.strip() if not line: continue try: items.append(json.loads(line)) except json.JSONDecodeError: continue return items def calculate_text_statistics(texts: Iterable[str], top_k: int = 50) -> Dict[str, Any]: texts_list = [t for t in texts if isinstance(t, str) and t.strip()] total_texts = len(texts_list) words: List[str] = [] for t in texts_list: words.extend(t.split()) total_words = len(words) unique_words = len(set(words)) avg_words_per_text = (total_words / total_texts) if total_texts else 0.0 freq = Counter(words) most_common_words = freq.most_common(top_k) return { "total_texts": total_texts, "total_words": total_words, "unique_words": unique_words, "avg_words_per_text": avg_words_per_text, "most_common_words": most_common_words, } @dataclass class CorpusSummary: total_articles: int total_words: int avg_words_per_article: float unique_words: int categories: Dict[str, int] def create_corpus_summary(articles: List[Dict[str, Any]]) -> CorpusSummary: texts = [a.get("text", "") for a in articles if isinstance(a, dict)] cats = [a.get("category", "") or "" for a in articles if isinstance(a, dict)] stats = calculate_text_statistics(texts, top_k=0) categories_counter = Counter([c for c in cats if isinstance(c, str) and c.strip()]) return CorpusSummary( total_articles=len(texts), total_words=stats["total_words"], avg_words_per_article=float(stats["avg_words_per_text"]), unique_words=stats["unique_words"], categories=dict(categories_counter), ) def save_corpus_summary(summary: CorpusSummary, out_path: str = "results/corpus_summary.json") -> None: Path(Path(out_path).parent).mkdir(parents=True, exist_ok=True) with open(out_path, "w", encoding="utf-8") as f: json.dump(asdict(summary), f, ensure_ascii=False, indent=2)