Spaces:
Sleeping
Sleeping
| # src/utils.py | |
| """ | |
| Вспомогательные утилиты: загрузка JSONL, вычисление статистики по текстам, | |
| создание сводной информации о корпусе и сохранение результатов. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from collections import Counter | |
| from dataclasses import dataclass, asdict | |
| from pathlib import Path | |
| from typing import Any, Dict, Iterable, List, Tuple, Optional | |
| import numpy as np | |
| def load_jsonl(path: str, max_items: Optional[int] = None) -> List[Dict[str, Any]]: | |
| items: List[Dict[str, Any]] = [] | |
| with open(path, "r", encoding="utf-8") as f: | |
| for i, line in enumerate(f): | |
| if max_items is not None and i >= max_items: | |
| break | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| items.append(json.loads(line)) | |
| except json.JSONDecodeError: | |
| continue | |
| return items | |
| def calculate_text_statistics(texts: Iterable[str], top_k: int = 50) -> Dict[str, Any]: | |
| texts_list = [t for t in texts if isinstance(t, str) and t.strip()] | |
| total_texts = len(texts_list) | |
| words: List[str] = [] | |
| for t in texts_list: | |
| words.extend(t.split()) | |
| total_words = len(words) | |
| unique_words = len(set(words)) | |
| avg_words_per_text = (total_words / total_texts) if total_texts else 0.0 | |
| freq = Counter(words) | |
| most_common_words = freq.most_common(top_k) | |
| return { | |
| "total_texts": total_texts, | |
| "total_words": total_words, | |
| "unique_words": unique_words, | |
| "avg_words_per_text": avg_words_per_text, | |
| "most_common_words": most_common_words, | |
| } | |
| class CorpusSummary: | |
| total_articles: int | |
| total_words: int | |
| avg_words_per_article: float | |
| unique_words: int | |
| categories: Dict[str, int] | |
| def create_corpus_summary(articles: List[Dict[str, Any]]) -> CorpusSummary: | |
| texts = [a.get("text", "") for a in articles if isinstance(a, dict)] | |
| cats = [a.get("category", "") or "" for a in articles if isinstance(a, dict)] | |
| stats = calculate_text_statistics(texts, top_k=0) | |
| categories_counter = Counter([c for c in cats if isinstance(c, str) and c.strip()]) | |
| return CorpusSummary( | |
| total_articles=len(texts), | |
| total_words=stats["total_words"], | |
| avg_words_per_article=float(stats["avg_words_per_text"]), | |
| unique_words=stats["unique_words"], | |
| categories=dict(categories_counter), | |
| ) | |
| def save_corpus_summary(summary: CorpusSummary, out_path: str = "results/corpus_summary.json") -> None: | |
| Path(Path(out_path).parent).mkdir(parents=True, exist_ok=True) | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| json.dump(asdict(summary), f, ensure_ascii=False, indent=2) | |