Spaces:
Sleeping
Sleeping
File size: 2,842 Bytes
54ccdcb 83b4881 54ccdcb 83b4881 54ccdcb 83b4881 54ccdcb 83b4881 54ccdcb 83b4881 54ccdcb 83b4881 54ccdcb 83b4881 54ccdcb 83b4881 54ccdcb 83b4881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# src/utils.py
"""
Вспомогательные утилиты: загрузка JSONL, вычисление статистики по текстам,
создание сводной информации о корпусе и сохранение результатов.
"""
from __future__ import annotations
import json
from collections import Counter
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Tuple, Optional
import numpy as np
def load_jsonl(path: str, max_items: Optional[int] = None) -> List[Dict[str, Any]]:
items: List[Dict[str, Any]] = []
with open(path, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
if max_items is not None and i >= max_items:
break
line = line.strip()
if not line:
continue
try:
items.append(json.loads(line))
except json.JSONDecodeError:
continue
return items
def calculate_text_statistics(texts: Iterable[str], top_k: int = 50) -> Dict[str, Any]:
texts_list = [t for t in texts if isinstance(t, str) and t.strip()]
total_texts = len(texts_list)
words: List[str] = []
for t in texts_list:
words.extend(t.split())
total_words = len(words)
unique_words = len(set(words))
avg_words_per_text = (total_words / total_texts) if total_texts else 0.0
freq = Counter(words)
most_common_words = freq.most_common(top_k)
return {
"total_texts": total_texts,
"total_words": total_words,
"unique_words": unique_words,
"avg_words_per_text": avg_words_per_text,
"most_common_words": most_common_words,
}
@dataclass
class CorpusSummary:
total_articles: int
total_words: int
avg_words_per_article: float
unique_words: int
categories: Dict[str, int]
def create_corpus_summary(articles: List[Dict[str, Any]]) -> CorpusSummary:
texts = [a.get("text", "") for a in articles if isinstance(a, dict)]
cats = [a.get("category", "") or "" for a in articles if isinstance(a, dict)]
stats = calculate_text_statistics(texts, top_k=0)
categories_counter = Counter([c for c in cats if isinstance(c, str) and c.strip()])
return CorpusSummary(
total_articles=len(texts),
total_words=stats["total_words"],
avg_words_per_article=float(stats["avg_words_per_text"]),
unique_words=stats["unique_words"],
categories=dict(categories_counter),
)
def save_corpus_summary(summary: CorpusSummary, out_path: str = "results/corpus_summary.json") -> None:
Path(Path(out_path).parent).mkdir(parents=True, exist_ok=True)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(asdict(summary), f, ensure_ascii=False, indent=2)
|