File size: 2,842 Bytes
54ccdcb
 
83b4881
 
54ccdcb
 
83b4881
 
54ccdcb
 
83b4881
 
 
54ccdcb
83b4881
54ccdcb
 
83b4881
 
 
54ccdcb
83b4881
54ccdcb
 
83b4881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54ccdcb
83b4881
 
 
 
 
54ccdcb
 
 
83b4881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# src/utils.py
"""
Вспомогательные утилиты: загрузка JSONL, вычисление статистики по текстам,
создание сводной информации о корпусе и сохранение результатов.
"""

from __future__ import annotations

import json
from collections import Counter
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Tuple, Optional

import numpy as np


def load_jsonl(path: str, max_items: Optional[int] = None) -> List[Dict[str, Any]]:
    items: List[Dict[str, Any]] = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_items is not None and i >= max_items:
                break
            line = line.strip()
            if not line:
                continue
            try:
                items.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return items


def calculate_text_statistics(texts: Iterable[str], top_k: int = 50) -> Dict[str, Any]:
    texts_list = [t for t in texts if isinstance(t, str) and t.strip()]
    total_texts = len(texts_list)
    words: List[str] = []
    for t in texts_list:
        words.extend(t.split())
    total_words = len(words)
    unique_words = len(set(words))
    avg_words_per_text = (total_words / total_texts) if total_texts else 0.0
    freq = Counter(words)
    most_common_words = freq.most_common(top_k)
    return {
        "total_texts": total_texts,
        "total_words": total_words,
        "unique_words": unique_words,
        "avg_words_per_text": avg_words_per_text,
        "most_common_words": most_common_words,
    }


@dataclass
class CorpusSummary:
    total_articles: int
    total_words: int
    avg_words_per_article: float
    unique_words: int
    categories: Dict[str, int]


def create_corpus_summary(articles: List[Dict[str, Any]]) -> CorpusSummary:
    texts = [a.get("text", "") for a in articles if isinstance(a, dict)]
    cats = [a.get("category", "") or "" for a in articles if isinstance(a, dict)]
    stats = calculate_text_statistics(texts, top_k=0)
    categories_counter = Counter([c for c in cats if isinstance(c, str) and c.strip()])
    return CorpusSummary(
        total_articles=len(texts),
        total_words=stats["total_words"],
        avg_words_per_article=float(stats["avg_words_per_text"]),
        unique_words=stats["unique_words"],
        categories=dict(categories_counter),
    )


def save_corpus_summary(summary: CorpusSummary, out_path: str = "results/corpus_summary.json") -> None:
    Path(Path(out_path).parent).mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(asdict(summary), f, ensure_ascii=False, indent=2)