Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

Kolesnikov Dmitry

feat: Вторая лабораторка

83b4881 about 2 months ago

2.84 kB

	# src/utils.py
	"""
	Вспомогательные утилиты: загрузка JSONL, вычисление статистики по текстам,
	создание сводной информации о корпусе и сохранение результатов.
	"""

	from __future__ import annotations

	import json
	from collections import Counter
	from dataclasses import dataclass, asdict
	from pathlib import Path
	from typing import Any, Dict, Iterable, List, Tuple, Optional

	import numpy as np


	def load_jsonl(path: str, max_items: Optional[int] = None) -> List[Dict[str, Any]]:
	items: List[Dict[str, Any]] = []
	with open(path, "r", encoding="utf-8") as f:
	for i, line in enumerate(f):
	if max_items is not None and i >= max_items:
	break
	line = line.strip()
	if not line:
	continue
	try:
	items.append(json.loads(line))
	except json.JSONDecodeError:
	continue
	return items


	def calculate_text_statistics(texts: Iterable[str], top_k: int = 50) -> Dict[str, Any]:
	texts_list = [t for t in texts if isinstance(t, str) and t.strip()]
	total_texts = len(texts_list)
	words: List[str] = []
	for t in texts_list:
	words.extend(t.split())
	total_words = len(words)
	unique_words = len(set(words))
	avg_words_per_text = (total_words / total_texts) if total_texts else 0.0
	freq = Counter(words)
	most_common_words = freq.most_common(top_k)
	return {
	"total_texts": total_texts,
	"total_words": total_words,
	"unique_words": unique_words,
	"avg_words_per_text": avg_words_per_text,
	"most_common_words": most_common_words,
	}


	@dataclass
	class CorpusSummary:
	total_articles: int
	total_words: int
	avg_words_per_article: float
	unique_words: int
	categories: Dict[str, int]


	def create_corpus_summary(articles: List[Dict[str, Any]]) -> CorpusSummary:
	texts = [a.get("text", "") for a in articles if isinstance(a, dict)]
	cats = [a.get("category", "") or "" for a in articles if isinstance(a, dict)]
	stats = calculate_text_statistics(texts, top_k=0)
	categories_counter = Counter([c for c in cats if isinstance(c, str) and c.strip()])
	return CorpusSummary(
	total_articles=len(texts),
	total_words=stats["total_words"],
	avg_words_per_article=float(stats["avg_words_per_text"]),
	unique_words=stats["unique_words"],
	categories=dict(categories_counter),
	)


	def save_corpus_summary(summary: CorpusSummary, out_path: str = "results/corpus_summary.json") -> None:
	Path(Path(out_path).parent).mkdir(parents=True, exist_ok=True)
	with open(out_path, "w", encoding="utf-8") as f:
	json.dump(asdict(summary), f, ensure_ascii=False, indent=2)