Spaces:

AILaborant
/

tg_medium

Running

App Files Files Community

tg_medium / app_internal.py

AILaborant

Create app_internal.py

9f6c3b2 verified 30 days ago

raw

history blame contribute delete

40.4 kB

	import csv
	import json
	import re
	from collections import Counter
	from pathlib import Path

	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.layers import Dense, Dropout, Embedding, LayerNormalization, MultiHeadAttention
	from tensorflow.keras.models import Model

	PAD_TOKEN = "<pad>"
	UNK_TOKEN = "<unk>"
	START_TOKEN = "<start>"
	END_TOKEN = "<end>"
	SPECIAL_TOKENS = [PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN]
	TOKEN_PATTERN = re.compile(
	r"\w+\|==\|!=\|<=\|>=\|->\|=>\|::\|\+\+\|--\|&&\|\\|\\|\|//\|/\\|\/\|[^\w]",
	re.UNICODE,
	)
	TOKENIZER_VERSION = "regex_subword_v1"
	DEFAULT_TOKEN_CHUNK_SIZE = 2
	DEFAULT_WORD_CHUNK_SIZE = 4
	DEFAULT_CYRILLIC_CHUNK_SIZE = 2
	IDENTIFIER_BOUNDARY_PATTERN = re.compile(
	r"[А-Яа-яЁё]+\|[A-Z]+(?=[A-Z][a-z]\|[0-9_]\|$)\|[A-Z]?[a-z]+\|[0-9]+\|_+\|[^\w]",
	re.UNICODE,
	)
	COMMON_FALLBACK_TOKENS = list(
	"abcdefghijklmnopqrstuvwxyz"
	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	"0123456789"
	"абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
	"АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
	"_ \n\t.,!?;:()[]{}<>+-*/=%'\"`\\\|&^~#$@"
	)
	SUPPORTED_DATA_EXTENSIONS = {".txt", ".md", ".json", ".jsonl", ".ndjson", ".csv"}
	TEXT_FIELDS = ("text", "content", "completion", "response", "output", "answer")
	QUESTION_FIELDS = ("question", "prompt", "instruction", "input", "query")
	PROMPT_FIELDS = QUESTION_FIELDS
	ANSWER_FIELDS = ("answer", "completion", "response", "output")
	QUESTION_LABEL = "Question:"
	ANSWER_LABEL = "Answer:"
	TG_DATASET_PRO_ID = "AILaborant/tg_dataset_pro"
	DEFAULT_COLAB_NOTEBOOK = "tg_dataset_pro_colab_train.ipynb"


	class ThreeChunkTokenizer:
	"""Regex subword tokenizer tuned for Russian prose and code-like text."""

	def __init__(
	self,
	vocab=None,
	max_vocab_size=128000,
	chunk_size=DEFAULT_TOKEN_CHUNK_SIZE,
	word_chunk_size=DEFAULT_WORD_CHUNK_SIZE,
	cyrillic_chunk_size=DEFAULT_CYRILLIC_CHUNK_SIZE,
	tokenizer_version=TOKENIZER_VERSION,
	):
	self.max_vocab_size = max_vocab_size
	self.chunk_size = max(1, int(chunk_size))
	self.word_chunk_size = max(1, int(word_chunk_size))
	self.cyrillic_chunk_size = max(1, int(cyrillic_chunk_size))
	self.tokenizer_version = tokenizer_version
	self.vocab = list(vocab) if vocab else list(SPECIAL_TOKENS)
	self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
	self.max_token_length = max((len(token) for token in self.vocab), default=1)

	def _is_fitted(self):
	return len(self.vocab) > len(SPECIAL_TOKENS)

	def _is_cyrillic(self, text):
	return any("а" <= char.lower() <= "я" or char in "Ёё" for char in text)

	def _chunk_segment(self, segment):
	if not segment:
	return []
	if segment.startswith("_"):
	return list(segment)
	if segment.isdigit():
	return [segment[i : i + 4] for i in range(0, len(segment), 4)]

	chunk_size = self.cyrillic_chunk_size if self._is_cyrillic(segment) else self.word_chunk_size
	return [segment[i : i + chunk_size] for i in range(0, len(segment), chunk_size)]

	def _tokenize_word_piece(self, piece):
	tokens = []
	for segment in IDENTIFIER_BOUNDARY_PATTERN.findall(piece):
	tokens.extend(self._chunk_segment(segment))
	return tokens

	def _tokenize_raw(self, text):
	tokens = []
	for piece in TOKEN_PATTERN.findall(text):
	if piece.isalnum() or "_" in piece:
	tokens.extend(self._tokenize_word_piece(piece))
	else:
	tokens.append(piece)
	return tokens

	def _tokenize_fixed_chunk_legacy(self, text):
	tokens = []
	for piece in TOKEN_PATTERN.findall(text):
	if piece.isalnum() or "_" in piece:
	tokens.extend(piece[i : i + self.chunk_size] for i in range(0, len(piece), self.chunk_size))
	else:
	tokens.append(piece)
	return tokens

	def _tokenize_with_vocab(self, token):
	if token in self.token_to_id:
	return [token]

	pieces = []
	index = 0
	while index < len(token):
	match = None
	max_end = min(len(token), index + self.max_token_length)
	for end in range(max_end, index, -1):
	candidate = token[index:end]
	if candidate in self.token_to_id:
	match = candidate
	break

	if match is None:
	pieces.append(UNK_TOKEN)
	index += 1
	else:
	pieces.append(match)
	index += len(match)
	return pieces

	def tokenize(self, text):
	if self.tokenizer_version == "fixed_chunk_legacy":
	return self._tokenize_fixed_chunk_legacy(text)

	tokens = self._tokenize_raw(text)
	if not self._is_fitted():
	return tokens

	segmented = []
	for token in tokens:
	segmented.extend(self._tokenize_with_vocab(token))
	return segmented

	def detokenize(self, tokens):
	return "".join(token for token in tokens if token not in SPECIAL_TOKENS)

	def fit(self, texts):
	counts = Counter()
	character_counts = Counter()
	for text in texts:
	counts.update(self._tokenize_raw(text))
	character_counts.update(text)

	required_tokens = []
	seen = set(SPECIAL_TOKENS)
	for token in COMMON_FALLBACK_TOKENS + [token for token, _ in character_counts.most_common()]:
	if token not in seen:
	required_tokens.append(token)
	seen.add(token)

	remaining = max(0, self.max_vocab_size - len(SPECIAL_TOKENS) - len(required_tokens))
	learned_tokens = []
	for token, _ in counts.most_common():
	if token in seen:
	continue
	learned_tokens.append(token)
	seen.add(token)
	if len(learned_tokens) >= remaining:
	break

	self.vocab = list(SPECIAL_TOKENS) + required_tokens + learned_tokens
	self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
	self.max_token_length = max((len(token) for token in self.vocab), default=1)
	return self

	def encode(self, text, add_boundaries=True, maxlen=None):
	tokens = self.tokenize(text)
	if add_boundaries:
	tokens = [START_TOKEN] + tokens + [END_TOKEN]

	ids = [self.token_to_id.get(token, self.token_to_id[UNK_TOKEN]) for token in tokens]
	if maxlen is not None:
	ids = ids[:maxlen]
	ids.extend([self.token_to_id[PAD_TOKEN]] * (maxlen - len(ids)))
	return ids

	def decode_ids(self, ids):
	tokens = [self.vocab[int(idx)] for idx in ids if int(idx) < len(self.vocab)]
	return self.detokenize(tokens)

	def to_dict(self):
	return {
	"vocab": self.vocab,
	"max_vocab_size": self.max_vocab_size,
	"chunk_size": self.chunk_size,
	"word_chunk_size": self.word_chunk_size,
	"cyrillic_chunk_size": self.cyrillic_chunk_size,
	"tokenizer_version": self.tokenizer_version,
	}

	@classmethod
	def from_dict(cls, data):
	# Older saved artifacts did not store this value and were trained with
	# 3-character chunks, so keep those models' prompt tokenization stable.
	return cls(
	vocab=data["vocab"],
	max_vocab_size=data.get("max_vocab_size", 128000),
	chunk_size=data.get("chunk_size", 3),
	word_chunk_size=data.get("word_chunk_size", data.get("chunk_size", 3)),
	cyrillic_chunk_size=data.get("cyrillic_chunk_size", data.get("chunk_size", 3)),
	tokenizer_version=data.get("tokenizer_version", "fixed_chunk_legacy"),
	)


	@tf.keras.utils.register_keras_serializable()
	class TransformerBlock(tf.keras.layers.Layer):
	def __init__(self, embed_dim, num_heads, ff_dim, rate=0.2, **kwargs):
	super().__init__(**kwargs)
	self.embed_dim = embed_dim
	self.num_heads = num_heads
	self.ff_dim = ff_dim
	self.rate = rate

	self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
	self.ffn = tf.keras.Sequential([
	Dense(ff_dim, activation="relu"),
	Dense(embed_dim),
	])

	self.layernorm1 = LayerNormalization(epsilon=1e-5)
	self.layernorm2 = LayerNormalization(epsilon=1e-5)
	self.dropout1 = Dropout(rate)
	self.dropout2 = Dropout(rate)

	def call(self, inputs, training=None):
	seq_len = tf.shape(inputs)[1]
	causal_mask = tf.linalg.band_part(tf.ones((seq_len, seq_len), dtype=tf.bool), -1, 0)
	attn_output = self.att(inputs, inputs, attention_mask=causal_mask)
	attn_output = self.dropout1(attn_output, training=training)
	attn_output = tf.cast(attn_output, inputs.dtype)
	out1 = self.layernorm1(inputs + attn_output)

	ffn_output = self.ffn(out1)
	ffn_output = self.dropout2(ffn_output, training=training)
	ffn_output = tf.cast(ffn_output, out1.dtype)
	return self.layernorm2(out1 + ffn_output)

	def build(self, input_shape):
	self.att.build(input_shape, input_shape)
	self.ffn.build(input_shape)
	self.layernorm1.build(input_shape)
	self.layernorm2.build(input_shape)
	super().build(input_shape)

	def get_config(self):
	config = super().get_config()
	config.update({
	"embed_dim": self.embed_dim,
	"num_heads": self.num_heads,
	"ff_dim": self.ff_dim,
	"rate": self.rate,
	})
	return config


	@tf.keras.utils.register_keras_serializable()
	class TokenAndPositionEmbedding(tf.keras.layers.Layer):
	def __init__(self, maxlen, vocab_size, embed_dim, **kwargs):
	super().__init__(**kwargs)
	self.maxlen = maxlen
	self.vocab_size = vocab_size
	self.embed_dim = embed_dim

	self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
	self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

	def call(self, x):
	maxlen = tf.shape(x)[-1]
	positions = tf.range(start=0, limit=maxlen, delta=1)
	positions = self.pos_emb(positions)
	x = self.token_emb(x)
	positions = tf.cast(positions, x.dtype)
	return x + positions

	def build(self, input_shape):
	self.token_emb.build(input_shape)
	self.pos_emb.build((self.maxlen,))
	super().build(input_shape)

	def get_config(self):
	config = super().get_config()
	config.update({
	"maxlen": self.maxlen,
	"vocab_size": self.vocab_size,
	"embed_dim": self.embed_dim,
	})
	return config


	def configure_tensorflow(use_mixed_precision=True, use_xla=True):
	gpus = tf.config.list_physical_devices("GPU")
	memory_growth_enabled = False
	for gpu in gpus:
	try:
	tf.config.experimental.set_memory_growth(gpu, True)
	memory_growth_enabled = True
	except (RuntimeError, ValueError):
	# Colab may initialize the GPU before training starts. In that case
	# memory growth cannot be changed, but training can continue safely.
	pass

	mixed_precision_enabled = False
	if use_mixed_precision and gpus:
	try:
	tf.keras.mixed_precision.set_global_policy("mixed_float16")
	mixed_precision_enabled = True
	except ValueError:
	pass

	xla_enabled = False
	if use_xla:
	try:
	tf.config.optimizer.set_jit(True)
	xla_enabled = True
	except ValueError:
	pass

	return {
	"gpu_count": len(gpus),
	"memory_growth": memory_growth_enabled,
	"mixed_precision": mixed_precision_enabled,
	"xla": xla_enabled,
	}


	def build_model(
	vocab_size,
	maxlen=100,
	embed_dim=96,
	num_heads=4,
	ff_dim=192,
	dropout=0.2,
	num_layers=1,
	jit_compile=True,
	):
	inputs = tf.keras.Input(shape=(maxlen,), dtype=tf.int32)
	x = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(inputs)
	for _ in range(max(1, int(num_layers))):
	x = TransformerBlock(embed_dim, num_heads, ff_dim, dropout)(x)
	outputs = Dense(vocab_size, activation="softmax", dtype="float32")(x)

	model = Model(inputs=inputs, outputs=outputs)
	model.compile(
	optimizer="adam",
	loss="sparse_categorical_crossentropy",
	weighted_metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="answer_accuracy")],
	jit_compile=jit_compile,
	)
	return model


	def stringify_value(value):
	if value is None:
	return ""
	if isinstance(value, str):
	return value.strip()
	if isinstance(value, (int, float, bool)):
	return str(value)
	if isinstance(value, list):
	return "\n".join(filter(None, (stringify_value(item) for item in value)))
	if isinstance(value, dict):
	return record_to_text(value)
	return str(value).strip()


	def messages_to_text(messages):
	lines = []
	for message in messages:
	if not isinstance(message, dict):
	text = stringify_value(message)
	if text:
	lines.append(text)
	continue

	role = stringify_value(message.get("role") or message.get("from") or message.get("speaker"))
	content = stringify_value(message.get("content") or message.get("value") or message.get("text"))
	if role and content:
	lines.append(f"{role}: {content}")
	elif content:
	lines.append(content)
	return "\n".join(lines).strip()


	def first_field(record, field_names):
	for field_name in field_names:
	value = stringify_value(record.get(field_name))
	if value:
	return value
	return ""


	def format_question_answer(question, answer):
	return f"{QUESTION_LABEL} {question}\n{ANSWER_LABEL} {answer}"


	def format_question_prompt(question):
	return f"{QUESTION_LABEL} {question}\n{ANSWER_LABEL}"


	def record_to_text(record):
	if not isinstance(record, dict):
	return stringify_value(record)

	question = first_field(record, QUESTION_FIELDS)
	answer = first_field(record, ANSWER_FIELDS)
	if question and answer:
	return format_question_answer(question, answer)

	if isinstance(record.get("messages"), list):
	text = messages_to_text(record["messages"])
	if text:
	return text

	direct_text = first_field(record, TEXT_FIELDS)
	if direct_text:
	return direct_text

	parts = []
	for key, value in record.items():
	text = stringify_value(value)
	if text:
	parts.append(f"{key}: {text}")
	return "\n".join(parts).strip()


	def walk_json_records(data):
	if isinstance(data, list):
	for item in data:
	yield from walk_json_records(item)
	elif isinstance(data, dict):
	yielded_nested = False
	for key in ("train", "validation", "test", "data", "examples", "rows", "records"):
	if isinstance(data.get(key), list):
	yielded_nested = True
	yield from walk_json_records(data[key])
	if not yielded_nested:
	yield data
	else:
	yield data


	def read_json_file(path):
	raw_text = path.read_text(encoding="utf-8-sig")
	try:
	data = json.loads(raw_text)
	return [record_to_text(record) for record in walk_json_records(data)]
	except json.JSONDecodeError as exc:
	# Some datasets are JSONL but are still named .json. If parsing a full
	# JSON document fails, try reading one JSON object per line.
	if "Extra data" not in exc.msg:
	raise
	return read_jsonl_text(raw_text, path)


	def read_jsonl_text(raw_text, path):
	texts = []
	for line_number, line in enumerate(raw_text.splitlines(), start=1):
	line = line.strip()
	if not line:
	continue
	try:
	texts.append(record_to_text(json.loads(line)))
	except json.JSONDecodeError as exc:
	raise ValueError(f"Invalid JSON on line {line_number} in {path}") from exc
	return texts


	def read_jsonl_file(path):
	return read_jsonl_text(path.read_text(encoding="utf-8-sig"), path)


	def read_csv_file(path):
	with path.open("r", encoding="utf-8", newline="") as file:
	rows = csv.DictReader(file)
	return [record_to_text(row) for row in rows]


	def read_dataset_file(path):
	extension = path.suffix.lower()
	if extension in {".txt", ".md"}:
	return [path.read_text(encoding="utf-8")]
	if extension == ".json":
	return read_json_file(path)
	if extension in {".jsonl", ".ndjson"}:
	return read_jsonl_file(path)
	if extension == ".csv":
	return read_csv_file(path)
	return []


	def read_training_texts(paths):
	texts = []
	for raw_path in paths:
	path = Path(raw_path)
	if path.is_dir():
	for file_path in sorted(path.rglob("*")):
	if file_path.is_file() and file_path.suffix.lower() in SUPPORTED_DATA_EXTENSIONS:
	texts.extend(read_dataset_file(file_path))
	else:
	texts.extend(read_dataset_file(path))

	texts = [text for text in texts if text and text.strip()]
	if not texts:
	supported = ", ".join(sorted(SUPPORTED_DATA_EXTENSIONS))
	raise ValueError(f"No training text found. Supported dataset files: {supported}")
	return texts

	def estimate_dataset_defaults(paths):
	texts = read_training_texts(paths)
	counter = Counter()
	token_count = 0
	sequence_lengths = []
	for text in texts:
	tokens = ThreeChunkTokenizer().tokenize(text)
	counter.update(tokens)
	sequence_length = len(tokens) + 2
	sequence_lengths.append(sequence_length)
	token_count += sequence_length

	unique_tokens = len(counter)
	length_p95 = int(np.percentile(sequence_lengths, 95)) if sequence_lengths else 96
	recommended_maxlen = min(256, max(96, ((length_p95 + 31) // 32) * 32))
	recommended_vocab = min(32000, max(2048, unique_tokens + len(SPECIAL_TOKENS) + 512))
	recommended_stride = 48 if token_count >= 1_000_000 else 32
	recommended_max_sequences = 60000 if token_count >= 1_000_000 else 0

	return {
	"records": len(texts),
	"tokens": token_count,
	"unique_tokens": unique_tokens,
	"epochs": 3,
	"batch_size": 128,
	"validation_split": 0.02,
	"maxlen": recommended_maxlen,
	"stride": recommended_stride,
	"max_vocab_size": recommended_vocab,
	"max_sequences": recommended_max_sequences,
	"embed_dim": 96,
	"num_heads": 4,
	"num_layers": 2,
	"ff_dim": 192,
	"dropout": 0.15,
	"answer_only_loss": True,
	}


	def find_subsequence(values, pattern):
	if not pattern or len(pattern) > len(values):
	return -1
	for index in range(len(values) - len(pattern) + 1):
	if values[index : index + len(pattern)] == pattern:
	return index
	return -1


	def make_loss_mask(ids, tokenizer, answer_only_loss=True):
	if not answer_only_loss:
	return np.ones(len(ids), dtype="float32")

	answer_label_ids = tokenizer.encode(ANSWER_LABEL, add_boundaries=False)
	answer_label_start = find_subsequence(ids, answer_label_ids)
	if answer_label_start < 0:
	return np.ones(len(ids), dtype="float32")

	answer_start = answer_label_start + len(answer_label_ids)
	mask = np.zeros(len(ids), dtype="float32")
	mask[answer_start:] = 1.0
	return mask


	def make_training_arrays(texts, tokenizer, maxlen, stride=48, max_sequences=0, answer_only_loss=True):
	stride = max(1, int(stride))
	max_sequences = max(0, int(max_sequences))
	pad_id = tokenizer.token_to_id[PAD_TOKEN]
	window = maxlen + 1
	sequences = []
	weights = []

	for text in texts:
	ids = tokenizer.encode(text, add_boundaries=True)
	loss_mask = make_loss_mask(ids, tokenizer, answer_only_loss=answer_only_loss)
	if len(ids) < window:
	pad_amount = window - len(ids)
	ids.extend([pad_id] * pad_amount)
	loss_mask = np.pad(loss_mask, (0, pad_amount), constant_values=0.0)

	last_start = max(0, len(ids) - window)
	starts = list(range(0, last_start + 1, stride))
	if starts[-1] != last_start:
	starts.append(last_start)

	for start in starts:
	target_weights = loss_mask[start + 1 : start + window]
	if answer_only_loss and not np.any(target_weights):
	continue

	sequences.append(ids[start : start + window])
	weights.append(target_weights)
	if max_sequences and len(sequences) >= max_sequences:
	data = np.asarray(sequences, dtype="int32")
	sample_weights = np.asarray(weights, dtype="float32")
	return data[:, :-1], data[:, 1:], sample_weights

	if not sequences:
	sequences.append([pad_id] * window)
	weights.append(np.ones(maxlen, dtype="float32"))

	data = np.asarray(sequences, dtype="int32")
	sample_weights = np.asarray(weights, dtype="float32")
	return data[:, :-1], data[:, 1:], sample_weights


	def make_tf_datasets(x, y, sample_weights, batch_size, validation_split=0.0, shuffle_buffer=10000):
	indices = np.random.default_rng(42).permutation(len(x))
	x = x[indices]
	y = y[indices]
	sample_weights = sample_weights[indices]

	val_count = int(len(x) * validation_split)
	if validation_split > 0 and val_count == 0 and len(x) > 1:
	val_count = 1

	if val_count:
	train_x, val_x = x[:-val_count], x[-val_count:]
	train_y, val_y = y[:-val_count], y[-val_count:]
	train_weights, val_weights = sample_weights[:-val_count], sample_weights[-val_count:]
	else:
	train_x, train_y = x, y
	train_weights = sample_weights
	val_x, val_y = None, None
	val_weights = None

	train_ds = tf.data.Dataset.from_tensor_slices((train_x, train_y, train_weights))
	train_ds = train_ds.shuffle(min(len(train_x), shuffle_buffer), reshuffle_each_iteration=True)
	train_ds = train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

	val_ds = None
	if val_x is not None:
	val_ds = tf.data.Dataset.from_tensor_slices((val_x, val_y, val_weights))
	val_ds = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

	return train_ds, val_ds, len(train_x), 0 if val_x is None else len(val_x)


	def model_paths(model_dir):
	path = Path(model_dir)
	return path / "model.keras", path / "tokenizer.json", path / "config.json"


	def save_artifacts(model, tokenizer, model_dir, config):
	path = Path(model_dir)
	path.mkdir(parents=True, exist_ok=True)
	model_path, tokenizer_path, config_path = model_paths(path)
	model.save(model_path)
	tokenizer_path.write_text(json.dumps(tokenizer.to_dict(), ensure_ascii=False, indent=2), encoding="utf-8")
	config_path.write_text(json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8")


	def load_artifacts(model_dir="tg-medium"):
	model_path, tokenizer_path, config_path = model_paths(model_dir)
	if not model_path.exists():
	raise FileNotFoundError(f"Model file not found: {model_path}")
	if not tokenizer_path.exists():
	raise FileNotFoundError(f"Tokenizer file not found: {tokenizer_path}")

	model = tf.keras.models.load_model(
	model_path,
	custom_objects={
	"TokenAndPositionEmbedding": TokenAndPositionEmbedding,
	"TransformerBlock": TransformerBlock,
	},
	compile=False,
	)
	tokenizer = ThreeChunkTokenizer.from_dict(json.loads(tokenizer_path.read_text(encoding="utf-8")))
	config = json.loads(config_path.read_text(encoding="utf-8")) if config_path.exists() else {}
	return model, tokenizer, config


	def clean_answer_text(text):
	text = text.strip()
	for marker in (QUESTION_LABEL, "Prompt:"):
	marker_index = text.find(marker)
	if marker_index >= 0:
	text = text[:marker_index].strip()

	if text.startswith(ANSWER_LABEL):
	text = text[len(ANSWER_LABEL) :].strip()

	return text.strip()


	def generate_text(model, tokenizer, prompt, maxlen=100, max_new_tokens=80, temperature=1.0, qa_mode=True):
	if qa_mode:
	prompt = format_question_prompt(prompt)

	context = [tokenizer.token_to_id[START_TOKEN]]
	context.extend(tokenizer.encode(prompt, add_boundaries=False))
	end_id = tokenizer.token_to_id[END_TOKEN]
	pad_id = tokenizer.token_to_id[PAD_TOKEN]

	generated = []
	for _ in range(max_new_tokens):
	input_ids = context[-maxlen:]
	last_position = len(input_ids) - 1
	padded = input_ids + [pad_id] * (maxlen - len(input_ids))
	predictions = model.predict(np.asarray([padded], dtype="int32"), verbose=0)[0, last_position]

	if temperature <= 0:
	next_id = int(np.argmax(predictions))
	else:
	logits = np.log(np.maximum(predictions, 1e-9)) / temperature
	probabilities = np.exp(logits) / np.sum(np.exp(logits))
	next_id = int(np.random.choice(len(probabilities), p=probabilities))

	if next_id == end_id:
	break
	context.append(next_id)
	generated.append(next_id)

	output = tokenizer.decode_ids(generated)
	return clean_answer_text(output) if qa_mode else output


	def train_command(args):
	tf_settings = configure_tensorflow(
	use_mixed_precision=args.mixed_precision,
	use_xla=args.xla,
	)
	texts = read_training_texts(args.data)
	tokenizer = ThreeChunkTokenizer(max_vocab_size=args.max_vocab_size).fit(texts)
	x, y, sample_weights = make_training_arrays(
	texts,
	tokenizer,
	args.maxlen,
	stride=args.stride,
	max_sequences=args.max_sequences,
	answer_only_loss=args.answer_only_loss,
	)
	train_ds, val_ds, train_count, val_count = make_tf_datasets(
	x,
	y,
	sample_weights,
	batch_size=args.batch_size,
	validation_split=args.validation_split,
	)

	model = build_model(
	vocab_size=len(tokenizer.vocab),
	maxlen=args.maxlen,
	embed_dim=args.embed_dim,
	num_heads=args.num_heads,
	ff_dim=args.ff_dim,
	dropout=args.dropout,
	num_layers=args.num_layers,
	jit_compile=args.xla,
	)
	print(
	f"Training on {train_count:,} sequences"
	f"{f' with {val_count:,} validation sequences' if val_count else ''}."
	)
	print(
	f"Vocab: {len(tokenizer.vocab):,} \| Batch: {args.batch_size} \| "
	f"Stride: {args.stride} \| Blocks: {args.num_layers} \| GPUs: {tf_settings['gpu_count']} \| "
	f"Answer-only loss: {args.answer_only_loss} \| "
	f"Memory growth: {tf_settings['memory_growth']} \| "
	f"Mixed precision: {tf_settings['mixed_precision']} \| XLA: {tf_settings['xla']}"
	)
	config = {
	"maxlen": args.maxlen,
	"embed_dim": args.embed_dim,
	"num_heads": args.num_heads,
	"num_layers": args.num_layers,
	"ff_dim": args.ff_dim,
	"dropout": args.dropout,
	"vocab_size": len(tokenizer.vocab),
	"tokenizer_chunk_size": tokenizer.chunk_size,
	"tokenizer_word_chunk_size": tokenizer.word_chunk_size,
	"tokenizer_cyrillic_chunk_size": tokenizer.cyrillic_chunk_size,
	"tokenizer_version": tokenizer.tokenizer_version,
	"stride": args.stride,
	"max_sequences": args.max_sequences,
	"answer_only_loss": args.answer_only_loss,
	"mixed_precision": tf_settings["mixed_precision"],
	"xla": tf_settings["xla"],
	}
	model.fit(train_ds, epochs=args.epochs, validation_data=val_ds)

	save_artifacts(model, tokenizer, args.model_dir, config)
	print(f"Saved model, tokenizer, and config to {args.model_dir}")


	def chat_command(args):
	model, tokenizer, config = load_artifacts(args.model_dir)
	maxlen = int(config.get("maxlen", args.maxlen))
	print("Chat ready. Type /exit or /quit to stop.")
	while True:
	try:
	prompt = input("you> ").strip()
	except (EOFError, KeyboardInterrupt):
	print()
	break
	if prompt.lower() in {"/exit", "/quit"}:
	break
	response = generate_text(model, tokenizer, prompt, maxlen=maxlen, max_new_tokens=args.max_new_tokens, temperature=args.temperature)
	print(f"bot> {response}")


	def generate_command(args):
	model, tokenizer, config = load_artifacts(args.model_dir)
	maxlen = int(config.get("maxlen", args.maxlen))
	print(generate_text(model, tokenizer, args.prompt, maxlen=maxlen, max_new_tokens=args.max_new_tokens, temperature=args.temperature))


	def save_command(args):
	model, tokenizer, config = load_artifacts(args.model_dir)
	save_artifacts(model, tokenizer, args.output_dir, config)
	print(f"Saved copy to {args.output_dir}")


	def make_notebook_cell(cell_type, source):
	return {
	"cell_type": cell_type,
	"metadata": {},
	"source": [line + "\n" for line in source.splitlines()],
	}


	def build_colab_notebook(dataset_id=TG_DATASET_PRO_ID):
	app_source = Path(__file__).read_text(encoding="utf-8")
	train_code = f"""
	DATASET_ID = {dataset_id!r}
	SPLIT = "train"
	MODEL_DIR = "/content/tg-dataset-pro-model"

	# 0 means use the full dataset. Set a smaller number for a quick smoke test.
	MAX_ROWS = 0

	# These defaults are intentionally Colab-friendly. Increase MAX_SEQUENCES,
	# EMBED_DIM, NUM_LAYERS, or EPOCHS if you have a stronger GPU/runtime.
	EPOCHS = 3
	BATCH_SIZE = 128
	VALIDATION_SPLIT = 0.02
	MAXLEN = 128
	STRIDE = 64
	MAX_VOCAB_SIZE = 32000
	MAX_SEQUENCES = 120000
	EMBED_DIM = 128
	NUM_HEADS = 4
	NUM_LAYERS = 2
	FF_DIM = 256
	DROPOUT = 0.15
	ANSWER_ONLY_LOSS = True

	from datasets import load_dataset
	from app import (
	ThreeChunkTokenizer,
	build_model,
	configure_tensorflow,
	make_tf_datasets,
	make_training_arrays,
	record_to_text,
	save_artifacts,
	)

	tf_settings = configure_tensorflow(use_mixed_precision=True, use_xla=True)
	print("TensorFlow settings:", tf_settings)

	raw_dataset = load_dataset(DATASET_ID, split=SPLIT)
	texts = []
	for index, record in enumerate(raw_dataset):
	if MAX_ROWS and index >= MAX_ROWS:
	break
	text = record_to_text(record)
	if text and text.strip():
	texts.append(text)

	if not texts:
	raise ValueError("No text rows were found in the Hugging Face dataset.")

	print(f"Loaded {{len(texts):,}} training rows from {{DATASET_ID}}.")

	tokenizer = ThreeChunkTokenizer(max_vocab_size=MAX_VOCAB_SIZE).fit(texts)
	x, y, sample_weights = make_training_arrays(
	texts,
	tokenizer,
	MAXLEN,
	stride=STRIDE,
	max_sequences=MAX_SEQUENCES,
	answer_only_loss=ANSWER_ONLY_LOSS,
	)
	train_ds, val_ds, train_count, val_count = make_tf_datasets(
	x,
	y,
	sample_weights,
	batch_size=BATCH_SIZE,
	validation_split=VALIDATION_SPLIT,
	)

	model = build_model(
	vocab_size=len(tokenizer.vocab),
	maxlen=MAXLEN,
	embed_dim=EMBED_DIM,
	num_heads=NUM_HEADS,
	ff_dim=FF_DIM,
	dropout=DROPOUT,
	num_layers=NUM_LAYERS,
	jit_compile=True,
	)

	print(
	f"Training on {{train_count:,}} sequences"
	f"{{f' with {{val_count:,}} validation sequences' if val_count else ''}}."
	)
	print(f"Vocab: {{len(tokenizer.vocab):,}} \| Maxlen: {{MAXLEN}} \| Batch: {{BATCH_SIZE}}")

	model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

	config = {{
	"dataset_id": DATASET_ID,
	"dataset_split": SPLIT,
	"max_rows": MAX_ROWS,
	"maxlen": MAXLEN,
	"embed_dim": EMBED_DIM,
	"num_heads": NUM_HEADS,
	"num_layers": NUM_LAYERS,
	"ff_dim": FF_DIM,
	"dropout": DROPOUT,
	"vocab_size": len(tokenizer.vocab),
	"tokenizer_chunk_size": tokenizer.chunk_size,
	"tokenizer_word_chunk_size": tokenizer.word_chunk_size,
	"tokenizer_cyrillic_chunk_size": tokenizer.cyrillic_chunk_size,
	"tokenizer_version": tokenizer.tokenizer_version,
	"stride": STRIDE,
	"max_sequences": MAX_SEQUENCES,
	"answer_only_loss": ANSWER_ONLY_LOSS,
	"mixed_precision": tf_settings["mixed_precision"],
	"xla": tf_settings["xla"],
	}}
	save_artifacts(model, tokenizer, MODEL_DIR, config)
	print(f"Saved model, tokenizer, and config to {{MODEL_DIR}}")
	"""

	zip_code = """
	import shutil
	from google.colab import files

	archive_path = shutil.make_archive(MODEL_DIR, "zip", MODEL_DIR)
	print(f"Created archive: {archive_path}")
	files.download(archive_path)
	"""

	return {
	"nbformat": 4,
	"nbformat_minor": 5,
	"metadata": {
	"accelerator": "GPU",
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3",
	},
	"language_info": {"name": "python"},
	"colab": {"gpuType": "T4"},
	},
	"cells": [
	make_notebook_cell(
	"markdown",
	f"# Train the tiny TG model on `{dataset_id}`\n\n"
	"Run this notebook in Google Colab with a GPU runtime. "
	"It downloads the Hugging Face dataset inside Colab, trains the model, "
	"and saves `model.keras`, `tokenizer.json`, and `config.json`.",
	),
	make_notebook_cell("code", "%pip -q install datasets huggingface_hub"),
	make_notebook_cell(
	"code",
	"from pathlib import Path\n"
	f"Path('app.py').write_text({app_source!r}, encoding='utf-8')\n"
	"print('Wrote app.py into the Colab runtime.')",
	),
	make_notebook_cell("code", train_code.strip()),
	make_notebook_cell(
	"markdown",
	"Run the next cell after training if you want to download the saved artifacts as a zip.",
	),
	make_notebook_cell("code", zip_code.strip()),
	],
	}


	def write_colab_notebook(output_path=DEFAULT_COLAB_NOTEBOOK, dataset_id=TG_DATASET_PRO_ID):
	notebook = build_colab_notebook(dataset_id=dataset_id)
	path = Path(output_path)
	path.write_text(json.dumps(notebook, ensure_ascii=False, indent=2), encoding="utf-8")
	return path


	def colab_command(args):
	notebook_path = write_colab_notebook(args.output, dataset_id=args.dataset_id)
	print(f"Created Colab notebook: {notebook_path}")
	print("Upload it to Google Colab, choose a GPU runtime, then run all cells.")


	def ask_text(prompt, default=None, required=False):
	while True:
	suffix = f" [{default}]" if default is not None else ""
	value = input(f"{prompt}{suffix}: ").strip()
	if value:
	return value
	if default is not None:
	return str(default)
	if not required:
	return ""
	print("Please enter a value.")


	def ask_int(prompt, default):
	while True:
	value = ask_text(prompt, default)
	try:
	return int(value)
	except ValueError:
	print("Please enter a whole number.")


	def ask_float(prompt, default):
	while True:
	value = ask_text(prompt, default)
	try:
	return float(value)
	except ValueError:
	print("Please enter a number.")


	def ask_bool(prompt, default=True):
	default_text = "y" if default else "n"
	while True:
	value = ask_text(prompt, default_text).lower()
	if value in {"y", "yes", "true", "1"}:
	return True
	if value in {"n", "no", "false", "0"}:
	return False
	print("Please enter y or n.")


	class MenuArgs:
	pass


	def build_train_args_from_input():
	args = MenuArgs()
	data = ask_text("Dataset path(s), separated by commas", required=True)
	args.data = [part.strip() for part in data.split(",") if part.strip()]
	try:
	defaults = estimate_dataset_defaults(args.data)
	print("\nDataset estimate:")
	print(f"Records: {defaults['records']:,}")
	print(f"Tokens: {defaults['tokens']:,}")
	print(f"Unique tokenizer tokens: {defaults['unique_tokens']:,}")
	print("Recommended fast settings for this dataset are pre-filled below.")
	except Exception as exc:
	print(f"Could not estimate dataset defaults, using generic fast defaults: {exc}")
	defaults = {
	"epochs": 3,
	"batch_size": 128,
	"validation_split": 0.02,
	"maxlen": 96,
	"stride": 48,
	"max_vocab_size": 32000,
	"max_sequences": 60000,
	"embed_dim": 96,
	"num_heads": 4,
	"num_layers": 2,
	"ff_dim": 192,
	"dropout": 0.15,
	"answer_only_loss": True,
	}

	args.model_dir = ask_text("Model save directory", "tg-medium")
	args.epochs = ask_int("Epochs", defaults["epochs"])
	args.batch_size = ask_int("Batch size", defaults["batch_size"])
	args.validation_split = ask_float("Validation split", defaults["validation_split"])
	args.maxlen = ask_int("Context length / max sequence length", defaults["maxlen"])
	args.stride = ask_int("Training stride, higher is faster", defaults["stride"])
	args.max_vocab_size = ask_int("Max vocab size", defaults["max_vocab_size"])
	args.max_sequences = ask_int("Max training sequences, 0 = all", defaults["max_sequences"])
	args.embed_dim = ask_int("Embedding size", defaults["embed_dim"])
	args.num_heads = ask_int("Attention heads", defaults["num_heads"])
	args.num_layers = ask_int("Transformer blocks", defaults["num_layers"])
	args.ff_dim = ask_int("Feed-forward size", defaults["ff_dim"])
	args.dropout = ask_float("Dropout", defaults["dropout"])
	args.answer_only_loss = ask_bool("Train loss only on answer tokens", defaults["answer_only_loss"])
	args.mixed_precision = ask_bool("Use mixed precision on GPU", True)
	args.xla = ask_bool("Use XLA/JIT compile", True)
	return args


	def build_chat_args_from_input():
	args = MenuArgs()
	args.model_dir = ask_text("Model directory to load", "tg-medium")
	args.maxlen = ask_int("Fallback context length if config is missing", 100)
	args.max_new_tokens = ask_int("Max new tokens per reply", 80)
	args.temperature = ask_float("Temperature, 0 for greedy", 0.8)
	return args


	def build_generate_args_from_input():
	args = build_chat_args_from_input()
	args.prompt = ask_text("Prompt", required=True)
	return args


	def build_save_args_from_input():
	args = MenuArgs()
	args.model_dir = ask_text("Model directory to load", "tg-medium")
	args.output_dir = ask_text("Output directory for saved copy", required=True)
	return args


	def build_colab_args_from_input():
	args = MenuArgs()
	args.dataset_id = ask_text("Hugging Face dataset ID", TG_DATASET_PRO_ID)
	args.output = ask_text("Output notebook path", DEFAULT_COLAB_NOTEBOOK)
	return args


	def print_menu():
	print("\nSmall LM CLI")
	print("1. Train model")
	print("2. Chat with model")
	print("3. Generate one response")
	print("4. Save/copy loaded model artifacts")
	print("5. Create Colab auto-train notebook")
	print("6. Exit")


	def main():
	while True:
	print_menu()
	choice = input("Choose an option: ").strip().lower()
	try:
	if choice in {"1", "train"}:
	train_command(build_train_args_from_input())
	elif choice in {"2", "chat"}:
	chat_command(build_chat_args_from_input())
	elif choice in {"3", "generate"}:
	generate_command(build_generate_args_from_input())
	elif choice in {"4", "save", "copy"}:
	save_command(build_save_args_from_input())
	elif choice in {"5", "colab"}:
	colab_command(build_colab_args_from_input())
	elif choice in {"6", "exit", "quit", "q"}:
	break
	else:
	print("Please choose 1, 2, 3, 4, 5, or 6.")
	except Exception as exc:
	print(f"Error: {exc}")


	if __name__ == "__main__":
	main()