qox
/

knowforge-0.6B

Text Generation

compositional-reasoning

Model card Files Files and versions

knowforge-0.6B / inference.py

qox's picture

Initial upload: KnowForge-0.6B

616bac5 verified 5 days ago

history blame contribute delete

3.63 kB

	"""
	KnowForge-0.6B inference — standalone, no extra deps beyond transformers.

	CLI: python inference.py "ZELPH RULES: ... Question: ..."
	API: from inference import ask; result = ask("ZELPH RULES: ...")
	"""
	import re
	import sys
	from pathlib import Path
	from typing import Optional

	_MODEL_DIR = Path(__file__).parent
	_pipeline = None # lazy singleton


	def _load_pipeline():
	global _pipeline
	if _pipeline is not None:
	return _pipeline
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

	tokenizer = AutoTokenizer.from_pretrained(str(_MODEL_DIR), trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	str(_MODEL_DIR),
	dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)
	model.eval()
	_pipeline = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	)
	return _pipeline


	def _split_think(raw: str) -> tuple[str, str]:
	"""Split <think>...</think> from final answer. Returns (answer, reasoning).

	Handles truncated output where </think> was never generated.
	"""
	closed_match = re.search(r"<think>(.*?)</think>", raw, re.DOTALL)
	if closed_match:
	reasoning = closed_match.group(1).strip()
	answer = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
	return answer, reasoning

	# Unclosed <think> block — generation was cut off during reasoning
	open_match = re.search(r"<think>(.*)", raw, re.DOTALL)
	if open_match:
	return "", open_match.group(1).strip()

	return raw.strip(), ""


	def ask(
	prompt: str,
	max_new_tokens: int = 512,
	temperature: float = 0.0,
	do_sample: bool = False,
	) -> dict:
	"""
	Run a KnowForge prompt through the model.

	Args:
	prompt: Full user-turn text, e.g. "ZELPH RULES: ... Question: ..."
	max_new_tokens: Max tokens to generate.
	temperature: Sampling temperature (ignored when do_sample=False).
	do_sample: Enable sampling; set True + temperature>0 for stochastic output.

	Returns:
	{
	"answer": str — text after </think> (or full output if no think block),
	"reasoning": str — text inside <think>...</think>, empty string if absent,
	}
	"""
	pipe = _load_pipeline()
	messages = [
	{
	"role": "system",
	"content": (
	"You are given rules for a fictional system that does NOT exist in the "
	"real world. Reason STRICTLY from the rules provided. Do NOT use any "
	"outside knowledge. Show your reasoning inside <think>...</think> tags "
	"before giving your final answer."
	),
	},
	{"role": "user", "content": prompt},
	]

	gen_kwargs: dict = {
	"max_new_tokens": max_new_tokens,
	"do_sample": do_sample,
	"pad_token_id": pipe.tokenizer.eos_token_id,
	}
	if do_sample:
	gen_kwargs["temperature"] = temperature

	outputs = pipe(messages, **gen_kwargs)

	raw = outputs[0]["generated_text"][-1]["content"]
	answer, reasoning = _split_think(raw)
	return {"answer": answer, "reasoning": reasoning}


	def _main():
	if len(sys.argv) < 2:
	print("Usage: python inference.py \"ZELPH RULES: ... Question: ...\"")
	sys.exit(1)
	prompt = " ".join(sys.argv[1:])
	result = ask(prompt)
	print("Answer:", result["answer"])
	if result["reasoning"]:
	print("\nReasoning:")
	print(result["reasoning"])


	if __name__ == "__main__":
	_main()