| """ |
| KnowForge-0.6B inference — standalone, no extra deps beyond transformers. |
| |
| CLI: python inference.py "ZELPH RULES: ... Question: ..." |
| API: from inference import ask; result = ask("ZELPH RULES: ...") |
| """ |
| import re |
| import sys |
| from pathlib import Path |
| from typing import Optional |
|
|
| _MODEL_DIR = Path(__file__).parent |
| _pipeline = None |
|
|
|
|
| def _load_pipeline(): |
| global _pipeline |
| if _pipeline is not None: |
| return _pipeline |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
| tokenizer = AutoTokenizer.from_pretrained(str(_MODEL_DIR), trust_remote_code=True) |
| model = AutoModelForCausalLM.from_pretrained( |
| str(_MODEL_DIR), |
| dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| model.eval() |
| _pipeline = pipeline( |
| "text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| ) |
| return _pipeline |
|
|
|
|
| def _split_think(raw: str) -> tuple[str, str]: |
| """Split <think>...</think> from final answer. Returns (answer, reasoning). |
| |
| Handles truncated output where </think> was never generated. |
| """ |
| closed_match = re.search(r"<think>(.*?)</think>", raw, re.DOTALL) |
| if closed_match: |
| reasoning = closed_match.group(1).strip() |
| answer = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip() |
| return answer, reasoning |
|
|
| |
| open_match = re.search(r"<think>(.*)", raw, re.DOTALL) |
| if open_match: |
| return "", open_match.group(1).strip() |
|
|
| return raw.strip(), "" |
|
|
|
|
| def ask( |
| prompt: str, |
| max_new_tokens: int = 512, |
| temperature: float = 0.0, |
| do_sample: bool = False, |
| ) -> dict: |
| """ |
| Run a KnowForge prompt through the model. |
| |
| Args: |
| prompt: Full user-turn text, e.g. "ZELPH RULES: ... Question: ..." |
| max_new_tokens: Max tokens to generate. |
| temperature: Sampling temperature (ignored when do_sample=False). |
| do_sample: Enable sampling; set True + temperature>0 for stochastic output. |
| |
| Returns: |
| { |
| "answer": str — text after </think> (or full output if no think block), |
| "reasoning": str — text inside <think>...</think>, empty string if absent, |
| } |
| """ |
| pipe = _load_pipeline() |
| messages = [ |
| { |
| "role": "system", |
| "content": ( |
| "You are given rules for a fictional system that does NOT exist in the " |
| "real world. Reason STRICTLY from the rules provided. Do NOT use any " |
| "outside knowledge. Show your reasoning inside <think>...</think> tags " |
| "before giving your final answer." |
| ), |
| }, |
| {"role": "user", "content": prompt}, |
| ] |
|
|
| gen_kwargs: dict = { |
| "max_new_tokens": max_new_tokens, |
| "do_sample": do_sample, |
| "pad_token_id": pipe.tokenizer.eos_token_id, |
| } |
| if do_sample: |
| gen_kwargs["temperature"] = temperature |
|
|
| outputs = pipe(messages, **gen_kwargs) |
|
|
| raw = outputs[0]["generated_text"][-1]["content"] |
| answer, reasoning = _split_think(raw) |
| return {"answer": answer, "reasoning": reasoning} |
|
|
|
|
| def _main(): |
| if len(sys.argv) < 2: |
| print("Usage: python inference.py \"ZELPH RULES: ... Question: ...\"") |
| sys.exit(1) |
| prompt = " ".join(sys.argv[1:]) |
| result = ask(prompt) |
| print("Answer:", result["answer"]) |
| if result["reasoning"]: |
| print("\nReasoning:") |
| print(result["reasoning"]) |
|
|
|
|
| if __name__ == "__main__": |
| _main() |
|
|