Text Generation
Transformers
Diffusers
Safetensors
English
gpt_oss
phillnet-2
gpt-oss
multimodal
image-generation
video-generation
speech
audio
custom-code
conversational
custom_code
Instructions to use ayjays132/Phillnet-2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ayjays132/Phillnet-2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use ayjays132/Phillnet-2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ayjays132/Phillnet-2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ayjays132/Phillnet-2
- SGLang
How to use ayjays132/Phillnet-2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ayjays132/Phillnet-2 with Docker Model Runner:
docker model run hf.co/ayjays132/Phillnet-2
| import gc | |
| import json | |
| import os | |
| import re | |
| import time | |
| from pathlib import Path | |
| from typing import Any | |
| import torch | |
| from peft import PeftModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, logging as transformers_logging | |
| try: | |
| from .cache import GuidanceCache | |
| from .context import collect_project_context | |
| from .prompts import GUIDANCE_SYSTEM, build_guidance_prompt | |
| from .settings import GuidanceSettings | |
| from .tools import ToolSignals, run_tools | |
| except ImportError: | |
| from guidance_sidecar.cache import GuidanceCache | |
| from guidance_sidecar.context import collect_project_context | |
| from guidance_sidecar.prompts import GUIDANCE_SYSTEM, build_guidance_prompt | |
| from guidance_sidecar.settings import GuidanceSettings | |
| from guidance_sidecar.tools import ToolSignals, run_tools | |
| def configure_runtime(settings: GuidanceSettings) -> None: | |
| cache_root = settings.resolve(settings.model_cache_dir) | |
| tmp_root = settings.resolve(settings.tmp_dir) | |
| tmp_root.mkdir(parents=True, exist_ok=True) | |
| for key in ("HF_HOME", "HF_HUB_CACHE", "TRANSFORMERS_CACHE", "HF_DATASETS_CACHE", "HF_XET_CACHE"): | |
| os.environ[key] = str(cache_root / key.lower()) | |
| for key in ("TMP", "TEMP", "TMPDIR"): | |
| os.environ[key] = str(tmp_root) | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| os.environ["TRANSFORMERS_VERBOSITY"] = "error" | |
| os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" | |
| transformers_logging.set_verbosity_error() | |
| if hasattr(transformers_logging, "disable_progress_bar"): | |
| transformers_logging.disable_progress_bar() | |
| def preferred_torch_dtype() -> torch.dtype: | |
| if torch.cuda.is_available(): | |
| return torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 | |
| return torch.float32 | |
| def extract_json(text: str) -> dict[str, Any]: | |
| text = text.strip() | |
| fenced = re.search(r"```(?:json)?\s*(.*?)```", text, re.DOTALL | re.IGNORECASE) | |
| if fenced: | |
| text = fenced.group(1).strip() | |
| start = text.find("{") | |
| end = text.rfind("}") | |
| if start >= 0 and end > start: | |
| text = text[start : end + 1] | |
| try: | |
| return json.loads(text) | |
| except json.JSONDecodeError: | |
| repaired = re.sub(r'(\]|\}|"|\d)\s*\n\s*("[-_a-zA-Z0-9]+"\s*:)', r"\1,\n \2", text) | |
| repaired = re.sub(r",\s*([}\]])", r"\1", repaired) | |
| return json.loads(repaired) | |
| def normalize_guidance(result: dict[str, Any]) -> dict[str, Any]: | |
| allowed_routes = ("plan", "patch", "critique", "test", "fix") | |
| route = str(result.get("route", "plan")).lower() | |
| result["route"] = next((candidate for candidate in allowed_routes if candidate in route), "plan") | |
| try: | |
| confidence = float(result.get("confidence", 0.0)) | |
| except (TypeError, ValueError): | |
| confidence = 0.0 | |
| result["confidence"] = max(0.0, min(1.0, confidence)) | |
| for key in ("useful_context", "plan", "patch_rules", "risks", "tests"): | |
| value = result.get(key, []) | |
| if isinstance(value, str): | |
| value = [value] | |
| elif not isinstance(value, list): | |
| value = [] | |
| result[key] = [str(item) for item in value if str(item).strip()] | |
| result["next_action"] = str(result.get("next_action", "")).strip() or "Continue with the planned coding step." | |
| return result | |
| def enrich_with_tools(result: dict[str, Any], signals: ToolSignals) -> dict[str, Any]: | |
| if result["route"] == "plan" and signals.route_hint != "plan": | |
| result["route"] = signals.route_hint | |
| for key, values in ( | |
| ("risks", signals.risk_flags), | |
| ("tests", signals.suggested_tests), | |
| ("patch_rules", signals.patch_rules), | |
| ): | |
| existing = result.get(key, []) | |
| merged = existing + [value for value in values if value not in existing] | |
| result[key] = merged[:8] | |
| result["tool_signals"] = { | |
| "route_hint": signals.route_hint, | |
| "context_stats": signals.context_stats, | |
| "calculations": signals.calculations, | |
| } | |
| return result | |
| class GuidanceEngine: | |
| def __init__( | |
| self, | |
| settings: GuidanceSettings | None = None, | |
| model_name: str | None = None, | |
| adapter_dir: str | None = None, | |
| keep_loaded_seconds: int | None = None, | |
| ): | |
| base = settings or GuidanceSettings() | |
| self.settings = GuidanceSettings( | |
| model_name=model_name or base.model_name, | |
| adapter_dir=adapter_dir if adapter_dir is not None else base.adapter_dir, | |
| cache_dir=base.cache_dir, | |
| model_cache_dir=base.model_cache_dir, | |
| tmp_dir=base.tmp_dir, | |
| max_context_chars=base.max_context_chars, | |
| max_input_tokens=base.max_input_tokens, | |
| max_new_tokens=base.max_new_tokens, | |
| keep_loaded_seconds=keep_loaded_seconds if keep_loaded_seconds is not None else base.keep_loaded_seconds, | |
| quantization=base.quantization, | |
| require_cuda=base.require_cuda, | |
| require_bf16=base.require_bf16, | |
| ) | |
| configure_runtime(self.settings) | |
| self.cache = GuidanceCache(self.settings.resolve(self.settings.cache_dir)) | |
| self.model = None | |
| self.tokenizer = None | |
| self.loaded_at = 0.0 | |
| def from_bundle(cls, bundle_dir: str | Path) -> "GuidanceEngine": | |
| return cls(GuidanceSettings.from_bundle(bundle_dir)) | |
| def _quantization_config(self) -> BitsAndBytesConfig | None: | |
| if self.settings.quantization == "none": | |
| return None | |
| if self.settings.quantization != "4bit": | |
| raise ValueError(f"Unsupported quantization: {self.settings.quantization}") | |
| return BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| def _load(self) -> None: | |
| if self.model is not None: | |
| return | |
| if self.settings.require_cuda and not torch.cuda.is_available(): | |
| raise RuntimeError("CUDA is required. Set GUIDANCE_REQUIRE_CUDA=0 for CPU-only non-fast mode.") | |
| if self.settings.require_bf16 and torch.cuda.is_available() and not torch.cuda.is_bf16_supported(): | |
| raise RuntimeError("bf16 was requested, but this CUDA device does not support bf16.") | |
| cache_dir = os.environ["HF_HOME"] | |
| dtype = preferred_torch_dtype() | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.settings.model_name, | |
| trust_remote_code=True, | |
| cache_dir=cache_dir, | |
| ) | |
| if self.tokenizer.pad_token is None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.settings.model_name, | |
| quantization_config=self._quantization_config(), | |
| device_map="auto", | |
| torch_dtype=dtype, | |
| trust_remote_code=True, | |
| cache_dir=cache_dir, | |
| low_cpu_mem_usage=True, | |
| ) | |
| if self.settings.adapter_dir: | |
| self.model = PeftModel.from_pretrained(self.model, self.settings.adapter_dir) | |
| self.model.eval() | |
| self.loaded_at = time.time() | |
| def unload(self) -> None: | |
| self.model = None | |
| self.tokenizer = None | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| def _maybe_unload(self) -> None: | |
| if self.settings.keep_loaded_seconds < 0: | |
| return | |
| if self.settings.keep_loaded_seconds == 0: | |
| self.unload() | |
| elif time.time() - self.loaded_at > self.settings.keep_loaded_seconds: | |
| self.unload() | |
| def advise(self, task: str, context: str = "", use_cache: bool = True) -> dict[str, Any]: | |
| signals = run_tools(task, context) | |
| payload = { | |
| "model": self.settings.model_name, | |
| "adapter": self.settings.adapter_dir, | |
| "task": task, | |
| "context": context, | |
| "max_new_tokens": self.settings.max_new_tokens, | |
| "tool_signals": signals.__dict__, | |
| } | |
| key = self.cache.key_for(payload) | |
| if use_cache: | |
| cached = self.cache.get(key) | |
| if cached: | |
| cached["cache_hit"] = True | |
| return cached | |
| self._load() | |
| messages = [ | |
| {"role": "system", "content": GUIDANCE_SYSTEM}, | |
| {"role": "user", "content": build_guidance_prompt(task, context, signals.as_prompt_text())}, | |
| ] | |
| prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = self.tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=self.settings.max_input_tokens, | |
| ).to(self.model.device) | |
| with torch.no_grad(): | |
| output = self.model.generate( | |
| **inputs, | |
| max_new_tokens=self.settings.max_new_tokens, | |
| do_sample=False, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| temperature=None, | |
| top_p=None, | |
| top_k=None, | |
| ) | |
| raw = self.tokenizer.decode(output[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True) | |
| try: | |
| result = extract_json(raw) | |
| except Exception: | |
| result = { | |
| "route": "plan", | |
| "confidence": 0.2, | |
| "useful_context": [], | |
| "plan": [], | |
| "patch_rules": ["Model returned invalid JSON; inspect raw_output."], | |
| "risks": ["Guidance parser failed."], | |
| "tests": [], | |
| "next_action": "Retry with less context.", | |
| "raw_output": raw, | |
| } | |
| result = enrich_with_tools(normalize_guidance(result), signals) | |
| result["cache_hit"] = False | |
| result["model"] = self.settings.model_name | |
| self.cache.put(key, result) | |
| self._maybe_unload() | |
| return result | |
| def generate_code(self, task: str, context: str = "", language: str = "python") -> str: | |
| """Generate code directly with the bundled coder sidecar. | |
| This is used by the outer multimodal model when a prompt explicitly asks | |
| for code-only output. The sidecar remains internal, but the returned text | |
| is final code rather than guidance JSON. | |
| """ | |
| self._load() | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are the internal coding sidecar for a larger model. " | |
| "Return only source code. No prose, no markdown fences, no JSON. " | |
| "Implement the requested behavior directly and minimally. " | |
| "Do not add extra validation, restrictions, printing, examples, or wrapper code unless requested. " | |
| "For text parsing tasks, handle values embedded inside surrounding text and preserve signs." | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": ( | |
| f"Language: {language}\n" | |
| f"Task:\n{task}\n\n" | |
| f"Relevant context:\n{context or 'No additional context.'}\n\n" | |
| "Return a complete minimal implementation only." | |
| ), | |
| }, | |
| ] | |
| prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = self.tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=self.settings.max_input_tokens, | |
| ).to(self.model.device) | |
| with torch.no_grad(): | |
| output = self.model.generate( | |
| **inputs, | |
| max_new_tokens=max(self.settings.max_new_tokens, 192), | |
| do_sample=False, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| temperature=None, | |
| top_p=None, | |
| top_k=None, | |
| ) | |
| raw = self.tokenizer.decode(output[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True).strip() | |
| match = re.search(r"```(?:\w+)?\s*(.*?)```", raw, flags=re.DOTALL) | |
| code = match.group(1).strip() if match else raw | |
| self._maybe_unload() | |
| return code | |
| def advise_project(self, task: str, root: str | Path = ".", use_cache: bool = True) -> dict[str, Any]: | |
| context = collect_project_context(Path(root).resolve(), self.settings.max_context_chars) | |
| return self.advise(task, context=context, use_cache=use_cache) | |