Text Generation
Transformers
Diffusers
Safetensors
English
gpt_oss
phillnet-2
gpt-oss
multimodal
image-generation
video-generation
speech
audio
custom-code
conversational
custom_code
Instructions to use ayjays132/Phillnet-2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ayjays132/Phillnet-2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("ayjays132/Phillnet-2", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use ayjays132/Phillnet-2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ayjays132/Phillnet-2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ayjays132/Phillnet-2
- SGLang
How to use ayjays132/Phillnet-2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ayjays132/Phillnet-2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ayjays132/Phillnet-2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ayjays132/Phillnet-2 with Docker Model Runner:
docker model run hf.co/ayjays132/Phillnet-2
| import torch | |
| import os | |
| import types | |
| from .planning_latch import PlanningLatch | |
| class AgenticLoopPro: | |
| """ | |
| Advanced agentic scaffolding loop that efficiently utilizes KV cache | |
| across multiple reasoning steps, reducing computational overhead. | |
| """ | |
| def __init__(self, tokenizer, model, show_thinking=True, **kwargs): | |
| self.tokenizer = tokenizer | |
| self.model = model | |
| self.show_thinking = show_thinking | |
| self.logger = getattr(model, "logger", None) if model is not None else None | |
| self.planning_latch = PlanningLatch(tokenizer, model) if tokenizer is not None and model is not None else None | |
| # FIX 1: registry must be initialised here so tool-dispatch in iterative_reason | |
| # never raises AttributeError. Callers can inject a live registry via kwarg. | |
| self.registry = kwargs.get("registry", getattr(model, "_tool_registry", None) if model is not None else None) | |
| def _trim_completion(self, prompt_ids, output_ids): | |
| if output_ids is None: | |
| return output_ids | |
| prompt_len = int(prompt_ids.shape[-1]) | |
| if output_ids.shape[-1] <= prompt_len: | |
| return output_ids | |
| return output_ids[:, prompt_len:] | |
| def _apply_agentic_tags(self, token_ids, *, improve=False): | |
| if token_ids is None or self.model is None: | |
| return token_ids | |
| apply_control = getattr(self.model, "_apply_control_ids", None) | |
| if not callable(apply_control): | |
| return token_ids | |
| prefix_ids = getattr(self.model.config, "agentic_prefix_ids", None) | |
| suffix_ids = getattr(self.model.config, "agentic_improve_ids", None) if improve else None | |
| return apply_control(token_ids, prefix_ids=prefix_ids, suffix_ids=suffix_ids) | |
| def _load_success_traces(self): | |
| """High-Performance Expert Pattern Importer: Routes to dedicated memory store.""" | |
| try: | |
| elite_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "memory", "tool_traces", "elite_patterns.json") | |
| if os.path.exists(elite_path): | |
| import json | |
| with open(elite_path, "r", encoding="utf-8") as f: | |
| patterns = json.load(f) | |
| return "\n".join([f"- {p['behavior'].upper()}: {p['example']}" for p in patterns]) | |
| except Exception: pass | |
| return "- Standard autonomous protocols active." | |
| def _build_fused_final_prompt(self, user_goal="", context_summary="", reasoning_text=""): | |
| """ | |
| Sovereign Unified Packet Builder (De-Jargonized & Performance Optimized). | |
| Aggregates professional observations and elite expert patterns. | |
| """ | |
| scaffolds = [ | |
| getattr(self.model, "identity_scaffold", None), | |
| getattr(self.model, "meta_scaffold", None), | |
| getattr(self.model, "epistemic_scaffold", None), | |
| getattr(self.model, "validator", None), | |
| ] | |
| sections = [] | |
| for sc in scaffolds: | |
| if sc and hasattr(sc, "get_tidbit"): | |
| tidbit = sc.get_tidbit(user_goal) | |
| if tidbit: sections.append(tidbit.strip()) | |
| # 1. Professional Observations | |
| if reasoning_text: | |
| sections.append("[OBSERVATIONS]\n" + reasoning_text.strip()) | |
| # 2. Elite Expert Patterns (from dedicated memory store) | |
| expert_patterns = self._load_success_traces() | |
| sections.append( | |
| "[EXPERT_PATTERNS]\n" | |
| f"{expert_patterns}\n" | |
| "- DIRECT_TARGETING: Use visual labels (0-9) to resolve interactive elements instantly.\n" | |
| "- UNIFIED_SYNTHESIS: Provide the final result directly without process-redundancy.\n" | |
| ) | |
| # 3. Strategy Latch (Mission Persistence) | |
| if any(w in (user_goal or "").lower() for w in ["mission", "develop", "maintain", "project", "build"]): | |
| sections.append( | |
| "[STRATEGY_LATCH]\n" | |
| "- Track long-term objective completion status.\n" | |
| "- Verify all tools serve the final goal.\n" | |
| ) | |
| # 4. Unified Answer Guide (Synthesis Directive) | |
| sections.append( | |
| "[UNIFIED_ANSWER_GUIDE]\n" | |
| "- PROVIDE A DIRECT, PROFESSIONAL CONCLUSION.\n" | |
| "- Summarize all findings into a single, coherent response.\n" | |
| "- Eliminate background noise and internal log repetitions.\n" | |
| ) | |
| sections.append("[FINAL_ANSWER]\nProvide your direct response to the user below:") | |
| return "\n\n".join(sections).strip() + "\n" | |
| def iterative_reason(self, input_ids, max_steps=3, max_new_tokens=128, user_goal="", context_summary=""): | |
| """ | |
| Performs iterative reasoning steps, caching KV states between steps | |
| to prevent re-computing the entire prefix context. | |
| """ | |
| if self.tokenizer is None: | |
| return None | |
| device = input_ids.device | |
| past_key_values = None | |
| current_ids = input_ids | |
| step_outputs = [] | |
| # Track whether a repetition guard triggered (used by run_agentic_loop) | |
| self._last_reason_repeated = False | |
| for step in range(max_steps): | |
| # Update HUD progress for better Manus-tier visibility | |
| try: | |
| browser = self.registry._ensure_browser_tool() if self.registry else None | |
| if browser and browser._page: | |
| browser._page.evaluate( | |
| "(payload) => window.__phillnetUpdateProgress && window.__phillnetUpdateProgress(payload.step, payload.total)", | |
| {"step": step + 1, "total": max_steps} | |
| ) | |
| except Exception: pass | |
| # ── Adaptive Performance ── | |
| # Routine steps (step > 0) use fewer tokens to speed up the interaction density | |
| current_max_tokens = max_new_tokens if step == 0 else max_new_tokens // 2 | |
| gen_params = { | |
| "input_ids": current_ids, | |
| "past_key_values": past_key_values, | |
| "use_cache": True, | |
| "use_guidance": False, | |
| "max_new_tokens": current_max_tokens, | |
| "return_dict_in_generate": True, | |
| "output_scores": False, | |
| "pad_token_id": self.tokenizer.eos_token_id, | |
| "repetition_penalty": 1.15, | |
| "no_repeat_ngram_size": 4, | |
| } | |
| outputs = self.model.generate_base(**gen_params) | |
| # CRITICAL SPEED FIX: Carry KV-Cache to next reasoning step | |
| past_key_values = getattr(outputs, "past_key_values", None) | |
| # The newly generated tokens | |
| generated_sequences = outputs.sequences if hasattr(outputs, "sequences") else outputs | |
| generated_sequence = generated_sequences[0] | |
| new_tokens = generated_sequence[current_ids.shape[-1]:] | |
| if len(new_tokens) == 0: | |
| break | |
| text = self.tokenizer.decode(new_tokens, skip_special_tokens=True) | |
| # Semantic Guard: Check for multi-step redundancy | |
| is_redundant = False | |
| if len(step_outputs) >= 1: | |
| prev_text = step_outputs[-1].lower() | |
| curr_text = text.lower() | |
| # Check for Jaccard overlap (tightened from 0.85 to 0.70) | |
| p_tokens = set(prev_text.split()) | |
| c_tokens = set(curr_text.split()) | |
| if p_tokens and c_tokens: | |
| sim = len(p_tokens & c_tokens) / len(p_tokens | c_tokens) | |
| if sim >= 0.70: | |
| is_redundant = True | |
| if self.logger: self.logger.self_heal(f"redundancy detected (sim={sim:.2f})", action="skip_step") | |
| if not is_redundant: | |
| step_outputs.append(text) | |
| else: | |
| self._last_reason_repeated = True | |
| # If we've repeated, we should probably stop the reasoning phase and push to final synthesis | |
| break | |
| # FIX 3: Tool detection aligned with scaffold.py XML tag format. | |
| # Primary check: <BROWSER_OBSERVE>...</BROWSER_OBSERVE> and | |
| # <BROWSER_PLAN>...</BROWSER_PLAN> tags as defined in scaffold.py. | |
| # Secondary fallback: plain-language "search for" / "navigate to". | |
| if self.registry is not None: | |
| import re as _re | |
| _query = None | |
| _action = None | |
| # --- Primary: structured scaffold tags --- | |
| obs_match = _re.search( | |
| r"<BROWSER_OBSERVE>(.*?)</BROWSER_OBSERVE>", | |
| text, _re.DOTALL | _re.IGNORECASE, | |
| ) | |
| plan_match = _re.search( | |
| r"<BROWSER_PLAN>(.*?)</BROWSER_PLAN>", | |
| text, _re.DOTALL | _re.IGNORECASE, | |
| ) | |
| act_match = _re.search( | |
| r"<BROWSER_ACT>(.*?)</BROWSER_ACT>", | |
| text, _re.DOTALL | _re.IGNORECASE, | |
| ) | |
| if obs_match or plan_match: | |
| tag_content = (obs_match or plan_match).group(1).strip() | |
| url_in_tag = _re.search(r'https?://\S+', tag_content) | |
| if url_in_tag: | |
| _action = "open" | |
| _query = url_in_tag.group(0) | |
| else: | |
| _action = "search" | |
| _query = tag_content | |
| elif act_match: | |
| tag_content = act_match.group(1).strip() | |
| # Pattern matching for interactive actions inside BROWSER_ACT | |
| # e.g. "click(15)", "type('some text')", "scroll('down')" | |
| click_m = _re.search(r'click\s*(?:\(?\s*(\d{1,4})\s*\)?|\(?\s*["\']\s*([^"\']+)\s*["\']\s*\)?)', tag_content, _re.I) | |
| type_m = _re.search(r'type\s*\(?\s*["\'](.*?)["\']\s*\)?', tag_content, _re.I) | |
| scroll_m = _re.search(r'scroll\s*\(?\s*["\']?(up|down)["\']?\s*\)?', tag_content, _re.I) | |
| key_m = _re.search(r'press\s*\(?\s*["\']?([^"\']+)["\']?\s*\)?', tag_content, _re.I) | |
| if click_m: | |
| _action = "click_label" | |
| _query = int(click_m.group(1)) if click_m.group(1) else click_m.group(2) | |
| elif type_m: | |
| _action = "type_text" | |
| _query = type_m.group(1) | |
| elif scroll_m: | |
| _action = "scroll" | |
| _query = scroll_m.group(1) | |
| elif key_m: | |
| _action = "press_key" | |
| _query = key_m.group(1) | |
| else: | |
| # Fallback to smart autonomous sequence if tag is generic | |
| _action = "agentic_sequence" | |
| _query = tag_content | |
| # --- Secondary: plain-language fallback --- | |
| if _query is None: | |
| q_match = _re.search( | |
| r'search\s+for\s+["\']?([^"\'\n]{2,120})["\']?', text, _re.I | |
| ) | |
| if q_match: | |
| _action = "search" | |
| _query = q_match.group(1).strip() | |
| else: | |
| u_match = _re.search( | |
| r'(?:navigate\s+to|open\s+url|go to)\s+["\']?([^"\'\s]{5,255})["\']?', | |
| text, _re.I, | |
| ) | |
| if u_match: | |
| _action = "open" | |
| _query = u_match.group(1).strip() | |
| # --- Dispatch --- | |
| if _query is not None and _action: | |
| tool_name = f"browser_{_action}" | |
| if self.logger: | |
| self.logger.tool_route_start(tool_name, "Playwright/Manus-Mode") | |
| browser = self.registry._ensure_browser_tool() if hasattr(self.registry, "_ensure_browser_tool") else None | |
| if browser: | |
| try: | |
| # Premium Multipath Dispatch | |
| if _action == "search": result = browser.search(_query) | |
| elif _action == "open": result = browser.open(_query) | |
| elif _action == "click_label": | |
| result = browser.click_label(int(_query)) if isinstance(_query, int) else browser.click(text_target=_query) | |
| elif _action == "type_text": result = browser.type_text(text=_query) | |
| elif _action == "scroll": result = browser.scroll(direction=_query) | |
| elif _action == "press_key": result = browser.press_key(key=_query) | |
| elif _action == "agentic_sequence": | |
| # Call the high-level autonomous workflow | |
| from Tools.browser_tools import run_browser_agentic_sequence | |
| result = run_browser_agentic_sequence(_query, browser.invoke_direct) # Use direct invoker | |
| else: result = browser.search(_query) | |
| if self.logger: | |
| self.logger.tool_route_end(tool_name, "completed") | |
| if result and (result.get("summary") or result.get("text")): | |
| # Injecting tool output into the sequence invalidates the KV cache | |
| # because the sequence length changes non-incrementally. | |
| # We only reset and re-compute when tool content arrives. | |
| past_key_values = None | |
| content = result.get("summary") or result.get("text", "") | |
| tool_text = f"\\n[TOOL_OUTPUT: {tool_name}]\\n{content}\\n" | |
| tool_ids = self.tokenizer( | |
| tool_text, | |
| return_tensors="pt", | |
| add_special_tokens=False, | |
| ).input_ids.to(device) | |
| generated_sequence = torch.cat([generated_sequence, tool_ids[0]], dim=0) | |
| # Multimodal Vision Bridge: capture and inject pixel features | |
| screenshot_path = result.get("screenshot_path") | |
| import os | |
| if screenshot_path and os.path.exists(screenshot_path): | |
| if getattr(self, "processor", None) is None: | |
| try: | |
| from transformers import Qwen2VLImageProcessor | |
| root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| v_dir = os.path.join(root_dir, "vision_tokenizer") | |
| if os.path.exists(v_dir): | |
| self.processor = Qwen2VLImageProcessor.from_pretrained(v_dir) | |
| except Exception: pass | |
| if getattr(self, "processor", None) is not None: | |
| try: | |
| from PIL import Image | |
| img = Image.open(screenshot_path).convert("RGB") | |
| image_pad = getattr(self.model.config, "image_token_id", 151655) | |
| # Fast Vision Processing | |
| v_inputs = self.processor(images=[img], return_tensors="pt") | |
| pixel_values = v_inputs.pixel_values.to(device) | |
| image_grid_thw = v_inputs.image_grid_thw.to(device) | |
| # Dynamic Expansion | |
| vision_ids = [image_pad] | |
| v_tensor = torch.tensor(vision_ids, device=device).unsqueeze(0) | |
| generated_sequence = torch.cat([generated_sequence, v_tensor[0]], dim=0) | |
| # Native cache injection: avoid full vision tower re-computation | |
| if hasattr(self.model.model, "visual") and self.model.model.visual is not None: | |
| image_embeds = self.model.model.visual(pixel_values, grid_thw=image_grid_thw) | |
| self.model.cached_image_embeds = image_embeds | |
| except Exception: pass | |
| except Exception as e: | |
| if self.logger: | |
| self.logger.tool_route_end(tool_name, f"failed: {e}") | |
| # Prepare for next iteration: append new tokens and carry KV cache. | |
| current_ids = generated_sequence.unsqueeze(0) | |
| # Conclusion signal: stop early without waiting for similarity guard | |
| if "final answer" in text.lower() or "conclusion" in text.lower() or "<FINAL>" in text: | |
| break | |
| return step_outputs | |
| def run_agentic_loop(self, input_ids, max_steps=8, max_new_tokens=128, user_goal="", context_summary="", **kwargs): | |
| """ | |
| Shared-model agentic loop. | |
| Uses the existing backbone for a small iterative reasoning pass, then | |
| asks the same shared model for one final answer. This keeps the loop | |
| inside the live model instead of spawning another copy. | |
| """ | |
| if self.tokenizer is None: | |
| return self.model.generate_base(input_ids=input_ids, max_new_tokens=max_new_tokens, **kwargs) | |
| if max_steps <= 1: | |
| if self.logger and hasattr(self.logger, "phase_start"): | |
| self.logger.phase_start("Planning", detail="formulating concise latch") | |
| if self.planning_latch is not None: | |
| try: | |
| latched = self.planning_latch.formulate_and_latch(user_goal or context_summary or self.tokenizer.decode(input_ids[0], skip_special_tokens=True)) | |
| if self.logger and hasattr(self.logger, "latch_log") and latched: | |
| self.logger.latch_log(latched) | |
| except Exception: | |
| if self.logger and hasattr(self.logger, "self_heal"): | |
| self.logger.self_heal("planning latch failed", action="falling back to fused final prompt") | |
| if self.logger and hasattr(self.logger, "tool_catalog") and getattr(self.model, "agentic_scaffold", None) is not None: | |
| try: | |
| self.logger.tool_catalog(self.model.agentic_scaffold.get_tool_surface()) | |
| except Exception: | |
| pass | |
| if self.logger and hasattr(self.logger, "phase_end"): | |
| self.logger.phase_end("Planning", note="latched") | |
| final_prompt = self._build_fused_final_prompt( | |
| user_goal=user_goal, | |
| context_summary=context_summary, | |
| ) | |
| final_ids = self.tokenizer(final_prompt, return_tensors="pt").input_ids.to(input_ids.device) | |
| final_ids = self._apply_agentic_tags(final_ids, improve=True) | |
| # PRESERVE VISION BLOCKS: Re-inject the <|vision_start|>...<|vision_end|> block from original input_ids | |
| vision_start = self.tokenizer.convert_tokens_to_ids("<|vision_start|>") | |
| vision_end = self.tokenizer.convert_tokens_to_ids("<|vision_end|>") | |
| if vision_start in input_ids[0] and vision_end in input_ids[0]: | |
| start_idx = (input_ids[0] == vision_start).nonzero(as_tuple=True)[0][0] | |
| end_idx = (input_ids[0] == vision_end).nonzero(as_tuple=True)[0][-1] | |
| vision_block = input_ids[:, start_idx:end_idx+1] | |
| final_ids = torch.cat([vision_block, final_ids], dim=-1) | |
| # Clean kwargs of sequence-length-dependent arguments before calling generate with new IDs | |
| gen_kwargs = kwargs.copy() | |
| for k in ["attention_mask", "position_ids", "past_key_values", "labels"]: | |
| gen_kwargs.pop(k, None) | |
| # FINAL HARDENING: Inject strong repetition guards for the consolidated answer | |
| gen_kwargs.update({ | |
| "repetition_penalty": 1.25, | |
| "no_repeat_ngram_size": 5, | |
| "temperature": 0.4, # Lower temp for more deterministic, professional summary | |
| "use_guidance": False, | |
| }) | |
| final_gen = self.model.generate_base(input_ids=final_ids, max_new_tokens=max_new_tokens, **gen_kwargs) | |
| # If generate_base returns a sequence (batch 1), we return the novel portion [1, seq] | |
| if final_gen is not None and len(final_gen.shape) >= 2: | |
| return final_gen[:, final_ids.shape[-1]:] | |
| return final_gen | |
| reasoning_steps = self.iterative_reason( | |
| input_ids=input_ids, | |
| max_steps=max_steps, | |
| max_new_tokens=min(max_new_tokens, 96), | |
| user_goal=user_goal, | |
| context_summary=context_summary, | |
| ) or [] | |
| if not reasoning_steps: | |
| if self.logger and hasattr(self.logger, "self_heal"): | |
| self.logger.self_heal("iterative reasoning produced no usable notes", action="fallback to base generation") | |
| return self.model.generate_base(input_ids=input_ids, max_new_tokens=max_new_tokens, **kwargs) | |
| # FIX 4: Skip redundant final generation when reasoning already concluded. | |
| # Conditions for an early clean return (no extra forward pass): | |
| # a) The last step explicitly contains a conclusion signal, OR | |
| # b) The repetition guard fired (model was looping — last surviving step is the answer). | |
| # In both cases we re-encode for shape consistency but do NOT call generate_base again. | |
| last_reasoning = reasoning_steps[-1].strip() | |
| _concluded = ( | |
| "final answer" in last_reasoning.lower() | |
| or "conclusion" in last_reasoning.lower() | |
| or "<FINAL>" in last_reasoning | |
| or getattr(self, "_last_reason_repeated", False) | |
| ) | |
| if _concluded: | |
| # Return a properly shaped 2-D tensor [1, seq] matching generate_base output | |
| final_answer_ids = self.tokenizer( | |
| last_reasoning, | |
| return_tensors="pt", | |
| add_special_tokens=False, | |
| truncation=True, | |
| max_length=max_new_tokens * 2, # generous but bounded | |
| ).input_ids.to(input_ids.device) | |
| return final_answer_ids # [1, seq] — same shape as generate_base trimmed output | |
| reasoning_text = "\n".join(step.strip() for step in reasoning_steps if step.strip()) | |
| final_prompt = self._build_fused_final_prompt( | |
| user_goal=user_goal, | |
| context_summary=context_summary, | |
| reasoning_text=reasoning_text, | |
| ) | |
| final_ids = self.tokenizer(final_prompt, return_tensors="pt").input_ids.to(input_ids.device) | |
| final_ids = self._apply_agentic_tags(final_ids, improve=True) | |
| # PRESERVE VISION BLOCKS: Re-inject the <|vision_start|>...<|vision_end|> block from original input_ids | |
| vision_start = self.tokenizer.convert_tokens_to_ids("<|vision_start|>") | |
| vision_end = self.tokenizer.convert_tokens_to_ids("<|vision_end|>") | |
| if vision_start in input_ids[0] and vision_end in input_ids[0]: | |
| start_idx = (input_ids[0] == vision_start).nonzero(as_tuple=True)[0][0] | |
| end_idx = (input_ids[0] == vision_end).nonzero(as_tuple=True)[0][-1] | |
| vision_block = input_ids[:, start_idx:end_idx+1] | |
| # Prepend the vision block securely before the final instruct text | |
| final_ids = torch.cat([vision_block, final_ids], dim=-1) | |
| # Clean kwargs of sequence-length-dependent arguments before calling generate with new IDs | |
| gen_kwargs = kwargs.copy() | |
| for k in ["attention_mask", "position_ids", "past_key_values", "labels"]: | |
| gen_kwargs.pop(k, None) | |
| gen_kwargs.setdefault("use_guidance", False) | |
| output_ids = self.model.generate_base(input_ids=final_ids, max_new_tokens=max_new_tokens, **gen_kwargs) | |
| trimmed = self._trim_completion(final_ids, output_ids) | |
| return trimmed if trimmed is not None else output_ids | |