| """Template utilities for Jinja template processing. | |
| This module provides utilities for analyzing and processing Jinja chat templates, | |
| including content format detection and message processing. | |
| """ | |
| import logging | |
| import jinja2 | |
| import transformers.utils.chat_template_utils as hf_chat_utils | |
| from sglang.srt.utils import ImageData | |
| logger = logging.getLogger(__name__) | |
| # ============================================================================ | |
| # JINJA TEMPLATE CONTENT FORMAT DETECTION | |
| # ============================================================================ | |
| # | |
| # This adapts vLLM's approach for detecting chat template content format: | |
| # https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313 | |
| # - Analyzes Jinja template AST to detect content iteration patterns | |
| # - 'openai' format: templates with {%- for content in message['content'] -%} loops | |
| # - 'string' format: templates that expect simple string content | |
| # - Processes content accordingly to match template expectations | |
| def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool: | |
| """Check if node is a variable access like {{ varname }}""" | |
| if isinstance(node, jinja2.nodes.Name): | |
| return node.ctx == "load" and node.name == varname | |
| return False | |
| def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool: | |
| """Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}""" | |
| if isinstance(node, jinja2.nodes.Getitem): | |
| return ( | |
| _is_var_access(node.node, varname) | |
| and isinstance(node.arg, jinja2.nodes.Const) | |
| and node.arg.value == key | |
| ) | |
| if isinstance(node, jinja2.nodes.Getattr): | |
| return _is_var_access(node.node, varname) and node.attr == key | |
| return False | |
| def _is_var_or_elems_access( | |
| node: jinja2.nodes.Node, | |
| varname: str, | |
| key: str = None, | |
| ) -> bool: | |
| """Check if node accesses varname or varname[key] with filters/tests""" | |
| if isinstance(node, jinja2.nodes.Filter): | |
| return node.node is not None and _is_var_or_elems_access( | |
| node.node, varname, key | |
| ) | |
| if isinstance(node, jinja2.nodes.Test): | |
| return _is_var_or_elems_access(node.node, varname, key) | |
| if isinstance(node, jinja2.nodes.Getitem) and isinstance( | |
| node.arg, jinja2.nodes.Slice | |
| ): | |
| return _is_var_or_elems_access(node.node, varname, key) | |
| return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname) | |
| def _try_extract_ast(chat_template: str): | |
| """Try to parse the Jinja template into an AST""" | |
| try: | |
| jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template) | |
| return jinja_compiled.environment.parse(chat_template) | |
| except Exception as e: | |
| logger.debug(f"Error when compiling Jinja template: {e}") | |
| return None | |
| def detect_jinja_template_content_format(chat_template: str) -> str: | |
| """ | |
| Detect whether a chat template expects 'string' or 'openai' content format. | |
| - 'string': content is a simple string (like DeepSeek templates) | |
| - 'openai': content is a list of structured dicts (like Llama4 templates) | |
| Detection logic: | |
| - If template has loops like {%- for content in message['content'] -%} → 'openai' | |
| - Otherwise → 'string' | |
| """ | |
| # Shortcut for multimodal templates | |
| if any( | |
| keyword in chat_template for keyword in ["image", "audio", "video", "vision"] | |
| ): | |
| return "openai" | |
| jinja_ast = _try_extract_ast(chat_template) | |
| if jinja_ast is None: | |
| return "string" | |
| try: | |
| # Look for patterns like: {%- for content in message['content'] -%} | |
| for loop_ast in jinja_ast.find_all(jinja2.nodes.For): | |
| loop_iter = loop_ast.iter | |
| # Check if iterating over message['content'] or similar | |
| if _is_var_or_elems_access(loop_iter, "message", "content"): | |
| return "openai" # Found content iteration → openai format | |
| # Also check for patterns like: {%- for item in msg.content -%} or {%- for item in m.content -%} | |
| if _is_var_or_elems_access( | |
| loop_iter, "msg", "content" | |
| ) or _is_var_or_elems_access(loop_iter, "m", "content"): | |
| return "openai" # Found content iteration → openai format (glm4v) | |
| return "string" # No content loops found → string format | |
| except Exception as e: | |
| logger.debug(f"Error when parsing AST of Jinja template: {e}") | |
| return "string" | |
| def process_content_for_template_format( | |
| msg_dict: dict, | |
| content_format: str, | |
| image_data: list, | |
| video_data: list, | |
| audio_data: list, | |
| modalities: list, | |
| ) -> dict: | |
| """ | |
| Process message content based on detected template format. | |
| Args: | |
| msg_dict: Message dictionary with content | |
| content_format: 'string' or 'openai' (detected via AST analysis) | |
| image_data: List to append extracted image URLs | |
| video_data: List to append extracted video URLs | |
| audio_data: List to append extracted audio URLs | |
| modalities: List to append modalities | |
| Returns: | |
| Processed message dictionary | |
| """ | |
| if not isinstance(msg_dict.get("content"), list): | |
| # Already a string or None, no processing needed | |
| return {k: v for k, v in msg_dict.items() if v is not None} | |
| if content_format == "openai": | |
| # OpenAI format: preserve structured content list, normalize types | |
| processed_content_parts = [] | |
| for chunk in msg_dict["content"]: | |
| if isinstance(chunk, dict): | |
| chunk_type = chunk.get("type") | |
| if chunk_type == "image_url": | |
| image_data.append( | |
| ImageData( | |
| url=chunk["image_url"]["url"], | |
| detail=chunk["image_url"].get("detail", "auto"), | |
| ) | |
| ) | |
| if chunk.get("modalities"): | |
| modalities.append(chunk.get("modalities")) | |
| # Normalize to simple 'image' type for template compatibility | |
| processed_content_parts.append({"type": "image"}) | |
| elif chunk_type == "video_url": | |
| video_data.append(chunk["video_url"]["url"]) | |
| if chunk.get("modalities"): | |
| modalities.append(chunk.get("modalities")) | |
| # Normalize to simple 'video' type for template compatibility | |
| processed_content_parts.append({"type": "video"}) | |
| elif chunk_type == "audio_url": | |
| audio_data.append(chunk["audio_url"]["url"]) | |
| # Normalize to simple 'audio' type | |
| processed_content_parts.append({"type": "audio"}) | |
| else: | |
| # Keep other content as-is (text, etc.) | |
| processed_content_parts.append(chunk) | |
| new_msg = { | |
| k: v for k, v in msg_dict.items() if v is not None and k != "content" | |
| } | |
| new_msg["content"] = processed_content_parts | |
| return new_msg | |
| elif content_format == "string": | |
| # String format: flatten to text only (for templates like DeepSeek) | |
| text_parts = [] | |
| for chunk in msg_dict["content"]: | |
| if isinstance(chunk, dict) and chunk.get("type") == "text": | |
| text_parts.append(chunk["text"]) | |
| # Note: For string format, we ignore images/audio since the template | |
| # doesn't expect structured content - multimodal placeholders would | |
| # need to be inserted differently | |
| new_msg = msg_dict.copy() | |
| new_msg["content"] = " ".join(text_parts) if text_parts else "" | |
| new_msg = {k: v for k, v in new_msg.items() if v is not None} | |
| return new_msg | |
| else: | |
| raise ValueError(f"Invalid content format: {content_format}") | |
Xet Storage Details
- Size:
- 8.01 kB
- Xet hash:
- e82aa4e867bf2f41a2640204896898e890a0e4db1fbd2cf086b297e056b87669
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.