karroyan commited on 5 days ago

Commit

df58226

1 Parent(s): 276cd53

feature(lxy): add readme and model

Files changed (19) hide show

.gitattributes +15 -0
Modelfile +16 -0
README.md +259 -0
added_tokens.json +3 -0
chat_template.jinja +3 -0
config.json +3 -0
generation_config.json +3 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +3 -0
preprocessor_config.json +3 -0
special_tokens_map.json +3 -0
tokenizer.json +3 -0
tokenizer_config.json +3 -0
video_preprocessor_config.json +3 -0
vocab.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+config.json filter=lfs diff=lfs merge=lfs -text
+model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
+preprocessor_config.json filter=lfs diff=lfs merge=lfs -text
+tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
+vocab.json filter=lfs diff=lfs merge=lfs -text
+added_tokens.json filter=lfs diff=lfs merge=lfs -text
+generation_config.json filter=lfs diff=lfs merge=lfs -text
+special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+video_preprocessor_config.json filter=lfs diff=lfs merge=lfs -text
+chat_template.jinja filter=lfs diff=lfs merge=lfs -text

Modelfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# ollama modelfile auto-generated by llamafactory
+FROM .
+TEMPLATE """{{ if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ range .Messages }}{{ if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+<|im_start|>assistant
+{{ else if eq .Role "assistant" }}{{ .Content }}<|im_end|>
+{{ end }}{{ end }}"""
+SYSTEM """You are a helpful assistant."""
+PARAMETER stop "<|im_end|>"
+PARAMETER num_ctx 4096

README.md ADDED Viewed

	@@ -0,0 +1,259 @@

+---
+language:
+- en
+- zh
+license: apache-2.0
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
+tags:
+- vision
+- image-text-to-text
+- multimodal
+- meme-generation
+- humor
+- chain-of-thought
+- qwen
+pipeline_tag: image-text-to-text
+library_name: vllm
+---
+# HUMOR-COT: Hierarchical Understanding and Meme Optimization with Chain-of-Thought
+<div align="center">
+**[Paper](https://arxiv.org/abs/2512.24555)** | **[Project Page](https://github.com/karroyan/MemeGenerator)**
+</div>
+## Model Summary
+**HUMOR-COT** is a multimodal generative model capable of creating humorous, context-aware memes. It is fine-tuned from **Qwen2.5-VL-7B-Instruct** using a novel **Hierarchical Chain-of-Thought (CoT)** approach.
+Unlike standard image captioning models that map images directly to text, HUMOR-COT mimics the human creative process in two stages:
+1. **Template-Level Reasoning:** Analyzes the image to infer latent intent, emotional tone, and layout.
+2. **Context-Level Grounding:** Generates specific, humorous captions (punchlines) grounded in user-supplied keywords or contexts.
+This model represents the Supervised Fine-Tuning (SFT) stage of the HUMOR framework, achieving state-of-the-art performance in humor, readability, and human-likeness (91.5%) compared to GPT-4o and other VLMs.
+## Uses
+### Intended Use
+* **Meme Generation:** Generating humorous captions for uploaded images based on specific topics or keywords.
+* **Humor Understanding:** Analyzing the punchline mechanics of existing memes.
+* **Creative Writing Assist:** Brainstorming metaphorical associations for visual content.
+### Out of Scope
+* Generation of hate speech, violence, or harmful stereotypes (filtered during training, but guardrails recommended for deployment).
+## How to Get Started
+The model is designed to be used with `vllm` for efficient inference. Below is a custom wrapper class designed to handle the hierarchical generation process.
+### Prerequisites
+You need to set up the following environment variables and files:
+* `NLP_MODEL_PATH`: Path to your Spacy model (e.g., `en_core_web_sm`).
+* `VLLM_MODEL_PATH`: Path to this model (local or HF hub ID).
+* `prompt/generate_meme.txt`: The text file containing the system prompt for CoT generation.
+### Inference Code
+```python
+import os
+import json
+import logging
+import numpy as np
+import spacy
+from vllm import LLM, SamplingParams
+from transformers import AutoProcessor
+# Note: Boxclipper and tag_config are custom dependencies from your codebase
+# from utils import Boxclipper, tag_config
+logger = logging.getLogger(__name__)
+class HumorMemeGenerator:
+    def __init__(self, input_path, input_path_update, mask_api: bool = False, use_gemini_generate: bool = False):
+        """
+        Initializes the HUMOR-COT generator.
+        Args:
+            input_path (str): Path to initial dataset/config json.
+            input_path_update (str): Path to updated labels json.
+            mask_api (bool): Whether to mask API calls (for internal tools).
+            use_gemini_generate (bool): Toggle to use external API instead of local vLLM.
+        """
+        self.mask_api = mask_api
+        self.use_gemini_generate = use_gemini_generate
+        # Load configurations
+        with open(input_path, 'r') as f:
+            self.input_data = json.load(f)
+        with open(input_path_update, 'r') as f:
+            self.input_data_update = json.load(f)
+        # Environment configuration
+        self.nlp_path = os.getenv('NLP_MODEL_PATH', 'en_core_web_sm')
+        self.model_path = os.getenv('VLLM_MODEL_PATH', 'Your-HF-Org/HUMOR-COT') # Default to HF path
+        self.nlp = spacy.load(self.nlp_path)
+        # Initialize internal classifiers/tools (Placeholder for custom logic)
+        # self.scene_theme_classifier = self._init_scene_theme_classifier()
+        # self.boxclipper = Boxclipper(mask_api=self.mask_api)
+        # Load Prompt Template
+        try:
+            with open('prompt/generate_meme.txt', 'r') as prompt_file:
+                self.PROMPT = prompt_file.read()
+        except FileNotFoundError:
+            logger.warning("Prompt file not found. Using default prompt.")
+            self.PROMPT = "Generate a humorous meme caption based on the image..."
+        # Initialize Qwen2.5-VL via vLLM
+        if not self.use_gemini_generate:
+            logger.info(f"Loading Qwen2.5-VL from {self.model_path}...")
+            self.processor = AutoProcessor.from_pretrained(
+                self.model_path,
+                trust_remote_code=True
+            )
+            # vLLM Configuration for Multimodal
+            self.llm = LLM(
+                model=self.model_path,
+                trust_remote_code=True,
+                dtype="bfloat16",
+                max_model_len=4096,
+                max_num_seqs=5,
+                mm_processor_kwargs={
+                    "min_pixels": 28 * 28,
+                    "max_pixels": 1280 * 28 * 28,
+                    "fps": 1,
+                },
+                limit_mm_per_prompt={"image": 1},
+                tensor_parallel_size=1,
+                gpu_memory_utilization=0.3,
+            )
+        else:
+            logger.info("Using External API (Gemini/GPT) for generation.")
+            self.llm = None
+    def inference(self, tag, keywords, question, image_path, modify, detections, history):
+        """
+        Internal inference method wrapping the vLLM generation call.
+        (Logic adapted for standalone usage)
+        """
+        if self.llm is None:
+            return "External API logic needed here", []
+        # Construct Prompt using CoT structure
+        prompt_text = self.PROMPT.format(
+            tag=tag,
+            keywords=keywords,
+            question=question
+        )
+        # Construct vLLM inputs
+        # Note: Qwen2.5-VL requires specific token formatting
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": [
+                {"type": "image", "image": image_path},
+                {"type": "text", "text": prompt_text}
+            ]}
+        ]
+        # Prepare inputs using processor logic (simplified for vLLM)
+        # Actual implementation depends on specific vLLM version requirements for Qwen2.5-VL
+        outputs = self.llm.chat(messages=messages, sampling_params=SamplingParams(temperature=0.7, max_tokens=256))
+        generated_text = outputs[0].outputs[0].text
+        # Parse generated_text to extract caption and bounding box (loc)
+        # return text, loc
+        return generated_text, []
+    def text_generate(self, state, chose_image_path, initial_info):
+        """
+        Main entry point for generating meme text.
+        Args:
+            state: Object containing history and modification state.
+            chose_image_path (dict): {'local_path': str, 'detections': ...}
+            initial_info (dict): {'tag': str, 'Text Content Keywords': str, 'question': str, ...}
+        """
+        tag = initial_info.get('tag', '')
+        keywords = initial_info.get('Text Content Keywords', '')
+        question = initial_info.get('question', '') + '\n' + initial_info.get('answer', '')
+        modify = state.modify
+        # Call inference
+        inference_result = self.inference(
+            tag,
+            keywords,
+            question,
+            chose_image_path['local_path'],
+            modify,
+            chose_image_path.get('detections'),
+            state.history_text_loc_info
+        )
+        gemini_result = None
+        # Handle Output Tuple
+        if len(inference_result) == 3:
+            text, loc, gemini_result = inference_result
+        else:
+            text, loc = inference_result[:2]
+        # Update State
+        state.original_text_loc_info = {'text': text, 'loc': loc}
+        if gemini_result:
+             state.gemini_text_loc_info = {
+                'text': gemini_result['text'],
+                'loc': gemini_result['loc'],
+                'image_path': gemini_result.get('image_path', chose_image_path['local_path'])
+            }
+        else:
+            state.gemini_text_loc_info = None
+        return text, loc
+```
+## Training Data & Methodology
+The model was trained on a dataset of **3,713** high-quality, in-the-wild memes.
+* **Data Processing:** We utilized a Two-Stage CoT synthesis pipeline (powered by Doubao-1.5-vision-pro) to reverse-engineer the "thought process" behind each meme.
+* **Format:** The model is trained to output a reasoning trace followed by the final content `box_1: text, box_2: text`.
+## Evaluation Results
+Evaluation was conducted against strong baselines (Qwen2.5-7B-Instruct, GPT-4o) using both human evaluation and automated metrics.
+| Model | Humor (0-5) | Readability (0-5) | Human-Likeness Score (%) |
+| --- | --- | --- | --- |
+| Qwen2.5-7B-Instruct (Base) | 2.39 | 3.35 | 75.7% |
+| GPT-4o | 2.70 | **3.79** | 91.3% |
+| **HUMOR-COT (Ours)** | **2.68** | 3.70 | **91.5%** |
+*HUMOR-COT significantly outperforms the base model and achieves parity with closed-source SOTA models in human-likeness.*
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+@article{li2025perception,
+  title={From Perception to Punchline: Empowering VLM with the Art of In-the-wild Meme},
+  author={Li, Xueyan and Xue, Yingyi and Jiang, Mengjie and Zhu, Qingzi and Niu, Yazhe},
+  journal={arXiv preprint arXiv:2512.24555},
+  year={2025}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58b54bbe36fc752f79a24a271ef66a0a0830054b4dfad94bde757d851968060b
+size 605

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0bc6f6fc7a29a80017a433e8f03a1cc1236e838a944a2d034295a60c4f2fddb
+size 1017

config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41769145a1ac36f13c54710617f00143672d1bbc0d76792beec4c07d2d9f38c8
+size 3219

generation_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:319521f8c6ab944bb1e33d8879079b772b1d6dc8455be4ceecdf7e4c52688a52
+size 214

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3809881f7c49314cb93194900c696f8ced3a0c658864d18c654406b26b708b28
+size 4968243304

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:746487cc31287ced3eeba0694addbd35835c7a37ec77da784b2b3abc7b4d2d8c
+size 4991495816

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb159f9dc13963266af8fe514580f57cc4ecbdf624fbca705a7fb39b0e3d6b39
+size 4932751040

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cf368d6376058703680ca7701d5e5528991dc9e5449078a474d58afa3c5e264
+size 1691924384

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4be35310ddc165e46e88de4c3fec8c1210014b0b8717f4544d82cc740814ae0c
+size 57655

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:276e1dbe46dd567fce6e587665266ede535f42ab08d46f3d7febea17cb37abcd
+size 791

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76862e765266b85aa9459767e33cbaf13970f327a0e88d1c65846c2ddd3a1ecd
+size 613

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8160131af9f1a4b44ace4fb7a707d6315f90efa6bcb4828a82972dfafed6a458
+size 4756

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09e98526bcd1b8584217418253badf2824ecf2815933b0583cdceb2e8f79ebb0
+size 907

vocab.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
+size 2776833