internlm
/

EndoCoT

@@ -1,11 +1,13 @@
 ---
-license: mit
 datasets:
 - internlm/EndoCoT-Data
 language:
 - en
-base_model:
-- Qwen/Qwen-Image-Edit-2511
 ---
 <p align="center">   <img src="fig/banner.svg" alt="EndoCoT" width="900"/> </p>
@@ -30,6 +32,8 @@ base_model:
 # EndoCoT: Scaling Endogenous Chain-of-Thought Reasoning in Diffusion Models
 ## 📝TODO
 - [x] Open source the training code
@@ -102,7 +106,7 @@ pip install -r requirements.txt
 1. Download the datasets & metadata.csv
-   - You may find our training data at: [**EndoCoT dataset**](https://huggingface.co/datasets/InternLM/EndoCoT)
    > Since the metadata uses relative paths, please ensure the dataset files are placed in the same directory as `metadata.csv`
@@ -130,7 +134,12 @@ def encode_prompt_edit(self, pipe: QwenImagePipeline, prompt, edit_image, is_fin
         drop_idx = 64
         if type(prompt[0])==str:
-            template =  "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
             txt = template.format(prompt[0])
             model_inputs = pipe.processor(text=txt, images=edit_image, padding=True, return_tensors="pt").to(pipe.device)
             embedding_layers = pipe.text_encoder.model.language_model.get_input_embeddings()
@@ -194,10 +203,15 @@ def encode_prompt_edit(self, pipe: QwenImagePipeline, prompt, edit_image, is_fin
 ## 📖 Citation
-```
-Coming Soon
 ```
 ## ⚖️ License
-![Code License](https://img.shields.io/badge/Code%20License-MIT-green.svg) ![Data License](https://img.shields.io/badge/Data%20License-CC%20By%20NC%204.0-red.svg)

 ---
+base_model:
+- Qwen/Qwen-Image-Edit-2511
 datasets:
 - internlm/EndoCoT-Data
 language:
 - en
+license: mit
+library_name: diffusers
+pipeline_tag: image-to-image
 ---
 <p align="center">   <img src="fig/banner.svg" alt="EndoCoT" width="900"/> </p>
 # EndoCoT: Scaling Endogenous Chain-of-Thought Reasoning in Diffusion Models
+This repository contains the official model checkpoints for **EndoCoT**, as presented in the paper [EndoCoT: Scaling Endogenous Chain-of-Thought Reasoning in Diffusion Models](https://huggingface.co/papers/2603.12252).
 ## 📝TODO
 - [x] Open source the training code
 1. Download the datasets & metadata.csv
+   - You may find our training data at: [**EndoCoT dataset**](https://huggingface.co/datasets/internlm/EndoCoT-Data)
    > Since the metadata uses relative paths, please ensure the dataset files are placed in the same directory as `metadata.csv`
         drop_idx = 64
         if type(prompt[0])==str:
+            template =  "<|im_start|>system
+Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+<|im_start|>user
+<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+<|im_start|>assistant
+"
             txt = template.format(prompt[0])
             model_inputs = pipe.processor(text=txt, images=edit_image, padding=True, return_tensors="pt").to(pipe.device)
             embedding_layers = pipe.text_encoder.model.language_model.get_input_embeddings()
 ## 📖 Citation
+```bibtex
+@article{dai2026endocot,
+  title={EndoCoT: Scaling Endogenous Chain-of-Thought Reasoning in Diffusion Models},
+  author={Dai, Xuanlang and Zhou, Yujie and Xing, Long and Bu, Jiazi and Wei, Xilin and Liu, Yuhong and Zhang, Beichen and Chen, Kai and Zang, Yuhang},
+  journal={arXiv preprint arXiv:2603.12252},
+  year={2026}
+}
 ```
 ## ⚖️ License
+![Code License](https://img.shields.io/badge/Code%20License-MIT-green.svg) ![Data License](https://img.shields.io/badge/Data%20License-CC%20By%20NC%204.0-red.svg)