Spaces:

Jiaqi-hkust
/

Robust-R1

Running on Zero

App Files Files Community

Jiaqi-hkust commited on 16 days ago

Commit

7f7273d

verified ·

1 Parent(s): b4fe2c0

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +250 -169

app.py CHANGED Viewed

@@ -3,46 +3,28 @@ import os
 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
-import copy
-# ==========================================
-# 1. 环境与检测 Setup
-# ==========================================
-is_spaces = os.getenv("SPACE_ID") is not None
-spaces_available = False
-GPU = None
-if is_spaces:
-    try:
-        from spaces import GPU
-        spaces_available = True
-    except ImportError:
-        print("⚠️ spaces module not available, GPU detection may not work")
-def gpu_decorator(func):
-    if spaces_available and GPU is not None:
-        return GPU(func)
-    return func
-# ==========================================
-# 2. 常量与配置 Constants
-# ==========================================
-MODEL_PATH = os.getenv("MODEL_PATH", "Jiaqi-hkust/Robust-R1-RL")
-PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
-SYS_PROMPT = """First output the types of degradations in image briefly in <TYPE> <TYPE_END> tags,
         and then output what effects do these degradation have on the image in <INFLUENCE> <INFLUENCE_END> tags,
         then based on the strength of degradation, output an APPROPRIATE length for the reasoning process in <REASONING> <REASONING_END> tags,
         and then summarize the content of reasoning and the give the answer in <CONCLUSION> <CONCLUSION_END> tags,
         provides the user with the answer briefly in <ANSWER> <ANSWER_END>."""
-CUSTOM_CSS = """
-.gradio-container { font-family: 'Inter', sans-serif; }
-"""
-# ==========================================
-# 3. 模型处理类 Model Handler
-# ==========================================
 class ModelHandler:
     def __init__(self, model_path):
         self.model_path = model_path
@@ -52,21 +34,24 @@ class ModelHandler:
     def _load_model(self):
         try:
-            print(f"⏳ Loading model weights from {self.model_path}...")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
-            use_flash_attention = False
             if torch.cuda.is_available():
                 device_capability = torch.cuda.get_device_capability()
-                if device_capability[0] >= 8:
-                    use_flash_attention = True
-                    print(f"🔧 CUDA available with Ampere+, utilizing Flash Attention 2")
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 self.model_path,
                 torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                 device_map="auto",
-                attn_implementation="flash_attention_2" if use_flash_attention else "sdpa",
                 trust_remote_code=True
             )
             print("✅ Model loaded successfully!")
@@ -74,155 +59,251 @@ class ModelHandler:
             print(f"❌ Model loading failed: {e}")
             raise e
 model_handler = None
 def get_model_handler():
     global model_handler
     if model_handler is None:
         model_handler = ModelHandler(MODEL_PATH)
     return model_handler
-# ==========================================
-# 4. 聊天生成函数 Chat Function
-# ==========================================
-@gpu_decorator
-def respond(message, history, temperature, max_tokens):
     """
-    message: dict -> {'text': str, 'files': list}
-    history: list of dicts -> OpenAI 风格历史记录
-    """
-    handler = get_model_handler()
-    # 1. 转换当前消息
-    current_user_content = []
-    if message.get("files"):
-        for file_path in message["files"]:
-            current_user_content.append({"type": "image", "image": file_path})
-    user_text = message.get("text", "")
-    if user_text:
-        current_user_content.append({"type": "text", "text": user_text})
-    # 2. 构建完整对话 (History + Current)
-    conversation = copy.deepcopy(history)
-    conversation.append({"role": "user", "content": current_user_content})
-    # 3. 注入 System Prompt
-    last_content = conversation[-1]["content"]
-    sys_prompt_fmt = "\n" + " ".join(SYS_PROMPT.split())
-    text_injected = False
-    for item in last_content:
-        if item.get("type") == "text":
-            item["text"] += sys_prompt_fmt
-            text_injected = True
-            break
-    if not text_injected:
-        last_content.append({"type": "text", "text": sys_prompt_fmt})
-    # 4. 推理
-    text_prompt = handler.processor.apply_chat_template(
-        conversation, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(conversation)
-    inputs = handler.processor(
-        text=[text_prompt],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt"
-    )
-    inputs = inputs.to(handler.model.device)
-    generation_kwargs = dict(
-        **inputs,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        do_sample=True if temperature > 0 else False,
-    )
-    try:
-        with torch.no_grad():
-            generated_ids = handler.model.generate(**generation_kwargs)
-        input_length = inputs['input_ids'].shape[1]
-        generated_ids = generated_ids[0][input_length:]
-        generated_text = handler.processor.tokenizer.decode(
-            generated_ids,
-            skip_special_tokens=True
-        )
-        yield generated_text
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        yield f"❌ Error: {str(e)}"
-# ==========================================
-# 5. 构建 UI
-# ==========================================
-# 【关键修复点】：Examples 格式必须包含 Additional Inputs 的值
-example_images_dir = os.path.join(PROJECT_DIR, "assets")
-examples_data = []
-if os.path.exists(example_images_dir):
-    raw_examples = [
-        ("What type of vehicles are the people riding?\n0. trucks\n1. wagons\n2. jeeps\n3. cars\n", "1.jpg"),
-        ("What is the giant fish in the air?\n0. blimp\n1. balloon\n2. kite\n3. sculpture\n", "2.jpg"),
-    ]
-    for text, filename in raw_examples:
-        path = os.path.join(example_images_dir, filename)
-        if os.path.exists(path):
-            # 格式必须是: [MessageDict, TemperatureValue, MaxTokensValue]
-            examples_data.append([
-                {"text": text, "files": [path]}, # 1. 消息对象
-                0.6,                             # 2. Temperature (对应 additional_inputs[0])
-                1024                             # 3. Max Tokens (对应 additional_inputs[1])
-            ])
-# 定义额外输入
-additional_inputs = [
-    gr.Slider(minimum=0.01, maximum=1.0, value=0.6, step=0.05, label="Temperature"),
-    gr.Slider(minimum=128, maximum=4096, value=1024, step=128, label="Max New Tokens"),
-]
-# 自定义 Chatbot 组件
-chatbot_component = gr.Chatbot(
-    label="Robust-R1 Chat",
-    avatar_images=(None, "https://api.dicebear.com/7.x/bottts/svg?seed=Qwen"),
-    height=650
-)
-# ChatInterface
-demo = gr.ChatInterface(
-    fn=respond,
-    chatbot=chatbot_component,
-    multimodal=True,
-    title="🤖 Robust-R1: Degradation-Aware Reasoning",
-    description="Upload an image and ask questions.",
-    additional_inputs=additional_inputs,
-    additional_inputs_accordion=gr.Accordion(label="⚙️ Generation Config", open=True),
-    examples=examples_data, # 现在这里的格式是 [[msg, 0.6, 1024], ...]
-    cache_examples=False
-)
 if __name__ == "__main__":
-    launch_kwargs = {
-        "theme": gr.themes.Soft(),
-        "css": CUSTOM_CSS,
-        "allowed_paths": [PROJECT_DIR]
-    }
     if is_spaces:
-        print(f"🚀 Running on Hugging Face Spaces")
-        demo.launch(**launch_kwargs)
     else:
-        print(f"🚀 Running locally")
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
-            **launch_kwargs
-        )

 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
+import html
+sys_prompt = """First output the the types of degradations in image briefly in <TYPE> <TYPE_END> tags,
         and then output what effects do these degradation have on the image in <INFLUENCE> <INFLUENCE_END> tags,
         then based on the strength of degradation, output an APPROPRIATE length for the reasoning process in <REASONING> <REASONING_END> tags,
         and then summarize the content of reasoning and the give the answer in <CONCLUSION> <CONCLUSION_END> tags,
         provides the user with the answer briefly in <ANSWER> <ANSWER_END>."""
+project_dir = os.path.dirname(os.path.abspath(__file__))
+is_spaces = os.getenv("SPACE_ID") is not None
+if not is_spaces:
+    temp_dir = os.path.join(project_dir, ".gradio_temp")
+    os.makedirs(temp_dir, exist_ok=True)
+    os.environ["GRADIO_TEMP_DIR"] = temp_dir
+MODEL_PATH = os.getenv("MODEL_PATH", "Jiaqi-hkust/Robust-R1-RL")
+print(f"==========================================")
+print(f"Initializing application...")
+print(f"==========================================")
 class ModelHandler:
     def __init__(self, model_path):
         self.model_path = model_path
     def _load_model(self):
         try:
+            print(f"⏳ Loading model weights, this may take a few minutes...")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
             if torch.cuda.is_available():
                 device_capability = torch.cuda.get_device_capability()
+                use_flash_attention = device_capability[0] >= 8
+                print(f"🔧 CUDA available, device capability: {device_capability}")
+            else:
+                use_flash_attention = False
+                print(f"🔧 Using CPU or non-CUDA device")
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 self.model_path,
                 torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
                 device_map="auto",
+                # attn_implementation="flash_attention_2" if use_flash_attention else "eager",
+                attn_implementation="sdpa",
                 trust_remote_code=True
             )
             print("✅ Model loaded successfully!")
             print(f"❌ Model loading failed: {e}")
             raise e
+    def predict(self, message_dict, history, temperature, max_tokens):
+        text = message_dict.get("text", "")
+        files = message_dict.get("files", [])
+        messages = []
+        if history:
+            print(f"Processing {len(history)} previous messages from history")
+            for msg in history:
+                role = msg.get("role", "")
+                content = msg.get("content", "")
+                if role == "user":
+                    user_content = []
+                    if isinstance(content, list):
+                        for item in content:
+                            if isinstance(item, str):
+                                if os.path.exists(item) or any(item.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']):
+                                    user_content.append({"type": "image", "image": item})
+                                else:
+                                    user_content.append({"type": "text", "text": item})
+                            elif isinstance(item, dict):
+                                user_content.append(item)
+                    elif isinstance(content, str):
+                        if content:
+                            user_content.append({"type": "text", "text": content})
+                    if user_content:
+                        messages.append({"role": "user", "content": user_content})
+                elif role == "assistant":
+                    if isinstance(content, str) and content:
+                        messages.append({"role": "assistant", "content": content})
+        current_content = []
+        if files:
+            for file_path in files:
+                current_content.append({"type": "image", "image": file_path})
+        if text:
+            sys_prompt_formatted = " ".join(sys_prompt.split())
+            full_text = f"{text}\n{sys_prompt_formatted}"
+            current_content.append({"type": "text", "text": full_text})
+        if current_content:
+            messages.append({"role": "user", "content": current_content})
+        print(f"Total messages for model: {len(messages)}")
+        print(f"Message roles: {[m['role'] for m in messages]}")
+        text_prompt = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text_prompt],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        )
+        inputs = inputs.to(self.model.device)
+        generation_kwargs = dict(
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            do_sample=True if temperature > 0 else False,
+        )
+        try:
+            print("Starting model generation...")
+            with torch.no_grad():
+                generated_ids = self.model.generate(**generation_kwargs)
+            input_length = inputs['input_ids'].shape[1]
+            generated_ids = generated_ids[0][input_length:]
+            print(f"Input length: {input_length}, Generated token count: {len(generated_ids)}")
+            generated_text = self.processor.tokenizer.decode(
+                generated_ids,
+                skip_special_tokens=True
+            )
+            print(f"Generation completed. Output length: {len(generated_text)}, Content preview: {repr(generated_text[:200])}")
+            if generated_text and generated_text.strip():
+                print(f"Yielding generated text: {generated_text[:100]}...")
+                yield generated_text
+            else:
+                warning_msg = "⚠️ No output generated. The model may not have produced any response."
+                print(warning_msg)
+                yield warning_msg
+        except Exception as e:
+            import traceback
+            error_details = traceback.format_exc()
+            print(f"Error in model.generate: {error_details}")
+            yield f"❌ Generation error: {str(e)}"
+            return
 model_handler = None
 def get_model_handler():
+    """Get model handler with lazy loading"""
     global model_handler
     if model_handler is None:
+        print("🔄 Initializing model handler...")
         model_handler = ModelHandler(MODEL_PATH)
     return model_handler
+def create_chat_ui():
+    custom_css = """
+    .gradio-container { font-family: 'Inter', sans-serif; }
+    #chatbot { height: 650px !important; overflow-y: auto; }
     """
+    with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Robust-R1") as demo:
+        with gr.Row():
+            gr.Markdown("# 🤖Robust-R1:Degradation-Aware Reasoning for Robust Visual Understanding")
+        with gr.Row():
+            with gr.Column(scale=4):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    label="Chat",
+                    type="messages",
+                    avatar_images=(None, "https://api.dicebear.com/7.x/bottts/svg?seed=Qwen"),
+                    height=650
+                )
+                chat_input = gr.MultimodalTextbox(
+                    interactive=True,
+                    file_types=["image"],
+                    placeholder="Enter your question or upload an image...",
+                    show_label=False
+                )
+            with gr.Column(scale=1):
+                with gr.Group():
+                    gr.Markdown("### ⚙️ Generation Config")
+                    temperature = gr.Slider(
+                        minimum=0.01, maximum=1.0, value=0.6, step=0.05,
+                        label="Temperature"
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=128, maximum=4096, value=1024, step=128,
+                        label="Max New Tokens"
+                    )
+                clear_btn = gr.Button("🗑️ Clear Context", variant="stop")
+        gr.Markdown("---")
+        gr.Markdown("### 📚 Examples")
+        gr.Markdown("Click the examples below to quickly fill the input box and start a conversation")
+        example_images_dir = os.path.join(project_dir, "assets")
+        examples_config = [
+            ("What type of vehicles are the people riding?\n0. trucks\n1. wagons\n2. jeeps\n3. cars\n", os.path.join(example_images_dir, "1.jpg")),
+            ("What is the giant fish in the air?\n0. blimp\n1. balloon\n2. kite\n3. sculpture\n", os.path.join(example_images_dir, "2.jpg")),
+        ]
+        example_data = []
+        for text, img_path in examples_config:
+            if os.path.exists(img_path):
+                example_data.append({"text": text, "files": [img_path]})
+        if example_data:
+            gr.Examples(
+                examples=example_data,
+                inputs=chat_input,
+                label="",
+                examples_per_page=3
+            )
+        else:
+            gr.Markdown("*No example images available, please manually upload images for testing*")
+        async def respond(user_msg, history, temp, tokens):
+            text = user_msg.get("text", "").strip()
+            files = user_msg.get("files", [])
+            user_content = list(files)
+            if text: user_content.append(text)
+            if not files and text: user_message = {"role": "user", "content": text}
+            else: user_message = {"role": "user", "content": user_content}
+            history.append(user_message)
+            yield history, gr.MultimodalTextbox(value=None, interactive=False)
+            history.append({"role": "assistant", "content": ""})
+            try:
+                previous_history = history[:-2] if len(history) >= 2 else []
+                handler = get_model_handler()
+                generated_text = ""
+                for chunk in handler.predict(user_msg, previous_history, temp, tokens):
+                    generated_text = chunk
+                    safe_text = generated_text.replace("<", "&lt;").replace(">", "&gt;")
+                    history[-1]["content"] = safe_text
+                    yield history, gr.MultimodalTextbox(interactive=False)
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                history[-1]["content"] = f"❌ Inference error: {str(e)}"
+                yield history, gr.MultimodalTextbox(interactive=True)
+            yield history, gr.MultimodalTextbox(value=None, interactive=True)
+        chat_input.submit(
+            respond,
+            inputs=[chat_input, chatbot, temperature, max_tokens],
+            outputs=[chatbot, chat_input]
+        )
+        def clear_history(): return [], None
+        clear_btn.click(clear_history, outputs=[chatbot, chat_input])
+    return demo
 if __name__ == "__main__":
+    demo = create_chat_ui()
     if is_spaces:
+        print(f"🚀 Running on Hugging Face Spaces: {os.getenv('SPACE_ID')}")
+        demo.launch(
+            show_error=True,
+            allowed_paths=[project_dir] if project_dir else None
+        )
     else:
+        print(f"🚀 Service is starting, please visit: http://localhost:7860")
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
+            share=False,
+            show_error=True,
+            allowed_paths=[project_dir]
+        )