Spaces:

Jiaqi-hkust
/

Robust-R1

Running on Zero

App Files Files Community

Jiaqi-hkust commited on 9 days ago

Commit

dfe5589

verified ·

1 Parent(s): 16db65e

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +142 -221
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -3,8 +3,11 @@ import os
 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
-# 导入 spaces 模块用于 GPU 检测
 is_spaces = os.getenv("SPACE_ID") is not None
 spaces_available = False
 GPU = None
@@ -16,39 +19,33 @@ if is_spaces:
     except ImportError:
         print("⚠️ spaces module not available, GPU detection may not work")
-# 创建条件装饰器
 def gpu_decorator(func):
     """条件应用 GPU 装饰器"""
     if spaces_available and GPU is not None:
         return GPU(func)
     return func
 # 系统提示词
-sys_prompt = """First output the types of degradations in image briefly in <TYPE> <TYPE_END> tags,
         and then output what effects do these degradation have on the image in <INFLUENCE> <INFLUENCE_END> tags,
         then based on the strength of degradation, output an APPROPRIATE length for the reasoning process in <REASONING> <REASONING_END> tags,
         and then summarize the content of reasoning and the give the answer in <CONCLUSION> <CONCLUSION_END> tags,
         provides the user with the answer briefly in <ANSWER> <ANSWER_END>."""
-project_dir = os.path.dirname(os.path.abspath(__file__))
-if not is_spaces:
-    temp_dir = os.path.join(project_dir, ".gradio_temp")
-    os.makedirs(temp_dir, exist_ok=True)
-    os.environ["GRADIO_TEMP_DIR"] = temp_dir
-MODEL_PATH = os.getenv("MODEL_PATH", "Jiaqi-hkust/Robust-R1-RL")
-# 定义 CSS (移到全局，方便管理)
 CUSTOM_CSS = """
 .gradio-container { font-family: 'Inter', sans-serif; }
-#chatbot { height: 650px !important; overflow-y: auto; }
 """
-print(f"==========================================")
-print(f"Initializing application (Gradio {gr.__version__})...")
-print(f"==========================================")
 class ModelHandler:
     def __init__(self, model_path):
         self.model_path = model_path
@@ -58,8 +55,7 @@ class ModelHandler:
     def _load_model(self):
         try:
-            print(f"⏳ Loading model weights, this may take a few minutes...")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
             # 智能判断 Flash Attention
@@ -69,8 +65,6 @@ class ModelHandler:
                 if device_capability[0] >= 8:
                     use_flash_attention = True
                     print(f"🔧 CUDA available with Ampere+, utilizing Flash Attention 2")
-            else:
-                print(f"🔧 Using CPU or non-CUDA device")
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 self.model_path,
@@ -84,234 +78,161 @@ class ModelHandler:
             print(f"❌ Model loading failed: {e}")
             raise e
-    def predict(self, messages, temperature, max_tokens):
-        # 深拷贝消息，避免修改 UI 历史
-        import copy
-        messages_payload = copy.deepcopy(messages)
-        # 拼接 System Prompt
-        if messages_payload and messages_payload[-1]["role"] == "user":
-            content = messages_payload[-1]["content"]
-            sys_prompt_fmt = "\n" + " ".join(sys_prompt.split())
-            if isinstance(content, list):
-                text_found = False
-                for item in content:
-                    if item.get("type") == "text":
-                        item["text"] += sys_prompt_fmt
-                        text_found = True
-                        break
-                if not text_found:
-                    content.append({"type": "text", "text": sys_prompt_fmt})
-            elif isinstance(content, str):
-                messages_payload[-1]["content"] += sys_prompt_fmt
-        text_prompt = self.processor.apply_chat_template(
-            messages_payload, tokenize=False, add_generation_prompt=True
-        )
-        image_inputs, video_inputs = process_vision_info(messages_payload)
-        inputs = self.processor(
-            text=[text_prompt],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt"
-        )
-        inputs = inputs.to(self.model.device)
-        generation_kwargs = dict(
-            **inputs,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            do_sample=True if temperature > 0 else False,
-        )
-        try:
-            print("Starting model generation...")
-            with torch.no_grad():
-                generated_ids = self.model.generate(**generation_kwargs)
-            input_length = inputs['input_ids'].shape[1]
-            generated_ids = generated_ids[0][input_length:]
-            generated_text = self.processor.tokenizer.decode(
-                generated_ids,
-                skip_special_tokens=True
-            )
-            print(f"Generated text: {generated_text}")
-            if generated_text:
-                yield generated_text
-            else:
-                yield "⚠️ No output generated."
-        except Exception as e:
-            import traceback
-            error_details = traceback.format_exc()
-            print(f"Error in model.generate: {error_details}")
-            yield f"❌ Generation error: {str(e)}"
-            return
 model_handler = None
 def get_model_handler():
-    """Get model handler with lazy loading"""
     global model_handler
     if model_handler is None:
-        print("🔄 Initializing model handler...")
         model_handler = ModelHandler(MODEL_PATH)
     return model_handler
 @gpu_decorator
-def respond(user_msg, history, temp, tokens):
     """
-    针对 type="messages" 的 Chatbot 响应函数
     """
-    # 1. 构建当前用户的消息内容
-    user_content = []
-    files = user_msg.get("files", [])
-    for f in files:
-        user_content.append({"type": "image", "image": f})
-    text = user_msg.get("text", "")
-    if text:
-        user_content.append({"type": "text", "text": text})
-    if not user_content:
-        yield history, gr.MultimodalTextbox(value=None, interactive=True)
-        return
-    # 2. 将用户消息加入历史
-    history.append({
-        "role": "user",
-        "content": user_content
-    })
-    # 立即更新 UI
-    yield history, gr.MultimodalTextbox(value=None, interactive=False)
-    # 3. 调用模型
     try:
-        handler = get_model_handler()
-        history.append({"role": "assistant", "content": ""})
-        full_response = ""
-        # 传入 history[:-1] 避免传入空的 assistant 消息导致模板报错
-        for chunk in handler.predict(history[:-1], temp, tokens):
-            full_response += chunk
-            history[-1]["content"] = full_response
-            yield history, gr.MultimodalTextbox(interactive=False)
     except Exception as e:
         import traceback
         traceback.print_exc()
-        # 如果还没加 assistant 消息就报错了，补一个
-        if not history or history[-1].get("role") != "assistant":
-             history.append({"role": "assistant", "content": ""})
-        history[-1]["content"] = f"❌ Error: {str(e)}"
-        yield history, gr.MultimodalTextbox(interactive=True)
-    # 恢复输入框
-    yield history, gr.MultimodalTextbox(interactive=True)
-def create_chat_ui():
-    # 【修复点 1】: 这里不要传 css 参数
-    with gr.Blocks(title="Robust-R1") as demo:
-        with gr.Row():
-            gr.Markdown("# 🤖 Robust-R1: Degradation-Aware Reasoning")
-        with gr.Row():
-            with gr.Column(scale=4):
-                # Chatbot 设置 type="messages"
-                chatbot = gr.Chatbot(
-                    elem_id="chatbot",
-                    label="Chat",
-                    avatar_images=(None, "https://api.dicebear.com/7.x/bottts/svg?seed=Qwen"),
-                    height=650,
-                    type="messages"
-                )
-                chat_input = gr.MultimodalTextbox(
-                    interactive=True,
-                    file_types=["image"],
-                    placeholder="Enter your question or upload an image...",
-                    show_label=False
-                )
-            with gr.Column(scale=1):
-                with gr.Group():
-                    gr.Markdown("### ⚙️ Generation Config")
-                    temperature = gr.Slider(
-                        minimum=0.01, maximum=1.0, value=0.6, step=0.05,
-                        label="Temperature"
-                    )
-                    max_tokens = gr.Slider(
-                        minimum=128, maximum=4096, value=1024, step=128,
-                        label="Max New Tokens"
-                    )
-                clear_btn = gr.Button("🗑️ Clear Context", variant="stop")
-        gr.Markdown("---")
-        gr.Markdown("### 📚 Examples")
-        example_images_dir = os.path.join(project_dir, "assets")
-        examples_config = [
-            ("What type of vehicles are the people riding?\n0. trucks\n1. wagons\n2. jeeps\n3. cars\n", os.path.join(example_images_dir, "1.jpg")),
-            ("What is the giant fish in the air?\n0. blimp\n1. balloon\n2. kite\n3. sculpture\n", os.path.join(example_images_dir, "2.jpg")),
-        ]
-        example_data = []
-        for text, img_path in examples_config:
-            if os.path.exists(img_path):
-                example_data.append({"text": text, "files": [img_path]})
-        if example_data:
-            gr.Examples(
-                examples=example_data,
-                inputs=chat_input,
-                label="",
-                examples_per_page=3
-            )
-        else:
-            gr.Markdown("*No example images available.*")
-        chat_input.submit(
-            respond,
-            inputs=[chat_input, chatbot, temperature, max_tokens],
-            outputs=[chatbot, chat_input]
-        )
-        clear_btn.click(lambda: ([], None), outputs=[chatbot, chat_input])
-    return demo
 if __name__ == "__main__":
-    demo = create_chat_ui()
     if is_spaces:
         print(f"🚀 Running on Hugging Face Spaces: {os.getenv('SPACE_ID')}")
-        # 【修复点 2】: CSS 放在 launch 里
         demo.launch(
-            theme=gr.themes.Soft(),
-            css=CUSTOM_CSS,
-            show_error=True,
-            allowed_paths=[project_dir] if project_dir else None
         )
     else:
-        print(f"🚀 Service is starting, please visit: http://localhost:7860")
         demo.launch(
-            theme=gr.themes.Soft(),
-            css=CUSTOM_CSS,
             server_name="0.0.0.0",
             server_port=7860,
-            share=False,
-            show_error=True,
-            allowed_paths=[project_dir]
         )

 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
+import copy
+# ==========================================
+# 1. 环境与检测 Setup
+# ==========================================
 is_spaces = os.getenv("SPACE_ID") is not None
 spaces_available = False
 GPU = None
     except ImportError:
         print("⚠️ spaces module not available, GPU detection may not work")
 def gpu_decorator(func):
     """条件应用 GPU 装饰器"""
     if spaces_available and GPU is not None:
         return GPU(func)
     return func
+# ==========================================
+# 2. 常量与配置 Constants
+# ==========================================
+MODEL_PATH = os.getenv("MODEL_PATH", "Jiaqi-hkust/Robust-R1-RL")
+PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
 # 系统提示词
+SYS_PROMPT = """First output the types of degradations in image briefly in <TYPE> <TYPE_END> tags,
         and then output what effects do these degradation have on the image in <INFLUENCE> <INFLUENCE_END> tags,
         then based on the strength of degradation, output an APPROPRIATE length for the reasoning process in <REASONING> <REASONING_END> tags,
         and then summarize the content of reasoning and the give the answer in <CONCLUSION> <CONCLUSION_END> tags,
         provides the user with the answer briefly in <ANSWER> <ANSWER_END>."""
+# CSS 样式
 CUSTOM_CSS = """
 .gradio-container { font-family: 'Inter', sans-serif; }
 """
+# ==========================================
+# 3. 模型处理类 Model Handler
+# ==========================================
 class ModelHandler:
     def __init__(self, model_path):
         self.model_path = model_path
     def _load_model(self):
         try:
+            print(f"⏳ Loading model weights from {self.model_path}...")
             self.processor = AutoProcessor.from_pretrained(self.model_path)
             # 智能判断 Flash Attention
                 if device_capability[0] >= 8:
                     use_flash_attention = True
                     print(f"🔧 CUDA available with Ampere+, utilizing Flash Attention 2")
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 self.model_path,
             print(f"❌ Model loading failed: {e}")
             raise e
 model_handler = None
 def get_model_handler():
+    """懒加载模型句柄"""
     global model_handler
     if model_handler is None:
         model_handler = ModelHandler(MODEL_PATH)
     return model_handler
+# ==========================================
+# 4. 聊天生成函数 Chat Function
+# ==========================================
 @gpu_decorator
+def respond(message, history, temperature, max_tokens):
     """
+    符合 gr.ChatInterface 标准的生成函数
+    message: dict (multimodal=True时) -> {'text': str, 'files': list} [cite: 140]
+    history: list of dicts -> OpenAI 风格历史记录 [cite: 24]
     """
+    handler = get_model_handler()
+    # 1. 构建当前用户消息 (转换为 OpenAI/Qwen 格式)
+    # message['files'] 包含文件路径列表
+    current_user_content = []
+    # 处理图片
+    if message.get("files"):
+        for file_path in message["files"]:
+            current_user_content.append({"type": "image", "image": file_path})
+    # 处理文本
+    user_text = message.get("text", "")
+    if user_text:
+        current_user_content.append({"type": "text", "text": user_text})
+    # 2. 构建完整的对话列表 (History + Current Message)
+    # 注意：ChatInterface 的 history 包含之前的内容，不包含当前这一条
+    conversation = copy.deepcopy(history)
+    conversation.append({"role": "user", "content": current_user_content})
+    # 3. 注入 System Prompt (添加到最后一条用户消息的文本中)
+    # 保持您原有的逻辑：将 prompt 拼接到最后一条消息
+    last_content = conversation[-1]["content"]
+    sys_prompt_fmt = "\n" + " ".join(SYS_PROMPT.split())
+    text_injected = False
+    for item in last_content:
+        if item.get("type") == "text":
+            item["text"] += sys_prompt_fmt
+            text_injected = True
+            break
+    if not text_injected:
+        last_content.append({"type": "text", "text": sys_prompt_fmt})
+    # 4. 预处理输入
+    text_prompt = handler.processor.apply_chat_template(
+        conversation, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(conversation)
+    inputs = handler.processor(
+        text=[text_prompt],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt"
+    )
+    inputs = inputs.to(handler.model.device)
+    # 5. 生成参数
+    generation_kwargs = dict(
+        **inputs,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        do_sample=True if temperature > 0 else False,
+    )
+    # 6. 流式生成 (Yielding response) [cite: 85]
     try:
+        input_length = inputs['input_ids'].shape[1]
+        # 注意：这里为了简化演示，使用非流式的 generate，然后模拟流式输出
+        # 如果需要真正的 token 级流式，需要使用 TextIteratorStreamer
+        # 但为了保持您原有逻辑的稳定性，我们先获取结果再 yield
+        with torch.no_grad():
+            generated_ids = handler.model.generate(**generation_kwargs)
+        generated_ids = generated_ids[0][input_length:]
+        generated_text = handler.processor.tokenizer.decode(
+            generated_ids,
+            skip_special_tokens=True
+        )
+        # 简单模拟流式效果（或直接返回）
+        yield generated_text
     except Exception as e:
         import traceback
         traceback.print_exc()
+        yield f"❌ Generation error: {str(e)}"
+# ==========================================
+# 5. 构建 UI (ChatInterface)
+# ==========================================
+# 准备 Examples 数据
+example_images_dir = os.path.join(PROJECT_DIR, "assets")
+examples_data = []
+# 定义示例数据源
+raw_examples = [
+    ("What type of vehicles are the people riding?\n0. trucks\n1. wagons\n2. jeeps\n3. cars\n", "1.jpg"),
+    ("What is the giant fish in the air?\n0. blimp\n1. balloon\n2. kite\n3. sculpture\n", "2.jpg"),
+]
+for text, filename in raw_examples:
+    path = os.path.join(example_images_dir, filename)
+    # ChatInterface multimodal examples 格式: {"text": str, "files": [list]}
+    if os.path.exists(path):
+        examples_data.append({"text": text, "files": [path]})
+# 定义额外输入组件 (Generation Config)
+additional_inputs = [
+    gr.Slider(minimum=0.01, maximum=1.0, value=0.6, step=0.05, label="Temperature"),
+    gr.Slider(minimum=128, maximum=4096, value=1024, step=128, label="Max New Tokens"),
+]
+# 创建 Interface
+demo = gr.ChatInterface(
+    fn=respond,
+    type="messages",          # 使用标准的 OpenAI 格式历史记录 [cite: 24]
+    multimodal=True,          # 启用多模态上传
+    title="🤖 Robust-R1: Degradation-Aware Reasoning",
+    description="Upload an image and ask questions. The model considers image degradations during reasoning.",
+    additional_inputs=additional_inputs, # 添加配置滑块
+    additional_inputs_accordion=gr.Accordion(label="⚙️ Generation Config", open=True), # 设置配置区域
+    examples=examples_data,   # 添加示例
+    cache_examples=False,     # 根据需要开启或关闭
+    theme=gr.themes.Soft(),
+    css=CUSTOM_CSS
+)
+# ==========================================
+# 6. 启动 Launch
+# ==========================================
 if __name__ == "__main__":
     if is_spaces:
         print(f"🚀 Running on Hugging Face Spaces: {os.getenv('SPACE_ID')}")
         demo.launch(
+            allowed_paths=[PROJECT_DIR]  # 允许访问本地图片资源
         )
     else:
+        print(f"🚀 Service is starting at http://localhost:7860")
         demo.launch(
             server_name="0.0.0.0",
             server_port=7860,
+            allowed_paths=[PROJECT_DIR]
         )

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio>=6.0.0
 torch>=2.0.0
 torchvision>=0.15.0
 transformers>=4.37.0

+gradio>=6.1.0
 torch>=2.0.0
 torchvision>=0.15.0
 transformers>=4.37.0