Spaces:

VOIDER
/

VisualQuality-R1-7B

Build error

App Files Files Community

VOIDER commited on Jan 7

Commit

1c355ce

verified ·

1 Parent(s): 6d24361

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -186

app.py CHANGED Viewed

@@ -1,200 +1,299 @@
-import os
-import sys
-import subprocess
-# --- ПРОВЕРКА И УСТАНОВКА БИБЛИОТЕКИ ---
-try:
-    from llama_cpp import Llama, LlamaChatCompletionHandler
-    print("Библиотека llama-cpp-python найдена.")
-except ImportError:
-    print("Установка llama-cpp-python (CPU)...")
-    # Принудительно ставим 0.3.16 или новее с поддержкой CPU
-    subprocess.check_call([
-        sys.executable, "-m", "pip", "install",
-        "llama-cpp-python>=0.3.16",
-        "--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cpu"
-    ])
-    from llama_cpp import Llama, LlamaChatCompletionHandler
 import gradio as gr
-from huggingface_hub import hf_hub_download
-import base64
-import io
 import re
-from PIL import Image
-# Конфигурация
-REPO_ID = "mradermacher/VisualQuality-R1-7B-GGUF"
-MODEL_FILENAME = "VisualQuality-R1-7B.Q8_0.gguf"
-# === ГЛАВНЫЙ ФИКС: СВОЙ ОБРАБОТЧИК ДЛЯ QWEN2-VL ===
-# Мы не зависим от встроенных классов, а пишем свой.
-class CustomQwen2VLHandler(LlamaChatCompletionHandler):
-    def __init__(self, clip_model_path=None, verbose=False):
-        self.clip_model_path = clip_model_path
-        self.verbose = verbose
-    def __call__(self, llama: Llama, messages, functions=None, function_call=None, tools=None, tool_choice=None, **kwargs):
-        # 1. Формируем промпт вручную с правильными тегами
-        prompt = ""
-        images = []
-        for message in messages:
-            role = message["role"]
-            content = message["content"]
-            # Начало сообщения
-            prompt += f"<|im_start|>{role}\n"
-            if isinstance(content, str):
-                prompt += content
-            elif isinstance(content, list):
-                for part in content:
-                    if part["type"] == "text":
-                        prompt += part["text"]
-                    elif part["type"] == "image_url":
-                        # Теги для Qwen2-VL: Vision Start -> Pad -> Vision End
-                        prompt += "<|vision_start|><|image_pad|><|vision_end|>"
-                        # Извлекаем байты из base64 для передачи в C++ слой
-                        try:
-                            image_url = part["image_url"]["url"]
-                            if "base64," in image_url:
-                                base64_data = image_url.split("base64,")[1]
-                                image_bytes = base64.b64decode(base64_data)
-                                images.append(image_bytes)
-                        except Exception as e:
-                            print(f"Ошибка декодирования картинки: {e}")
-            # Конец сообщения
-            prompt += "<|im_end|>\n"
-        # Добавляем триггер для ответа ассистента
-        prompt += "<|im_start|>assistant\n"
-        if self.verbose:
-            print(f"=== SENDED PROMPT ({len(prompt)} chars) ===")
-            print(prompt[:200] + "..." if len(prompt) > 200 else prompt)
-            print(f"=== IMAGES: {len(images)} ===")
-        # Возвращаем кортеж (prompt, images), который понимает llama.cpp
-        return prompt, images
-llm = None
-def load_model():
-    global llm
-    if llm is None:
-        print(f"Загрузка модели {MODEL_FILENAME}...")
-        try:
-            model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
-            # Инициализируем НАШ кастомный хендлер
-            # clip_model_path указываем на тот же файл (так как это GGUF all-in-one)
-            chat_handler = CustomQwen2VLHandler(clip_model_path=model_path, verbose=True)
-            llm = Llama(
-                model_path=model_path,
-                n_ctx=8192,           # Контекст (картинки большие, нужно место)
-                n_gpu_layers=0,       # CPU
-                verbose=True,
-                chat_handler=chat_handler, # <-- ВАЖНО: Используем наш класс
-                n_batch=512,
-                logits_all=True
-            )
-            print("Модель успешно загружена с CustomQwen2VLHandler!")
-        except Exception as e:
-            print(f"Ошибка загрузки: {e}")
-            raise e
-    return llm
-def process_image(image):
-    # Ресайз до 1024px макс, чтобы не перегружать CPU память и контекст
-    max_dim = 1024
-    if max(image.size) > max_dim:
-        image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
-    buffered = io.BytesIO()
-    image = image.convert("RGB")
-    image.save(buffered, format="JPEG", quality=90)
-    return base64.b64encode(buffered.getvalue()).decode('utf-8')
-def evaluate_image(image, progress=gr.Progress()):
-    if image is None:
-        return "Пожалуйста, загрузите изображение.", ""
-    try:
-        progress(0.1, desc="Загрузка модели...")
-        model = load_model()
-        progress(0.2, desc="Обработка...")
-        base64_img = process_image(image)
-        img_url = f"data:image/jpeg;base64,{base64_img}"
-        system_prompt = "You are doing the image quality assessment task."
-        user_prompt = (
-            "What is your overall rating on the quality of this picture? "
-            "The rating should be a float between 1 and 5, rounded to two decimal places, "
-            "with 1 representing very poor quality and 5 representing excellent quality. "
-            "Please only output the final answer with only one score in <answer> </answer> tags."
-        )
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": img_url}},
-                    {"type": "text", "text": user_prompt}
-                ]
-            }
-        ]
-        full_response = ""
-        print("Начинаю генерацию...")
-        # Запуск стриминга
-        stream = model.create_chat_completion(
-            messages=messages,
-            max_tokens=1024,
-            temperature=0.6,
-            stream=True
         )
-        for chunk in stream:
-            if "choices" in chunk:
-                delta = chunk["choices"][0]["delta"]
-                if "content" in delta and delta["content"]:
-                    content = delta["content"]
-                    full_response += content
-                    yield full_response, "Думаю..."
-        # Поиск оценки
-        score_match = re.search(r'<answer>\s*([\d\.]+)\s*</answer>', full_response)
-        final_score = score_match.group(1) if score_match else "Оценка не найдена"
-        yield full_response, final_score
-    except Exception as e:
-        err_msg = f"Произошла ошибка: {str(e)}"
-        print(err_msg)
-        yield err_msg, "Error"
-# Интерфейс
-with gr.Blocks(title="VisualQuality-R1 (Custom Handler)") as demo:
-    gr.Markdown("# 👁️ VisualQuality-R1 (Qwen2-VL)")
-    gr.Markdown("Оценка качества изображений на CPU с кастомным обработчиком.")
-    with gr.Row():
-        with gr.Column():
-            input_img = gr.Image(type="pil", label="Изображение")
-            run_btn = gr.Button("Оценить", variant="primary")
-        with gr.Column():
-            output_score = gr.Label(label="Оценка")
-            output_text = gr.Textbox(label="CoT (Рассуждения)", lines=15)
-    run_btn.click(evaluate_image, inputs=[input_img], outputs=[output_text, output_score])
 if __name__ == "__main__":
-    demo.queue().launch()

 import gradio as gr
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig, TextIteratorStreamer
+from qwen_vl_utils import process_vision_info
+from threading import Thread
 import re
+import random
+import spaces
+# Константы
+MODEL_PATH = "TianheWu/VisualQuality-R1-7B"
+# Промпты
+PROMPT = (
+    "You are doing the image quality assessment task. Here is the question: "
+    "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
+    "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
+)
+QUESTION_TEMPLATE_THINKING = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
+QUESTION_TEMPLATE_NO_THINKING = "{Question} Please only output the final answer with only one score in <answer> </answer> tags."
+# Конфигурация 8-bit квантизации
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+    llm_int8_threshold=6.0,
+    llm_int8_has_fp16_weight=False,
+)
+print("Loading model...")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    quantization_config=quantization_config,
+    device_map="auto",
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+)
+model.eval()
+processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+processor.tokenizer.padding_side = "left"
+print("Model loaded successfully!")
+def extract_score(text):
+    """Извлечение оценки из текста"""
+    try:
+        model_output_matches = re.findall(r'<answer>(.*?)</answer>', text, re.DOTALL)
+        if model_output_matches:
+            model_answer = model_output_matches[-1].strip()
+        else:
+            model_answer = text.strip()
+        score_match = re.search(r'\d+(\.\d+)?', model_answer)
+        if score_match:
+            score = float(score_match.group())
+            return min(max(score, 1.0), 5.0)  # Ограничение от 1 до 5
+    except Exception as e:
+        print(f"Error extracting score: {e}")
+    return None
+def extract_thinking(text):
+    """Извлечение процесса мышления из текста"""
+    thinking_matches = re.findall(r'<think>(.*?)</think>', text, re.DOTALL)
+    if thinking_matches:
+        return thinking_matches[-1].strip()
+    return None
+@spaces.GPU(duration=120)
+def score_image_streaming(image, use_thinking=True):
+    """Оценка качества изображения со стримингом"""
+    if image is None:
+        yield "❌ Please upload an image first.", "", ""
+        return
+    # Выбор шаблона
+    if use_thinking:
+        question_template = QUESTION_TEMPLATE_THINKING
+    else:
+        question_template = QUESTION_TEMPLATE_NO_THINKING
+    # Формирование сообщения
+    message = [
+        {
+            "role": "user",
+            "content": [
+                {'type': 'image', 'image': image},
+                {"type": "text", "text": question_template.format(Question=PROMPT)}
+            ],
+        }
+    ]
+    batch_messages = [message]
+    # Подготовка входных данных
+    text = [processor.apply_chat_template(
+        msg, tokenize=False, add_generation_prompt=True, add_vision_id=True
+    ) for msg in batch_messages]
+    image_inputs, video_inputs = process_vision_info(batch_messages)
+    inputs = processor(
+        text=text,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(model.device)
+    # Настройка стриминга
+    streamer = TextIteratorStreamer(
+        processor.tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=2048 if use_thinking else 256,
+        do_sample=True,
+        top_k=50,
+        top_p=0.95,
+        temperature=0.7,
+        use_cache=True,
+    )
+    # Запуск генерации в отдельном потоке
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Стриминг вывода
+    generated_text = ""
+    current_thinking = ""
+    current_score = ""
+    for new_text in streamer:
+        generated_text += new_text
+        # Извлечение мышления (если есть)
+        thinking = extract_thinking(generated_text)
+        if thinking:
+            current_thinking = thinking
+        # Извлечение оценки
+        score = extract_score(generated_text)
+        if score is not None:
+            current_score = f"⭐ **Quality Score: {score:.2f} / 5.00**"
+        # Форматирование вывода
+        display_text = generated_text
+        yield display_text, current_thinking, current_score
+    thread.join()
+    # Финальное извлечение
+    final_score = extract_score(generated_text)
+    final_thinking = extract_thinking(generated_text) if use_thinking else ""
+    if final_score is not None:
+        score_display = f"⭐ **Quality Score: {final_score:.2f} / 5.00**\n\n📊 **For Leaderboard:** `{final_score:.2f}`"
+    else:
+        score_display = "❌ Could not extract score. Please try again."
+    yield generated_text, final_thinking or "", score_display
+def create_interface():
+    """Создание интерфейса Gradio"""
+    with gr.Blocks(
+        title="VisualQuality-R1: Image Quality Assessment",
+        theme=gr.themes.Soft(),
+        css="""
+        .score-box {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            border-radius: 10px;
+            padding: 20px;
+            color: white;
+            text-align: center;
+            font-size: 1.2em;
+        }
+        .thinking-box {
+            background-color: #f0f4f8;
+            border-left: 4px solid #667eea;
+            padding: 15px;
+            border-radius: 5px;
+            font-style: italic;
+        }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # 🎨 VisualQuality-R1: Image Quality Assessment
+        **Reasoning-Induced Image Quality Assessment via Reinforcement Learning to Rank**
+        Upload an image to get a quality score (1-5) with detailed reasoning.
+        [![Paper](https://img.shields.io/badge/arXiv-Paper-red)](https://arxiv.org/abs/2505.14460)
+        [![Model](https://img.shields.io/badge/🤗-Model-yellow)](https://huggingface.co/TianheWu/VisualQuality-R1-7B)
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                image_input = gr.Image(
+                    label="📷 Upload Image",
+                    type="pil",
+                    height=400
+                )
+                thinking_checkbox = gr.Checkbox(
+                    label="🧠 Enable Thinking Mode (detailed reasoning)",
+                    value=True
+                )
+                submit_btn = gr.Button(
+                    "🔍 Analyze Image Quality",
+                    variant="primary",
+                    size="lg"
+                )
+                gr.Markdown("""
+                ### 📖 Instructions:
+                1. Upload an image
+                2. Enable/disable thinking mode
+                3. Click "Analyze Image Quality"
+                4. Wait for the score and reasoning
+                ### 📊 Score Scale:
+                - **1.0**: Very poor quality
+                - **2.0**: Poor quality
+                - **3.0**: Fair quality
+                - **4.0**: Good quality
+                - **5.0**: Excellent quality
+                """)
+            with gr.Column(scale=1):
+                score_output = gr.Markdown(
+                    label="Quality Score",
+                    value="*Upload an image to see the score*"
+                )
+                thinking_output = gr.Textbox(
+                    label="🧠 Thinking Process",
+                    lines=8,
+                    max_lines=15,
+                    placeholder="Reasoning will appear here when thinking mode is enabled...",
+                    interactive=False
+                )
+                raw_output = gr.Textbox(
+                    label="📝 Full Model Output",
+                    lines=10,
+                    max_lines=20,
+                    placeholder="Full model response will appear here...",
+                    interactive=False
+                )
+        # Примеры
+        gr.Markdown("### 📸 Example Images")
+        gr.Examples(
+            examples=[
+                ["https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png"],
+            ],
+            inputs=[image_input],
+            label="Click to try"
         )
+        # Обработка события
+        submit_btn.click(
+            fn=score_image_streaming,
+            inputs=[image_input, thinking_checkbox],
+            outputs=[raw_output, thinking_output, score_output],
+        )
+        gr.Markdown("""
+        ---
+        ### 📚 Citation
+        ```bibtex
+        @article{wu2025visualquality,
+          title={{VisualQuality-R1}: Reasoning-Induced Image Quality Assessment via Reinforcement Learning to Rank},
+          author={Wu, Tianhe and Zou, Jian and Liang, Jie and Zhang, Lei and Ma, Kede},
+          journal={arXiv preprint arXiv:2505.14460},
+          year={2025}
+        }
+        ```
+        """)
+    return demo
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.queue(max_size=10)
+    demo.launch()