Spaces:

artificialguybr
/

qwen-vl

Sleeping

App Files Files Community

artificialguybr commited on Mar 20

Commit

46b916c

1 Parent(s): c21abe0

Refactor qwen-vl space for dynamic Qwen2.5-VL model selection

Browse files

Files changed (2) hide show

app.py +293 -145
requirements.txt +5 -13

app.py CHANGED Viewed

@@ -1,156 +1,304 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
 import torch
 from PIL import Image
-import re
-import requests
-from io import BytesIO
-import copy
-import secrets
-from pathlib import Path
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
-config = AutoConfig.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True, torch_dtype=torch.float16)
-#config.quantization_config["use_exllama"] = True
-config.quantization_config["disable_exllama"] = False
-config.quantization_config["exllama_config"] = {"version":2}
-model = AutoModelForCausalLM.from_config(config, trust_remote_code=True, torch_dtype=torch.float16)
-BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
-PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
-def _parse_text(text):
-    lines = text.split("\n")
-    lines = [line for line in lines if line != ""]
-    count = 0
-    for i, line in enumerate(lines):
-        if "```" in line:
-            count += 1
-            items = line.split("`")
-            if count % 2 == 1:
-                lines[i] = f'<pre><code class="language-{items[-1]}">'
-            else:
-                lines[i] = f"<br></code></pre>"
-        else:
-            if i > 0:
-                if count % 2 == 1:
-                    line = line.replace("`", r"\`")
-                    line = line.replace("<", "&lt;")
-                    line = line.replace(">", "&gt;")
-                    line = line.replace(" ", "&nbsp;")
-                    line = line.replace("*", "&ast;")
-                    line = line.replace("_", "&lowbar;")
-                    line = line.replace("-", "&#45;")
-                    line = line.replace(".", "&#46;")
-                    line = line.replace("!", "&#33;")
-                    line = line.replace("(", "&#40;")
-                    line = line.replace(")", "&#41;")
-                    line = line.replace("$", "&#36;")
-                lines[i] = "<br>" + line
-    text = "".join(lines)
-    return text
-def predict(_chatbot, task_history):
-    chat_query = _chatbot[-1][0]
-    query = task_history[-1][0]
-    history_cp = copy.deepcopy(task_history)
-    full_response = ""
-    history_filter = []
-    pic_idx = 1
-    pre = ""
-    for i, (q, a) in enumerate(history_cp):
-        if isinstance(q, (tuple, list)):
-            q = f'Picture {pic_idx}: <img>{q[0]}</img>'
-            pre += q + '\n'
-            pic_idx += 1
-        else:
-            pre += q
-            history_filter.append((pre, a))
-            pre = ""
-    history, message = history_filter[:-1], history_filter[-1][0]
-    response, history = model.chat(tokenizer, message, history=history)
-    image = tokenizer.draw_bbox_on_latest_picture(response, history)
     if image is not None:
-        temp_dir = secrets.token_hex(20)
-        temp_dir = Path("/tmp") / temp_dir
-        temp_dir.mkdir(exist_ok=True, parents=True)
-        name = f"tmp{secrets.token_hex(5)}.jpg"
-        filename = temp_dir / name
-        image.save(str(filename))
-        _chatbot[-1] = (_parse_text(chat_query), (str(filename),))
-        chat_response = response.replace("<ref>", "")
-        chat_response = chat_response.replace(r"</ref>", "")
-        chat_response = re.sub(BOX_TAG_PATTERN, "", chat_response)
-        if chat_response != "":
-            _chatbot.append((None, chat_response))
-    else:
-        _chatbot[-1] = (_parse_text(chat_query), response)
-    full_response = _parse_text(response)
-    task_history[-1] = (query, full_response)
-    return _chatbot
-def add_text(history, task_history, text):
-    task_text = text
-    if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
-        task_text = text[:-1]
-    history = history + [(_parse_text(text), None)]
-    task_history = task_history + [(task_text, None)]
-    return history, task_history, ""
-def add_file(history, task_history, file):
-    history = history + [((file.name,), None)]
-    task_history = task_history + [((file.name,), None)]
-    return history, task_history
-def reset_user_input():
-    return gr.update(value="")
-def reset_state(task_history):
-    task_history.clear()
-    return []
-def regenerate(_chatbot, task_history):
-    print("Regenerate clicked")
-    print("Before:", task_history, _chatbot)
-    if not task_history:
-        return _chatbot
-    item = task_history[-1]
-    if item[1] is None:
-        return _chatbot
-    task_history[-1] = (item[0], None)
-    chatbot_item = _chatbot.pop(-1)
-    if chatbot_item[0] is None:
-        _chatbot[-1] = (_chatbot[-1][0], None)
     else:
-        _chatbot.append((chatbot_item[0], None))
-    print("After:", task_history, _chatbot)
-    return predict(_chatbot, task_history)
-css = '''
-.gradio-container{max-width:800px !important}
-'''
-with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Qwen-VL-Chat Bot")
-    gr.Markdown("## Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud **Space by [@Artificialguybr](https://twitter.com/artificialguybr). Test the [QwenLLM-14B](https://huggingface.co/spaces/artificialguybr/qwen-14b-chat-demo) here for free!</center>")
-    chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=520)
-    query = gr.Textbox(lines=2, label='Input')
-    task_history = gr.State([])
     with gr.Row():
-        addfile_btn = gr.UploadButton("📁 Upload", file_types=["image"])
-        submit_btn = gr.Button("🚀 Submit")
-        regen_btn = gr.Button("🤔️ Regenerate")
-        empty_bin = gr.Button("🧹 Clear History")
-    gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
-    submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
-        predict, [chatbot, task_history], [chatbot], show_progress=True
     )
-    submit_btn.click(reset_user_input, [], [query])
-    empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
-    regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
-    addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
-demo.launch()

+from __future__ import annotations
+import gc
+import re
+from typing import Any
 import gradio as gr
+import requests
+import spaces
 import torch
 from PIL import Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+HF_MODELS_API = "https://huggingface.co/api/models"
+MIN_UPDATED_DATE = "2025-03-01"
+ORG = "Qwen"
+SEARCH_TERM = "Qwen2.5-VL"
+DEFAULT_MODELS = [
+    {
+        "id": "Qwen/Qwen2.5-VL-3B-Instruct",
+        "updated": "2025-04-06",
+        "fit_note": "Best speed/quality for most tasks on 80GB.",
+    },
+    {
+        "id": "Qwen/Qwen2.5-VL-7B-Instruct",
+        "updated": "2025-04-06",
+        "fit_note": "Higher quality, still comfortable on 80GB.",
+    },
+    {
+        "id": "Qwen/Qwen2.5-VL-32B-Instruct-AWQ",
+        "updated": "2025-04-06",
+        "fit_note": "Strong quality with 4-bit AWQ quantization.",
+    },
+    {
+        "id": "Qwen/Qwen2.5-VL-72B-Instruct-AWQ",
+        "updated": "2025-03-07",
+        "fit_note": "Largest option; can fit on 80GB but heavier/less headroom.",
+    },
+]
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+LOADED_MODEL_ID: str | None = None
+LOADED_MODEL: AutoModelForImageTextToText | None = None
+LOADED_PROCESSOR: AutoProcessor | None = None
+def _parse_param_billions(model_id: str) -> int:
+    match = re.search(r"-(\d+)B-", model_id)
+    if not match:
+        return 0
+    return int(match.group(1))
+def _fits_80gb(model_id: str, tags: list[str]) -> bool:
+    params_b = _parse_param_billions(model_id)
+    lower_id = model_id.lower()
+    lower_tags = " ".join(str(tag).lower() for tag in tags)
+    if params_b == 0:
+        return False
+    if params_b <= 32:
+        return True
+    if params_b <= 72 and ("awq" in lower_id or "awq" in lower_tags):
+        return True
+    return False
+def _fetch_model_catalog() -> list[dict[str, str]]:
+    params = {
+        "author": ORG,
+        "search": SEARCH_TERM,
+        "full": "true",
+        "limit": 200,
+    }
+    response = requests.get(HF_MODELS_API, params=params, timeout=60)
+    response.raise_for_status()
+    models = response.json()
+    selected: list[dict[str, str]] = []
+    for item in models:
+        model_id = item.get("id", "")
+        pipeline = item.get("pipeline_tag")
+        updated = (item.get("lastModified") or "")[:10]
+        tags = item.get("tags") or []
+        if pipeline != "image-text-to-text":
+            continue
+        if not model_id.startswith("Qwen/Qwen2.5-VL"):
+            continue
+        if not updated or updated < MIN_UPDATED_DATE:
+            continue
+        if not _fits_80gb(model_id, tags):
+            continue
+        if "gguf" in model_id.lower():
+            continue
+        selected.append(
+            {
+                "id": model_id,
+                "updated": updated,
+                "fit_note": "Auto-selected by VRAM fit heuristic for 80GB.",
+            }
+        )
+    selected.sort(key=lambda x: (_parse_param_billions(x["id"]), x["id"]))
+    return selected
+def get_model_catalog() -> list[dict[str, str]]:
+    try:
+        models = _fetch_model_catalog()
+        if models:
+            return models
+    except Exception:
+        pass
+    return DEFAULT_MODELS
+MODEL_CATALOG = get_model_catalog()
+MODEL_LABELS = {
+    item["id"]: f"{item['id']}  | updated {item['updated']}"
+    for item in MODEL_CATALOG
+}
+def _dtype_for_model(model_id: str) -> torch.dtype:
+    if DEVICE != "cuda":
+        return torch.float32
+    if "awq" in model_id.lower():
+        return torch.float16
+    return torch.bfloat16
+def unload_current_model() -> None:
+    global LOADED_MODEL, LOADED_PROCESSOR, LOADED_MODEL_ID
+    LOADED_MODEL = None
+    LOADED_PROCESSOR = None
+    LOADED_MODEL_ID = None
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def _first_model_device(model: AutoModelForImageTextToText) -> torch.device:
+    try:
+        return next(model.parameters()).device
+    except StopIteration:
+        return torch.device("cuda:0" if DEVICE == "cuda" else "cpu")
+def load_model(model_id: str) -> tuple[AutoModelForImageTextToText, AutoProcessor]:
+    global LOADED_MODEL, LOADED_PROCESSOR, LOADED_MODEL_ID
+    if LOADED_MODEL is not None and LOADED_PROCESSOR is not None and LOADED_MODEL_ID == model_id:
+        return LOADED_MODEL, LOADED_PROCESSOR
+    unload_current_model()
+    dtype = _dtype_for_model(model_id)
+    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+    model = AutoModelForImageTextToText.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        torch_dtype=dtype,
+        device_map="auto" if DEVICE == "cuda" else None,
+    )
+    model.eval()
+    LOADED_MODEL_ID = model_id
+    LOADED_MODEL = model
+    LOADED_PROCESSOR = processor
+    return model, processor
+def format_model_status(model_id: str) -> str:
+    entry = next((item for item in MODEL_CATALOG if item["id"] == model_id), None)
+    if entry is None:
+        return f"**Model:** `{model_id}`"
+    return (
+        f"**Model:** `{entry['id']}`\n"
+        f"- Updated: **{entry['updated']}**\n"
+        f"- 80GB fit note: {entry['fit_note']}"
+    )
+def _build_messages(prompt: str, image: Image.Image | None) -> list[dict[str, Any]]:
+    content: list[dict[str, Any]] = []
     if image is not None:
+        content.append({"type": "image", "image": image})
+    content.append({"type": "text", "text": prompt})
+    return [{"role": "user", "content": content}]
+@spaces.GPU(duration=120)
+def run_vl(
+    model_id: str,
+    image: Image.Image | None,
+    prompt: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+) -> tuple[str, str]:
+    if not prompt or not prompt.strip():
+        raise gr.Error("Prompt is required.")
+    if image is None:
+        raise gr.Error("Upload an image first.")
+    model, processor = load_model(model_id)
+    messages = _build_messages(prompt.strip(), image)
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(text=[text], images=[image], return_tensors="pt")
+    model_device = _first_model_device(model)
+    inputs = {k: (v.to(model_device) if torch.is_tensor(v) else v) for k, v in inputs.items()}
+    generate_kwargs: dict[str, Any] = {
+        "max_new_tokens": int(max_new_tokens),
+        "top_p": float(top_p),
+    }
+    if temperature > 0:
+        generate_kwargs["do_sample"] = True
+        generate_kwargs["temperature"] = float(temperature)
     else:
+        generate_kwargs["do_sample"] = False
+    with torch.inference_mode():
+        output_ids = model.generate(**inputs, **generate_kwargs)
+    prompt_len = inputs["input_ids"].shape[1]
+    completion_ids = output_ids[:, prompt_len:]
+    answer = processor.batch_decode(
+        completion_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0].strip()
+    return answer, format_model_status(model_id)
+def on_model_change(model_id: str) -> str:
+    return format_model_status(model_id)
+default_model_id = MODEL_CATALOG[0]["id"]
+with gr.Blocks() as demo:
+    gr.Markdown("# Qwen2.5-VL Multi-Model Playground")
+    gr.Markdown(
+        "Selecione o modelo VL da Qwen, envie uma imagem e faça perguntas/extração. "
+        f"Critério aplicado: modelos Qwen2.5-VL com update em/apos {MIN_UPDATED_DATE} e que cabem em 80GB."
+    )
+    with gr.Row():
+        model_id = gr.Dropdown(
+            label="Model",
+            choices=[(MODEL_LABELS[item["id"]], item["id"]) for item in MODEL_CATALOG],
+            value=default_model_id,
+        )
+        model_status = gr.Markdown(value=format_model_status(default_model_id))
     with gr.Row():
+        image_input = gr.Image(type="pil", label="Image")
+        answer_output = gr.Textbox(lines=16, label="Answer")
+    prompt = gr.Textbox(
+        lines=3,
+        label="Prompt",
+        placeholder="Describe this image in detail. / Extract all text. / What's happening?",
+    )
+    with gr.Accordion("Advanced generation settings", open=False):
+        with gr.Row():
+            max_new_tokens = gr.Slider(
+                label="Max new tokens", minimum=32, maximum=2048, value=512, step=32
+            )
+            temperature = gr.Slider(
+                label="Temperature", minimum=0.0, maximum=1.5, value=0.2, step=0.05
+            )
+            top_p = gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, value=0.9, step=0.05)
+    with gr.Row():
+        run_btn = gr.Button("Run", variant="primary")
+        clear_btn = gr.Button("Clear")
+        unload_btn = gr.Button("Unload current model")
+    run_btn.click(
+        fn=run_vl,
+        inputs=[model_id, image_input, prompt, max_new_tokens, temperature, top_p],
+        outputs=[answer_output, model_status],
+    )
+    model_id.change(fn=on_model_change, inputs=[model_id], outputs=[model_status])
+    clear_btn.click(
+        fn=lambda selected_model: (None, "", "", format_model_status(selected_model)),
+        inputs=[model_id],
+        outputs=[image_input, prompt, answer_output, model_status],
+    )
+    unload_btn.click(
+        fn=lambda: (unload_current_model() or "Model unloaded from memory."),
+        outputs=[answer_output],
     )
+demo.queue(max_size=10)
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,16 +1,8 @@
-transformers
 Pillow
 requests
-accelerate
-tiktoken
-einops
-transformers_stream_generator==0.0.4
-scipy
 torchvision
-pillow
-tensorboard
-matplotlib
-bitsandbytes
-optimum
-auto-gptq
-torch

+accelerate
+gradio
 Pillow
 requests
+spaces
+torch
 torchvision
+transformers>=4.52.0