Spaces:

omar0scarf
/

glm_api

Runtime error

File size: 8,379 Bytes

# ==========================================
# CRITICAL: Monkey patch BEFORE any imports that use transformers
# This fixes: TypeError: argument of type 'NoneType' is not iterable
# ==========================================
import sys
import types

# Patch the module before it can be imported by anything else
# We prefer patching the specific sub-module if it exists, or pretending it exists
try:
    # Try to import just the top level to have 'transformers' in sys.modules if needed, 
    # but we need to intercept the specific video_processing_auto module.
    # Actually, the safest bet is to pre-populate sys.modules.
    
    # Create the fake module structure
    fake_vpa = types.ModuleType('transformers.models.auto.video_processing_auto')
    fake_vpa.extractors = {}
    
    # We might need to ensure the parent modules exist too, or Python might complain
    # However, usually just patching the specific leaf module in sys.modules is enough *if* imports are done specific way.
    # But let's try to be less invasive first: just Ensure the attribute exists if loaded.
    # But wait, the error happens AT import time or usage time inside the library? 
    # The traceback showed: video_processor_class = video_processor_class_from_name(video_processor_class)
    # File ".../video_processing_auto.py", line 96, in video_processor_class_from_name
    # if class_name in extractors:
    # So 'extractors' is None inside the module.
    
    # Strategy: We force-load the module, patch it, THEN import everything else.
    # But we can't force load if it crashes on import. 
    # The crash is inside a function 'video_processor_class_from_name', not at module level.
    # So we CAN import it, then patch it.
    pass
except Exception:
    pass

# Lets try the user's specific heavy-handed patch which injects into sys.modules
# This is safer because it guarantees the state before the code runs.

if 'transformers.models.auto.video_processing_auto' not in sys.modules:
    # Create a dummy module that will be used instead of the real one (or mixed with it?)
    # Wait, if we replace it completely we might break other things.
    # The user's code suggests mocking it.
    # Let's try the simpler "import then patch" approach first BUT do it very early.
    # If the user says "transitive import", we must do it before gradio.
    pass

import importlib.util
import sys

# Attempt to find the module spec
try:
    # We import the module manually. 
    # We know the crash happens inside a function call, NOT at import time. 
    # So we can import it, patch it, and then proceed.
    # The issue was 'gradio' importing it before we patched it. 
    # If we import it HERE, we get the reference, we patch it, and then when gradio imports it, it gets the patched version (from sys.modules).
    
    # But we must import the underlying module directly.
    # Note: 'transformers.models.auto.video_processing_auto' might not be directly importable if top level __init__ does stuff.
    # Let's trust the user's snippet which does it aggressively.
    pass
except:
    pass

# Implementing the user's suggested patch exactly as it seems robust
import sys
import types

# 1. Ensure we can import the module (or create a stub if it fails hard)
# Actually, iterating on the user's logic:
if 'transformers.models.auto.video_processing_auto' in sys.modules:
    sys.modules['transformers.models.auto.video_processing_auto'].extractors = {}
else:
    # Pre-inject to catch the first import
    # This is tricky because if we fully replace it, we lose the real functionality.
    # But the real functionality is broken (it has None).
    # The user's code creates a 'fake_module'. This effectively disables video processing auto mapping?
    # That might be fine if we don't use it, but GLM-4v might need it.
    
    # BETTER APPROACH: Import, then patch.
    # The key is doing it BEFORE gradio.
    try:
        # We try to import it. If it crashes ON IMPORT, we are stuck.
        # But the traceback says the crash is in `from_pretrained` -> `video_processor_class_from_name`. 
        # So import is safe.
        import transformers.models.auto.video_processing_auto as vpa
        if not hasattr(vpa, "extractors") or vpa.extractors is None:
            vpa.extractors = {}
    except ImportError:
        # If we can't import it, maybe it doesn't exist yet?
        pass

import gradio as gr
import torch
from PIL import Image
import os

from transformers import AutoProcessor, Glm4vForConditionalGeneration

# Configuration
MODEL_PATH = "zai-org/GLM-4.6V-Flash"

# Load Model
print(f"Loading model: {MODEL_PATH}...")
try:
    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
    model = Glm4vForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        device_map="auto" 
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    # If it failed, print the extractors state for debugging logs
    try:
        import transformers.models.auto.video_processing_auto as vpa
        print(f"DEBUG: vpa.extractors is {getattr(vpa, 'extractors', 'MISSING')}")
    except:
        pass
    raise

def predict(image, text, history_state):
    if not text and not image:
        return "Please upload an image or enter text.", history_state

    # Initialize history if None (first run)
    if history_state is None:
        history_state = []
    
    messages = []
    
    # Build conversation history
    for turn in history_state:
        if isinstance(turn, dict) and "user" in turn and "assistant" in turn:
            messages.append({"role": "user", "content": turn["user"]})
            messages.append({"role": "assistant", "content": turn["assistant"]})
    
    # Current turn
    content = []
    if image is not None:
        content.append({"type": "image", "image": image})
    if text:
        content.append({"type": "text", "text": text})
        
    messages.append({"role": "user", "content": content})

    try:
        # Prepare inputs
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        ).to(model.device)
        
        # Remove token_type_ids if present (sometimes causes issues)
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs, 
                max_new_tokens=1024, 
                do_sample=True, 
                temperature=0.7
            )
        
        # Decode
        output_text = processor.decode(
            generated_ids[0][inputs["input_ids"].shape[1]:], 
            skip_special_tokens=True
        )
        
        # Update history
        history_state.append({
            "user": content,
            "assistant": [{"type": "text", "text": output_text}]
        })
        
        return output_text, history_state

    except Exception as e:
        return f"Error during generation: {str(e)}", history_state

# Create Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown(f"# {MODEL_PATH}")
    gr.Markdown("Multimodal chat with conversation history support.")
    
    # Proper state initialization
    state = gr.State([])
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image (Optional)")
            text_input = gr.Textbox(label="Message", placeholder="Enter text here...")
            submit_btn = gr.Button("Submit", variant="primary")
            clear_btn = gr.Button("Clear Conversation")
        with gr.Column():
            output = gr.Markdown(label="Response")

    submit_btn.click(
        fn=predict,
        inputs=[image_input, text_input, state],
        outputs=[output, state]
    )
    
    clear_btn.click(lambda: ([], "Conversation cleared."), outputs=[state, output])

    gr.Markdown("""
    ### API Usage
    Supports multimodal inputs (text + image). 
    
    **Note**: Includes pre-import monkey patch for transformers `video_processing_auto.extractors` bug.
    """)

if __name__ == "__main__":
    demo.launch()