File size: 8,379 Bytes
60147c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8863b2f
 
 
c42d004
 
05c509e
 
8863b2f
 
 
 
 
05c509e
 
 
 
 
 
 
 
 
 
 
 
60147c8
 
 
 
 
 
05c509e
8863b2f
05c509e
8863b2f
05c509e
8863b2f
05c509e
 
 
 
8863b2f
 
05c509e
 
 
 
 
8863b2f
05c509e
8863b2f
05c509e
8863b2f
 
 
 
 
 
05c509e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8863b2f
 
 
05c509e
 
 
 
 
8863b2f
 
 
 
05c509e
 
 
8863b2f
 
 
 
 
05c509e
 
8863b2f
05c509e
 
8863b2f
 
 
05c509e
 
60147c8
8863b2f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# ==========================================
# CRITICAL: Monkey patch BEFORE any imports that use transformers
# This fixes: TypeError: argument of type 'NoneType' is not iterable
# ==========================================
import sys
import types

# Patch the module before it can be imported by anything else
# We prefer patching the specific sub-module if it exists, or pretending it exists
try:
    # Try to import just the top level to have 'transformers' in sys.modules if needed, 
    # but we need to intercept the specific video_processing_auto module.
    # Actually, the safest bet is to pre-populate sys.modules.
    
    # Create the fake module structure
    fake_vpa = types.ModuleType('transformers.models.auto.video_processing_auto')
    fake_vpa.extractors = {}
    
    # We might need to ensure the parent modules exist too, or Python might complain
    # However, usually just patching the specific leaf module in sys.modules is enough *if* imports are done specific way.
    # But let's try to be less invasive first: just Ensure the attribute exists if loaded.
    # But wait, the error happens AT import time or usage time inside the library? 
    # The traceback showed: video_processor_class = video_processor_class_from_name(video_processor_class)
    # File ".../video_processing_auto.py", line 96, in video_processor_class_from_name
    # if class_name in extractors:
    # So 'extractors' is None inside the module.
    
    # Strategy: We force-load the module, patch it, THEN import everything else.
    # But we can't force load if it crashes on import. 
    # The crash is inside a function 'video_processor_class_from_name', not at module level.
    # So we CAN import it, then patch it.
    pass
except Exception:
    pass

# Lets try the user's specific heavy-handed patch which injects into sys.modules
# This is safer because it guarantees the state before the code runs.

if 'transformers.models.auto.video_processing_auto' not in sys.modules:
    # Create a dummy module that will be used instead of the real one (or mixed with it?)
    # Wait, if we replace it completely we might break other things.
    # The user's code suggests mocking it.
    # Let's try the simpler "import then patch" approach first BUT do it very early.
    # If the user says "transitive import", we must do it before gradio.
    pass

import importlib.util
import sys

# Attempt to find the module spec
try:
    # We import the module manually. 
    # We know the crash happens inside a function call, NOT at import time. 
    # So we can import it, patch it, and then proceed.
    # The issue was 'gradio' importing it before we patched it. 
    # If we import it HERE, we get the reference, we patch it, and then when gradio imports it, it gets the patched version (from sys.modules).
    
    # But we must import the underlying module directly.
    # Note: 'transformers.models.auto.video_processing_auto' might not be directly importable if top level __init__ does stuff.
    # Let's trust the user's snippet which does it aggressively.
    pass
except:
    pass

# Implementing the user's suggested patch exactly as it seems robust
import sys
import types

# 1. Ensure we can import the module (or create a stub if it fails hard)
# Actually, iterating on the user's logic:
if 'transformers.models.auto.video_processing_auto' in sys.modules:
    sys.modules['transformers.models.auto.video_processing_auto'].extractors = {}
else:
    # Pre-inject to catch the first import
    # This is tricky because if we fully replace it, we lose the real functionality.
    # But the real functionality is broken (it has None).
    # The user's code creates a 'fake_module'. This effectively disables video processing auto mapping?
    # That might be fine if we don't use it, but GLM-4v might need it.
    
    # BETTER APPROACH: Import, then patch.
    # The key is doing it BEFORE gradio.
    try:
        # We try to import it. If it crashes ON IMPORT, we are stuck.
        # But the traceback says the crash is in `from_pretrained` -> `video_processor_class_from_name`. 
        # So import is safe.
        import transformers.models.auto.video_processing_auto as vpa
        if not hasattr(vpa, "extractors") or vpa.extractors is None:
            vpa.extractors = {}
    except ImportError:
        # If we can't import it, maybe it doesn't exist yet?
        pass

import gradio as gr
import torch
from PIL import Image
import os

from transformers import AutoProcessor, Glm4vForConditionalGeneration

# Configuration
MODEL_PATH = "zai-org/GLM-4.6V-Flash"

# Load Model
print(f"Loading model: {MODEL_PATH}...")
try:
    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
    model = Glm4vForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        device_map="auto" 
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    # If it failed, print the extractors state for debugging logs
    try:
        import transformers.models.auto.video_processing_auto as vpa
        print(f"DEBUG: vpa.extractors is {getattr(vpa, 'extractors', 'MISSING')}")
    except:
        pass
    raise

def predict(image, text, history_state):
    if not text and not image:
        return "Please upload an image or enter text.", history_state

    # Initialize history if None (first run)
    if history_state is None:
        history_state = []
    
    messages = []
    
    # Build conversation history
    for turn in history_state:
        if isinstance(turn, dict) and "user" in turn and "assistant" in turn:
            messages.append({"role": "user", "content": turn["user"]})
            messages.append({"role": "assistant", "content": turn["assistant"]})
    
    # Current turn
    content = []
    if image is not None:
        content.append({"type": "image", "image": image})
    if text:
        content.append({"type": "text", "text": text})
        
    messages.append({"role": "user", "content": content})

    try:
        # Prepare inputs
        inputs = processor.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt"
        ).to(model.device)
        
        # Remove token_type_ids if present (sometimes causes issues)
        if "token_type_ids" in inputs:
            inputs.pop("token_type_ids")

        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs, 
                max_new_tokens=1024, 
                do_sample=True, 
                temperature=0.7
            )
        
        # Decode
        output_text = processor.decode(
            generated_ids[0][inputs["input_ids"].shape[1]:], 
            skip_special_tokens=True
        )
        
        # Update history
        history_state.append({
            "user": content,
            "assistant": [{"type": "text", "text": output_text}]
        })
        
        return output_text, history_state

    except Exception as e:
        return f"Error during generation: {str(e)}", history_state

# Create Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown(f"# {MODEL_PATH}")
    gr.Markdown("Multimodal chat with conversation history support.")
    
    # Proper state initialization
    state = gr.State([])
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image (Optional)")
            text_input = gr.Textbox(label="Message", placeholder="Enter text here...")
            submit_btn = gr.Button("Submit", variant="primary")
            clear_btn = gr.Button("Clear Conversation")
        with gr.Column():
            output = gr.Markdown(label="Response")

    submit_btn.click(
        fn=predict,
        inputs=[image_input, text_input, state],
        outputs=[output, state]
    )
    
    clear_btn.click(lambda: ([], "Conversation cleared."), outputs=[state, output])

    gr.Markdown("""
    ### API Usage
    Supports multimodal inputs (text + image). 
    
    **Note**: Includes pre-import monkey patch for transformers `video_processing_auto.extractors` bug.
    """)

if __name__ == "__main__":
    demo.launch()