# ========================================== # CRITICAL: Monkey patch BEFORE any imports that use transformers # This fixes: TypeError: argument of type 'NoneType' is not iterable # ========================================== import sys import types # Patch the module before it can be imported by anything else # We prefer patching the specific sub-module if it exists, or pretending it exists try: # Try to import just the top level to have 'transformers' in sys.modules if needed, # but we need to intercept the specific video_processing_auto module. # Actually, the safest bet is to pre-populate sys.modules. # Create the fake module structure fake_vpa = types.ModuleType('transformers.models.auto.video_processing_auto') fake_vpa.extractors = {} # We might need to ensure the parent modules exist too, or Python might complain # However, usually just patching the specific leaf module in sys.modules is enough *if* imports are done specific way. # But let's try to be less invasive first: just Ensure the attribute exists if loaded. # But wait, the error happens AT import time or usage time inside the library? # The traceback showed: video_processor_class = video_processor_class_from_name(video_processor_class) # File ".../video_processing_auto.py", line 96, in video_processor_class_from_name # if class_name in extractors: # So 'extractors' is None inside the module. # Strategy: We force-load the module, patch it, THEN import everything else. # But we can't force load if it crashes on import. # The crash is inside a function 'video_processor_class_from_name', not at module level. # So we CAN import it, then patch it. pass except Exception: pass # Lets try the user's specific heavy-handed patch which injects into sys.modules # This is safer because it guarantees the state before the code runs. if 'transformers.models.auto.video_processing_auto' not in sys.modules: # Create a dummy module that will be used instead of the real one (or mixed with it?) # Wait, if we replace it completely we might break other things. # The user's code suggests mocking it. # Let's try the simpler "import then patch" approach first BUT do it very early. # If the user says "transitive import", we must do it before gradio. pass import importlib.util import sys # Attempt to find the module spec try: # We import the module manually. # We know the crash happens inside a function call, NOT at import time. # So we can import it, patch it, and then proceed. # The issue was 'gradio' importing it before we patched it. # If we import it HERE, we get the reference, we patch it, and then when gradio imports it, it gets the patched version (from sys.modules). # But we must import the underlying module directly. # Note: 'transformers.models.auto.video_processing_auto' might not be directly importable if top level __init__ does stuff. # Let's trust the user's snippet which does it aggressively. pass except: pass # Implementing the user's suggested patch exactly as it seems robust import sys import types # 1. Ensure we can import the module (or create a stub if it fails hard) # Actually, iterating on the user's logic: if 'transformers.models.auto.video_processing_auto' in sys.modules: sys.modules['transformers.models.auto.video_processing_auto'].extractors = {} else: # Pre-inject to catch the first import # This is tricky because if we fully replace it, we lose the real functionality. # But the real functionality is broken (it has None). # The user's code creates a 'fake_module'. This effectively disables video processing auto mapping? # That might be fine if we don't use it, but GLM-4v might need it. # BETTER APPROACH: Import, then patch. # The key is doing it BEFORE gradio. try: # We try to import it. If it crashes ON IMPORT, we are stuck. # But the traceback says the crash is in `from_pretrained` -> `video_processor_class_from_name`. # So import is safe. import transformers.models.auto.video_processing_auto as vpa if not hasattr(vpa, "extractors") or vpa.extractors is None: vpa.extractors = {} except ImportError: # If we can't import it, maybe it doesn't exist yet? pass import gradio as gr import torch from PIL import Image import os from transformers import AutoProcessor, Glm4vForConditionalGeneration # Configuration MODEL_PATH = "zai-org/GLM-4.6V-Flash" # Load Model print(f"Loading model: {MODEL_PATH}...") try: processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) model = Glm4vForConditionalGeneration.from_pretrained( MODEL_PATH, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, device_map="auto" ) print("Model loaded successfully.") except Exception as e: print(f"Error loading model: {e}") # If it failed, print the extractors state for debugging logs try: import transformers.models.auto.video_processing_auto as vpa print(f"DEBUG: vpa.extractors is {getattr(vpa, 'extractors', 'MISSING')}") except: pass raise def predict(image, text, history_state): if not text and not image: return "Please upload an image or enter text.", history_state # Initialize history if None (first run) if history_state is None: history_state = [] messages = [] # Build conversation history for turn in history_state: if isinstance(turn, dict) and "user" in turn and "assistant" in turn: messages.append({"role": "user", "content": turn["user"]}) messages.append({"role": "assistant", "content": turn["assistant"]}) # Current turn content = [] if image is not None: content.append({"type": "image", "image": image}) if text: content.append({"type": "text", "text": text}) messages.append({"role": "user", "content": content}) try: # Prepare inputs inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt" ).to(model.device) # Remove token_type_ids if present (sometimes causes issues) if "token_type_ids" in inputs: inputs.pop("token_type_ids") # Generate with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=1024, do_sample=True, temperature=0.7 ) # Decode output_text = processor.decode( generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True ) # Update history history_state.append({ "user": content, "assistant": [{"type": "text", "text": output_text}] }) return output_text, history_state except Exception as e: return f"Error during generation: {str(e)}", history_state # Create Gradio Interface with gr.Blocks() as demo: gr.Markdown(f"# {MODEL_PATH}") gr.Markdown("Multimodal chat with conversation history support.") # Proper state initialization state = gr.State([]) with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image (Optional)") text_input = gr.Textbox(label="Message", placeholder="Enter text here...") submit_btn = gr.Button("Submit", variant="primary") clear_btn = gr.Button("Clear Conversation") with gr.Column(): output = gr.Markdown(label="Response") submit_btn.click( fn=predict, inputs=[image_input, text_input, state], outputs=[output, state] ) clear_btn.click(lambda: ([], "Conversation cleared."), outputs=[state, output]) gr.Markdown(""" ### API Usage Supports multimodal inputs (text + image). **Note**: Includes pre-import monkey patch for transformers `video_processing_auto.extractors` bug. """) if __name__ == "__main__": demo.launch()