Spaces:
Runtime error
Runtime error
| # ========================================== | |
| # CRITICAL: Monkey patch BEFORE any imports that use transformers | |
| # This fixes: TypeError: argument of type 'NoneType' is not iterable | |
| # ========================================== | |
| import sys | |
| import types | |
| # Patch the module before it can be imported by anything else | |
| # We prefer patching the specific sub-module if it exists, or pretending it exists | |
| try: | |
| # Try to import just the top level to have 'transformers' in sys.modules if needed, | |
| # but we need to intercept the specific video_processing_auto module. | |
| # Actually, the safest bet is to pre-populate sys.modules. | |
| # Create the fake module structure | |
| fake_vpa = types.ModuleType('transformers.models.auto.video_processing_auto') | |
| fake_vpa.extractors = {} | |
| # We might need to ensure the parent modules exist too, or Python might complain | |
| # However, usually just patching the specific leaf module in sys.modules is enough *if* imports are done specific way. | |
| # But let's try to be less invasive first: just Ensure the attribute exists if loaded. | |
| # But wait, the error happens AT import time or usage time inside the library? | |
| # The traceback showed: video_processor_class = video_processor_class_from_name(video_processor_class) | |
| # File ".../video_processing_auto.py", line 96, in video_processor_class_from_name | |
| # if class_name in extractors: | |
| # So 'extractors' is None inside the module. | |
| # Strategy: We force-load the module, patch it, THEN import everything else. | |
| # But we can't force load if it crashes on import. | |
| # The crash is inside a function 'video_processor_class_from_name', not at module level. | |
| # So we CAN import it, then patch it. | |
| pass | |
| except Exception: | |
| pass | |
| # Lets try the user's specific heavy-handed patch which injects into sys.modules | |
| # This is safer because it guarantees the state before the code runs. | |
| if 'transformers.models.auto.video_processing_auto' not in sys.modules: | |
| # Create a dummy module that will be used instead of the real one (or mixed with it?) | |
| # Wait, if we replace it completely we might break other things. | |
| # The user's code suggests mocking it. | |
| # Let's try the simpler "import then patch" approach first BUT do it very early. | |
| # If the user says "transitive import", we must do it before gradio. | |
| pass | |
| import importlib.util | |
| import sys | |
| # Attempt to find the module spec | |
| try: | |
| # We import the module manually. | |
| # We know the crash happens inside a function call, NOT at import time. | |
| # So we can import it, patch it, and then proceed. | |
| # The issue was 'gradio' importing it before we patched it. | |
| # If we import it HERE, we get the reference, we patch it, and then when gradio imports it, it gets the patched version (from sys.modules). | |
| # But we must import the underlying module directly. | |
| # Note: 'transformers.models.auto.video_processing_auto' might not be directly importable if top level __init__ does stuff. | |
| # Let's trust the user's snippet which does it aggressively. | |
| pass | |
| except: | |
| pass | |
| # Implementing the user's suggested patch exactly as it seems robust | |
| import sys | |
| import types | |
| # 1. Ensure we can import the module (or create a stub if it fails hard) | |
| # Actually, iterating on the user's logic: | |
| if 'transformers.models.auto.video_processing_auto' in sys.modules: | |
| sys.modules['transformers.models.auto.video_processing_auto'].extractors = {} | |
| else: | |
| # Pre-inject to catch the first import | |
| # This is tricky because if we fully replace it, we lose the real functionality. | |
| # But the real functionality is broken (it has None). | |
| # The user's code creates a 'fake_module'. This effectively disables video processing auto mapping? | |
| # That might be fine if we don't use it, but GLM-4v might need it. | |
| # BETTER APPROACH: Import, then patch. | |
| # The key is doing it BEFORE gradio. | |
| try: | |
| # We try to import it. If it crashes ON IMPORT, we are stuck. | |
| # But the traceback says the crash is in `from_pretrained` -> `video_processor_class_from_name`. | |
| # So import is safe. | |
| import transformers.models.auto.video_processing_auto as vpa | |
| if not hasattr(vpa, "extractors") or vpa.extractors is None: | |
| vpa.extractors = {} | |
| except ImportError: | |
| # If we can't import it, maybe it doesn't exist yet? | |
| pass | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import os | |
| from transformers import AutoProcessor, Glm4vForConditionalGeneration | |
| # Configuration | |
| MODEL_PATH = "zai-org/GLM-4.6V-Flash" | |
| # Load Model | |
| print(f"Loading model: {MODEL_PATH}...") | |
| try: | |
| processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) | |
| model = Glm4vForConditionalGeneration.from_pretrained( | |
| MODEL_PATH, | |
| torch_dtype=torch.bfloat16, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True, | |
| device_map="auto" | |
| ) | |
| print("Model loaded successfully.") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| # If it failed, print the extractors state for debugging logs | |
| try: | |
| import transformers.models.auto.video_processing_auto as vpa | |
| print(f"DEBUG: vpa.extractors is {getattr(vpa, 'extractors', 'MISSING')}") | |
| except: | |
| pass | |
| raise | |
| def predict(image, text, history_state): | |
| if not text and not image: | |
| return "Please upload an image or enter text.", history_state | |
| # Initialize history if None (first run) | |
| if history_state is None: | |
| history_state = [] | |
| messages = [] | |
| # Build conversation history | |
| for turn in history_state: | |
| if isinstance(turn, dict) and "user" in turn and "assistant" in turn: | |
| messages.append({"role": "user", "content": turn["user"]}) | |
| messages.append({"role": "assistant", "content": turn["assistant"]}) | |
| # Current turn | |
| content = [] | |
| if image is not None: | |
| content.append({"type": "image", "image": image}) | |
| if text: | |
| content.append({"type": "text", "text": text}) | |
| messages.append({"role": "user", "content": content}) | |
| try: | |
| # Prepare inputs | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt" | |
| ).to(model.device) | |
| # Remove token_type_ids if present (sometimes causes issues) | |
| if "token_type_ids" in inputs: | |
| inputs.pop("token_type_ids") | |
| # Generate | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=1024, | |
| do_sample=True, | |
| temperature=0.7 | |
| ) | |
| # Decode | |
| output_text = processor.decode( | |
| generated_ids[0][inputs["input_ids"].shape[1]:], | |
| skip_special_tokens=True | |
| ) | |
| # Update history | |
| history_state.append({ | |
| "user": content, | |
| "assistant": [{"type": "text", "text": output_text}] | |
| }) | |
| return output_text, history_state | |
| except Exception as e: | |
| return f"Error during generation: {str(e)}", history_state | |
| # Create Gradio Interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown(f"# {MODEL_PATH}") | |
| gr.Markdown("Multimodal chat with conversation history support.") | |
| # Proper state initialization | |
| state = gr.State([]) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="pil", label="Upload Image (Optional)") | |
| text_input = gr.Textbox(label="Message", placeholder="Enter text here...") | |
| submit_btn = gr.Button("Submit", variant="primary") | |
| clear_btn = gr.Button("Clear Conversation") | |
| with gr.Column(): | |
| output = gr.Markdown(label="Response") | |
| submit_btn.click( | |
| fn=predict, | |
| inputs=[image_input, text_input, state], | |
| outputs=[output, state] | |
| ) | |
| clear_btn.click(lambda: ([], "Conversation cleared."), outputs=[state, output]) | |
| gr.Markdown(""" | |
| ### API Usage | |
| Supports multimodal inputs (text + image). | |
| **Note**: Includes pre-import monkey patch for transformers `video_processing_auto.extractors` bug. | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |