glm_api / app.py
Example88's picture
Fix: Robust monkey patch before Gradio import
60147c8
# ==========================================
# CRITICAL: Monkey patch BEFORE any imports that use transformers
# This fixes: TypeError: argument of type 'NoneType' is not iterable
# ==========================================
import sys
import types
# Patch the module before it can be imported by anything else
# We prefer patching the specific sub-module if it exists, or pretending it exists
try:
# Try to import just the top level to have 'transformers' in sys.modules if needed,
# but we need to intercept the specific video_processing_auto module.
# Actually, the safest bet is to pre-populate sys.modules.
# Create the fake module structure
fake_vpa = types.ModuleType('transformers.models.auto.video_processing_auto')
fake_vpa.extractors = {}
# We might need to ensure the parent modules exist too, or Python might complain
# However, usually just patching the specific leaf module in sys.modules is enough *if* imports are done specific way.
# But let's try to be less invasive first: just Ensure the attribute exists if loaded.
# But wait, the error happens AT import time or usage time inside the library?
# The traceback showed: video_processor_class = video_processor_class_from_name(video_processor_class)
# File ".../video_processing_auto.py", line 96, in video_processor_class_from_name
# if class_name in extractors:
# So 'extractors' is None inside the module.
# Strategy: We force-load the module, patch it, THEN import everything else.
# But we can't force load if it crashes on import.
# The crash is inside a function 'video_processor_class_from_name', not at module level.
# So we CAN import it, then patch it.
pass
except Exception:
pass
# Lets try the user's specific heavy-handed patch which injects into sys.modules
# This is safer because it guarantees the state before the code runs.
if 'transformers.models.auto.video_processing_auto' not in sys.modules:
# Create a dummy module that will be used instead of the real one (or mixed with it?)
# Wait, if we replace it completely we might break other things.
# The user's code suggests mocking it.
# Let's try the simpler "import then patch" approach first BUT do it very early.
# If the user says "transitive import", we must do it before gradio.
pass
import importlib.util
import sys
# Attempt to find the module spec
try:
# We import the module manually.
# We know the crash happens inside a function call, NOT at import time.
# So we can import it, patch it, and then proceed.
# The issue was 'gradio' importing it before we patched it.
# If we import it HERE, we get the reference, we patch it, and then when gradio imports it, it gets the patched version (from sys.modules).
# But we must import the underlying module directly.
# Note: 'transformers.models.auto.video_processing_auto' might not be directly importable if top level __init__ does stuff.
# Let's trust the user's snippet which does it aggressively.
pass
except:
pass
# Implementing the user's suggested patch exactly as it seems robust
import sys
import types
# 1. Ensure we can import the module (or create a stub if it fails hard)
# Actually, iterating on the user's logic:
if 'transformers.models.auto.video_processing_auto' in sys.modules:
sys.modules['transformers.models.auto.video_processing_auto'].extractors = {}
else:
# Pre-inject to catch the first import
# This is tricky because if we fully replace it, we lose the real functionality.
# But the real functionality is broken (it has None).
# The user's code creates a 'fake_module'. This effectively disables video processing auto mapping?
# That might be fine if we don't use it, but GLM-4v might need it.
# BETTER APPROACH: Import, then patch.
# The key is doing it BEFORE gradio.
try:
# We try to import it. If it crashes ON IMPORT, we are stuck.
# But the traceback says the crash is in `from_pretrained` -> `video_processor_class_from_name`.
# So import is safe.
import transformers.models.auto.video_processing_auto as vpa
if not hasattr(vpa, "extractors") or vpa.extractors is None:
vpa.extractors = {}
except ImportError:
# If we can't import it, maybe it doesn't exist yet?
pass
import gradio as gr
import torch
from PIL import Image
import os
from transformers import AutoProcessor, Glm4vForConditionalGeneration
# Configuration
MODEL_PATH = "zai-org/GLM-4.6V-Flash"
# Load Model
print(f"Loading model: {MODEL_PATH}...")
try:
processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = Glm4vForConditionalGeneration.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
device_map="auto"
)
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
# If it failed, print the extractors state for debugging logs
try:
import transformers.models.auto.video_processing_auto as vpa
print(f"DEBUG: vpa.extractors is {getattr(vpa, 'extractors', 'MISSING')}")
except:
pass
raise
def predict(image, text, history_state):
if not text and not image:
return "Please upload an image or enter text.", history_state
# Initialize history if None (first run)
if history_state is None:
history_state = []
messages = []
# Build conversation history
for turn in history_state:
if isinstance(turn, dict) and "user" in turn and "assistant" in turn:
messages.append({"role": "user", "content": turn["user"]})
messages.append({"role": "assistant", "content": turn["assistant"]})
# Current turn
content = []
if image is not None:
content.append({"type": "image", "image": image})
if text:
content.append({"type": "text", "text": text})
messages.append({"role": "user", "content": content})
try:
# Prepare inputs
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(model.device)
# Remove token_type_ids if present (sometimes causes issues)
if "token_type_ids" in inputs:
inputs.pop("token_type_ids")
# Generate
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=1024,
do_sample=True,
temperature=0.7
)
# Decode
output_text = processor.decode(
generated_ids[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True
)
# Update history
history_state.append({
"user": content,
"assistant": [{"type": "text", "text": output_text}]
})
return output_text, history_state
except Exception as e:
return f"Error during generation: {str(e)}", history_state
# Create Gradio Interface
with gr.Blocks() as demo:
gr.Markdown(f"# {MODEL_PATH}")
gr.Markdown("Multimodal chat with conversation history support.")
# Proper state initialization
state = gr.State([])
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="Upload Image (Optional)")
text_input = gr.Textbox(label="Message", placeholder="Enter text here...")
submit_btn = gr.Button("Submit", variant="primary")
clear_btn = gr.Button("Clear Conversation")
with gr.Column():
output = gr.Markdown(label="Response")
submit_btn.click(
fn=predict,
inputs=[image_input, text_input, state],
outputs=[output, state]
)
clear_btn.click(lambda: ([], "Conversation cleared."), outputs=[state, output])
gr.Markdown("""
### API Usage
Supports multimodal inputs (text + image).
**Note**: Includes pre-import monkey patch for transformers `video_processing_auto.extractors` bug.
""")
if __name__ == "__main__":
demo.launch()