Spaces:

malusama
/

M2-Encoder-0.4B-Space

Sleeping

App Files Files Community

malusama commited on 30 days ago

Commit

a44eea8

verified ·

1 Parent(s): 1fde89f

Load tokenizer and image processor directly from snapshot

Browse files

Files changed (1) hide show

app.py +30 -8

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from functools import lru_cache
 import json
 import os
 import torch
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import AutoModel, AutoProcessor
 os.environ["HF_ENDPOINT"] = "https://huggingface.co"
@@ -22,17 +24,22 @@ def load_components():
         repo_id=MODEL_ID,
         revision=MODEL_REVISION,
     )
     model = AutoModel.from_pretrained(
         model_dir,
         trust_remote_code=True,
     )
-    processor = AutoProcessor.from_pretrained(
-        model_dir,
-        trust_remote_code=True,
     )
     model.to(DEVICE)
     model.eval()
-    return model, processor
 def parse_labels(text: str):
@@ -52,10 +59,25 @@ def run_demo(image: Image.Image, candidate_text: str):
     if not labels:
         raise ValueError("Please enter at least one label.")
-    model, processor = load_components()
     with torch.no_grad():
-        text_inputs = processor(text=labels, return_tensors="pt")
-        image_inputs = processor(images=image.convert("RGB"), return_tensors="pt")
         text_outputs = model(**text_inputs)
         image_outputs = model(**image_inputs)

 from functools import lru_cache
+import importlib
 import json
 import os
+import sys
 import torch
 from huggingface_hub import snapshot_download
 from PIL import Image
+from transformers import AutoModel
 os.environ["HF_ENDPOINT"] = "https://huggingface.co"
         repo_id=MODEL_ID,
         revision=MODEL_REVISION,
     )
+    if model_dir not in sys.path:
+        sys.path.insert(0, model_dir)
     model = AutoModel.from_pretrained(
         model_dir,
         trust_remote_code=True,
     )
+    tokenizer = importlib.import_module("tokenization_glm").GLMChineseTokenizer(
+        vocab_file=os.path.join(model_dir, "sp.model")
     )
+    image_processor = importlib.import_module(
+        "image_processing_m2_encoder"
+    ).M2EncoderImageProcessor.from_pretrained(model_dir)
     model.to(DEVICE)
     model.eval()
+    return model, tokenizer, image_processor
 def parse_labels(text: str):
     if not labels:
         raise ValueError("Please enter at least one label.")
+    model, tokenizer, image_processor = load_components()
     with torch.no_grad():
+        text_inputs = tokenizer(
+            labels,
+            padding="max_length",
+            truncation=True,
+            max_length=52,
+            return_special_tokens_mask=True,
+            return_tensors="pt",
+        )
+        image_inputs = image_processor(image.convert("RGB"), return_tensors="pt")
+        text_inputs = {
+            key: value.to(DEVICE) if hasattr(value, "to") else value
+            for key, value in text_inputs.items()
+        }
+        image_inputs = {
+            key: value.to(DEVICE) if hasattr(value, "to") else value
+            for key, value in image_inputs.items()
+        }
         text_outputs = model(**text_inputs)
         image_outputs = model(**image_inputs)