from gliner import GLiNER import torch class EndpointHandler: def __init__(self, path=""): # Load without device_map, then move to GPU self.model = GLiNER.from_pretrained(path) # Remove device_map="cuda" self.model = self.model.to("cuda") self.model.eval() # Lock for inference def __call__(self, data): # If data is wrapped in 'inputs' (as Hugging Face does), unwrap it if isinstance(data, dict) and "inputs" in data: data = data["inputs"] text = data.get("text", "") labels = data.get("labels", []) if not text or not labels: return {"error": "Please provide 'text' and 'labels'"} entities = self.model.predict_entities(text, labels) return {"entities": entities}