Spaces:

AlsuGibadullina
/

TestRefactoringModels

Sleeping

App Files Files Community

AlsuGibadullina commited on about 1 month ago

Commit

e1fbc11

verified ·

1 Parent(s): f435cf6

Create backends.py

Browse files

Files changed (1) hide show

src/backends.py +130 -0

src/backends.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import time
+import json
+import requests
+from dataclasses import dataclass
+from typing import Optional, Dict, Any, Protocol
+from huggingface_hub import InferenceClient
+# Local backend (optional)
+try:
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    import torch
+except Exception:
+    AutoTokenizer = None
+    AutoModelForCausalLM = None
+    torch = None
+class LLMBackend(Protocol):
+    def generate(self, prompt: str, *, system: Optional[str], params: Dict[str, Any]) -> str:
+        ...
+@dataclass
+class HFInferenceAPIBackend:
+    """
+    Uses HF Inference API via huggingface_hub.InferenceClient.
+    Works well on Spaces for large models if you provide HF_TOKEN in Secrets.
+    """
+    model_id: str
+    token: Optional[str] = None
+    timeout_s: int = 120
+    def __post_init__(self):
+        self.token = self.token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        self.client = InferenceClient(model=self.model_id, token=self.token, timeout=self.timeout_s)
+    def generate(self, prompt: str, *, system: Optional[str], params: Dict[str, Any]) -> str:
+        # We use chat.completions when available (for chat-tuned models),
+        # otherwise fall back to text_generation.
+        # InferenceClient adapts per model capabilities.
+        temperature = float(params.get("temperature", 0.2))
+        max_new_tokens = int(params.get("max_new_tokens", 600))
+        top_p = float(params.get("top_p", 0.95))
+        repetition_penalty = float(params.get("repetition_penalty", 1.05))
+        # Try chat first
+        try:
+            messages = []
+            if system:
+                messages.append({"role": "system", "content": system})
+            messages.append({"role": "user", "content": prompt})
+            resp = self.client.chat.completions.create(
+                model=self.model_id,
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_new_tokens,
+                top_p=top_p,
+            )
+            return resp.choices[0].message.content
+        except Exception:
+            # Fallback: text generation
+            out = self.client.text_generation(
+                prompt=(f"{system}\n\n{prompt}" if system else prompt),
+                temperature=temperature,
+                max_new_tokens=max_new_tokens,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                do_sample=True,
+                return_full_text=False,
+            )
+            return out
+@dataclass
+class LocalTransformersBackend:
+    """
+    Loads model locally in the Space container.
+    Use only small models unless you have GPU Space and enough memory.
+    """
+    model_id: str
+    device: str = "cpu"
+    def __post_init__(self):
+        if AutoTokenizer is None or AutoModelForCausalLM is None:
+            raise RuntimeError("transformers/torch not available in this environment.")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, use_fast=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_id)
+        if torch is not None:
+            self.model.to(self.device)
+    def generate(self, prompt: str, *, system: Optional[str], params: Dict[str, Any]) -> str:
+        temperature = float(params.get("temperature", 0.2))
+        max_new_tokens = int(params.get("max_new_tokens", 600))
+        top_p = float(params.get("top_p", 0.95))
+        repetition_penalty = float(params.get("repetition_penalty", 1.05))
+        full_prompt = (f"{system}\n\n{prompt}" if system else prompt)
+        inputs = self.tokenizer(full_prompt, return_tensors="pt")
+        if torch is not None:
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                max_new_tokens=max_new_tokens,
+            )
+        text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        # Heuristic: remove the prompt prefix if present
+        if text.startswith(full_prompt):
+            text = text[len(full_prompt):].lstrip()
+        return text
+def make_backend(backend_type: str, model_id: str) -> LLMBackend:
+    if backend_type == "hf_inference_api":
+        return HFInferenceAPIBackend(model_id=model_id)
+    if backend_type == "local_transformers":
+        # auto-device for local; keep cpu by default
+        return LocalTransformersBackend(model_id=model_id, device="cpu")
+    raise ValueError(f"Unknown backend: {backend_type}")