Spaces:

workbykait
/

LLM-Judge

Sleeping

App Files Files Community

workbykait commited on Feb 18

Commit

6f59ec0

verified ·

1 Parent(s): 53f9d55

Update inference.py

Browse files

Files changed (1) hide show

inference.py +34 -51

inference.py CHANGED Viewed

@@ -7,59 +7,42 @@ import gc
 def generate_response(model_cfg, prompt, max_new_tokens=512, temperature=0.7):
     model_id = model_cfg["id"]
-    provider = model_cfg.get("provider", None)  # optional override
-    client = InferenceClient(model=model_id, provider=provider)
-    try:
-        # Prefer chat/completions style — much more reliable in 2026
-        messages = [{"role": "user", "content": prompt}]
-        completion = client.chat_completion(
-            messages=messages,
-            max_tokens=max_new_tokens,
-            temperature=temperature,
-            stream=False
-        )
-        return completion.choices[0].message.content.strip()
-    except AttributeError:
-        # Fallback to text_generation if chat_completion not available
         try:
-            output = client.text_generation(
-                prompt,
-                max_new_tokens=max_new_tokens,
                 temperature=temperature,
-                details=False
             )
-            return output.generated_text.strip() if hasattr(output, "generated_text") else output
-        except Exception as e_text:
-            raise RuntimeError(f"Both chat_completion and text_generation failed: {e_text}")
-    except Exception as e:
-        raise RuntimeError(
-            f"Generation failed for {model_id} (provider={provider}): {str(e)}\n"
-            "Try changing provider in models_config.py or use a different model."
-        )
-# Keep local quantized fallback only if you have GPU hardware
-# (comment out if running on CPU-only Space)
-def local_generate_fallback(model_cfg, prompt, max_new_tokens=512):
-    if not model_cfg.get("quantized", False):
-        return None
-    try:
-        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_cfg["id"])
-        model = AutoModelForCausalLM.from_pretrained(
-            model_cfg["id"], quantization_config=bnb_config, device_map="auto"
-        )
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
-        resp = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        del model, tokenizer, inputs, outputs
-        gc.collect()
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        return resp[len(prompt):].strip()
-    except Exception as e:
-        return f"[Local fallback failed: {str(e)}]"

 def generate_response(model_cfg, prompt, max_new_tokens=512, temperature=0.7):
     model_id = model_cfg["id"]
+    primary_provider = model_cfg.get("provider")
+    # Try order: primary → groq → nebius → featherless-ai → default (HF)
+    providers_to_try = [primary_provider, "groq", "nebius", "featherless-ai", None]
+    for prov in [p for p in providers_to_try if p is not None or p == primary_provider]:
         try:
+            client = InferenceClient(model=model_id, provider=prov)
+            messages = [{"role": "user", "content": prompt}]
+            completion = client.chat.completions.create(
+                messages=messages,
+                max_tokens=max_new_tokens,
                 temperature=temperature,
+                stream=False
             )
+            return completion.choices[0].message.content.strip()
+        except Exception as chat_err:
+            print(f"Chat completion failed (provider={prov}): {chat_err}")
+            # Fallback to legacy text_generation
+            try:
+                output = client.text_generation(
+                    prompt,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    details=False
+                )
+                return output if isinstance(output, str) else output.generated_text
+            except Exception as text_err:
+                print(f"Text generation also failed (provider={prov}): {text_err}")
+                continue
+    raise RuntimeError(
+        f"Generation failed for {model_id} after trying providers: {providers_to_try}\n"
+        "Check model card for supported providers or try different models."
+    )
+# Optional local quantized fallback (only if GPU hardware available)
+# ... (keep your existing local code if needed)