Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

3a9a518

verified ·

1 Parent(s): 7ba258a

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -26

app.py CHANGED Viewed

@@ -41,8 +41,13 @@ CONFIG = {
 # hosted model) which can generate longer outputs faster than a CPU-bound local
 # model. Set `HF_INFERENCE_MODEL` to choose the remote model (instruction-tuned
 # model recommended).
 USE_REMOTE_LLM = False
-REMOTE_LLM_MODEL = os.environ.get("HF_INFERENCE_MODEL", "tiiuae/falcon-7b-instruct")
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
@@ -69,13 +74,17 @@ def initialize_llm():
     # If a remote HF Inference API key is provided, we won't instantiate a local
     # heavy model; instead generation will be performed via the HTTP API.
     global USE_REMOTE_LLM, REMOTE_LLM_MODEL
     if USE_REMOTE_LLM:
-        logger.info(f"🔄 Using remote Hugging Face Inference model: {REMOTE_LLM_MODEL}")
         CONFIG["llm_model"] = REMOTE_LLM_MODEL
-        CONFIG["model_type"] = "remote"
         return None
-    logger.info("🔄 Initializing FREE local language model...")
     model_name = "google/flan-t5-large"
     try:
@@ -106,58 +115,82 @@ def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float =
     """Call the Hugging Face Inference API for remote generation. Requires
     `HF_INFERENCE_API_KEY` env var to be set and a model name in
     `REMOTE_LLM_MODEL`.
     """
     if not HF_INFERENCE_API_KEY:
         raise Exception("HF_INFERENCE_API_KEY not set for remote generation")
-    # New router endpoint is required by HF (replaces api-inference.huggingface.co)
-    router_url = f"https://router.huggingface.co/models/{REMOTE_LLM_MODEL}"
-    old_url = f"https://api-inference.huggingface.co/models/{REMOTE_LLM_MODEL}"
-    headers = {"Authorization": f"Bearer {HF_INFERENCE_API_KEY}", "Accept": "application/json"}
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
-            "return_full_text": False
         }
     }
-    logger.info(f"    → Remote inference request to router {REMOTE_LLM_MODEL} (tokens={max_new_tokens}, temp={temperature})")
     try:
-        r = requests.post(router_url, headers=headers, json=payload, timeout=120)
     except Exception as e:
-        logger.error(f"    ✗ Remote router request failed: {e}")
-        # Try older endpoint as a fallback
         try:
-            logger.info("    → Attempting legacy api-inference endpoint as fallback")
-            r = requests.post(old_url, headers=headers, json=payload, timeout=120)
-        except Exception as e2:
-            logger.error(f"    ✗ Legacy endpoint request failed: {e2}")
             return ""
     if r.status_code != 200:
-        logger.error(f"    ✗ Remote inference error {r.status_code}: {r.text[:200]}")
         return ""
     result = r.json()
     if isinstance(result, dict) and result.get("error"):
         logger.error(f"    ✗ Remote inference returned error: {result.get('error')}")
         return ""
-    # The HF Inference API can return a list of generated outputs or text
     if isinstance(result, list) and result:
-        # entries may be strings or dicts like {"generated_text": "..."}
         first = result[0]
         if isinstance(first, dict):
-            return first.get("generated_text", "").strip()
-        return str(first).strip()
-    if isinstance(result, dict) and "generated_text" in result:
-        return result["generated_text"].strip()
-    return str(result).strip()
 def initialize_embeddings():
     logger.info("🔄 Initializing embeddings model...")

 # hosted model) which can generate longer outputs faster than a CPU-bound local
 # model. Set `HF_INFERENCE_MODEL` to choose the remote model (instruction-tuned
 # model recommended).
+#
+# PHI models are excellent lightweight instruction-following models:
+# - microsoft/phi-2 (2.7B parameters, free inference)
+# - microsoft/Phi-3-mini-4k-instruct (3.8B parameters, recommended)
+# - microsoft/Phi-3-mini-128k-instruct (3.8B with longer context)
 USE_REMOTE_LLM = False
+REMOTE_LLM_MODEL = os.environ.get("HF_INFERENCE_MODEL", "microsoft/Phi-3-mini-4k-instruct")
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
     # If a remote HF Inference API key is provided, we won't instantiate a local
     # heavy model; instead generation will be performed via the HTTP API.
     global USE_REMOTE_LLM, REMOTE_LLM_MODEL
+    # For Hugging Face Spaces deployment: prefer remote PHI inference
+    # This avoids memory issues on CPU-only spaces and provides better performance
     if USE_REMOTE_LLM:
+        logger.info(f"🔄 Using remote Hugging Face Inference with PHI model: {REMOTE_LLM_MODEL}")
+        logger.info(f"   ✅ PHI models are optimized for instruction-following and long-form generation")
         CONFIG["llm_model"] = REMOTE_LLM_MODEL
+        CONFIG["model_type"] = "remote_phi"
         return None
+    # Final fallback: attempt to initialize the free local T5 model (as before)
+    logger.info("🔄 Initializing FREE local language model (fallback to T5)...")
     model_name = "google/flan-t5-large"
     try:
     """Call the Hugging Face Inference API for remote generation. Requires
     `HF_INFERENCE_API_KEY` env var to be set and a model name in
     `REMOTE_LLM_MODEL`.
+    PHI models work best with clear instruction formatting. This function
+    handles both the standard HF Inference API and PHI-specific response parsing.
     """
     if not HF_INFERENCE_API_KEY:
         raise Exception("HF_INFERENCE_API_KEY not set for remote generation")
+    # Use the HF Inference API endpoint (not router for better PHI compatibility)
+    api_url = f"https://api-inference.huggingface.co/models/{REMOTE_LLM_MODEL}"
+    headers = {"Authorization": f"Bearer {HF_INFERENCE_API_KEY}"}
+    # PHI models prefer simple parameters; avoid return_full_text which can cause issues
     payload = {
         "inputs": prompt,
         "parameters": {
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
+            "do_sample": True,
+            "repetition_penalty": 1.1
         }
     }
+    logger.info(f"    → Remote PHI inference to {REMOTE_LLM_MODEL} (tokens={max_new_tokens}, temp={temperature})")
     try:
+        r = requests.post(api_url, headers=headers, json=payload, timeout=90)
     except Exception as e:
+        logger.error(f"    ✗ Remote request failed: {e}")
+        return ""
+    if r.status_code == 503:
+        logger.warning(f"    ⚠️ Model loading (503), retrying in 5s...")
+        import time
+        time.sleep(5)
         try:
+            r = requests.post(api_url, headers=headers, json=payload, timeout=90)
+        except Exception as e:
+            logger.error(f"    ✗ Retry failed: {e}")
             return ""
     if r.status_code != 200:
+        logger.error(f"    ✗ Remote inference error {r.status_code}: {r.text[:300]}")
         return ""
     result = r.json()
+    # Handle error responses
     if isinstance(result, dict) and result.get("error"):
         logger.error(f"    ✗ Remote inference returned error: {result.get('error')}")
         return ""
+    # Parse the generated text from various response formats
+    generated_text = ""
     if isinstance(result, list) and result:
+        # HF Inference API returns [{"generated_text": "..."}]
         first = result[0]
         if isinstance(first, dict):
+            generated_text = first.get("generated_text", "")
+        else:
+            generated_text = str(first)
+    elif isinstance(result, dict) and "generated_text" in result:
+        generated_text = result["generated_text"]
+    else:
+        generated_text = str(result)
+    # Clean up: PHI may return the prompt + completion, extract only new text
+    generated_text = generated_text.strip()
+    # If the response contains the original prompt, extract only the new completion
+    if prompt in generated_text:
+        # Find where the prompt ends and new generation begins
+        prompt_end = generated_text.find(prompt) + len(prompt)
+        generated_text = generated_text[prompt_end:].strip()
+    return generated_text
 def initialize_embeddings():
     logger.info("🔄 Initializing embeddings model...")