Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

25c4058

verified ·

1 Parent(s): 3c74f4e

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -74

app.py CHANGED Viewed

@@ -36,18 +36,12 @@ CONFIG = {
     "max_tokens": 350,
 }
-# Remote inference config (optional). If `HF_INFERENCE_API_KEY` is set in the
-# environment, the app will prefer calling the Hugging Face Inference API (remote
-# hosted model) which can generate longer outputs faster than a CPU-bound local
-# model. Set `HF_INFERENCE_MODEL` to choose the remote model (instruction-tuned
-# model recommended).
-#
-# PHI models are excellent lightweight instruction-following models:
-# - microsoft/phi-2 (2.7B parameters, free inference)
-# - microsoft/Phi-3-mini-4k-instruct (3.8B parameters, recommended)
-# - microsoft/Phi-3-mini-128k-instruct (3.8B with longer context)
 USE_REMOTE_LLM = False
-REMOTE_LLM_MODEL = os.environ.get("HF_INFERENCE_MODEL", "microsoft/Phi-3-mini-4k-instruct")
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
@@ -71,44 +65,92 @@ if HF_INFERENCE_API_KEY:
 # ============================================================================
 def initialize_llm():
-    # If a remote HF Inference API key is provided, we won't instantiate a local
-    # heavy model; instead generation will be performed via the HTTP API.
-    global USE_REMOTE_LLM, REMOTE_LLM_MODEL
-    # For Hugging Face Spaces deployment: prefer remote PHI inference
-    # This avoids memory issues on CPU-only spaces and provides better performance
-    if USE_REMOTE_LLM:
-        logger.info(f"🔄 Using remote Hugging Face Inference with PHI model: {REMOTE_LLM_MODEL}")
-        logger.info(f"   ✅ PHI models are optimized for instruction-following and long-form generation")
-        CONFIG["llm_model"] = REMOTE_LLM_MODEL
-        CONFIG["model_type"] = "remote_phi"
-        return None
-    # Final fallback: attempt to initialize the free local T5 model (as before)
-    logger.info("🔄 Initializing FREE local language model (fallback to T5)...")
-    model_name = "google/flan-t5-large"
     try:
-        logger.info(f"   Loading {model_name}...")
-        device = 0 if torch.cuda.is_available() else -1
-        model_kwargs = {"low_cpu_mem_usage": True}
         llm_client = pipeline(
-            "text2text-generation",
-            model=model_name,
-            device=device,
-            model_kwargs=model_kwargs
         )
-        CONFIG["llm_model"] = model_name
-        CONFIG["model_type"] = "t5"
-        logger.info(f"✅ LLM initialized: {model_name}")
-        logger.info(f"   Device: {'GPU' if device == 0 else 'CPU'}")
         return llm_client
     except Exception as e:
-        logger.error(f"❌ Failed to load model: {str(e)}")
-        raise Exception(f"Failed to initialize LLM: {str(e)}")
 def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
@@ -472,14 +514,30 @@ Draft:
 Answer:
 """
-    logger.info("  → Polishing scaffold with LLM")
     try:
-        if USE_REMOTE_LLM:
-            polished = remote_generate(polish_prompt, max_new_tokens=600, temperature=0.72, top_p=0.92)
         else:
-            out = llm_client(polish_prompt, max_new_tokens=600, temperature=0.72, top_p=0.92, do_sample=True, num_beams=1)
-            polished = out[0].get('generated_text', '') if isinstance(out, list) and out else str(out)
             polished = polished.strip()
     except Exception as e:
         logger.error(f"  ✗ Polishing error: {e}")
         return None
@@ -555,9 +613,9 @@ def generate_llm_answer(
     llm_client,
     attempt: int = 1
 ) -> Optional[str]:
-    # Allow operation when using remote inference (no local llm_client).
-    if not llm_client and not USE_REMOTE_LLM:
-        logger.error("  → LLM client not initialized and remote inference disabled")
         return None
     query_lower = query.lower()
@@ -600,27 +658,36 @@ def generate_llm_answer(
     max_iterations = 4
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
-        logger.info(f"    → Model call (temp={temperature}, max_new_tokens={max_new_tokens})")
         try:
-            if USE_REMOTE_LLM:
-                # Use remote Hugging Face Inference API
-                return remote_generate(prompt, max_new_tokens, temperature, top_p)
             out = llm_client(
                 prompt,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 do_sample=True,
-                num_beams=1,
                 repetition_penalty=repetition_penalty,
-                early_stopping=False
             )
             if isinstance(out, list) and out:
-                return out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
-            return str(out)
         except Exception as e:
-            logger.error(f"    ✗ Model call error: {e}")
             return ''
     # Build initial prompt
@@ -771,18 +838,15 @@ def generate_answer_langchain(
     if not llm_answer:
         logger.error(f"  ✗ All 2 LLM attempts failed")
-        # Next attempt: if remote LLM is available, build a short scaffold from
-        # retrieved documents and ask the remote model to polish/expand it. This
-        # is more reliable than single-shot long generation on some models.
-        if USE_REMOTE_LLM:
-            try:
-                logger.info("  → Attempting scaffold-and-polish using remote LLM")
-                polished = scaffold_and_polish(query, retrieved_docs, llm_client)
-                if polished:
-                    logger.info("  ✅ Scaffold-and-polish produced an answer")
-                    return polished
-            except Exception as e:
-                logger.error(f"  ✗ Scaffold-and-polish error: {e}")
         # Final fallback: extractive templated answer (guaranteed deterministic)
         try:

     "max_tokens": 350,
 }
+# Local PHI model configuration for Hugging Face Spaces
+# PHI-2 is optimal for CPU deployment: 2.7B parameters, excellent quality
+# Can be swapped with Phi-3-mini-4k-instruct if more memory is available
+LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
+USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
 # ============================================================================
 def initialize_llm():
+    """Initialize PHI model locally with CPU optimizations for Hugging Face Spaces.
+    Uses efficient techniques:
+    - 8-bit quantization to reduce memory by ~50%
+    - CPU-optimized loading with device_map
+    - Lazy loading and minimal memory footprint
+    """
+    global LOCAL_PHI_MODEL, USE_8BIT_QUANTIZATION
+    logger.info(f"🔄 Initializing local PHI model: {LOCAL_PHI_MODEL}")
+    logger.info("   Using CPU-optimized configuration for Hugging Face Spaces")
     try:
+        from transformers import AutoTokenizer, AutoModelForCausalLM
+        # Check if we have GPU (unlikely on free Spaces, but check anyway)
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"   Target device: {device}")
+        # Load tokenizer (lightweight)
+        logger.info("   Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            LOCAL_PHI_MODEL,
+            trust_remote_code=True,
+            use_fast=True
+        )
+        # Set padding token if not present (PHI models need this)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        # Configure model loading for CPU efficiency
+        model_kwargs = {
+            "trust_remote_code": True,
+            "low_cpu_mem_usage": True,
+            "torch_dtype": torch.float32,  # CPU works best with float32
+        }
+        # Try to use 8-bit quantization if available (requires bitsandbytes)
+        if USE_8BIT_QUANTIZATION and device == "cpu":
+            try:
+                logger.info("   Attempting 8-bit quantization for memory efficiency...")
+                model_kwargs["load_in_8bit"] = True
+            except Exception as quant_error:
+                logger.warning(f"   8-bit quantization unavailable: {quant_error}")
+                logger.info("   Falling back to float32 (will use more memory)")
+        # Load the model
+        logger.info("   Loading PHI model (this may take 30-60 seconds)...")
+        model = AutoModelForCausalLM.from_pretrained(
+            LOCAL_PHI_MODEL,
+            **model_kwargs
+        )
+        # Move to eval mode to disable dropout and save memory
+        model.eval()
+        # Create pipeline for generation
+        logger.info("   Creating text-generation pipeline...")
         llm_client = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device=0 if device == "cuda" else -1,
+            max_new_tokens=512,
+            pad_token_id=tokenizer.eos_token_id
         )
+        CONFIG["llm_model"] = LOCAL_PHI_MODEL
+        CONFIG["model_type"] = "phi_local"
+        logger.info(f"✅ PHI model initialized successfully: {LOCAL_PHI_MODEL}")
+        logger.info(f"   Model size: ~2.7B parameters (PHI-2) or ~3.8B (PHI-3)")
+        logger.info(f"   Memory optimization: {'8-bit quantization' if USE_8BIT_QUANTIZATION else 'float32'}")
         return llm_client
+    except ImportError as ie:
+        logger.error(f"❌ Missing required library: {ie}")
+        logger.info("   Install with: pip install transformers accelerate bitsandbytes")
+        raise
     except Exception as e:
+        logger.error(f"❌ Failed to load PHI model: {str(e)}")
+        logger.info("   This may be due to insufficient memory on the Space")
+        logger.info("   Try using a smaller model or enabling 8-bit quantization")
+        raise Exception(f"Failed to initialize PHI LLM: {str(e)}")
 def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
 Answer:
 """
+    logger.info("  → Polishing scaffold with PHI model")
     try:
+        out = llm_client(
+            polish_prompt,
+            max_new_tokens=600,
+            temperature=0.72,
+            top_p=0.92,
+            do_sample=True,
+            repetition_penalty=1.1,
+            pad_token_id=llm_client.tokenizer.eos_token_id
+        )
+        # Extract and clean the polished text
+        if isinstance(out, list) and out:
+            polished = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
+        else:
+            polished = str(out)
+        # Remove prompt echo if present
+        if polish_prompt in polished:
+            polished = polished[len(polish_prompt):].strip()
         else:
             polished = polished.strip()
     except Exception as e:
         logger.error(f"  ✗ Polishing error: {e}")
         return None
     llm_client,
     attempt: int = 1
 ) -> Optional[str]:
+    # Ensure we have a local PHI model loaded
+    if not llm_client:
+        logger.error("  → PHI model not initialized")
         return None
     query_lower = query.lower()
     max_iterations = 4
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
+        logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
         try:
+            # Call local PHI model (causal LM)
             out = llm_client(
                 prompt,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 do_sample=True,
                 repetition_penalty=repetition_penalty,
+                num_return_sequences=1,
+                pad_token_id=llm_client.tokenizer.eos_token_id,
+                eos_token_id=llm_client.tokenizer.eos_token_id
             )
+            # Extract generated text from pipeline output
             if isinstance(out, list) and out:
+                generated = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
+            else:
+                generated = str(out)
+            # PHI models return prompt + completion, extract only new text
+            if prompt in generated:
+                # Remove the prompt from the output
+                generated = generated[len(prompt):].strip()
+            return generated
         except Exception as e:
+            logger.error(f"    ✗ PHI model call error: {e}")
             return ''
     # Build initial prompt
     if not llm_answer:
         logger.error(f"  ✗ All 2 LLM attempts failed")
+        # Try scaffold-and-polish as a fallback strategy
+        try:
+            logger.info("  → Attempting scaffold-and-polish using PHI model")
+            polished = scaffold_and_polish(query, retrieved_docs, llm_client)
+            if polished:
+                logger.info("  ✅ Scaffold-and-polish produced an answer")
+                return polished
+        except Exception as e:
+            logger.error(f"  ✗ Scaffold-and-polish error: {e}")
         # Final fallback: extractive templated answer (guaranteed deterministic)
         try: