Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

06dde32

verified ·

1 Parent(s): 4b54bb9

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -53

app.py CHANGED Viewed

@@ -61,11 +61,11 @@ CONFIG = {
     "max_tokens": 600,  # Allow natural length responses
 }
-# Local PHI model configuration for Hugging Face Spaces
-# PHI-2 is optimal for CPU deployment: 2.7B parameters, excellent quality
-# Can be swapped with Phi-3-mini-4k-instruct if more memory is available
-LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
-USE_8BIT_QUANTIZATION = False  # DISABLED: causes hanging on CPU
 USE_REMOTE_LLM = False
 # Natural flow mode: No word limits, let model decide length
@@ -95,16 +95,13 @@ if HF_INFERENCE_API_KEY:
 # ============================================================================
 def initialize_llm():
-    """Initialize PHI model locally with CPU optimizations for Hugging Face Spaces.
-    Uses efficient techniques:
-    - 8-bit quantization to reduce memory by ~50%
-    - CPU-optimized loading with device_map
-    - Lazy loading and minimal memory footprint
     """
-    global LOCAL_PHI_MODEL, USE_8BIT_QUANTIZATION
-    logger.info(f"🔄 Initializing local PHI model: {LOCAL_PHI_MODEL}")
     logger.info("   Using CPU-optimized configuration for Hugging Face Spaces")
     try:
@@ -114,37 +111,33 @@ def initialize_llm():
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"   Target device: {device}")
-        # Load tokenizer (lightweight)
         logger.info("   Loading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(
-            LOCAL_PHI_MODEL,
-            trust_remote_code=True,
-            use_fast=True
         )
-        # Configure tokenizer for PHI models
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
-        logger.info(f"   Tokenizer configured: vocab_size={len(tokenizer)}, eos_token={tokenizer.eos_token}")
-        # Configure model loading for CPU efficiency (NO quantization)
-        model_kwargs = {
-            "trust_remote_code": True,
-            "low_cpu_mem_usage": True,
-            "torch_dtype": torch.float32,  # CPU works best with float32
-            "device_map": "auto",  # Let transformers handle device placement
-        }
-        # Load the model with optimization
-        logger.info("   Loading PHI model (this may take 30-60 seconds)...")
         model = AutoModelForCausalLM.from_pretrained(
-            LOCAL_PHI_MODEL,
-            **model_kwargs
         )
         # Apply advanced optimizations for faster inference
         if hasattr(model, 'config'):
             # Reduce attention heads computation for speed
@@ -163,64 +156,70 @@ def initialize_llm():
         logger.info("   Configuring direct model inference (faster than pipeline)...")
         # Create a simple wrapper that mimics pipeline interface
-        class FastPHIGenerator:
             def __init__(self, model, tokenizer):
                 self.model = model
                 self.tokenizer = tokenizer
             def __call__(self, prompt, max_new_tokens=150, temperature=0.7, top_p=0.9,
                         do_sample=True, repetition_penalty=1.1, **kwargs):
-                """Direct generation - faster than pipeline"""
                 try:
                     # Tokenize
-                    inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-                    input_ids = inputs["input_ids"]
                     # Generate
                     with torch.no_grad():
                         outputs = self.model.generate(
                             input_ids,
                             max_new_tokens=max_new_tokens,
-                            temperature=temperature,
-                            top_p=top_p,
                             do_sample=do_sample,
                             repetition_penalty=repetition_penalty,
-                            pad_token_id=self.tokenizer.eos_token_id,
-                            eos_token_id=self.tokenizer.eos_token_id,
-                            early_stopping=True
                         )
                     # Decode only the new tokens
                     generated_ids = outputs[0][input_ids.shape[1]:]
                     generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
-                    return [{"generated_text": generated_text}]
                 except Exception as e:
                     logger.error(f"Generation error: {e}")
                     return [{"generated_text": ""}]
-        llm_client = FastPHIGenerator(model, tokenizer)
         llm_client.tokenizer = tokenizer  # Add tokenizer reference for compatibility
-        CONFIG["llm_model"] = LOCAL_PHI_MODEL
-        CONFIG["model_type"] = "phi_local"
-        logger.info(f"✅ PHI model initialized successfully: {LOCAL_PHI_MODEL}")
-        logger.info(f"   Model size: ~2.7B parameters (PHI-2) or ~3.8B (PHI-3)")
-        logger.info(f"   Memory optimization: {'8-bit quantization' if USE_8BIT_QUANTIZATION else 'float32'}")
         return llm_client
     except ImportError as ie:
         logger.error(f"❌ Missing required library: {ie}")
-        logger.info("   Install with: pip install transformers accelerate bitsandbytes")
         raise
     except Exception as e:
-        logger.error(f"❌ Failed to load PHI model: {str(e)}")
-        logger.info("   This may be due to insufficient memory on the Space")
-        logger.info("   Try using a smaller model or enabling 8-bit quantization")
-        raise Exception(f"Failed to initialize PHI LLM: {str(e)}")
 def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
@@ -715,7 +714,7 @@ def generate_llm_answer(
                 # Ultra-simple prompt
                 formatted_prompt = f"{prompt}\n\nAnswer:"
-                logger.info(f"    → PHI-2 generating (max_tokens={max_new_tokens})")
                 # MINIMAL settings - most restrictive for speed
                 out = llm_client(
@@ -744,7 +743,7 @@ def generate_llm_answer(
         gen_thread.join(timeout=45)  # 45 second timeout
         if gen_thread.is_alive():
-            logger.error("    ✗ Generation TIMEOUT after 45s")
             return ''
         if result_container['error']:

     "max_tokens": 600,  # Allow natural length responses
 }
+# Local LLM configuration for Hugging Face Spaces
+# TinyLlama: 1.1B parameters, fast on CPU, reliable generation
+# Alternative: google/flan-t5-base (smaller, faster)
+LOCAL_LLM_MODEL = os.environ.get("LOCAL_LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+USE_8BIT_QUANTIZATION = False  # Not needed for TinyLlama
 USE_REMOTE_LLM = False
 # Natural flow mode: No word limits, let model decide length
 # ============================================================================
 def initialize_llm():
+    """Initialize TinyLlama model locally with CPU optimizations.
+    TinyLlama is fast, reliable, and works well on CPU without device issues.
     """
+    global LOCAL_LLM_MODEL, USE_8BIT_QUANTIZATION
+    logger.info(f"🔄 Initializing local LLM: {LOCAL_LLM_MODEL}")
     logger.info("   Using CPU-optimized configuration for Hugging Face Spaces")
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"   Target device: {device}")
+        # Load tokenizer
         logger.info("   Loading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(
+            LOCAL_LLM_MODEL,
+            trust_remote_code=True
         )
+        # Configure tokenizer
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
+        logger.info(f"   Tokenizer ready: {len(tokenizer)} tokens")
+        # Load model - simple CPU configuration
+        logger.info("   Loading model (20-40 seconds)...")
         model = AutoModelForCausalLM.from_pretrained(
+            LOCAL_LLM_MODEL,
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+            low_cpu_mem_usage=True
         )
+        # Move to CPU explicitly
+        model = model.to('cpu')
         # Apply advanced optimizations for faster inference
         if hasattr(model, 'config'):
             # Reduce attention heads computation for speed
         logger.info("   Configuring direct model inference (faster than pipeline)...")
         # Create a simple wrapper that mimics pipeline interface
+        class FastLLMGenerator:
             def __init__(self, model, tokenizer):
                 self.model = model
                 self.tokenizer = tokenizer
             def __call__(self, prompt, max_new_tokens=150, temperature=0.7, top_p=0.9,
                         do_sample=True, repetition_penalty=1.1, **kwargs):
+                """Direct generation - faster and more reliable"""
                 try:
                     # Tokenize
+                    inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=400)
+                    input_ids = inputs["input_ids"].to('cpu')
+                    attention_mask = inputs.get("attention_mask", None)
+                    if attention_mask is not None:
+                        attention_mask = attention_mask.to('cpu')
                     # Generate
                     with torch.no_grad():
                         outputs = self.model.generate(
                             input_ids,
+                            attention_mask=attention_mask,
                             max_new_tokens=max_new_tokens,
+                            temperature=temperature if do_sample else 1.0,
+                            top_p=top_p if do_sample else 1.0,
                             do_sample=do_sample,
                             repetition_penalty=repetition_penalty,
+                            pad_token_id=self.tokenizer.pad_token_id,
+                            eos_token_id=self.tokenizer.eos_token_id
                         )
                     # Decode only the new tokens
                     generated_ids = outputs[0][input_ids.shape[1]:]
                     generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+                    return [{"generated_text": generated_text.strip()}]
                 except Exception as e:
                     logger.error(f"Generation error: {e}")
+                    import traceback
+                    logger.error(traceback.format_exc())
                     return [{"generated_text": ""}]
+        llm_client = FastLLMGenerator(model, tokenizer)
         llm_client.tokenizer = tokenizer  # Add tokenizer reference for compatibility
+        CONFIG["llm_model"] = LOCAL_LLM_MODEL
+        CONFIG["model_type"] = "tinyllama_local"
+        logger.info(f"✅ LLM initialized successfully: {LOCAL_LLM_MODEL}")
+        logger.info(f"   Model size: 1.1B parameters")
+        logger.info(f"   Expected speed: 5-15 seconds per response on CPU")
         return llm_client
     except ImportError as ie:
         logger.error(f"❌ Missing required library: {ie}")
+        logger.info("   Install with: pip install transformers torch")
         raise
     except Exception as e:
+        logger.error(f"❌ Failed to load LLM: {str(e)}")
+        logger.info("   This may be due to insufficient memory")
+        import traceback
+        logger.error(traceback.format_exc())
+        raise Exception(f"Failed to initialize LLM: {str(e)}")
 def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
                 # Ultra-simple prompt
                 formatted_prompt = f"{prompt}\n\nAnswer:"
+                logger.info(f"    → Generating with TinyLlama (max_tokens={max_new_tokens})")
                 # MINIMAL settings - most restrictive for speed
                 out = llm_client(
         gen_thread.join(timeout=45)  # 45 second timeout
         if gen_thread.is_alive():
+            logger.error("    ✗ TIMEOUT after 45s - model may be too slow")
             return ''
         if result_container['error']: