Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

5e4b481

verified ·

1 Parent(s): 21776b6

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -72

app.py CHANGED Viewed

@@ -65,11 +65,11 @@ CONFIG = {
 # PHI-2 is optimal for CPU deployment: 2.7B parameters, excellent quality
 # Can be swapped with Phi-3-mini-4k-instruct if more memory is available
 LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
-USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
 # Natural flow mode: No word limits, let model decide length
-MAX_CONTEXT_LENGTH = 800  # Rich context for quality
 USE_CACHING = True  # Cache model outputs for repeated patterns
 ENABLE_FAST_MODE = False  # Allow natural completion, no artificial limits
@@ -122,26 +122,22 @@ def initialize_llm():
             use_fast=True
         )
-        # Set padding token if not present (PHI models need this)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        # Configure model loading for CPU efficiency
         model_kwargs = {
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
             "torch_dtype": torch.float32,  # CPU works best with float32
         }
-        # Try to use 8-bit quantization if available (requires bitsandbytes)
-        if USE_8BIT_QUANTIZATION and device == "cpu":
-            try:
-                logger.info("   Attempting 8-bit quantization for memory efficiency...")
-                model_kwargs["load_in_8bit"] = True
-            except Exception as quant_error:
-                logger.warning(f"   8-bit quantization unavailable: {quant_error}")
-                logger.info("   Falling back to float32 (will use more memory)")
         # Load the model with optimization
         logger.info("   Loading PHI model (this may take 30-60 seconds)...")
         model = AutoModelForCausalLM.from_pretrained(
@@ -159,24 +155,19 @@ def initialize_llm():
         # Move to eval mode to disable dropout and save memory
         model.eval()
-        # Advanced: Try to optimize with torch.compile (PyTorch 2.0+)
-        try:
-            if hasattr(torch, 'compile') and not USE_8BIT_QUANTIZATION:
-                logger.info("   Applying torch.compile for faster inference...")
-                model = torch.compile(model, mode="reduce-overhead")
-        except Exception as compile_error:
-            logger.info(f"   Torch compile not available or failed: {compile_error}")
         # Create pipeline for generation
-        # NOTE: When using accelerate/quantization, do NOT specify device parameter
         logger.info("   Creating text-generation pipeline...")
         llm_client = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            max_new_tokens=600,  # Allow natural length responses
             pad_token_id=tokenizer.eos_token_id,
-            batch_size=1  # Single batch for optimal CPU performance
         )
         CONFIG["llm_model"] = LOCAL_PHI_MODEL
@@ -681,80 +672,71 @@ def generate_llm_answer(
     max_iterations = 0  # Single-shot only for speed
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
-        logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
-        logger.info(f"    → Prompt length: {len(prompt)} chars")
         try:
-            # Call local PHI model with optimized parameters
             out = llm_client(
-                prompt,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 do_sample=True,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
-                pad_token_id=llm_client.tokenizer.eos_token_id,
-                eos_token_id=llm_client.tokenizer.eos_token_id,
-                truncation=True,
-                return_full_text=False  # Only return new generation, not prompt
             )
-            logger.info(f"    → Raw output type: {type(out)}")
-            # Extract generated text from pipeline output
-            if isinstance(out, list) and len(out) > 0:
-                first_item = out[0]
-                if isinstance(first_item, dict):
-                    generated = first_item.get('generated_text', '')
-                else:
-                    generated = str(first_item)
-            else:
-                generated = str(out) if out else ''
-            logger.info(f"    → Generated length before cleanup: {len(generated)} chars")
-            # PHI models may still include prompt, remove it
-            if generated and prompt in generated:
-                prompt_end = generated.find(prompt) + len(prompt)
-                generated = generated[prompt_end:].strip()
-            # Additional cleanup: remove any leading prompt fragments
-            if generated and generated.startswith(prompt[:50]):
-                generated = generated[len(prompt):].strip()
-            logger.info(f"    → Final generated length: {len(generated)} chars, words: {len(generated.split())}")
-            return generated.strip()
         except Exception as e:
-            logger.error(f"    ✗ PHI model call error: {e}")
             import traceback
-            logger.error(f"    ✗ Traceback: {traceback.format_exc()}")
             return ''
-    # Natural prompt: let the model generate complete, flowing responses
-    base_prompt = f"""You are a fashion expert. Provide a detailed, helpful answer to this question using the context provided.
-Question: {query}
-Context:
-{context_text[:1200]}
-Write a natural, complete answer with practical fashion advice. Include specific recommendations, styling tips, and any relevant details.
-Answer:"""
-    # Natural generation parameters: quality over speed, no artificial limits
     if attempt == 1:
-        temperature = 0.75  # Balanced creativity
-        max_new_tokens = 600  # Allow longer natural responses
-        top_p = 0.92
-        repetition_penalty = 1.08
     else:
-        temperature = 0.80
-        max_new_tokens = 700  # Even longer if needed
-        top_p = 0.93
-        repetition_penalty = 1.10
     logger.info(f"  → Starting generation with prompt: {base_prompt[:200]}...")
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
@@ -787,6 +769,11 @@ Answer:"""
     if word_count >= 20:
         logger.info(f"  ⚠️ Short but acceptable response ({word_count} words)")
         return response
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response

 # PHI-2 is optimal for CPU deployment: 2.7B parameters, excellent quality
 # Can be swapped with Phi-3-mini-4k-instruct if more memory is available
 LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
+USE_8BIT_QUANTIZATION = False  # DISABLED: causes hanging on CPU
 USE_REMOTE_LLM = False
 # Natural flow mode: No word limits, let model decide length
+MAX_CONTEXT_LENGTH = 400  # Reduced for faster generation
 USE_CACHING = True  # Cache model outputs for repeated patterns
 ENABLE_FAST_MODE = False  # Allow natural completion, no artificial limits
             use_fast=True
         )
+        # Configure tokenizer for PHI models
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        logger.info(f"   Tokenizer configured: vocab_size={len(tokenizer)}, eos_token={tokenizer.eos_token}")
+        # Configure model loading for CPU efficiency (NO quantization)
         model_kwargs = {
             "trust_remote_code": True,
             "low_cpu_mem_usage": True,
             "torch_dtype": torch.float32,  # CPU works best with float32
+            "device_map": "auto",  # Let transformers handle device placement
         }
         # Load the model with optimization
         logger.info("   Loading PHI model (this may take 30-60 seconds)...")
         model = AutoModelForCausalLM.from_pretrained(
         # Move to eval mode to disable dropout and save memory
         model.eval()
+        # Skip torch.compile - can cause issues on Hugging Face Spaces
+        logger.info("   Model ready for inference")
         # Create pipeline for generation
         logger.info("   Creating text-generation pipeline...")
         llm_client = pipeline(
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            max_new_tokens=200,  # Reduced for faster generation
             pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            device=0 if device == "cuda" else -1  # -1 for CPU
         )
         CONFIG["llm_model"] = LOCAL_PHI_MODEL
     max_iterations = 0  # Single-shot only for speed
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
+        """Optimized for PHI-2 - fast generation on CPU"""
         try:
+            # Simple direct prompt - no fancy formatting
+            formatted_prompt = f"{prompt}\n\nAnswer:"
+            logger.info(f"    → Calling PHI-2 (tokens={max_new_tokens}, temp={temperature})")
+            logger.info(f"    → Formatted prompt length: {len(formatted_prompt)} chars")
+            # Call PHI-2 with MINIMAL settings for speed
             out = llm_client(
+                formatted_prompt,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 do_sample=True,
                 repetition_penalty=repetition_penalty,
                 num_return_sequences=1,
+                return_full_text=False
             )
+            logger.info(f"    → Generation completed")
+            # Extract text quickly
+            if not out or not isinstance(out, list) or len(out) == 0:
+                logger.warning("    ✗ Empty output")
+                return ''
+            generated = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
+            # Quick cleanup
+            for remove in [formatted_prompt, 'Answer:', 'Response:', 'Output:']:
+                generated = generated.replace(remove, '')
+            generated = generated.strip()
+            word_count = len(generated.split())
+            logger.info(f"    ✅ Generated {word_count} words")
+            return generated
         except Exception as e:
+            logger.error(f"    ✗ Error: {e}")
             import traceback
+            logger.error(traceback.format_exc())
             return ''
+    # PHI-2 optimized: VERY short prompt for fast generation
+    # Long prompts cause slow/hanging generation on CPU
+    base_prompt = f"""Question: {query}
+Context: {context_text[:400]}
+Answer with fashion advice:"""
+    # PHI-2 generation parameters: SPEED OPTIMIZED for CPU
+    # Shorter outputs = faster generation on Hugging Face Spaces
     if attempt == 1:
+        temperature = 0.7
+        max_new_tokens = 200  # Reduced for faster generation
+        top_p = 0.9
+        repetition_penalty = 1.15  # Higher to prevent loops
     else:
+        temperature = 0.75
+        max_new_tokens = 250
+        top_p = 0.92
+        repetition_penalty = 1.2
     logger.info(f"  → Starting generation with prompt: {base_prompt[:200]}...")
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     if word_count >= 20:
         logger.info(f"  ⚠️ Short but acceptable response ({word_count} words)")
         return response
+    # Ultra permissive: accept ANYTHING with 10+ words to show something
+    if word_count >= 10:
+        logger.info(f"  ⚠️ Very short response ({word_count} words) but accepting")
+        return response
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response