Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

4b54bb9

verified ·

1 Parent(s): 7a3d769

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -72

app.py CHANGED Viewed

@@ -158,17 +158,50 @@ def initialize_llm():
         # Skip torch.compile - can cause issues on Hugging Face Spaces
         logger.info("   Model ready for inference")
-        # Create pipeline for generation
-        # CRITICAL: Do NOT specify device when using device_map="auto"
-        logger.info("   Creating text-generation pipeline...")
-        llm_client = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=200,  # Reduced for faster generation
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id
-        )
         CONFIG["llm_model"] = LOCAL_PHI_MODEL
         CONFIG["model_type"] = "phi_local"
@@ -655,12 +688,12 @@ def generate_llm_answer(
     scored_docs.sort(key=lambda x: x[1], reverse=True)
     top_docs = [doc[0] for doc in scored_docs[:8]]
-    # Natural flow: use rich context from top documents
     context_parts = []
-    for doc in top_docs[:6]:  # Use 6 best documents
         content = doc.page_content.strip()
-        if len(content) > 500:  # Keep more content
-            content = content[:500] + "..."
         context_parts.append(content)
     context_text = "\n\n".join(context_parts)
@@ -672,71 +705,90 @@ def generate_llm_answer(
     max_iterations = 0  # Single-shot only for speed
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
-        """Optimized for PHI-2 - fast generation on CPU"""
-        try:
-            # Simple direct prompt - no fancy formatting
-            formatted_prompt = f"{prompt}\n\nAnswer:"
-            logger.info(f"    → Calling PHI-2 (tokens={max_new_tokens}, temp={temperature})")
-            logger.info(f"    → Formatted prompt length: {len(formatted_prompt)} chars")
-            # Call PHI-2 with MINIMAL settings for speed
-            out = llm_client(
-                formatted_prompt,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                do_sample=True,
-                repetition_penalty=repetition_penalty,
-                num_return_sequences=1,
-                return_full_text=False
-            )
-            logger.info(f"    → Generation completed")
-            # Extract text quickly
-            if not out or not isinstance(out, list) or len(out) == 0:
-                logger.warning("    ✗ Empty output")
-                return ''
-            generated = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
-            # Quick cleanup
-            for remove in [formatted_prompt, 'Answer:', 'Response:', 'Output:']:
-                generated = generated.replace(remove, '')
-            generated = generated.strip()
-            word_count = len(generated.split())
-            logger.info(f"    ✅ Generated {word_count} words")
-            return generated
-        except Exception as e:
-            logger.error(f"    ✗ Error: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
             return ''
-    # PHI-2 optimized: VERY short prompt for fast generation
-    # Long prompts cause slow/hanging generation on CPU
-    base_prompt = f"""Question: {query}
-Context: {context_text[:400]}
-Answer with fashion advice:"""
-    # PHI-2 generation parameters: SPEED OPTIMIZED for CPU
-    # Shorter outputs = faster generation on Hugging Face Spaces
     if attempt == 1:
         temperature = 0.7
-        max_new_tokens = 200  # Reduced for faster generation
         top_p = 0.9
-        repetition_penalty = 1.15  # Higher to prevent loops
-    else:
-        temperature = 0.75
-        max_new_tokens = 250
-        top_p = 0.92
-        repetition_penalty = 1.2
     logger.info(f"  → Starting generation with prompt: {base_prompt[:200]}...")
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
@@ -774,6 +826,11 @@ Answer with fashion advice:"""
     if word_count >= 10:
         logger.info(f"  ⚠️ Very short response ({word_count} words) but accepting")
         return response
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response

         # Skip torch.compile - can cause issues on Hugging Face Spaces
         logger.info("   Model ready for inference")
+        # Store model and tokenizer directly for faster inference
+        # We'll use direct generation instead of pipeline
+        logger.info("   Configuring direct model inference (faster than pipeline)...")
+        # Create a simple wrapper that mimics pipeline interface
+        class FastPHIGenerator:
+            def __init__(self, model, tokenizer):
+                self.model = model
+                self.tokenizer = tokenizer
+            def __call__(self, prompt, max_new_tokens=150, temperature=0.7, top_p=0.9,
+                        do_sample=True, repetition_penalty=1.1, **kwargs):
+                """Direct generation - faster than pipeline"""
+                try:
+                    # Tokenize
+                    inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+                    input_ids = inputs["input_ids"]
+                    # Generate
+                    with torch.no_grad():
+                        outputs = self.model.generate(
+                            input_ids,
+                            max_new_tokens=max_new_tokens,
+                            temperature=temperature,
+                            top_p=top_p,
+                            do_sample=do_sample,
+                            repetition_penalty=repetition_penalty,
+                            pad_token_id=self.tokenizer.eos_token_id,
+                            eos_token_id=self.tokenizer.eos_token_id,
+                            early_stopping=True
+                        )
+                    # Decode only the new tokens
+                    generated_ids = outputs[0][input_ids.shape[1]:]
+                    generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+                    return [{"generated_text": generated_text}]
+                except Exception as e:
+                    logger.error(f"Generation error: {e}")
+                    return [{"generated_text": ""}]
+        llm_client = FastPHIGenerator(model, tokenizer)
+        llm_client.tokenizer = tokenizer  # Add tokenizer reference for compatibility
         CONFIG["llm_model"] = LOCAL_PHI_MODEL
         CONFIG["model_type"] = "phi_local"
     scored_docs.sort(key=lambda x: x[1], reverse=True)
     top_docs = [doc[0] for doc in scored_docs[:8]]
+    # Minimal context for speed
     context_parts = []
+    for doc in top_docs[:3]:  # Only 3 best documents
         content = doc.page_content.strip()
+        if len(content) > 200:  # Much shorter snippets
+            content = content[:200] + "..."
         context_parts.append(content)
     context_text = "\n\n".join(context_parts)
     max_iterations = 0  # Single-shot only for speed
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
+        """Optimized for PHI-2 with timeout protection"""
+        import threading
+        result_container = {'output': None, 'error': None}
+        def generate_with_timeout():
+            try:
+                # Ultra-simple prompt
+                formatted_prompt = f"{prompt}\n\nAnswer:"
+                logger.info(f"    → PHI-2 generating (max_tokens={max_new_tokens})")
+                # MINIMAL settings - most restrictive for speed
+                out = llm_client(
+                    formatted_prompt,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=False,  # Greedy decoding for speed
+                    repetition_penalty=repetition_penalty,
+                    num_return_sequences=1,
+                    return_full_text=False,
+                    early_stopping=True
+                )
+                result_container['output'] = out
+                logger.info(f"    ✓ Generation done")
+            except Exception as e:
+                result_container['error'] = str(e)
+                logger.error(f"    ✗ Generation error: {e}")
+        # Run generation in thread with timeout
+        gen_thread = threading.Thread(target=generate_with_timeout)
+        gen_thread.daemon = True
+        gen_thread.start()
+        gen_thread.join(timeout=45)  # 45 second timeout
+        if gen_thread.is_alive():
+            logger.error("    ✗ Generation TIMEOUT after 45s")
+            return ''
+        if result_container['error']:
+            logger.error(f"    ✗ Error: {result_container['error']}")
+            return ''
+        out = result_container['output']
+        # Extract text quickly
+        if not out or not isinstance(out, list) or len(out) == 0:
+            logger.warning("    ✗ Empty output")
             return ''
+        generated = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
+        # Quick cleanup
+        formatted_prompt = f"{prompt}\n\nAnswer:"
+        for remove in [formatted_prompt, 'Answer:', 'Response:', 'Output:']:
+            generated = generated.replace(remove, '')
+        generated = generated.strip()
+        word_count = len(generated.split())
+        logger.info(f"    ✅ Generated {word_count} words")
+        return generated
+    # ULTRA-SHORT prompt for speed
+    base_prompt = f"""Q: {query}
+{context_text[:300]}
+A:"""
+    # AGGRESSIVE speed optimization
     if attempt == 1:
+        temperature = 0.6  # Lower = faster
+        max_new_tokens = 150  # Much shorter
+        top_p = 0.85
+        repetition_penalty = 1.2
+    else:
         temperature = 0.7
+        max_new_tokens = 180
         top_p = 0.9
+        repetition_penalty = 1.25
     logger.info(f"  → Starting generation with prompt: {base_prompt[:200]}...")
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     if word_count >= 10:
         logger.info(f"  ⚠️ Very short response ({word_count} words) but accepting")
         return response
+    # EMERGENCY: accept even 5+ words if that's all we get
+    if word_count >= 5:
+        logger.info(f"  ⚠️ EMERGENCY: Accepting tiny response ({word_count} words)")
+        return response
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response