Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 6 days ago

Commit

24467e0

verified ·

1 Parent(s): 068ed2b

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -18

app.py CHANGED Viewed

@@ -333,25 +333,25 @@ def generate_llm_answer(
     context_text = "\n\n".join(context_parts)
-    # Progressive parameters - balanced for T5's capabilities
     if attempt == 1:
         temperature = 0.7
-        max_new_tokens = 350  # Realistic length for T5
         top_p = 0.9
         repetition_penalty = 1.2
     elif attempt == 2:
         temperature = 0.75
-        max_new_tokens = 400
         top_p = 0.92
         repetition_penalty = 1.25
     elif attempt == 3:
         temperature = 0.8
-        max_new_tokens = 450
         top_p = 0.94
         repetition_penalty = 1.3
     else:
         temperature = 0.85
-        max_new_tokens = 500
         top_p = 0.95
         repetition_penalty = 1.35
@@ -370,17 +370,16 @@ Provide detailed fashion advice (200-400 words):"""
     try:
         logger.info(f"  → Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
-        # T5 optimized parameters - CRITICAL: truncate input to stay under 512 tokens
         output = llm_client(
             user_prompt,
             max_new_tokens=max_new_tokens,
-            min_new_tokens=100,  # Ensure minimum length generation
             temperature=temperature,
             top_p=top_p,
             do_sample=True,
-            num_beams=4,  # Higher beams for better quality
             repetition_penalty=repetition_penalty,
-            length_penalty=1.2,  # Encourage longer responses
             early_stopping=True,
             no_repeat_ngram_size=3,
             truncation=True  # CRITICAL: Truncate input if too long
@@ -394,8 +393,8 @@ Provide detailed fashion advice (200-400 words):"""
             return None
         # Validation - accept responses with meaningful content
-        if len(response) < 100:
-            logger.warning(f"  ✗ Response too short: {len(response)} chars (need 100+)")
             return None
         # Check for apologies/refusals
@@ -439,17 +438,17 @@ def generate_answer_langchain(
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
-    # Step 2: Try LLM generation (4 attempts)
     llm_answer = None
-    for attempt in range(1, 5):
-        logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/4")
         llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
         if llm_answer:
             logger.info(f"  ✅ LLM answer generated successfully")
             break
         else:
-            logger.warning(f"  → Attempt {attempt}/4 failed, retrying...")
     # Step 3: If all attempts fail, return error
     if not llm_answer:
@@ -488,10 +487,10 @@ def fashion_chatbot(message: str, history: List[List[str]]):
         # Show generating indicator
         yield f"💭 Generating answer ({len(retrieved_docs)} sources found)..."
-        # Generate answer with multiple attempts
         llm_answer = None
-        for attempt in range(1, 5):
-            logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/4")
             llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
             if llm_answer:

     context_text = "\n\n".join(context_parts)
+    # Progressive parameters - optimized for SPEED (shorter = faster)
     if attempt == 1:
         temperature = 0.7
+        max_new_tokens = 250  # Faster generation
         top_p = 0.9
         repetition_penalty = 1.2
     elif attempt == 2:
         temperature = 0.75
+        max_new_tokens = 300
         top_p = 0.92
         repetition_penalty = 1.25
     elif attempt == 3:
         temperature = 0.8
+        max_new_tokens = 350
         top_p = 0.94
         repetition_penalty = 1.3
     else:
         temperature = 0.85
+        max_new_tokens = 400
         top_p = 0.95
         repetition_penalty = 1.35
     try:
         logger.info(f"  → Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
+        # T5 optimized for SPEED on CPU - use greedy decoding (num_beams=1)
         output = llm_client(
             user_prompt,
             max_new_tokens=max_new_tokens,
+            min_new_tokens=80,  # Lower minimum for faster completion
             temperature=temperature,
             top_p=top_p,
             do_sample=True,
+            num_beams=1,  # Greedy decoding for 4x faster speed on CPU
             repetition_penalty=repetition_penalty,
             early_stopping=True,
             no_repeat_ngram_size=3,
             truncation=True  # CRITICAL: Truncate input if too long
             return None
         # Validation - accept responses with meaningful content
+        if len(response) < 80:
+            logger.warning(f"  ✗ Response too short: {len(response)} chars (need 80+)")
             return None
         # Check for apologies/refusals
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
+    # Step 2: Try LLM generation (2 attempts for speed)
     llm_answer = None
+    for attempt in range(1, 3):
+        logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/2")
         llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
         if llm_answer:
             logger.info(f"  ✅ LLM answer generated successfully")
             break
         else:
+            logger.warning(f"  → Attempt {attempt}/2 failed, retrying...")
     # Step 3: If all attempts fail, return error
     if not llm_answer:
         # Show generating indicator
         yield f"💭 Generating answer ({len(retrieved_docs)} sources found)..."
+        # Generate answer with 2 quick attempts
         llm_answer = None
+        for attempt in range(1, 3):
+            logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/2")
             llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
             if llm_answer: