Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

0e156ba

verified ·

1 Parent(s): 486261d

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -54

app.py CHANGED Viewed

@@ -273,85 +273,152 @@ def generate_llm_answer(
     context_text = "\n\n".join(context_parts)
-    # Request a long, natural-flowing answer (target 400-700 words).
-    # We keep context trimmed to avoid exceeding input limits, but ask the model
-    # to produce an extended answer. Use sampling for natural flow on CPU.
     if attempt == 1:
         temperature = 0.70
-        max_new_tokens = 1000
         top_p = 0.92
         repetition_penalty = 1.1
     else:
         temperature = 0.82
-        max_new_tokens = 1200
         top_p = 0.95
         repetition_penalty = 1.15
-    user_prompt = f"""Please write a clear, natural-flowing, well-structured fashion answer using the context below.
-Question: {query}
-Context (use where helpful):
-{context_text[:1400]}
-Requirements:
-- Write between 400 and 700 words (aim for natural paragraphs, not a list of short sentences).
-- Use the provided context where relevant and add practical, actionable advice.
-- Keep a friendly, expert tone and avoid hedging phrases like "I can't" or "I don't know".
-Answer:
 """
-    try:
-        logger.info(f"  → Calling {CONFIG.get('llm_model', 'LLM')} (temp={temperature}, max_new_tokens={max_new_tokens})...")
-        # Use max_new_tokens for the generated output length (safer than max_length here).
-        output = llm_client(
-            user_prompt,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            do_sample=True,
-            num_beams=1,
-            repetition_penalty=repetition_penalty,
-            early_stopping=False
-        )
-        # transformers text2text pipeline returns a list of dicts; key is often 'generated_text'
-        response = output[0].get('generated_text', '') if isinstance(output, list) and output else str(output).strip()
-        response = response.strip()
-        if not response:
-            logger.warning(f"  ✗ Empty response (attempt {attempt})")
-            return None
-        # Word-count validation: ensure long-form output (target 400-700 words)
-        words = response.split()
-        word_count = len(words)
-        if word_count < 380:
-            logger.warning(f"  ✗ Response too short ({word_count} words) — expected ~400-700")
-            return None
-        # If excessively long, truncate to 700 words for safety and readability
-        if word_count > 750:
-            logger.info(f"  ⚠️ Response very long ({word_count} words). Truncating to 700 words.")
-            response = ' '.join(words[:700]) + '...'
-            word_count = 700
-        # Filter out apology/hedging starts
-        apology_phrases = ["i cannot", "i can't", "i'm sorry", "i apologize", "i don't have"]
-        if any(phrase in response.lower()[:200] for phrase in apology_phrases):
-            logger.warning(f"  ✗ Apology/hedging detected in response start")
-            return None
-        logger.info(f"  ✅ Generated long-form answer ({word_count} words, {len(response)} chars)")
-        return response
-    except Exception as e:
-        logger.error(f"  ✗ Generation error: {e}")
         return None
 def generate_answer_langchain(
     query: str,
     vectorstore,

     context_text = "\n\n".join(context_parts)
+    # Primary strategy: try a single long-form generation first. If that fails
+    # (too short or truncated), fall back to an iterative multi-pass generator
+    # that appends continuation chunks until we reach the target word count.
+    target_min_words = 400
+    target_max_words = 700
+    chunk_target_words = 200
+    max_iterations = 4
+    def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
+        logger.info(f"    → Model call (temp={temperature}, max_new_tokens={max_new_tokens})")
+        try:
+            out = llm_client(
+                prompt,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                num_beams=1,
+                repetition_penalty=repetition_penalty,
+                early_stopping=False
+            )
+            if isinstance(out, list) and out:
+                return out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
+            return str(out)
+        except Exception as e:
+            logger.error(f"    ✗ Model call error: {e}")
+            return ''
+    # Build initial prompt
+    base_prompt = f"""Please write a clear, natural-flowing, well-structured fashion answer using the context below.
+Question: {query}
+Context (use where helpful):
+{context_text[:1200]}
+Requirements:
+- Aim for a long-form answer ~{target_min_words}-{target_max_words} words, structured in paragraphs.
+- Use the provided context where relevant and add practical, actionable advice.
+- Keep a friendly, expert tone and avoid hedging phrases like "I can't" or "I don't know".
+Answer:
+"""
+    # First attempt: single call with a medium-large token budget (may be limited by model)
     if attempt == 1:
         temperature = 0.70
+        max_new_tokens = 600
         top_p = 0.92
         repetition_penalty = 1.1
     else:
         temperature = 0.82
+        max_new_tokens = 800
         top_p = 0.95
         repetition_penalty = 1.15
+    initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
+    response = (initial_output or '').strip()
+    # Basic sanity checks
+    if not response:
+        logger.warning("  ✗ Empty initial response")
+        response = ''
+    words = response.split()
+    word_count = len(words)
+    # If single-shot succeeded, validate length and return
+    if word_count >= target_min_words:
+        if word_count > target_max_words:
+            response = ' '.join(words[:target_max_words]) + '...'
+            word_count = target_max_words
+        logger.info(f"  ✅ Single-shot generated {word_count} words")
+        return response
+    # Otherwise, try iterative continuation to build up to the target
+    accumulated = response
+    prev_word_count = word_count
+    for i in range(max_iterations):
+        remaining = max(0, target_min_words - len(accumulated.split()))
+        if remaining <= 0:
+            break
+        # Ask the model to continue without repeating previous content
+        continue_prompt = f"""Continue the previous answer in the same tone and style to add about {min(chunk_target_words, remaining)} words.
+Do not repeat sentences already present. Keep paragraphs natural and connected.
+Previous answer:
+{accumulated}
+Continue:
 """
+        # Slightly higher temperature on continuations to encourage richer text
+        cont_output = call_model(continue_prompt, max_new_tokens=450, temperature=0.78, top_p=0.93, repetition_penalty=1.08)
+        cont_text = (cont_output or '').strip()
+        if not cont_text:
+            logger.warning(f"  ✗ Continuation {i+1} returned empty — stopping")
+            break
+        # Avoid trivial repeats: if continuation repeats the accumulated text, stop
+        if cont_text in accumulated or accumulated.endswith(cont_text[:50]):
+            logger.warning(f"  ✗ Continuation {i+1} appears repetitive — stopping")
+            break
+        # Append and normalize spacing
+        accumulated = accumulated.rstrip() + '\n\n' + cont_text
+        current_word_count = len(accumulated.split())
+        logger.info(f"  → After continuation {i+1}, words={current_word_count}")
+        # Stop early if we've reached or exceeded the minimum target
+        if current_word_count >= target_min_words:
+            break
+        # Safety: if no progress, break
+        if current_word_count == prev_word_count:
+            logger.warning("  ✗ No progress from continuation — stopping")
+            break
+        prev_word_count = current_word_count
+    final_words = accumulated.split()
+    final_count = len(final_words)
+    if final_count < target_min_words:
+        logger.warning(f"  ✗ Final answer too short ({final_count} words) after continuations")
         return None
+    if final_count > target_max_words:
+        logger.info(f"  ⚠️ Final answer long ({final_count} words). Truncating to {target_max_words} words.")
+        accumulated = ' '.join(final_words[:target_max_words]) + '...'
+        final_count = target_max_words
+    # Final check for apology/hedging at start
+    apology_phrases = ["i cannot", "i can't", "i'm sorry", "i apologize", "i don't have"]
+    if any(phrase in accumulated.lower()[:200] for phrase in apology_phrases):
+        logger.warning("  ✗ Apology/hedging detected in final answer")
+        return None
+    logger.info(f"  ✅ Built long-form answer ({final_count} words)")
+    return accumulated
 def generate_answer_langchain(
     query: str,
     vectorstore,