Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

d85f59c

verified ·

1 Parent(s): 7eb2f2d

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -110

app.py CHANGED Viewed

@@ -40,9 +40,9 @@ CONFIG = {
     "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
     "llm_model": None,
     "vector_store_path": ".",
-    "top_k": 8,  # Minimal retrieval for speed
-    "temperature": 0.85,  # Higher for faster sampling
-    "max_tokens": 280,  # Aggressive reduction
 }
 # Local PHI model configuration for Hugging Face Spaces
@@ -52,11 +52,10 @@ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
 USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
-# Advanced optimization settings for FAST generation
-MAX_CONTEXT_LENGTH = 500  # Minimal context for speed
-TARGET_ANSWER_WORDS = 220  # Shorter answers = faster generation
 USE_CACHING = True  # Cache model outputs for repeated patterns
-ENABLE_FAST_MODE = True  # Skip iterative generation, use single-shot only
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
@@ -159,7 +158,7 @@ def initialize_llm():
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            max_new_tokens=280,  # Default optimized value
             pad_token_id=tokenizer.eos_token_id,
             batch_size=1  # Single batch for optimal CPU performance
         )
@@ -461,28 +460,10 @@ def generate_extractive_answer(query: str, retrieved_docs: List[Document]) -> Op
     # Combine and refine spacing
     answer = "\n\n".join(parts)
-    # Post-process: ensure target length (approximately 400-700 words)
     words = answer.split()
     word_count = len(words)
-    # If too short, append templated practical paragraphs built from keywords
-    if word_count < 380:
-        logger.info(f"  → Extractive answer short ({word_count} words). Appending templated paragraphs.")
-        extra_paragraphs = []
-        extra_paragraphs.append("A reliable strategy is to build around versatile, neutral pieces: a well-fitted blazer, tailored trousers, a versatile dress, and quality shoes. These items can be mixed and matched for many occasions.")
-        extra_paragraphs.append("Focus on fit and fabric: ensure key items are well-tailored, prioritize breathable fabrics for comfort, and choose merino or wool blends for colder seasons to layer effectively.")
-        extra_paragraphs.append("Layering is essential for transitional weather; combine a lightweight sweater under a jacket, and carry a scarf for added warmth and visual interest.")
-        extra_paragraphs.append("Accessories like belts, a structured bag, and minimal jewelry can elevate basic outfits without extra effort. Neutral colors increase versatility and pair well with bolder accents.")
-        answer += "\n\n" + "\n\n".join(extra_paragraphs)
-        words = answer.split()
-        word_count = len(words)
-    # If still too long, truncate gracefully
-    if word_count > 750:
-        words = words[:700]
-        answer = " ".join(words) + '...'
-        word_count = 700
     logger.info(f"  ✅ Extractive answer ready ({word_count} words)")
     return answer
@@ -531,10 +512,12 @@ def scaffold_and_polish(query: str, retrieved_docs: List[Document], llm_client)
         logger.warning("  ✗ Scaffold empty after selection")
         return None
-    # Craft polish prompt - optimized for speed
-    polish_prompt = f"""Expand this draft to ~280 words with practical fashion advice for: {query}
-Draft: {scaffold[:400]}
 Enhanced answer:
 """
@@ -543,9 +526,9 @@ Enhanced answer:
     try:
         out = llm_client(
             polish_prompt,
-            max_new_tokens=400,  # Reduced for speed
             temperature=0.75,
-            top_p=0.90,
             do_sample=True,
             repetition_penalty=1.1,
             pad_token_id=llm_client.tokenizer.eos_token_id
@@ -573,32 +556,29 @@ Enhanced answer:
     final_words = polished.split()
     fw = len(final_words)
-    if fw < 200:
         logger.warning(f"  ✗ Polished output too short ({fw} words)")
         return None
-    if fw > 380:
-        polished = ' '.join(final_words[:350]) + '...'
-    logger.info(f"  ✅ Polished answer ready ({len(polished.split())} words)")
     return polished
 def retrieve_knowledge_langchain(
     query: str,
     vectorstore,
-    top_k: int = 8
 ) -> Tuple[List[Document], float]:
     logger.info(f"🔍 Retrieving knowledge for: '{query}'")
-    # Fast mode: single query only (no variants)
-    global ENABLE_FAST_MODE
-    if ENABLE_FAST_MODE:
-        query_variants = [query]
-    else:
-        query_variants = [
-            query,
-            f"fashion advice clothing outfit style for {query}",
-        ]
     all_docs = []
@@ -668,28 +648,21 @@ def generate_llm_answer(
     scored_docs.sort(key=lambda x: x[1], reverse=True)
     top_docs = [doc[0] for doc in scored_docs[:8]]
-    # Ultra-fast context preparation: only use top 4 docs, very short snippets
     context_parts = []
-    for doc in top_docs[:4]:  # Reduced from 8 to 4
         content = doc.page_content.strip()
-        if len(content) > 200:  # Reduced from 300 to 200
-            content = content[:200] + "..."
         context_parts.append(content)
-    context_text = "\n".join(context_parts)  # Single newline instead of double
-    # Ultra-fast mode: minimal words, no iterations
-    global ENABLE_FAST_MODE
-    if ENABLE_FAST_MODE:
-        target_min_words = 180  # Much shorter
-        target_max_words = 280
-        chunk_target_words = 0  # No continuations
-        max_iterations = 0  # No iterations
-    else:
-        target_min_words = 250
-        target_max_words = 350
-        chunk_target_words = 120
-        max_iterations = 2
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
@@ -727,23 +700,28 @@ def generate_llm_answer(
             logger.error(f"    ✗ PHI model call error: {e}")
             return ''
-    # Ultra-compact prompt for maximum speed
-    base_prompt = f"""Q: {query}
-Context: {context_text[:400]}
-A:"""
-    # Aggressive speed optimization: fewer tokens, higher temperature for faster sampling
     if attempt == 1:
-        temperature = 0.85  # Higher = faster sampling
-        max_new_tokens = 280  # Reduced significantly
-        top_p = 0.88
         repetition_penalty = 1.08
     else:
-        temperature = 0.90
-        max_new_tokens = 320
-        top_p = 0.90
         repetition_penalty = 1.10
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
@@ -757,31 +735,18 @@ A:"""
     words = response.split()
     word_count = len(words)
-    # Fast mode: accept shorter answers immediately
-    if ENABLE_FAST_MODE and word_count >= 150:
-        if word_count > target_max_words:
-            response = ' '.join(words[:target_max_words]) + '...'
-            word_count = target_max_words
-        logger.info(f"  ✅ Fast-mode generated {word_count} words")
         return response
-    # If single-shot succeeded, validate length and return
-    if word_count >= target_min_words:
-        if word_count > target_max_words:
-            response = ' '.join(words[:target_max_words]) + '...'
-            word_count = target_max_words
-        logger.info(f"  ✅ Single-shot generated {word_count} words")
         return response
-    # Skip iterations in fast mode
-    if ENABLE_FAST_MODE or max_iterations == 0:
-        if word_count >= 120:  # Accept even shorter in fast mode
-            logger.info(f"  ✅ Fast-mode accepted {word_count} words")
-            return response
-        # If too short, return None to trigger fallback
-        logger.warning(f"  ✗ Output too short ({word_count} words), trying fallback")
-        return None
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response
     prev_word_count = word_count
@@ -867,9 +832,8 @@ def generate_answer_langchain(
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
-    # Fast mode: single attempt only
-    global ENABLE_FAST_MODE
-    max_attempts = 1 if ENABLE_FAST_MODE else 2
     llm_answer = None
     for attempt in range(1, max_attempts + 1):
@@ -886,16 +850,15 @@ def generate_answer_langchain(
     if not llm_answer:
         logger.error(f"  ✗ All {max_attempts} LLM attempts failed")
-        # In fast mode, skip scaffold-and-polish and go straight to extractive
-        if not ENABLE_FAST_MODE:
-            try:
-                logger.info("  → Attempting scaffold-and-polish using PHI model")
-                polished = scaffold_and_polish(query, retrieved_docs, llm_client)
-                if polished:
-                    logger.info("  ✅ Scaffold-and-polish produced an answer")
-                    return polished
-            except Exception as e:
-                logger.error(f"  ✗ Scaffold-and-polish error: {e}")
         # Final fallback: extractive templated answer (guaranteed deterministic & FAST)
         try:

     "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
     "llm_model": None,
     "vector_store_path": ".",
+    "top_k": 12,  # Rich retrieval for quality
+    "temperature": 0.75,  # Balanced for natural flow
+    "max_tokens": 600,  # Allow natural length responses
 }
 # Local PHI model configuration for Hugging Face Spaces
 USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
+# Natural flow mode: No word limits, let model decide length
+MAX_CONTEXT_LENGTH = 800  # Rich context for quality
 USE_CACHING = True  # Cache model outputs for repeated patterns
+ENABLE_FAST_MODE = False  # Allow natural completion, no artificial limits
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            max_new_tokens=600,  # Allow natural length responses
             pad_token_id=tokenizer.eos_token_id,
             batch_size=1  # Single batch for optimal CPU performance
         )
     # Combine and refine spacing
     answer = "\n\n".join(parts)
+    # Natural length - no artificial padding or truncation
     words = answer.split()
     word_count = len(words)
     logger.info(f"  ✅ Extractive answer ready ({word_count} words)")
     return answer
         logger.warning("  ✗ Scaffold empty after selection")
         return None
+    # Craft polish prompt - natural expansion with no limits
+    polish_prompt = f"""Expand this draft into a complete, detailed fashion answer for: {query}
+Draft: {scaffold}
+Write a comprehensive, natural answer with practical advice and specific recommendations.
 Enhanced answer:
 """
     try:
         out = llm_client(
             polish_prompt,
+            max_new_tokens=600,  # Allow natural expansion
             temperature=0.75,
+            top_p=0.92,
             do_sample=True,
             repetition_penalty=1.1,
             pad_token_id=llm_client.tokenizer.eos_token_id
     final_words = polished.split()
     fw = len(final_words)
+    # No artificial limits - accept natural length
+    if fw < 50:
         logger.warning(f"  ✗ Polished output too short ({fw} words)")
         return None
+    # Keep full response, no truncation
+    logger.info(f"  ✅ Polished answer ready ({fw} words)")
     return polished
 def retrieve_knowledge_langchain(
     query: str,
     vectorstore,
+    top_k: int = 12
 ) -> Tuple[List[Document], float]:
     logger.info(f"🔍 Retrieving knowledge for: '{query}'")
+    # Natural mode: use query variants for better context
+    query_variants = [
+        query,
+        f"fashion advice clothing outfit style for {query}",
+    ]
     all_docs = []
     scored_docs.sort(key=lambda x: x[1], reverse=True)
     top_docs = [doc[0] for doc in scored_docs[:8]]
+    # Natural flow: use rich context from top documents
     context_parts = []
+    for doc in top_docs[:6]:  # Use 6 best documents
         content = doc.page_content.strip()
+        if len(content) > 500:  # Keep more content
+            content = content[:500] + "..."
         context_parts.append(content)
+    context_text = "\n\n".join(context_parts)
+    # NO WORD LIMITS: Let the model decide natural completion length
+    target_min_words = 100  # Very low minimum - accept any reasonable output
+    target_max_words = 999999  # No maximum - let model complete naturally
+    chunk_target_words = 0  # Not used in natural mode
+    max_iterations = 0  # Single-shot only for speed
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
             logger.error(f"    ✗ PHI model call error: {e}")
             return ''
+    # Natural prompt: let the model generate complete, flowing responses
+    base_prompt = f"""You are a fashion expert. Provide a detailed, helpful answer to this question using the context provided.
+Question: {query}
+Context:
+{context_text[:1200]}
+Write a natural, complete answer with practical fashion advice. Include specific recommendations, styling tips, and any relevant details.
+Answer:"""
+    # Natural generation parameters: quality over speed, no artificial limits
     if attempt == 1:
+        temperature = 0.75  # Balanced creativity
+        max_new_tokens = 600  # Allow longer natural responses
+        top_p = 0.92
         repetition_penalty = 1.08
     else:
+        temperature = 0.80
+        max_new_tokens = 700  # Even longer if needed
+        top_p = 0.93
         repetition_penalty = 1.10
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     words = response.split()
     word_count = len(words)
+    # Natural mode: accept ANY response length - let model decide
+    # No truncation, no artificial limits
+    if word_count >= target_min_words:
+        # Accept the full natural response without cutting
+        logger.info(f"  ✅ Generated {word_count} words naturally")
         return response
+    # Even if short, accept it if it has substance (50+ words)
+    if word_count >= 50:
+        logger.info(f"  ✅ Accepted natural response ({word_count} words)")
         return response
     # Otherwise, try iterative continuation to build up to the target
     accumulated = response
     prev_word_count = word_count
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
+    # Natural mode: allow 2 attempts for quality
+    max_attempts = 2
     llm_answer = None
     for attempt in range(1, max_attempts + 1):
     if not llm_answer:
         logger.error(f"  ✗ All {max_attempts} LLM attempts failed")
+        # Try scaffold-and-polish as fallback
+        try:
+            logger.info("  → Attempting scaffold-and-polish using PHI model")
+            polished = scaffold_and_polish(query, retrieved_docs, llm_client)
+            if polished:
+                logger.info("  ✅ Scaffold-and-polish produced an answer")
+                return polished
+        except Exception as e:
+            logger.error(f"  ✗ Scaffold-and-polish error: {e}")
         # Final fallback: extractive templated answer (guaranteed deterministic & FAST)
         try: