Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 4 days ago

Commit

7e05a3e

verified ·

1 Parent(s): e1490b6

Update app.py

Browse files

Files changed (1) hide show

app.py +168 -264

app.py CHANGED Viewed

@@ -62,8 +62,8 @@ CONFIG = {
 }
 # LLM Configuration - LOCAL ONLY
-# Using GPT-2 Medium: 355M params, better quality, still fast on CPU
-LOCAL_LLM_MODEL = os.environ.get("LOCAL_LLM_MODEL", "gpt2-medium")
 USE_8BIT_QUANTIZATION = False
 USE_REMOTE_LLM = False  # LOCAL ONLY
@@ -94,18 +94,18 @@ if HF_INFERENCE_API_KEY:
 # ============================================================================
 def initialize_llm():
-    """Initialize GPT-2 Medium for local CPU generation.
-    GPT-2 Medium is 355M params - much better quality than DistilGPT2,
-    still fast enough for CPU inference (5-10 seconds per response).
     """
     global LOCAL_LLM_MODEL
-    logger.info(f"🔄 Initializing GPT-2 Medium: {LOCAL_LLM_MODEL}")
-    logger.info("   Better quality for natural fashion advice")
     try:
-        from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"   Device: {device}")
@@ -113,15 +113,11 @@ def initialize_llm():
         # Load tokenizer
         logger.info("   Loading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL)
-        # Set pad token
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
         logger.info("   Tokenizer ready")
         # Load model
-        logger.info("   Loading GPT-2 Medium (15-20 seconds)...")
-        model = AutoModelForCausalLM.from_pretrained(
             LOCAL_LLM_MODEL,
             torch_dtype=torch.float32
         )
@@ -130,23 +126,20 @@ def initialize_llm():
         model.eval()
         logger.info("   Model ready")
-        # Use pipeline for simplicity
-        logger.info("   Creating generation pipeline...")
-        llm_client = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            device=0 if device == "cuda" else -1,
-            max_new_tokens=150
-        )
         CONFIG["llm_model"] = LOCAL_LLM_MODEL
-        CONFIG["model_type"] = "gpt2_medium_local"
-        logger.info(f"✅ GPT-2 Medium initialized: {LOCAL_LLM_MODEL}")
-        logger.info(f"   Size: 355M parameters (4x larger than DistilGPT2)")
-        logger.info(f"   Quality: Much better for natural fashion advice")
-        logger.info(f"   Speed: 5-10 seconds per response")
         return llm_client
@@ -355,87 +348,84 @@ def load_vector_store(embeddings):
 # ============================================================================
 def generate_extractive_answer(query: str, retrieved_docs: List[Document]) -> Optional[str]:
-    """Build a long-form answer from retrieved documents using extractive
-    selection + templated transitions. This avoids calling the LLM when it
-    repeatedly fails or returns very short outputs.
     """
-    logger.info(f"🔧 Running extractive fallback for: '{query}'")
-    # Collect text and split into sentences
     import re
-    all_text = "\n\n".join([d.page_content for d in retrieved_docs])
-    # Basic sentence split (keeps punctuation)
     sentences = re.split(r'(?<=[.!?])\s+', all_text)
-    sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
     if not sentences:
-        logger.warning("  ✗ No sentences found in retrieved documents for extractive fallback")
         return None
-    # Scoring: keyword overlap with query and fashion terms
     query_tokens = set(re.findall(r"\w+", query.lower()))
-    fashion_keywords = set(["outfit","wear","wardrobe","style","colors","color","layer","layering",
-                            "blazer","trousers","dress","shirt","shoes","boots","sweater","jacket",
-                            "care","wash","dry","clean","wool","cotton","silk","linen","fit","tailor",
-                            "versatile","neutral","accessory","belt","bag","occasion","season","fall"])
-    keywords = query_tokens.union(fashion_keywords)
     scored = []
-    for s in sentences:
         s_tokens = set(re.findall(r"\w+", s.lower()))
-        score = len(s_tokens & keywords)
-        # length bonus to prefer richer sentences
-        score += min(3, len(s.split()) // 20)
         scored.append((score, s))
     scored.sort(key=lambda x: x[0], reverse=True)
-    top_sentences = [s for _, s in scored[:60]]
-    # Build structured sections using top sentences + templates
-    def pick(n, start=0):
-        return top_sentences[start:start+n]
-    intro = []
-    intro.extend(pick(2, 0))
-    key_items = pick(8, 2)
-    styling = pick(8, 10)
-    care = pick(6, 18)
-    conclusion = pick(4, 24)
-    # Add handcrafted, helpful transitions to improve flow
-    template_intro = f"Here's a detailed answer to '{query}'. I'll cover essential wardrobe items, styling tips, and care advice so you can apply these suggestions practically."
-    # Ensure care advice includes the user's specific care example if present or add it
-    care_text = "\n\n".join(care)
-    if "dry clean" not in care_text.lower() and "hand wash" not in care_text.lower():
-        care_text += "\n\nDry clean or hand wash in cold water with wool-specific detergent. Never wring out wool - gently squeeze excess water and lay flat to dry on a towel."
-    parts = []
-    parts.append(template_intro)
-    if intro:
-        parts.append(" ".join(intro))
-    if key_items:
-        parts.append("Key wardrobe items to prioritize:")
-        parts.append(" ".join(key_items))
-    if styling:
-        parts.append("Practical styling tips:")
-        parts.append(" ".join(styling))
-    if care_text:
-        parts.append("Care & maintenance:")
-        parts.append(care_text)
-    if conclusion:
-        parts.append("Wrapping up:")
-        parts.append(" ".join(conclusion))
-    # Combine and refine spacing
-    answer = "\n\n".join(parts)
-    # Natural length - no artificial padding or truncation
-    words = answer.split()
-    word_count = len(words)
-    logger.info(f"  ✅ Extractive answer ready ({word_count} words)")
     return answer
@@ -594,16 +584,22 @@ def generate_llm_answer(
     llm_client,
     attempt: int = 1
 ) -> Optional[str]:
-    # Ensure we have a local PHI model loaded
     if not llm_client:
-        logger.error("  → PHI model not initialized")
         return None
     query_lower = query.lower()
     query_words = set(query_lower.split())
     scored_docs = []
-    for doc in retrieved_docs[:20]:
         content = doc.page_content.lower()
         doc_words = set(content.split())
         overlap = len(query_words.intersection(doc_words))
@@ -617,180 +613,73 @@ def generate_llm_answer(
         scored_docs.append((doc, overlap))
     scored_docs.sort(key=lambda x: x[1], reverse=True)
-    top_docs = [doc[0] for doc in scored_docs[:8]]
-    # Minimal context for speed
     context_parts = []
-    for doc in top_docs[:3]:  # Only 3 best documents
         content = doc.page_content.strip()
-        if len(content) > 200:  # Much shorter snippets
-            content = content[:200] + "..."
         context_parts.append(content)
     context_text = "\n\n".join(context_parts)
-    # NO WORD LIMITS: Let the model decide natural completion length
-    target_min_words = 100  # Very low minimum - accept any reasonable output
-    target_max_words = 999999  # No maximum - let model complete naturally
-    chunk_target_words = 0  # Not used in natural mode
-    max_iterations = 0  # Single-shot only for speed
-    def call_model(prompt, max_new_tokens, temperature):
-        """Generate with GPT-2 Medium - better quality"""
-        try:
-            logger.info(f"    → Generating (max_tokens={max_new_tokens})")
-            out = llm_client(
-                prompt,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                do_sample=True,
-                return_full_text=False,
-                repetition_penalty=1.15,  # Moderate penalty for better flow
-                top_k=50,
-                top_p=0.92,
-                pad_token_id=llm_client.tokenizer.eos_token_id,
-                eos_token_id=llm_client.tokenizer.eos_token_id
-            )
-            if not out or not isinstance(out, list) or len(out) == 0:
-                return ''
-            generated = out[0].get('generated_text', '').strip()
-            word_count = len(generated.split())
-            logger.info(f"    ✅ Generated {word_count} words")
-            return generated
-        except Exception as e:
-            logger.error(f"    ✗ Error: {e}")
-            return ''
-    # Natural prompt for GPT-2 Medium
-    base_prompt = f"""Question: {query}
-Based on fashion knowledge:
-{context_text[:400]}
-Fashion Advice: For this question, I recommend"""
-    # GPT-2 Medium parameters - balanced for quality
-    if attempt == 1:
-        max_new_tokens = 200
-        temperature = 0.7
-    else:
-        max_new_tokens = 250
-        temperature = 0.75
-    logger.info(f"  → Starting generation with prompt: {base_prompt[:200]}...")
-    initial_output = call_model(base_prompt, max_new_tokens, temperature)
-    response = (initial_output or '').strip()
-    # Basic sanity checks
-    if not response:
-        logger.warning("  ✗ Empty initial response - model may not be generating")
-        logger.warning(f"  ✗ Prompt was: {base_prompt[:300]}")
-        response = ''
-    words = response.split()
-    word_count = len(words)
-    logger.info(f"  → Initial response: {word_count} words")
-    # Natural mode: accept ANY response length - let model decide
-    # No truncation, no artificial limits
-    if word_count >= target_min_words:
-        # Accept the full natural response without cutting
-        logger.info(f"  ✅ Generated {word_count} words naturally")
-        return response
-    # Even if short, accept it if it has substance (50+ words)
-    if word_count >= 50:
-        logger.info(f"  ✅ Accepted natural response ({word_count} words)")
-        return response
-    # Very permissive: accept anything with 20+ words
-    if word_count >= 20:
-        logger.info(f"  ⚠️ Short but acceptable response ({word_count} words)")
-        return response
-    # Ultra permissive: accept ANYTHING with 10+ words to show something
-    if word_count >= 10:
-        logger.info(f"  ⚠️ Very short response ({word_count} words) but accepting")
-        return response
-    # EMERGENCY: accept even 5+ words if that's all we get
-    if word_count >= 5:
-        logger.info(f"  ⚠️ EMERGENCY: Accepting tiny response ({word_count} words)")
         return response
-    # Otherwise, try iterative continuation to build up to the target
-    accumulated = response
-    prev_word_count = word_count
-    for i in range(max_iterations):
-        remaining = max(0, target_min_words - len(accumulated.split()))
-        if remaining <= 0:
-            break
-        # Ask the model to continue without repeating previous content
-        continue_prompt = f"""Add {min(chunk_target_words, remaining)} more words to complete this answer:
-{accumulated[-400:]}
-Continue naturally:
-"""
-        # Optimized continuation parameters for speed
-        cont_output = call_model(continue_prompt, max_new_tokens=250, temperature=0.80, top_p=0.90, repetition_penalty=1.10)
-        cont_text = (cont_output or '').strip()
-        if not cont_text:
-            logger.warning(f"  ✗ Continuation {i+1} returned empty — stopping")
-            break
-        # Avoid trivial repeats: if continuation repeats the accumulated text, stop
-        if cont_text in accumulated or accumulated.endswith(cont_text[:50]):
-            logger.warning(f"  ✗ Continuation {i+1} appears repetitive — stopping")
-            break
-        # Append and normalize spacing
-        accumulated = accumulated.rstrip() + '\n\n' + cont_text
-        current_word_count = len(accumulated.split())
-        logger.info(f"  → After continuation {i+1}, words={current_word_count}")
-        # Stop early if we've reached or exceeded the minimum target
-        if current_word_count >= target_min_words:
-            break
-        # Safety: if no progress, break
-        if current_word_count == prev_word_count:
-            logger.warning("  ✗ No progress from continuation — stopping")
-            break
-        prev_word_count = current_word_count
-    final_words = accumulated.split()
-    final_count = len(final_words)
-    if final_count < target_min_words:
-        logger.warning(f"  ✗ Final answer too short ({final_count} words) after continuations")
-        return None
-    if final_count > target_max_words:
-        logger.info(f"  ⚠️ Final answer long ({final_count} words). Truncating to {target_max_words} words.")
-        accumulated = ' '.join(final_words[:target_max_words]) + '...'
-        final_count = target_max_words
-    # Final check for apology/hedging at start
-    apology_phrases = ["i cannot", "i can't", "i'm sorry", "i apologize", "i don't have"]
-    if any(phrase in accumulated.lower()[:200] for phrase in apology_phrases):
-        logger.warning("  ✗ Apology/hedging detected in final answer")
         return None
-    logger.info(f"  ✅ Built long-form answer ({final_count} words)")
-    return accumulated
 def generate_answer_langchain(
     query: str,
     vectorstore,
@@ -809,8 +698,18 @@ def generate_answer_langchain(
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
-    # Use extractive answer as PRIMARY - it's high quality and reliable
-    logger.info("  → Using extractive answer generator (proven high quality)")
     try:
         extractive_answer = generate_extractive_answer(query, retrieved_docs)
         if extractive_answer:
@@ -845,12 +744,17 @@ def fashion_chatbot(message: str, history: List[List[str]]):
         yield f"💭 Generating fashion advice ({len(retrieved_docs)} sources found)..."
-        # Use extractive answer - high quality and reliable
-        logger.info("  → Generating extractive answer")
-        llm_answer = generate_extractive_answer(message.strip(), retrieved_docs)
         if not llm_answer:
-            logger.error(f"  ✗ Extractive answer generation failed")
             yield "I apologize, but I'm having trouble generating a response. Please try rephrasing your question."
             return

 }
 # LLM Configuration - LOCAL ONLY
+# Using Flan-T5 Base: 250M params, instruction-tuned, fast and high quality
+LOCAL_LLM_MODEL = os.environ.get("LOCAL_LLM_MODEL", "google/flan-t5-base")
 USE_8BIT_QUANTIZATION = False
 USE_REMOTE_LLM = False  # LOCAL ONLY
 # ============================================================================
 def initialize_llm():
+    """Initialize Flan-T5 Base for local CPU generation.
+    Flan-T5 is instruction-tuned, produces high-quality answers,
+    and is fast on CPU (3-5 seconds per response).
     """
     global LOCAL_LLM_MODEL
+    logger.info(f"🔄 Initializing Flan-T5 Base: {LOCAL_LLM_MODEL}")
+    logger.info("   Instruction-tuned for high-quality Q&A")
     try:
+        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
         device = "cuda" if torch.cuda.is_available() else "cpu"
         logger.info(f"   Device: {device}")
         # Load tokenizer
         logger.info("   Loading tokenizer...")
         tokenizer = AutoTokenizer.from_pretrained(LOCAL_LLM_MODEL)
         logger.info("   Tokenizer ready")
         # Load model
+        logger.info("   Loading Flan-T5 Base (10-15 seconds)...")
+        model = AutoModelForSeq2SeqLM.from_pretrained(
             LOCAL_LLM_MODEL,
             torch_dtype=torch.float32
         )
         model.eval()
         logger.info("   Model ready")
+        # Store model and tokenizer for custom generation
+        llm_client = {
+            'model': model,
+            'tokenizer': tokenizer,
+            'device': device
+        }
         CONFIG["llm_model"] = LOCAL_LLM_MODEL
+        CONFIG["model_type"] = "flan_t5_base_local"
+        logger.info(f"✅ Flan-T5 Base initialized: {LOCAL_LLM_MODEL}")
+        logger.info(f"   Size: 250M parameters (instruction-tuned)")
+        logger.info(f"   Quality: Excellent for fashion Q&A")
+        logger.info(f"   Speed: 3-5 seconds per 200 words")
         return llm_client
 # ============================================================================
 def generate_extractive_answer(query: str, retrieved_docs: List[Document]) -> Optional[str]:
+    """Build a focused, intelligent answer from retrieved documents.
+    Filters out product catalogs and provides concise, relevant fashion advice.
     """
+    logger.info(f"🔧 Generating smart extractive answer for: '{query}'")
     import re
+    all_text = "\n\n".join([d.page_content for d in retrieved_docs[:10]])  # Top 10 docs only
     sentences = re.split(r'(?<=[.!?])\s+', all_text)
+    sentences = [s.strip() for s in sentences if len(s.strip()) > 40]
     if not sentences:
+        logger.warning("  ✗ No sentences found")
         return None
+    # Filter out product catalog noise
+    filtered_sentences = []
+    for s in sentences:
+        # Skip sentences that are clearly product listings
+        if re.search(r'Category:|Season:|Usage:|Color:|Price:|SKU:', s, re.IGNORECASE):
+            continue
+        # Skip sentences with brand names followed by product codes
+        if re.search(r'(Men|Women|Kids|Boys|Girls)\s+[A-Z][a-z]+\s+[A-Z]', s):
+            continue
+        # Keep only advice/guidance sentences
+        if any(word in s.lower() for word in ['wear', 'pair', 'choose', 'opt', 'works', 'complement',
+                                                'match', 'combine', 'style', 'look', 'consider', 'add']):
+            filtered_sentences.append(s)
+    if not filtered_sentences:
+        # Fallback: use all sentences if filtering was too aggressive
+        filtered_sentences = [s for s in sentences if len(s.split()) > 10][:15]
+    # Score by relevance to query
     query_tokens = set(re.findall(r"\w+", query.lower()))
     scored = []
+    for s in filtered_sentences:
         s_tokens = set(re.findall(r"\w+", s.lower()))
+        score = len(s_tokens & query_tokens)
+        # Bonus for sentence length (prefer substantial advice)
+        score += min(2, len(s.split()) // 30)
         scored.append((score, s))
     scored.sort(key=lambda x: x[0], reverse=True)
+    # Take top 5-8 most relevant sentences
+    top_sentences = [s for _, s in scored[:8] if s]
+    if not top_sentences:
+        return None
+    # Build concise answer
+    answer_parts = []
+    # Add 3-5 best sentences with natural flow
+    for i, sentence in enumerate(top_sentences[:5]):
+        answer_parts.append(sentence)
+    answer = " ".join(answer_parts)
+    # Clean up any remaining noise
+    answer = re.sub(r'\s+', ' ', answer).strip()
+    word_count = len(answer.split())
+    # Ensure answer is substantial but not too long (100-200 words ideal)
+    if word_count < 50:
+        logger.warning(f"  ⚠️ Answer too short ({word_count} words)")
+        return None
+    if word_count > 250:
+        # Trim to ~200 words
+        words = answer.split()[:200]
+        answer = " ".join(words) + "..."
+        word_count = 200
+    logger.info(f"  ✅ Smart answer ready ({word_count} words)")
     return answer
     llm_client,
     attempt: int = 1
 ) -> Optional[str]:
+    """Generate answer using Flan-T5 Base - instruction-tuned for Q&A."""
     if not llm_client:
+        logger.error("  → Flan-T5 model not initialized")
         return None
+    # Extract model components
+    model = llm_client['model']
+    tokenizer = llm_client['tokenizer']
+    device = llm_client['device']
+    # Select best documents
     query_lower = query.lower()
     query_words = set(query_lower.split())
     scored_docs = []
+    for doc in retrieved_docs[:15]:
         content = doc.page_content.lower()
         doc_words = set(content.split())
         overlap = len(query_words.intersection(doc_words))
         scored_docs.append((doc, overlap))
     scored_docs.sort(key=lambda x: x[1], reverse=True)
+    top_docs = [doc[0] for doc in scored_docs[:5]]
+    # Build rich context (Flan-T5 can handle more context)
     context_parts = []
+    for doc in top_docs:
         content = doc.page_content.strip()
+        if len(content) > 300:
+            content = content[:300] + "..."
         context_parts.append(content)
     context_text = "\n\n".join(context_parts)
+    # Flan-T5 instruction prompt - direct and clear
+    prompt = f"""Answer this fashion question with specific, practical advice (150-200 words):
+Question: {query}
+Fashion Knowledge:
+{context_text[:600]}
+Provide detailed fashion advice:"""
+    try:
+        logger.info(f"  → Generating with Flan-T5 (target: 200 words)")
+        # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate with Flan-T5 optimized parameters
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=250,  # ~200 words
+                min_length=120,      # Ensure substantial answers
+                temperature=0.8,     # Balanced creativity
+                top_p=0.9,
+                do_sample=True,
+                repetition_penalty=1.2,
+                no_repeat_ngram_size=3,
+                early_stopping=False
+            )
+        # Decode output
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+        word_count = len(response.split())
+        logger.info(f"  ✅ Generated {word_count} words with Flan-T5")
+        # Validate quality
+        if word_count < 50:
+            logger.warning(f"  ⚠️ Response too short ({word_count} words)")
+            return None
+        # Check for generic/irrelevant content
+        if any(phrase in response.lower() for phrase in ["i cannot", "i can't", "i'm sorry", "as an ai"]):
+            logger.warning("  ⚠️ Generic response detected")
+            return None
         return response
+    except Exception as e:
+        logger.error(f"  ✗ Flan-T5 generation error: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
         return None
 def generate_answer_langchain(
     query: str,
     vectorstore,
     if not retrieved_docs:
         return "I couldn't find relevant information to answer your question."
+    # Try Flan-T5 first (instruction-tuned, high quality)
+    logger.info("  → Attempting Flan-T5 generation (primary method)")
+    try:
+        llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt=1)
+        if llm_answer:
+            logger.info(f"  ✅ Flan-T5 answer generated successfully")
+            return llm_answer
+    except Exception as e:
+        logger.error(f"  ✗ Flan-T5 error: {e}")
+    # Fallback to extractive if Flan-T5 fails
+    logger.info("  → Fallback: Using extractive answer generator")
     try:
         extractive_answer = generate_extractive_answer(query, retrieved_docs)
         if extractive_answer:
         yield f"💭 Generating fashion advice ({len(retrieved_docs)} sources found)..."
+        # Try Flan-T5 first (fast and high quality)
+        logger.info("  → Generating with Flan-T5")
+        llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt=1)
+        # Fallback to extractive if needed
+        if not llm_answer:
+            logger.info("  → Fallback: Using extractive answer")
+            llm_answer = generate_extractive_answer(message.strip(), retrieved_docs)
         if not llm_answer:
+            logger.error(f"  ✗ All generation methods failed")
             yield "I apologize, but I'm having trouble generating a response. Please try rephrasing your question."
             return