Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Sleeping

App Files Files Community

hamxaameer commited on 17 days ago

Commit

9ae102e

verified ·

1 Parent(s): ed0b266

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -27

app.py CHANGED Viewed

@@ -44,9 +44,8 @@ def initialize_llm():
     logger.info("🔄 Initializing FREE local language model...")
     BACKUP_MODELS = [
-        "microsoft/Phi-3-mini-4k-instruct",  # Primary - 3.8B, very efficient
-        "google/flan-t5-large",  # Backup - 780M, good quality
-        "google/flan-t5-base",  # Fallback - 250M, fast
     ]
     for model_name in BACKUP_MODELS:
@@ -54,15 +53,20 @@ def initialize_llm():
             logger.info(f"   Trying {model_name}...")
             device = 0 if torch.cuda.is_available() else -1
             llm_client = pipeline(
-                "text-generation",
                 model=model_name,
                 device=device,
-                max_length=512,
                 truncation=True,
             )
             CONFIG["llm_model"] = model_name
             logger.info(f"✅ FREE LLM initialized: {model_name}")
             logger.info(f"   Device: {'GPU' if device == 0 else 'CPU'}")
             return llm_client
@@ -352,8 +356,13 @@ def generate_llm_answer(
         top_p = 0.97
         repetition_penalty = 1.25
-    # Create prompt
-    user_prompt = f"""[INST] Question: {query}
 Fashion Knowledge:
 {context_text}
@@ -363,17 +372,30 @@ Answer the question using the knowledge above. Be specific and helpful (100-250
     try:
         logger.info(f"  → Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
-        # Call pipeline
-        output = llm_client(
-            user_prompt,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            do_sample=True,
-            return_full_text=False,
-            pad_token_id=llm_client.tokenizer.eos_token_id
-        )
         # Extract generated text
         response = output[0]['generated_text'].strip()
@@ -469,26 +491,60 @@ def generate_answer_langchain(
 # GRADIO INTERFACE
 # ============================================================================
-def fashion_chatbot(message: str, history: List[List[str]]) -> str:
     """
-    Chatbot function for Gradio interface
     """
     try:
         if not message or not message.strip():
-            return "Please ask a fashion-related question!"
-        # Generate answer using RAG pipeline
-        answer = generate_answer_langchain(
             message.strip(),
             vectorstore,
-            llm_client
         )
-        return answer
     except Exception as e:
         logger.error(f"Error in chatbot: {e}")
-        return f"Sorry, I encountered an error: {str(e)}"
 # ============================================================================
 # INITIALIZE AND LAUNCH
@@ -519,7 +575,7 @@ def startup():
 # Initialize on startup
 startup()
-# Create Gradio interface - simple version compatible with all Gradio versions
 demo = gr.ChatInterface(
     fn=fashion_chatbot,
     title="👗 Fashion Advisor - RAG System",
@@ -542,6 +598,10 @@ I can help with:
         "How to dress for a summer wedding?",
         "What's the best outfit for a university presentation?",
     ],
 )
 # Launch

     logger.info("🔄 Initializing FREE local language model...")
     BACKUP_MODELS = [
+        "google/flan-t5-base",  # Primary - 250M, very fast on CPU
+        "google/flan-t5-large",  # Backup - 780M, slower but better
     ]
     for model_name in BACKUP_MODELS:
             logger.info(f"   Trying {model_name}...")
             device = 0 if torch.cuda.is_available() else -1
+            # Use text2text-generation for T5 models (not text-generation)
+            task = "text2text-generation" if "t5" in model_name.lower() else "text-generation"
             llm_client = pipeline(
+                task,
                 model=model_name,
                 device=device,
+                max_length=300,
                 truncation=True,
+                model_kwargs={"low_cpu_mem_usage": True, "use_cache": True}  # Optimize for speed
             )
             CONFIG["llm_model"] = model_name
+            CONFIG["model_type"] = "t5" if "t5" in model_name.lower() else "instruct"
             logger.info(f"✅ FREE LLM initialized: {model_name}")
             logger.info(f"   Device: {'GPU' if device == 0 else 'CPU'}")
             return llm_client
         top_p = 0.97
         repetition_penalty = 1.25
+    # Create prompt based on model type
+    if CONFIG.get("model_type") == "t5":
+        # T5 needs simple input-output format
+        user_prompt = f"Question: {query}\n\nContext: {context_text[:800]}\n\nProvide a helpful fashion answer:"
+    else:
+        # Instruct models use INST format
+        user_prompt = f"""[INST] Question: {query}
 Fashion Knowledge:
 {context_text}
     try:
         logger.info(f"  → Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
+        # Call pipeline with model-specific parameters
+        if CONFIG.get("model_type") == "t5":
+            # T5 uses max_length not max_new_tokens
+            output = llm_client(
+                user_prompt,
+                max_length=150,  # Even shorter for faster response
+                temperature=0.7,  # Lower temp for consistency
+                top_p=0.9,
+                do_sample=True,
+                num_beams=1,  # Disable beam search for speed
+                early_stopping=True
+            )
+        else:
+            # Other models use max_new_tokens
+            output = llm_client(
+                user_prompt,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                do_sample=True,
+                return_full_text=False,
+                pad_token_id=llm_client.tokenizer.eos_token_id
+            )
         # Extract generated text
         response = output[0]['generated_text'].strip()
 # GRADIO INTERFACE
 # ============================================================================
+def fashion_chatbot(message: str, history: List[List[str]]):
     """
+    Chatbot function for Gradio interface with streaming
     """
     try:
         if not message or not message.strip():
+            yield "Please ask a fashion-related question!"
+            return
+        # Show typing indicator
+        yield "🔍 Searching fashion knowledge base..."
+        # Retrieve documents
+        retrieved_docs, confidence = retrieve_knowledge_langchain(
             message.strip(),
             vectorstore,
+            top_k=CONFIG["top_k"]
         )
+        if not retrieved_docs:
+            yield "I couldn't find relevant information to answer your question."
+            return
+        # Update status
+        yield f"💭 Generating answer (found {len(retrieved_docs)} relevant sources)..."
+        # Generate answer with multiple attempts
+        llm_answer = None
+        for attempt in range(1, 5):
+            logger.info(f"\n  🤖 LLM Generation Attempt {attempt}/4")
+            llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
+            if llm_answer:
+                break
+        # Fallback if needed
+        if not llm_answer:
+            logger.error(f"  ✗ All LLM attempts failed - using fallback")
+            llm_answer = synthesize_direct_answer(message.strip(), retrieved_docs)
+        # Stream the answer word by word for natural flow
+        words = llm_answer.split()
+        displayed_text = ""
+        for i, word in enumerate(words):
+            displayed_text += word + " "
+            # Yield every 2-3 words for smooth streaming
+            if i % 2 == 0 or i == len(words) - 1:
+                yield displayed_text.strip()
     except Exception as e:
         logger.error(f"Error in chatbot: {e}")
+        yield f"Sorry, I encountered an error: {str(e)}"
 # ============================================================================
 # INITIALIZE AND LAUNCH
 # Initialize on startup
 startup()
+# Create Gradio interface with streaming enabled
 demo = gr.ChatInterface(
     fn=fashion_chatbot,
     title="👗 Fashion Advisor - RAG System",
         "How to dress for a summer wedding?",
         "What's the best outfit for a university presentation?",
     ],
+    cache_examples=False,  # Don't cache for fresh responses
+    retry_btn="🔄 Retry",
+    undo_btn="↩️ Undo",
+    clear_btn="🗑️ Clear",
 )
 # Launch