Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

2e58050

verified ·

1 Parent(s): 7980cb3

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -55

app.py CHANGED Viewed

@@ -23,6 +23,15 @@ from langchain.schema import Document
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
@@ -31,9 +40,9 @@ CONFIG = {
     "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
     "llm_model": None,
     "vector_store_path": ".",
-    "top_k": 15,
     "temperature": 0.75,
-    "max_tokens": 350,
 }
 # Local PHI model configuration for Hugging Face Spaces
@@ -43,6 +52,10 @@ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
 USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
 # project root containing only the token (no newline is necessary). DO NOT
@@ -500,27 +513,21 @@ def scaffold_and_polish(query: str, retrieved_docs: List[Document], llm_client)
         logger.warning("  ✗ Scaffold empty after selection")
         return None
-    # Craft polish prompt
-    polish_prompt = f"""Please rewrite and expand the draft below into a clear, expert, natural-flowing answer of about 320-420 words to the question: {query}
-Requirements:
-- Keep paragraphs natural and connected.
-- Preserve factual content from the draft and avoid inventing unsupported facts.
-- Use a friendly, expert tone and provide practical, actionable advice.
-Draft:
-{scaffold}
-Answer:
 """
     logger.info("  → Polishing scaffold with PHI model")
     try:
         out = llm_client(
             polish_prompt,
-            max_new_tokens=600,
-            temperature=0.72,
-            top_p=0.92,
             do_sample=True,
             repetition_penalty=1.1,
             pad_token_id=llm_client.tokenizer.eos_token_id
@@ -548,11 +555,11 @@ Answer:
     final_words = polished.split()
     fw = len(final_words)
-    if fw < 300:
         logger.warning(f"  ✗ Polished output too short ({fw} words)")
         return None
-    if fw > 460:
-        polished = ' '.join(final_words[:420]) + '...'
     logger.info(f"  ✅ Polished answer ready ({len(polished.split())} words)")
     return polished
@@ -641,21 +648,18 @@ def generate_llm_answer(
     context_parts = []
     for doc in top_docs:
         content = doc.page_content.strip()
-        if len(content) > 400:
-            content = content[:400] + "..."
         context_parts.append(content)
     context_text = "\n\n".join(context_parts)
-    # Primary strategy: try a single long-form generation first. If that fails
-    # (too short or truncated), fall back to an iterative multi-pass generator
-    # that appends continuation chunks until we reach the target word count.
-    # Adjusted targets for faster generation and user's request: aim ~350 words
-    target_min_words = 320
-    target_max_words = 420
-    chunk_target_words = 140
-    max_iterations = 4
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
@@ -690,33 +694,30 @@ def generate_llm_answer(
             logger.error(f"    ✗ PHI model call error: {e}")
             return ''
-    # Build initial prompt
-    base_prompt = f"""Please write a clear, natural-flowing, well-structured fashion answer using the context below.
 Question: {query}
-Context (use where helpful):
-{context_text[:1200]}
-Requirements:
-- Aim for a long-form answer ~{target_min_words}-{target_max_words} words, structured in paragraphs.
-- Use the provided context where relevant and add practical, actionable advice.
-- Keep a friendly, expert tone and avoid hedging phrases like "I can't" or "I don't know".
 Answer:
 """
-    # First attempt: single call with a medium-large token budget (may be limited by model)
     if attempt == 1:
-        temperature = 0.70
-        max_new_tokens = 600
-        top_p = 0.92
         repetition_penalty = 1.1
     else:
-        temperature = 0.82
-        max_new_tokens = 800
-        top_p = 0.95
-        repetition_penalty = 1.15
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     response = (initial_output or '').strip()
@@ -747,18 +748,15 @@ Answer:
             break
         # Ask the model to continue without repeating previous content
-        continue_prompt = f"""Continue the previous answer in the same tone and style to add about {min(chunk_target_words, remaining)} words.
-Do not repeat sentences already present. Keep paragraphs natural and connected.
-Previous answer:
-{accumulated}
-Continue:
 """
-        # Slightly higher temperature on continuations to encourage richer text
-        cont_output = call_model(continue_prompt, max_new_tokens=450, temperature=0.78, top_p=0.93, repetition_penalty=1.08)
         cont_text = (cont_output or '').strip()
         if not cont_text:
@@ -903,12 +901,13 @@ def fashion_chatbot(message: str, history: List[List[str]]):
         words = llm_answer.split()
         displayed_text = ""
         for i, word in enumerate(words):
             displayed_text += word + " "
-            if i % 3 == 0 or i == len(words) - 1:
                 yield displayed_text.strip()
-                time.sleep(0.05)
     except Exception as e:
         logger.error(f"Error in chatbot: {e}")

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Optimize PyTorch for CPU inference
+torch.set_num_threads(4)  # Limit threads for better CPU performance
+torch.set_grad_enabled(False)  # Disable gradients (inference only)
+# Suppress specific warnings
+import warnings
+warnings.filterwarnings("ignore", message="MatMul8bitLt")
+warnings.filterwarnings("ignore", message="torch_dtype")
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
     "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
     "llm_model": None,
     "vector_store_path": ".",
+    "top_k": 10,  # Reduced for faster retrieval
     "temperature": 0.75,
+    "max_tokens": 300,  # Reduced for faster generation
 }
 # Local PHI model configuration for Hugging Face Spaces
 USE_8BIT_QUANTIZATION = True  # Reduces memory usage by ~50%
 USE_REMOTE_LLM = False
+# Generation optimization for speed
+MAX_CONTEXT_LENGTH = 800  # Reduce context to speed up generation
+TARGET_ANSWER_WORDS = 280  # Shorter target for faster responses
 # Prefer the environment variable, but also allow a local token file for users
 # who don't know how to set env vars. Create a file named `hf_token.txt` in the
 # project root containing only the token (no newline is necessary). DO NOT
         logger.warning("  ✗ Scaffold empty after selection")
         return None
+    # Craft polish prompt - optimized for speed
+    polish_prompt = f"""Expand this draft to ~280 words with practical fashion advice for: {query}
+Draft: {scaffold[:400]}
+Enhanced answer:
 """
     logger.info("  → Polishing scaffold with PHI model")
     try:
         out = llm_client(
             polish_prompt,
+            max_new_tokens=400,  # Reduced for speed
+            temperature=0.75,
+            top_p=0.90,
             do_sample=True,
             repetition_penalty=1.1,
             pad_token_id=llm_client.tokenizer.eos_token_id
     final_words = polished.split()
     fw = len(final_words)
+    if fw < 200:
         logger.warning(f"  ✗ Polished output too short ({fw} words)")
         return None
+    if fw > 380:
+        polished = ' '.join(final_words[:350]) + '...'
     logger.info(f"  ✅ Polished answer ready ({len(polished.split())} words)")
     return polished
     context_parts = []
     for doc in top_docs:
         content = doc.page_content.strip()
+        if len(content) > 300:
+            content = content[:300] + "..."
         context_parts.append(content)
     context_text = "\n\n".join(context_parts)
+    # Optimized for speed: shorter context, shorter target, fewer iterations
+    # This significantly reduces generation time on CPU
+    target_min_words = 250
+    target_max_words = 350
+    chunk_target_words = 120
+    max_iterations = 2
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
             logger.error(f"    ✗ PHI model call error: {e}")
             return ''
+    # Build initial prompt - optimized for speed with shorter context
+    base_prompt = f"""Answer this fashion question with practical advice in ~{target_min_words} words.
 Question: {query}
+Key information:
+{context_text[:600]}
+Provide a clear, helpful answer with specific recommendations.
 Answer:
 """
+    # Optimized parameters for faster CPU generation
     if attempt == 1:
+        temperature = 0.75
+        max_new_tokens = 400  # Reduced for speed
+        top_p = 0.90
         repetition_penalty = 1.1
     else:
+        temperature = 0.85
+        max_new_tokens = 500
+        top_p = 0.92
+        repetition_penalty = 1.12
     initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
     response = (initial_output or '').strip()
             break
         # Ask the model to continue without repeating previous content
+        continue_prompt = f"""Add {min(chunk_target_words, remaining)} more words to complete this answer:
+{accumulated[-400:]}
+Continue naturally:
 """
+        # Optimized continuation parameters for speed
+        cont_output = call_model(continue_prompt, max_new_tokens=250, temperature=0.80, top_p=0.90, repetition_penalty=1.10)
         cont_text = (cont_output or '').strip()
         if not cont_text:
         words = llm_answer.split()
         displayed_text = ""
+        # Faster streaming for better UX
         for i, word in enumerate(words):
             displayed_text += word + " "
+            if i % 5 == 0 or i == len(words) - 1:
                 yield displayed_text.strip()
+                time.sleep(0.02)  # Reduced delay
     except Exception as e:
         logger.error(f"Error in chatbot: {e}")