Spaces:

hamxaameer
/

OutfitOrbit-Chatbot-Assistant

Running

App Files Files Community

hamxaameer commited on 5 days ago

Commit

4fc4e4b

verified ·

1 Parent(s): 0e156ba

Update app.py

Browse files

Files changed (1) hide show

app.py +208 -8

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ import pickle
 import torch
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.schema import Document
@@ -35,37 +36,116 @@ CONFIG = {
     "max_tokens": 350,
 }
 # ============================================================================
 # INITIALIZE MODELS
 # ============================================================================
 def initialize_llm():
     logger.info("🔄 Initializing FREE local language model...")
     model_name = "google/flan-t5-large"
     try:
         logger.info(f"   Loading {model_name}...")
         device = 0 if torch.cuda.is_available() else -1
         model_kwargs = {"low_cpu_mem_usage": True}
         llm_client = pipeline(
             "text2text-generation",
             model=model_name,
             device=device,
             model_kwargs=model_kwargs
         )
         CONFIG["llm_model"] = model_name
         CONFIG["model_type"] = "t5"
         logger.info(f"✅ LLM initialized: {model_name}")
         logger.info(f"   Device: {'GPU' if device == 0 else 'CPU'}")
         return llm_client
     except Exception as e:
         logger.error(f"❌ Failed to load model: {str(e)}")
         raise Exception(f"Failed to initialize LLM: {str(e)}")
 def initialize_embeddings():
     logger.info("🔄 Initializing embeddings model...")
@@ -185,6 +265,109 @@ def load_vector_store(embeddings):
 # RAG PIPELINE FUNCTIONS
 # ============================================================================
 def retrieve_knowledge_langchain(
     query: str,
     vectorstore,
@@ -277,14 +460,19 @@ def generate_llm_answer(
     # (too short or truncated), fall back to an iterative multi-pass generator
     # that appends continuation chunks until we reach the target word count.
-    target_min_words = 400
-    target_max_words = 700
-    chunk_target_words = 200
     max_iterations = 4
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → Model call (temp={temperature}, max_new_tokens={max_new_tokens})")
         try:
             out = llm_client(
                 prompt,
                 max_new_tokens=max_new_tokens,
@@ -450,6 +638,18 @@ def generate_answer_langchain(
     if not llm_answer:
         logger.error(f"  ✗ All 2 LLM attempts failed")
         return "I apologize, but I'm having trouble generating a response. Please try rephrasing your question or ask something else."
     return llm_answer

 import torch
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer
+import requests
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.schema import Document
     "max_tokens": 350,
 }
+# Remote inference config (optional). If `HF_INFERENCE_API_KEY` is set in the
+# environment, the app will prefer calling the Hugging Face Inference API (remote
+# hosted model) which can generate longer outputs faster than a CPU-bound local
+# model. Set `HF_INFERENCE_MODEL` to choose the remote model (instruction-tuned
+# model recommended).
+USE_REMOTE_LLM = False
+REMOTE_LLM_MODEL = os.environ.get("HF_INFERENCE_MODEL", "tiiuae/falcon-7b-instruct")
+# Prefer the environment variable, but also allow a local token file for users
+# who don't know how to set env vars. Create a file named `hf_token.txt` in the
+# project root containing only the token (no newline is necessary). DO NOT
+# commit that file to version control. A .gitignore entry will be added.
+HF_INFERENCE_API_KEY = os.environ.get("HF_INFERENCE_API_KEY")
+if not HF_INFERENCE_API_KEY:
+    try:
+        token_path = Path("hf_token.txt")
+        if token_path.exists():
+            HF_INFERENCE_API_KEY = token_path.read_text(encoding="utf-8").strip()
+            logger.info("Loaded HF token from hf_token.txt (ensure this file is private and not committed)")
+    except Exception:
+        logger.warning("Could not read hf_token.txt for HF token")
+if HF_INFERENCE_API_KEY:
+    USE_REMOTE_LLM = True
 # ============================================================================
 # INITIALIZE MODELS
 # ============================================================================
 def initialize_llm():
+    # If a remote HF Inference API key is provided, we won't instantiate a local
+    # heavy model; instead generation will be performed via the HTTP API.
+    global USE_REMOTE_LLM, REMOTE_LLM_MODEL
+    if USE_REMOTE_LLM:
+        logger.info(f"🔄 Using remote Hugging Face Inference model: {REMOTE_LLM_MODEL}")
+        CONFIG["llm_model"] = REMOTE_LLM_MODEL
+        CONFIG["model_type"] = "remote"
+        return None
     logger.info("🔄 Initializing FREE local language model...")
     model_name = "google/flan-t5-large"
     try:
         logger.info(f"   Loading {model_name}...")
         device = 0 if torch.cuda.is_available() else -1
         model_kwargs = {"low_cpu_mem_usage": True}
         llm_client = pipeline(
             "text2text-generation",
             model=model_name,
             device=device,
             model_kwargs=model_kwargs
         )
         CONFIG["llm_model"] = model_name
         CONFIG["model_type"] = "t5"
         logger.info(f"✅ LLM initialized: {model_name}")
         logger.info(f"   Device: {'GPU' if device == 0 else 'CPU'}")
         return llm_client
     except Exception as e:
         logger.error(f"❌ Failed to load model: {str(e)}")
         raise Exception(f"Failed to initialize LLM: {str(e)}")
+def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
+    """Call the Hugging Face Inference API for remote generation. Requires
+    `HF_INFERENCE_API_KEY` env var to be set and a model name in
+    `REMOTE_LLM_MODEL`.
+    """
+    if not HF_INFERENCE_API_KEY:
+        raise Exception("HF_INFERENCE_API_KEY not set for remote generation")
+    url = f"https://api-inference.huggingface.co/models/{REMOTE_LLM_MODEL}"
+    headers = {"Authorization": f"Bearer {HF_INFERENCE_API_KEY}"}
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "return_full_text": False
+        }
+    }
+    logger.info(f"    → Remote inference request to {REMOTE_LLM_MODEL} (tokens={max_new_tokens}, temp={temperature})")
+    r = requests.post(url, headers=headers, json=payload, timeout=60)
+    if r.status_code != 200:
+        logger.error(f"    ✗ Remote inference error {r.status_code}: {r.text[:200]}")
+        return ""
+    result = r.json()
+    if isinstance(result, dict) and result.get("error"):
+        logger.error(f"    ✗ Remote inference returned error: {result.get('error')}")
+        return ""
+    # The HF Inference API can return a list of generated outputs or text
+    if isinstance(result, list) and result:
+        # entries may be strings or dicts like {"generated_text": "..."}
+        first = result[0]
+        if isinstance(first, dict):
+            return first.get("generated_text", "").strip()
+        return str(first).strip()
+    if isinstance(result, dict) and "generated_text" in result:
+        return result["generated_text"].strip()
+    return str(result).strip()
 def initialize_embeddings():
     logger.info("🔄 Initializing embeddings model...")
 # RAG PIPELINE FUNCTIONS
 # ============================================================================
+def generate_extractive_answer(query: str, retrieved_docs: List[Document]) -> Optional[str]:
+    """Build a long-form answer from retrieved documents using extractive
+    selection + templated transitions. This avoids calling the LLM when it
+    repeatedly fails or returns very short outputs.
+    """
+    logger.info(f"🔧 Running extractive fallback for: '{query}'")
+    # Collect text and split into sentences
+    import re
+    all_text = "\n\n".join([d.page_content for d in retrieved_docs])
+    # Basic sentence split (keeps punctuation)
+    sentences = re.split(r'(?<=[.!?])\s+', all_text)
+    sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
+    if not sentences:
+        logger.warning("  ✗ No sentences found in retrieved documents for extractive fallback")
+        return None
+    # Scoring: keyword overlap with query and fashion terms
+    query_tokens = set(re.findall(r"\w+", query.lower()))
+    fashion_keywords = set(["outfit","wear","wardrobe","style","colors","color","layer","layering",
+                            "blazer","trousers","dress","shirt","shoes","boots","sweater","jacket",
+                            "care","wash","dry","clean","wool","cotton","silk","linen","fit","tailor",
+                            "versatile","neutral","accessory","belt","bag","occasion","season","fall"])
+    keywords = query_tokens.union(fashion_keywords)
+    scored = []
+    for s in sentences:
+        s_tokens = set(re.findall(r"\w+", s.lower()))
+        score = len(s_tokens & keywords)
+        # length bonus to prefer richer sentences
+        score += min(3, len(s.split()) // 20)
+        scored.append((score, s))
+    scored.sort(key=lambda x: x[0], reverse=True)
+    top_sentences = [s for _, s in scored[:60]]
+    # Build structured sections using top sentences + templates
+    def pick(n, start=0):
+        return top_sentences[start:start+n]
+    intro = []
+    intro.extend(pick(2, 0))
+    key_items = pick(8, 2)
+    styling = pick(8, 10)
+    care = pick(6, 18)
+    conclusion = pick(4, 24)
+    # Add handcrafted, helpful transitions to improve flow
+    template_intro = f"Here's a detailed answer to '{query}'. I'll cover essential wardrobe items, styling tips, and care advice so you can apply these suggestions practically."
+    # Ensure care advice includes the user's specific care example if present or add it
+    care_text = "\n\n".join(care)
+    if "dry clean" not in care_text.lower() and "hand wash" not in care_text.lower():
+        care_text += "\n\nDry clean or hand wash in cold water with wool-specific detergent. Never wring out wool - gently squeeze excess water and lay flat to dry on a towel."
+    parts = []
+    parts.append(template_intro)
+    if intro:
+        parts.append(" ".join(intro))
+    if key_items:
+        parts.append("Key wardrobe items to prioritize:")
+        parts.append(" ".join(key_items))
+    if styling:
+        parts.append("Practical styling tips:")
+        parts.append(" ".join(styling))
+    if care_text:
+        parts.append("Care & maintenance:")
+        parts.append(care_text)
+    if conclusion:
+        parts.append("Wrapping up:")
+        parts.append(" ".join(conclusion))
+    # Combine and refine spacing
+    answer = "\n\n".join(parts)
+    # Post-process: ensure target length (approximately 400-700 words)
+    words = answer.split()
+    word_count = len(words)
+    # If too short, append templated practical paragraphs built from keywords
+    if word_count < 380:
+        logger.info(f"  → Extractive answer short ({word_count} words). Appending templated paragraphs.")
+        extra_paragraphs = []
+        extra_paragraphs.append("A reliable strategy is to build around versatile, neutral pieces: a well-fitted blazer, tailored trousers, a versatile dress, and quality shoes. These items can be mixed and matched for many occasions.")
+        extra_paragraphs.append("Focus on fit and fabric: ensure key items are well-tailored, prioritize breathable fabrics for comfort, and choose merino or wool blends for colder seasons to layer effectively.")
+        extra_paragraphs.append("Layering is essential for transitional weather; combine a lightweight sweater under a jacket, and carry a scarf for added warmth and visual interest.")
+        extra_paragraphs.append("Accessories like belts, a structured bag, and minimal jewelry can elevate basic outfits without extra effort. Neutral colors increase versatility and pair well with bolder accents.")
+        answer += "\n\n" + "\n\n".join(extra_paragraphs)
+        words = answer.split()
+        word_count = len(words)
+    # If still too long, truncate gracefully
+    if word_count > 750:
+        words = words[:700]
+        answer = " ".join(words) + '...'
+        word_count = 700
+    logger.info(f"  ✅ Extractive answer ready ({word_count} words)")
+    return answer
 def retrieve_knowledge_langchain(
     query: str,
     vectorstore,
     # (too short or truncated), fall back to an iterative multi-pass generator
     # that appends continuation chunks until we reach the target word count.
+    # Adjusted targets for faster generation and user's request: aim ~350 words
+    target_min_words = 320
+    target_max_words = 420
+    chunk_target_words = 140
     max_iterations = 4
     def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
         logger.info(f"    → Model call (temp={temperature}, max_new_tokens={max_new_tokens})")
         try:
+            if USE_REMOTE_LLM:
+                # Use remote Hugging Face Inference API
+                return remote_generate(prompt, max_new_tokens, temperature, top_p)
             out = llm_client(
                 prompt,
                 max_new_tokens=max_new_tokens,
     if not llm_answer:
         logger.error(f"  ✗ All 2 LLM attempts failed")
+        # Fallback: use an extractive + template-based generator to produce a long,
+        # natural-flowing answer without using the LLM. This helps when the model
+        # repeatedly returns very short outputs or errors.
+        try:
+            logger.info("  → Using extractive fallback generator")
+            fallback = generate_extractive_answer(query, retrieved_docs)
+            if fallback:
+                logger.info("  ✅ Extractive fallback produced an answer")
+                return fallback
+        except Exception as e:
+            logger.error(f"  ✗ Extractive fallback error: {e}")
         return "I apologize, but I'm having trouble generating a response. Please try rephrasing your question or ask something else."
     return llm_answer