hamxaameer commited on
Commit
d85f59c
Β·
verified Β·
1 Parent(s): 7eb2f2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -110
app.py CHANGED
@@ -40,9 +40,9 @@ CONFIG = {
40
  "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
41
  "llm_model": None,
42
  "vector_store_path": ".",
43
- "top_k": 8, # Minimal retrieval for speed
44
- "temperature": 0.85, # Higher for faster sampling
45
- "max_tokens": 280, # Aggressive reduction
46
  }
47
 
48
  # Local PHI model configuration for Hugging Face Spaces
@@ -52,11 +52,10 @@ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
52
  USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
53
  USE_REMOTE_LLM = False
54
 
55
- # Advanced optimization settings for FAST generation
56
- MAX_CONTEXT_LENGTH = 500 # Minimal context for speed
57
- TARGET_ANSWER_WORDS = 220 # Shorter answers = faster generation
58
  USE_CACHING = True # Cache model outputs for repeated patterns
59
- ENABLE_FAST_MODE = True # Skip iterative generation, use single-shot only
60
 
61
  # Prefer the environment variable, but also allow a local token file for users
62
  # who don't know how to set env vars. Create a file named `hf_token.txt` in the
@@ -159,7 +158,7 @@ def initialize_llm():
159
  "text-generation",
160
  model=model,
161
  tokenizer=tokenizer,
162
- max_new_tokens=280, # Default optimized value
163
  pad_token_id=tokenizer.eos_token_id,
164
  batch_size=1 # Single batch for optimal CPU performance
165
  )
@@ -461,28 +460,10 @@ def generate_extractive_answer(query: str, retrieved_docs: List[Document]) -> Op
461
  # Combine and refine spacing
462
  answer = "\n\n".join(parts)
463
 
464
- # Post-process: ensure target length (approximately 400-700 words)
465
  words = answer.split()
466
  word_count = len(words)
467
-
468
- # If too short, append templated practical paragraphs built from keywords
469
- if word_count < 380:
470
- logger.info(f" β†’ Extractive answer short ({word_count} words). Appending templated paragraphs.")
471
- extra_paragraphs = []
472
- extra_paragraphs.append("A reliable strategy is to build around versatile, neutral pieces: a well-fitted blazer, tailored trousers, a versatile dress, and quality shoes. These items can be mixed and matched for many occasions.")
473
- extra_paragraphs.append("Focus on fit and fabric: ensure key items are well-tailored, prioritize breathable fabrics for comfort, and choose merino or wool blends for colder seasons to layer effectively.")
474
- extra_paragraphs.append("Layering is essential for transitional weather; combine a lightweight sweater under a jacket, and carry a scarf for added warmth and visual interest.")
475
- extra_paragraphs.append("Accessories like belts, a structured bag, and minimal jewelry can elevate basic outfits without extra effort. Neutral colors increase versatility and pair well with bolder accents.")
476
- answer += "\n\n" + "\n\n".join(extra_paragraphs)
477
- words = answer.split()
478
- word_count = len(words)
479
-
480
- # If still too long, truncate gracefully
481
- if word_count > 750:
482
- words = words[:700]
483
- answer = " ".join(words) + '...'
484
- word_count = 700
485
-
486
  logger.info(f" βœ… Extractive answer ready ({word_count} words)")
487
  return answer
488
 
@@ -531,10 +512,12 @@ def scaffold_and_polish(query: str, retrieved_docs: List[Document], llm_client)
531
  logger.warning(" βœ— Scaffold empty after selection")
532
  return None
533
 
534
- # Craft polish prompt - optimized for speed
535
- polish_prompt = f"""Expand this draft to ~280 words with practical fashion advice for: {query}
536
 
537
- Draft: {scaffold[:400]}
 
 
538
 
539
  Enhanced answer:
540
  """
@@ -543,9 +526,9 @@ Enhanced answer:
543
  try:
544
  out = llm_client(
545
  polish_prompt,
546
- max_new_tokens=400, # Reduced for speed
547
  temperature=0.75,
548
- top_p=0.90,
549
  do_sample=True,
550
  repetition_penalty=1.1,
551
  pad_token_id=llm_client.tokenizer.eos_token_id
@@ -573,32 +556,29 @@ Enhanced answer:
573
 
574
  final_words = polished.split()
575
  fw = len(final_words)
576
- if fw < 200:
 
 
577
  logger.warning(f" βœ— Polished output too short ({fw} words)")
578
  return None
579
- if fw > 380:
580
- polished = ' '.join(final_words[:350]) + '...'
581
-
582
- logger.info(f" βœ… Polished answer ready ({len(polished.split())} words)")
583
  return polished
584
 
585
 
586
  def retrieve_knowledge_langchain(
587
  query: str,
588
  vectorstore,
589
- top_k: int = 8
590
  ) -> Tuple[List[Document], float]:
591
  logger.info(f"πŸ” Retrieving knowledge for: '{query}'")
592
 
593
- # Fast mode: single query only (no variants)
594
- global ENABLE_FAST_MODE
595
- if ENABLE_FAST_MODE:
596
- query_variants = [query]
597
- else:
598
- query_variants = [
599
- query,
600
- f"fashion advice clothing outfit style for {query}",
601
- ]
602
 
603
  all_docs = []
604
 
@@ -668,28 +648,21 @@ def generate_llm_answer(
668
  scored_docs.sort(key=lambda x: x[1], reverse=True)
669
  top_docs = [doc[0] for doc in scored_docs[:8]]
670
 
671
- # Ultra-fast context preparation: only use top 4 docs, very short snippets
672
  context_parts = []
673
- for doc in top_docs[:4]: # Reduced from 8 to 4
674
  content = doc.page_content.strip()
675
- if len(content) > 200: # Reduced from 300 to 200
676
- content = content[:200] + "..."
677
  context_parts.append(content)
678
 
679
- context_text = "\n".join(context_parts) # Single newline instead of double
680
 
681
- # Ultra-fast mode: minimal words, no iterations
682
- global ENABLE_FAST_MODE
683
- if ENABLE_FAST_MODE:
684
- target_min_words = 180 # Much shorter
685
- target_max_words = 280
686
- chunk_target_words = 0 # No continuations
687
- max_iterations = 0 # No iterations
688
- else:
689
- target_min_words = 250
690
- target_max_words = 350
691
- chunk_target_words = 120
692
- max_iterations = 2
693
 
694
  def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
695
  logger.info(f" β†’ PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
@@ -727,23 +700,28 @@ def generate_llm_answer(
727
  logger.error(f" βœ— PHI model call error: {e}")
728
  return ''
729
 
730
- # Ultra-compact prompt for maximum speed
731
- base_prompt = f"""Q: {query}
732
 
733
- Context: {context_text[:400]}
734
 
735
- A:"""
 
736
 
737
- # Aggressive speed optimization: fewer tokens, higher temperature for faster sampling
 
 
 
 
738
  if attempt == 1:
739
- temperature = 0.85 # Higher = faster sampling
740
- max_new_tokens = 280 # Reduced significantly
741
- top_p = 0.88
742
  repetition_penalty = 1.08
743
  else:
744
- temperature = 0.90
745
- max_new_tokens = 320
746
- top_p = 0.90
747
  repetition_penalty = 1.10
748
 
749
  initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
@@ -757,31 +735,18 @@ A:"""
757
  words = response.split()
758
  word_count = len(words)
759
 
760
- # Fast mode: accept shorter answers immediately
761
- if ENABLE_FAST_MODE and word_count >= 150:
762
- if word_count > target_max_words:
763
- response = ' '.join(words[:target_max_words]) + '...'
764
- word_count = target_max_words
765
- logger.info(f" βœ… Fast-mode generated {word_count} words")
766
  return response
767
 
768
- # If single-shot succeeded, validate length and return
769
- if word_count >= target_min_words:
770
- if word_count > target_max_words:
771
- response = ' '.join(words[:target_max_words]) + '...'
772
- word_count = target_max_words
773
- logger.info(f" βœ… Single-shot generated {word_count} words")
774
  return response
775
 
776
- # Skip iterations in fast mode
777
- if ENABLE_FAST_MODE or max_iterations == 0:
778
- if word_count >= 120: # Accept even shorter in fast mode
779
- logger.info(f" βœ… Fast-mode accepted {word_count} words")
780
- return response
781
- # If too short, return None to trigger fallback
782
- logger.warning(f" βœ— Output too short ({word_count} words), trying fallback")
783
- return None
784
-
785
  # Otherwise, try iterative continuation to build up to the target
786
  accumulated = response
787
  prev_word_count = word_count
@@ -867,9 +832,8 @@ def generate_answer_langchain(
867
  if not retrieved_docs:
868
  return "I couldn't find relevant information to answer your question."
869
 
870
- # Fast mode: single attempt only
871
- global ENABLE_FAST_MODE
872
- max_attempts = 1 if ENABLE_FAST_MODE else 2
873
 
874
  llm_answer = None
875
  for attempt in range(1, max_attempts + 1):
@@ -886,16 +850,15 @@ def generate_answer_langchain(
886
  if not llm_answer:
887
  logger.error(f" βœ— All {max_attempts} LLM attempts failed")
888
 
889
- # In fast mode, skip scaffold-and-polish and go straight to extractive
890
- if not ENABLE_FAST_MODE:
891
- try:
892
- logger.info(" β†’ Attempting scaffold-and-polish using PHI model")
893
- polished = scaffold_and_polish(query, retrieved_docs, llm_client)
894
- if polished:
895
- logger.info(" βœ… Scaffold-and-polish produced an answer")
896
- return polished
897
- except Exception as e:
898
- logger.error(f" βœ— Scaffold-and-polish error: {e}")
899
 
900
  # Final fallback: extractive templated answer (guaranteed deterministic & FAST)
901
  try:
 
40
  "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
41
  "llm_model": None,
42
  "vector_store_path": ".",
43
+ "top_k": 12, # Rich retrieval for quality
44
+ "temperature": 0.75, # Balanced for natural flow
45
+ "max_tokens": 600, # Allow natural length responses
46
  }
47
 
48
  # Local PHI model configuration for Hugging Face Spaces
 
52
  USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
53
  USE_REMOTE_LLM = False
54
 
55
+ # Natural flow mode: No word limits, let model decide length
56
+ MAX_CONTEXT_LENGTH = 800 # Rich context for quality
 
57
  USE_CACHING = True # Cache model outputs for repeated patterns
58
+ ENABLE_FAST_MODE = False # Allow natural completion, no artificial limits
59
 
60
  # Prefer the environment variable, but also allow a local token file for users
61
  # who don't know how to set env vars. Create a file named `hf_token.txt` in the
 
158
  "text-generation",
159
  model=model,
160
  tokenizer=tokenizer,
161
+ max_new_tokens=600, # Allow natural length responses
162
  pad_token_id=tokenizer.eos_token_id,
163
  batch_size=1 # Single batch for optimal CPU performance
164
  )
 
460
  # Combine and refine spacing
461
  answer = "\n\n".join(parts)
462
 
463
+ # Natural length - no artificial padding or truncation
464
  words = answer.split()
465
  word_count = len(words)
466
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  logger.info(f" βœ… Extractive answer ready ({word_count} words)")
468
  return answer
469
 
 
512
  logger.warning(" βœ— Scaffold empty after selection")
513
  return None
514
 
515
+ # Craft polish prompt - natural expansion with no limits
516
+ polish_prompt = f"""Expand this draft into a complete, detailed fashion answer for: {query}
517
 
518
+ Draft: {scaffold}
519
+
520
+ Write a comprehensive, natural answer with practical advice and specific recommendations.
521
 
522
  Enhanced answer:
523
  """
 
526
  try:
527
  out = llm_client(
528
  polish_prompt,
529
+ max_new_tokens=600, # Allow natural expansion
530
  temperature=0.75,
531
+ top_p=0.92,
532
  do_sample=True,
533
  repetition_penalty=1.1,
534
  pad_token_id=llm_client.tokenizer.eos_token_id
 
556
 
557
  final_words = polished.split()
558
  fw = len(final_words)
559
+
560
+ # No artificial limits - accept natural length
561
+ if fw < 50:
562
  logger.warning(f" βœ— Polished output too short ({fw} words)")
563
  return None
564
+
565
+ # Keep full response, no truncation
566
+ logger.info(f" βœ… Polished answer ready ({fw} words)")
 
567
  return polished
568
 
569
 
570
  def retrieve_knowledge_langchain(
571
  query: str,
572
  vectorstore,
573
+ top_k: int = 12
574
  ) -> Tuple[List[Document], float]:
575
  logger.info(f"πŸ” Retrieving knowledge for: '{query}'")
576
 
577
+ # Natural mode: use query variants for better context
578
+ query_variants = [
579
+ query,
580
+ f"fashion advice clothing outfit style for {query}",
581
+ ]
 
 
 
 
582
 
583
  all_docs = []
584
 
 
648
  scored_docs.sort(key=lambda x: x[1], reverse=True)
649
  top_docs = [doc[0] for doc in scored_docs[:8]]
650
 
651
+ # Natural flow: use rich context from top documents
652
  context_parts = []
653
+ for doc in top_docs[:6]: # Use 6 best documents
654
  content = doc.page_content.strip()
655
+ if len(content) > 500: # Keep more content
656
+ content = content[:500] + "..."
657
  context_parts.append(content)
658
 
659
+ context_text = "\n\n".join(context_parts)
660
 
661
+ # NO WORD LIMITS: Let the model decide natural completion length
662
+ target_min_words = 100 # Very low minimum - accept any reasonable output
663
+ target_max_words = 999999 # No maximum - let model complete naturally
664
+ chunk_target_words = 0 # Not used in natural mode
665
+ max_iterations = 0 # Single-shot only for speed
 
 
 
 
 
 
 
666
 
667
  def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
668
  logger.info(f" β†’ PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
 
700
  logger.error(f" βœ— PHI model call error: {e}")
701
  return ''
702
 
703
+ # Natural prompt: let the model generate complete, flowing responses
704
+ base_prompt = f"""You are a fashion expert. Provide a detailed, helpful answer to this question using the context provided.
705
 
706
+ Question: {query}
707
 
708
+ Context:
709
+ {context_text[:1200]}
710
 
711
+ Write a natural, complete answer with practical fashion advice. Include specific recommendations, styling tips, and any relevant details.
712
+
713
+ Answer:"""
714
+
715
+ # Natural generation parameters: quality over speed, no artificial limits
716
  if attempt == 1:
717
+ temperature = 0.75 # Balanced creativity
718
+ max_new_tokens = 600 # Allow longer natural responses
719
+ top_p = 0.92
720
  repetition_penalty = 1.08
721
  else:
722
+ temperature = 0.80
723
+ max_new_tokens = 700 # Even longer if needed
724
+ top_p = 0.93
725
  repetition_penalty = 1.10
726
 
727
  initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
 
735
  words = response.split()
736
  word_count = len(words)
737
 
738
+ # Natural mode: accept ANY response length - let model decide
739
+ # No truncation, no artificial limits
740
+ if word_count >= target_min_words:
741
+ # Accept the full natural response without cutting
742
+ logger.info(f" βœ… Generated {word_count} words naturally")
 
743
  return response
744
 
745
+ # Even if short, accept it if it has substance (50+ words)
746
+ if word_count >= 50:
747
+ logger.info(f" βœ… Accepted natural response ({word_count} words)")
 
 
 
748
  return response
749
 
 
 
 
 
 
 
 
 
 
750
  # Otherwise, try iterative continuation to build up to the target
751
  accumulated = response
752
  prev_word_count = word_count
 
832
  if not retrieved_docs:
833
  return "I couldn't find relevant information to answer your question."
834
 
835
+ # Natural mode: allow 2 attempts for quality
836
+ max_attempts = 2
 
837
 
838
  llm_answer = None
839
  for attempt in range(1, max_attempts + 1):
 
850
  if not llm_answer:
851
  logger.error(f" βœ— All {max_attempts} LLM attempts failed")
852
 
853
+ # Try scaffold-and-polish as fallback
854
+ try:
855
+ logger.info(" β†’ Attempting scaffold-and-polish using PHI model")
856
+ polished = scaffold_and_polish(query, retrieved_docs, llm_client)
857
+ if polished:
858
+ logger.info(" βœ… Scaffold-and-polish produced an answer")
859
+ return polished
860
+ except Exception as e:
861
+ logger.error(f" βœ— Scaffold-and-polish error: {e}")
 
862
 
863
  # Final fallback: extractive templated answer (guaranteed deterministic & FAST)
864
  try: