hamxaameer commited on
Commit
2e58050
Β·
verified Β·
1 Parent(s): 7980cb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -55
app.py CHANGED
@@ -23,6 +23,15 @@ from langchain.schema import Document
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
 
 
 
 
 
 
 
 
 
 
26
  # ============================================================================
27
  # CONFIGURATION
28
  # ============================================================================
@@ -31,9 +40,9 @@ CONFIG = {
31
  "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
32
  "llm_model": None,
33
  "vector_store_path": ".",
34
- "top_k": 15,
35
  "temperature": 0.75,
36
- "max_tokens": 350,
37
  }
38
 
39
  # Local PHI model configuration for Hugging Face Spaces
@@ -43,6 +52,10 @@ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
43
  USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
44
  USE_REMOTE_LLM = False
45
 
 
 
 
 
46
  # Prefer the environment variable, but also allow a local token file for users
47
  # who don't know how to set env vars. Create a file named `hf_token.txt` in the
48
  # project root containing only the token (no newline is necessary). DO NOT
@@ -500,27 +513,21 @@ def scaffold_and_polish(query: str, retrieved_docs: List[Document], llm_client)
500
  logger.warning(" βœ— Scaffold empty after selection")
501
  return None
502
 
503
- # Craft polish prompt
504
- polish_prompt = f"""Please rewrite and expand the draft below into a clear, expert, natural-flowing answer of about 320-420 words to the question: {query}
505
 
506
- Requirements:
507
- - Keep paragraphs natural and connected.
508
- - Preserve factual content from the draft and avoid inventing unsupported facts.
509
- - Use a friendly, expert tone and provide practical, actionable advice.
510
 
511
- Draft:
512
- {scaffold}
513
-
514
- Answer:
515
  """
516
 
517
  logger.info(" β†’ Polishing scaffold with PHI model")
518
  try:
519
  out = llm_client(
520
  polish_prompt,
521
- max_new_tokens=600,
522
- temperature=0.72,
523
- top_p=0.92,
524
  do_sample=True,
525
  repetition_penalty=1.1,
526
  pad_token_id=llm_client.tokenizer.eos_token_id
@@ -548,11 +555,11 @@ Answer:
548
 
549
  final_words = polished.split()
550
  fw = len(final_words)
551
- if fw < 300:
552
  logger.warning(f" βœ— Polished output too short ({fw} words)")
553
  return None
554
- if fw > 460:
555
- polished = ' '.join(final_words[:420]) + '...'
556
 
557
  logger.info(f" βœ… Polished answer ready ({len(polished.split())} words)")
558
  return polished
@@ -641,21 +648,18 @@ def generate_llm_answer(
641
  context_parts = []
642
  for doc in top_docs:
643
  content = doc.page_content.strip()
644
- if len(content) > 400:
645
- content = content[:400] + "..."
646
  context_parts.append(content)
647
 
648
  context_text = "\n\n".join(context_parts)
649
 
650
- # Primary strategy: try a single long-form generation first. If that fails
651
- # (too short or truncated), fall back to an iterative multi-pass generator
652
- # that appends continuation chunks until we reach the target word count.
653
-
654
- # Adjusted targets for faster generation and user's request: aim ~350 words
655
- target_min_words = 320
656
- target_max_words = 420
657
- chunk_target_words = 140
658
- max_iterations = 4
659
 
660
  def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
661
  logger.info(f" β†’ PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
@@ -690,33 +694,30 @@ def generate_llm_answer(
690
  logger.error(f" βœ— PHI model call error: {e}")
691
  return ''
692
 
693
- # Build initial prompt
694
- base_prompt = f"""Please write a clear, natural-flowing, well-structured fashion answer using the context below.
695
 
696
  Question: {query}
697
 
698
- Context (use where helpful):
699
- {context_text[:1200]}
700
 
701
- Requirements:
702
- - Aim for a long-form answer ~{target_min_words}-{target_max_words} words, structured in paragraphs.
703
- - Use the provided context where relevant and add practical, actionable advice.
704
- - Keep a friendly, expert tone and avoid hedging phrases like "I can't" or "I don't know".
705
 
706
  Answer:
707
  """
708
 
709
- # First attempt: single call with a medium-large token budget (may be limited by model)
710
  if attempt == 1:
711
- temperature = 0.70
712
- max_new_tokens = 600
713
- top_p = 0.92
714
  repetition_penalty = 1.1
715
  else:
716
- temperature = 0.82
717
- max_new_tokens = 800
718
- top_p = 0.95
719
- repetition_penalty = 1.15
720
 
721
  initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
722
  response = (initial_output or '').strip()
@@ -747,18 +748,15 @@ Answer:
747
  break
748
 
749
  # Ask the model to continue without repeating previous content
750
- continue_prompt = f"""Continue the previous answer in the same tone and style to add about {min(chunk_target_words, remaining)} words.
751
-
752
- Do not repeat sentences already present. Keep paragraphs natural and connected.
753
 
754
- Previous answer:
755
- {accumulated}
756
 
757
- Continue:
758
  """
759
 
760
- # Slightly higher temperature on continuations to encourage richer text
761
- cont_output = call_model(continue_prompt, max_new_tokens=450, temperature=0.78, top_p=0.93, repetition_penalty=1.08)
762
  cont_text = (cont_output or '').strip()
763
 
764
  if not cont_text:
@@ -903,12 +901,13 @@ def fashion_chatbot(message: str, history: List[List[str]]):
903
  words = llm_answer.split()
904
  displayed_text = ""
905
 
 
906
  for i, word in enumerate(words):
907
  displayed_text += word + " "
908
 
909
- if i % 3 == 0 or i == len(words) - 1:
910
  yield displayed_text.strip()
911
- time.sleep(0.05)
912
 
913
  except Exception as e:
914
  logger.error(f"Error in chatbot: {e}")
 
23
  logging.basicConfig(level=logging.INFO)
24
  logger = logging.getLogger(__name__)
25
 
26
+ # Optimize PyTorch for CPU inference
27
+ torch.set_num_threads(4) # Limit threads for better CPU performance
28
+ torch.set_grad_enabled(False) # Disable gradients (inference only)
29
+
30
+ # Suppress specific warnings
31
+ import warnings
32
+ warnings.filterwarnings("ignore", message="MatMul8bitLt")
33
+ warnings.filterwarnings("ignore", message="torch_dtype")
34
+
35
  # ============================================================================
36
  # CONFIGURATION
37
  # ============================================================================
 
40
  "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
41
  "llm_model": None,
42
  "vector_store_path": ".",
43
+ "top_k": 10, # Reduced for faster retrieval
44
  "temperature": 0.75,
45
+ "max_tokens": 300, # Reduced for faster generation
46
  }
47
 
48
  # Local PHI model configuration for Hugging Face Spaces
 
52
  USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
53
  USE_REMOTE_LLM = False
54
 
55
+ # Generation optimization for speed
56
+ MAX_CONTEXT_LENGTH = 800 # Reduce context to speed up generation
57
+ TARGET_ANSWER_WORDS = 280 # Shorter target for faster responses
58
+
59
  # Prefer the environment variable, but also allow a local token file for users
60
  # who don't know how to set env vars. Create a file named `hf_token.txt` in the
61
  # project root containing only the token (no newline is necessary). DO NOT
 
513
  logger.warning(" βœ— Scaffold empty after selection")
514
  return None
515
 
516
+ # Craft polish prompt - optimized for speed
517
+ polish_prompt = f"""Expand this draft to ~280 words with practical fashion advice for: {query}
518
 
519
+ Draft: {scaffold[:400]}
 
 
 
520
 
521
+ Enhanced answer:
 
 
 
522
  """
523
 
524
  logger.info(" β†’ Polishing scaffold with PHI model")
525
  try:
526
  out = llm_client(
527
  polish_prompt,
528
+ max_new_tokens=400, # Reduced for speed
529
+ temperature=0.75,
530
+ top_p=0.90,
531
  do_sample=True,
532
  repetition_penalty=1.1,
533
  pad_token_id=llm_client.tokenizer.eos_token_id
 
555
 
556
  final_words = polished.split()
557
  fw = len(final_words)
558
+ if fw < 200:
559
  logger.warning(f" βœ— Polished output too short ({fw} words)")
560
  return None
561
+ if fw > 380:
562
+ polished = ' '.join(final_words[:350]) + '...'
563
 
564
  logger.info(f" βœ… Polished answer ready ({len(polished.split())} words)")
565
  return polished
 
648
  context_parts = []
649
  for doc in top_docs:
650
  content = doc.page_content.strip()
651
+ if len(content) > 300:
652
+ content = content[:300] + "..."
653
  context_parts.append(content)
654
 
655
  context_text = "\n\n".join(context_parts)
656
 
657
+ # Optimized for speed: shorter context, shorter target, fewer iterations
658
+ # This significantly reduces generation time on CPU
659
+ target_min_words = 250
660
+ target_max_words = 350
661
+ chunk_target_words = 120
662
+ max_iterations = 2
 
 
 
663
 
664
  def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
665
  logger.info(f" β†’ PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
 
694
  logger.error(f" βœ— PHI model call error: {e}")
695
  return ''
696
 
697
+ # Build initial prompt - optimized for speed with shorter context
698
+ base_prompt = f"""Answer this fashion question with practical advice in ~{target_min_words} words.
699
 
700
  Question: {query}
701
 
702
+ Key information:
703
+ {context_text[:600]}
704
 
705
+ Provide a clear, helpful answer with specific recommendations.
 
 
 
706
 
707
  Answer:
708
  """
709
 
710
+ # Optimized parameters for faster CPU generation
711
  if attempt == 1:
712
+ temperature = 0.75
713
+ max_new_tokens = 400 # Reduced for speed
714
+ top_p = 0.90
715
  repetition_penalty = 1.1
716
  else:
717
+ temperature = 0.85
718
+ max_new_tokens = 500
719
+ top_p = 0.92
720
+ repetition_penalty = 1.12
721
 
722
  initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
723
  response = (initial_output or '').strip()
 
748
  break
749
 
750
  # Ask the model to continue without repeating previous content
751
+ continue_prompt = f"""Add {min(chunk_target_words, remaining)} more words to complete this answer:
 
 
752
 
753
+ {accumulated[-400:]}
 
754
 
755
+ Continue naturally:
756
  """
757
 
758
+ # Optimized continuation parameters for speed
759
+ cont_output = call_model(continue_prompt, max_new_tokens=250, temperature=0.80, top_p=0.90, repetition_penalty=1.10)
760
  cont_text = (cont_output or '').strip()
761
 
762
  if not cont_text:
 
901
  words = llm_answer.split()
902
  displayed_text = ""
903
 
904
+ # Faster streaming for better UX
905
  for i, word in enumerate(words):
906
  displayed_text += word + " "
907
 
908
+ if i % 5 == 0 or i == len(words) - 1:
909
  yield displayed_text.strip()
910
+ time.sleep(0.02) # Reduced delay
911
 
912
  except Exception as e:
913
  logger.error(f"Error in chatbot: {e}")