Update app.py
Browse files
app.py
CHANGED
|
@@ -40,9 +40,9 @@ CONFIG = {
|
|
| 40 |
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
|
| 41 |
"llm_model": None,
|
| 42 |
"vector_store_path": ".",
|
| 43 |
-
"top_k":
|
| 44 |
-
"temperature": 0.
|
| 45 |
-
"max_tokens":
|
| 46 |
}
|
| 47 |
|
| 48 |
# Local PHI model configuration for Hugging Face Spaces
|
|
@@ -52,11 +52,10 @@ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
|
|
| 52 |
USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
|
| 53 |
USE_REMOTE_LLM = False
|
| 54 |
|
| 55 |
-
#
|
| 56 |
-
MAX_CONTEXT_LENGTH =
|
| 57 |
-
TARGET_ANSWER_WORDS = 220 # Shorter answers = faster generation
|
| 58 |
USE_CACHING = True # Cache model outputs for repeated patterns
|
| 59 |
-
ENABLE_FAST_MODE =
|
| 60 |
|
| 61 |
# Prefer the environment variable, but also allow a local token file for users
|
| 62 |
# who don't know how to set env vars. Create a file named `hf_token.txt` in the
|
|
@@ -159,7 +158,7 @@ def initialize_llm():
|
|
| 159 |
"text-generation",
|
| 160 |
model=model,
|
| 161 |
tokenizer=tokenizer,
|
| 162 |
-
max_new_tokens=
|
| 163 |
pad_token_id=tokenizer.eos_token_id,
|
| 164 |
batch_size=1 # Single batch for optimal CPU performance
|
| 165 |
)
|
|
@@ -461,28 +460,10 @@ def generate_extractive_answer(query: str, retrieved_docs: List[Document]) -> Op
|
|
| 461 |
# Combine and refine spacing
|
| 462 |
answer = "\n\n".join(parts)
|
| 463 |
|
| 464 |
-
#
|
| 465 |
words = answer.split()
|
| 466 |
word_count = len(words)
|
| 467 |
-
|
| 468 |
-
# If too short, append templated practical paragraphs built from keywords
|
| 469 |
-
if word_count < 380:
|
| 470 |
-
logger.info(f" β Extractive answer short ({word_count} words). Appending templated paragraphs.")
|
| 471 |
-
extra_paragraphs = []
|
| 472 |
-
extra_paragraphs.append("A reliable strategy is to build around versatile, neutral pieces: a well-fitted blazer, tailored trousers, a versatile dress, and quality shoes. These items can be mixed and matched for many occasions.")
|
| 473 |
-
extra_paragraphs.append("Focus on fit and fabric: ensure key items are well-tailored, prioritize breathable fabrics for comfort, and choose merino or wool blends for colder seasons to layer effectively.")
|
| 474 |
-
extra_paragraphs.append("Layering is essential for transitional weather; combine a lightweight sweater under a jacket, and carry a scarf for added warmth and visual interest.")
|
| 475 |
-
extra_paragraphs.append("Accessories like belts, a structured bag, and minimal jewelry can elevate basic outfits without extra effort. Neutral colors increase versatility and pair well with bolder accents.")
|
| 476 |
-
answer += "\n\n" + "\n\n".join(extra_paragraphs)
|
| 477 |
-
words = answer.split()
|
| 478 |
-
word_count = len(words)
|
| 479 |
-
|
| 480 |
-
# If still too long, truncate gracefully
|
| 481 |
-
if word_count > 750:
|
| 482 |
-
words = words[:700]
|
| 483 |
-
answer = " ".join(words) + '...'
|
| 484 |
-
word_count = 700
|
| 485 |
-
|
| 486 |
logger.info(f" β
Extractive answer ready ({word_count} words)")
|
| 487 |
return answer
|
| 488 |
|
|
@@ -531,10 +512,12 @@ def scaffold_and_polish(query: str, retrieved_docs: List[Document], llm_client)
|
|
| 531 |
logger.warning(" β Scaffold empty after selection")
|
| 532 |
return None
|
| 533 |
|
| 534 |
-
# Craft polish prompt -
|
| 535 |
-
polish_prompt = f"""Expand this draft
|
| 536 |
|
| 537 |
-
Draft: {scaffold
|
|
|
|
|
|
|
| 538 |
|
| 539 |
Enhanced answer:
|
| 540 |
"""
|
|
@@ -543,9 +526,9 @@ Enhanced answer:
|
|
| 543 |
try:
|
| 544 |
out = llm_client(
|
| 545 |
polish_prompt,
|
| 546 |
-
max_new_tokens=
|
| 547 |
temperature=0.75,
|
| 548 |
-
top_p=0.
|
| 549 |
do_sample=True,
|
| 550 |
repetition_penalty=1.1,
|
| 551 |
pad_token_id=llm_client.tokenizer.eos_token_id
|
|
@@ -573,32 +556,29 @@ Enhanced answer:
|
|
| 573 |
|
| 574 |
final_words = polished.split()
|
| 575 |
fw = len(final_words)
|
| 576 |
-
|
|
|
|
|
|
|
| 577 |
logger.warning(f" β Polished output too short ({fw} words)")
|
| 578 |
return None
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
logger.info(f" β
Polished answer ready ({len(polished.split())} words)")
|
| 583 |
return polished
|
| 584 |
|
| 585 |
|
| 586 |
def retrieve_knowledge_langchain(
|
| 587 |
query: str,
|
| 588 |
vectorstore,
|
| 589 |
-
top_k: int =
|
| 590 |
) -> Tuple[List[Document], float]:
|
| 591 |
logger.info(f"π Retrieving knowledge for: '{query}'")
|
| 592 |
|
| 593 |
-
#
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
query_variants = [
|
| 599 |
-
query,
|
| 600 |
-
f"fashion advice clothing outfit style for {query}",
|
| 601 |
-
]
|
| 602 |
|
| 603 |
all_docs = []
|
| 604 |
|
|
@@ -668,28 +648,21 @@ def generate_llm_answer(
|
|
| 668 |
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
| 669 |
top_docs = [doc[0] for doc in scored_docs[:8]]
|
| 670 |
|
| 671 |
-
#
|
| 672 |
context_parts = []
|
| 673 |
-
for doc in top_docs[:
|
| 674 |
content = doc.page_content.strip()
|
| 675 |
-
if len(content) >
|
| 676 |
-
content = content[:
|
| 677 |
context_parts.append(content)
|
| 678 |
|
| 679 |
-
context_text = "\n".join(context_parts)
|
| 680 |
|
| 681 |
-
#
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
chunk_target_words = 0 # No continuations
|
| 687 |
-
max_iterations = 0 # No iterations
|
| 688 |
-
else:
|
| 689 |
-
target_min_words = 250
|
| 690 |
-
target_max_words = 350
|
| 691 |
-
chunk_target_words = 120
|
| 692 |
-
max_iterations = 2
|
| 693 |
|
| 694 |
def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
|
| 695 |
logger.info(f" β PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
|
|
@@ -727,23 +700,28 @@ def generate_llm_answer(
|
|
| 727 |
logger.error(f" β PHI model call error: {e}")
|
| 728 |
return ''
|
| 729 |
|
| 730 |
-
#
|
| 731 |
-
base_prompt = f"""
|
| 732 |
|
| 733 |
-
|
| 734 |
|
| 735 |
-
|
|
|
|
| 736 |
|
| 737 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
if attempt == 1:
|
| 739 |
-
temperature = 0.
|
| 740 |
-
max_new_tokens =
|
| 741 |
-
top_p = 0.
|
| 742 |
repetition_penalty = 1.08
|
| 743 |
else:
|
| 744 |
-
temperature = 0.
|
| 745 |
-
max_new_tokens =
|
| 746 |
-
top_p = 0.
|
| 747 |
repetition_penalty = 1.10
|
| 748 |
|
| 749 |
initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
|
|
@@ -757,31 +735,18 @@ A:"""
|
|
| 757 |
words = response.split()
|
| 758 |
word_count = len(words)
|
| 759 |
|
| 760 |
-
#
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
logger.info(f" β
Fast-mode generated {word_count} words")
|
| 766 |
return response
|
| 767 |
|
| 768 |
-
#
|
| 769 |
-
if word_count >=
|
| 770 |
-
|
| 771 |
-
response = ' '.join(words[:target_max_words]) + '...'
|
| 772 |
-
word_count = target_max_words
|
| 773 |
-
logger.info(f" β
Single-shot generated {word_count} words")
|
| 774 |
return response
|
| 775 |
|
| 776 |
-
# Skip iterations in fast mode
|
| 777 |
-
if ENABLE_FAST_MODE or max_iterations == 0:
|
| 778 |
-
if word_count >= 120: # Accept even shorter in fast mode
|
| 779 |
-
logger.info(f" β
Fast-mode accepted {word_count} words")
|
| 780 |
-
return response
|
| 781 |
-
# If too short, return None to trigger fallback
|
| 782 |
-
logger.warning(f" β Output too short ({word_count} words), trying fallback")
|
| 783 |
-
return None
|
| 784 |
-
|
| 785 |
# Otherwise, try iterative continuation to build up to the target
|
| 786 |
accumulated = response
|
| 787 |
prev_word_count = word_count
|
|
@@ -867,9 +832,8 @@ def generate_answer_langchain(
|
|
| 867 |
if not retrieved_docs:
|
| 868 |
return "I couldn't find relevant information to answer your question."
|
| 869 |
|
| 870 |
-
#
|
| 871 |
-
|
| 872 |
-
max_attempts = 1 if ENABLE_FAST_MODE else 2
|
| 873 |
|
| 874 |
llm_answer = None
|
| 875 |
for attempt in range(1, max_attempts + 1):
|
|
@@ -886,16 +850,15 @@ def generate_answer_langchain(
|
|
| 886 |
if not llm_answer:
|
| 887 |
logger.error(f" β All {max_attempts} LLM attempts failed")
|
| 888 |
|
| 889 |
-
#
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
logger.error(f" β Scaffold-and-polish error: {e}")
|
| 899 |
|
| 900 |
# Final fallback: extractive templated answer (guaranteed deterministic & FAST)
|
| 901 |
try:
|
|
|
|
| 40 |
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
|
| 41 |
"llm_model": None,
|
| 42 |
"vector_store_path": ".",
|
| 43 |
+
"top_k": 12, # Rich retrieval for quality
|
| 44 |
+
"temperature": 0.75, # Balanced for natural flow
|
| 45 |
+
"max_tokens": 600, # Allow natural length responses
|
| 46 |
}
|
| 47 |
|
| 48 |
# Local PHI model configuration for Hugging Face Spaces
|
|
|
|
| 52 |
USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
|
| 53 |
USE_REMOTE_LLM = False
|
| 54 |
|
| 55 |
+
# Natural flow mode: No word limits, let model decide length
|
| 56 |
+
MAX_CONTEXT_LENGTH = 800 # Rich context for quality
|
|
|
|
| 57 |
USE_CACHING = True # Cache model outputs for repeated patterns
|
| 58 |
+
ENABLE_FAST_MODE = False # Allow natural completion, no artificial limits
|
| 59 |
|
| 60 |
# Prefer the environment variable, but also allow a local token file for users
|
| 61 |
# who don't know how to set env vars. Create a file named `hf_token.txt` in the
|
|
|
|
| 158 |
"text-generation",
|
| 159 |
model=model,
|
| 160 |
tokenizer=tokenizer,
|
| 161 |
+
max_new_tokens=600, # Allow natural length responses
|
| 162 |
pad_token_id=tokenizer.eos_token_id,
|
| 163 |
batch_size=1 # Single batch for optimal CPU performance
|
| 164 |
)
|
|
|
|
| 460 |
# Combine and refine spacing
|
| 461 |
answer = "\n\n".join(parts)
|
| 462 |
|
| 463 |
+
# Natural length - no artificial padding or truncation
|
| 464 |
words = answer.split()
|
| 465 |
word_count = len(words)
|
| 466 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
logger.info(f" β
Extractive answer ready ({word_count} words)")
|
| 468 |
return answer
|
| 469 |
|
|
|
|
| 512 |
logger.warning(" β Scaffold empty after selection")
|
| 513 |
return None
|
| 514 |
|
| 515 |
+
# Craft polish prompt - natural expansion with no limits
|
| 516 |
+
polish_prompt = f"""Expand this draft into a complete, detailed fashion answer for: {query}
|
| 517 |
|
| 518 |
+
Draft: {scaffold}
|
| 519 |
+
|
| 520 |
+
Write a comprehensive, natural answer with practical advice and specific recommendations.
|
| 521 |
|
| 522 |
Enhanced answer:
|
| 523 |
"""
|
|
|
|
| 526 |
try:
|
| 527 |
out = llm_client(
|
| 528 |
polish_prompt,
|
| 529 |
+
max_new_tokens=600, # Allow natural expansion
|
| 530 |
temperature=0.75,
|
| 531 |
+
top_p=0.92,
|
| 532 |
do_sample=True,
|
| 533 |
repetition_penalty=1.1,
|
| 534 |
pad_token_id=llm_client.tokenizer.eos_token_id
|
|
|
|
| 556 |
|
| 557 |
final_words = polished.split()
|
| 558 |
fw = len(final_words)
|
| 559 |
+
|
| 560 |
+
# No artificial limits - accept natural length
|
| 561 |
+
if fw < 50:
|
| 562 |
logger.warning(f" β Polished output too short ({fw} words)")
|
| 563 |
return None
|
| 564 |
+
|
| 565 |
+
# Keep full response, no truncation
|
| 566 |
+
logger.info(f" β
Polished answer ready ({fw} words)")
|
|
|
|
| 567 |
return polished
|
| 568 |
|
| 569 |
|
| 570 |
def retrieve_knowledge_langchain(
|
| 571 |
query: str,
|
| 572 |
vectorstore,
|
| 573 |
+
top_k: int = 12
|
| 574 |
) -> Tuple[List[Document], float]:
|
| 575 |
logger.info(f"π Retrieving knowledge for: '{query}'")
|
| 576 |
|
| 577 |
+
# Natural mode: use query variants for better context
|
| 578 |
+
query_variants = [
|
| 579 |
+
query,
|
| 580 |
+
f"fashion advice clothing outfit style for {query}",
|
| 581 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
|
| 583 |
all_docs = []
|
| 584 |
|
|
|
|
| 648 |
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
| 649 |
top_docs = [doc[0] for doc in scored_docs[:8]]
|
| 650 |
|
| 651 |
+
# Natural flow: use rich context from top documents
|
| 652 |
context_parts = []
|
| 653 |
+
for doc in top_docs[:6]: # Use 6 best documents
|
| 654 |
content = doc.page_content.strip()
|
| 655 |
+
if len(content) > 500: # Keep more content
|
| 656 |
+
content = content[:500] + "..."
|
| 657 |
context_parts.append(content)
|
| 658 |
|
| 659 |
+
context_text = "\n\n".join(context_parts)
|
| 660 |
|
| 661 |
+
# NO WORD LIMITS: Let the model decide natural completion length
|
| 662 |
+
target_min_words = 100 # Very low minimum - accept any reasonable output
|
| 663 |
+
target_max_words = 999999 # No maximum - let model complete naturally
|
| 664 |
+
chunk_target_words = 0 # Not used in natural mode
|
| 665 |
+
max_iterations = 0 # Single-shot only for speed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
|
| 667 |
def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
|
| 668 |
logger.info(f" β PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
|
|
|
|
| 700 |
logger.error(f" β PHI model call error: {e}")
|
| 701 |
return ''
|
| 702 |
|
| 703 |
+
# Natural prompt: let the model generate complete, flowing responses
|
| 704 |
+
base_prompt = f"""You are a fashion expert. Provide a detailed, helpful answer to this question using the context provided.
|
| 705 |
|
| 706 |
+
Question: {query}
|
| 707 |
|
| 708 |
+
Context:
|
| 709 |
+
{context_text[:1200]}
|
| 710 |
|
| 711 |
+
Write a natural, complete answer with practical fashion advice. Include specific recommendations, styling tips, and any relevant details.
|
| 712 |
+
|
| 713 |
+
Answer:"""
|
| 714 |
+
|
| 715 |
+
# Natural generation parameters: quality over speed, no artificial limits
|
| 716 |
if attempt == 1:
|
| 717 |
+
temperature = 0.75 # Balanced creativity
|
| 718 |
+
max_new_tokens = 600 # Allow longer natural responses
|
| 719 |
+
top_p = 0.92
|
| 720 |
repetition_penalty = 1.08
|
| 721 |
else:
|
| 722 |
+
temperature = 0.80
|
| 723 |
+
max_new_tokens = 700 # Even longer if needed
|
| 724 |
+
top_p = 0.93
|
| 725 |
repetition_penalty = 1.10
|
| 726 |
|
| 727 |
initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
|
|
|
|
| 735 |
words = response.split()
|
| 736 |
word_count = len(words)
|
| 737 |
|
| 738 |
+
# Natural mode: accept ANY response length - let model decide
|
| 739 |
+
# No truncation, no artificial limits
|
| 740 |
+
if word_count >= target_min_words:
|
| 741 |
+
# Accept the full natural response without cutting
|
| 742 |
+
logger.info(f" β
Generated {word_count} words naturally")
|
|
|
|
| 743 |
return response
|
| 744 |
|
| 745 |
+
# Even if short, accept it if it has substance (50+ words)
|
| 746 |
+
if word_count >= 50:
|
| 747 |
+
logger.info(f" β
Accepted natural response ({word_count} words)")
|
|
|
|
|
|
|
|
|
|
| 748 |
return response
|
| 749 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
# Otherwise, try iterative continuation to build up to the target
|
| 751 |
accumulated = response
|
| 752 |
prev_word_count = word_count
|
|
|
|
| 832 |
if not retrieved_docs:
|
| 833 |
return "I couldn't find relevant information to answer your question."
|
| 834 |
|
| 835 |
+
# Natural mode: allow 2 attempts for quality
|
| 836 |
+
max_attempts = 2
|
|
|
|
| 837 |
|
| 838 |
llm_answer = None
|
| 839 |
for attempt in range(1, max_attempts + 1):
|
|
|
|
| 850 |
if not llm_answer:
|
| 851 |
logger.error(f" β All {max_attempts} LLM attempts failed")
|
| 852 |
|
| 853 |
+
# Try scaffold-and-polish as fallback
|
| 854 |
+
try:
|
| 855 |
+
logger.info(" β Attempting scaffold-and-polish using PHI model")
|
| 856 |
+
polished = scaffold_and_polish(query, retrieved_docs, llm_client)
|
| 857 |
+
if polished:
|
| 858 |
+
logger.info(" β
Scaffold-and-polish produced an answer")
|
| 859 |
+
return polished
|
| 860 |
+
except Exception as e:
|
| 861 |
+
logger.error(f" β Scaffold-and-polish error: {e}")
|
|
|
|
| 862 |
|
| 863 |
# Final fallback: extractive templated answer (guaranteed deterministic & FAST)
|
| 864 |
try:
|