Update app.py
Browse files
app.py
CHANGED
|
@@ -23,6 +23,15 @@ from langchain.schema import Document
|
|
| 23 |
logging.basicConfig(level=logging.INFO)
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# ============================================================================
|
| 27 |
# CONFIGURATION
|
| 28 |
# ============================================================================
|
|
@@ -31,9 +40,9 @@ CONFIG = {
|
|
| 31 |
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
|
| 32 |
"llm_model": None,
|
| 33 |
"vector_store_path": ".",
|
| 34 |
-
"top_k":
|
| 35 |
"temperature": 0.75,
|
| 36 |
-
"max_tokens":
|
| 37 |
}
|
| 38 |
|
| 39 |
# Local PHI model configuration for Hugging Face Spaces
|
|
@@ -43,6 +52,10 @@ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
|
|
| 43 |
USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
|
| 44 |
USE_REMOTE_LLM = False
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
# Prefer the environment variable, but also allow a local token file for users
|
| 47 |
# who don't know how to set env vars. Create a file named `hf_token.txt` in the
|
| 48 |
# project root containing only the token (no newline is necessary). DO NOT
|
|
@@ -500,27 +513,21 @@ def scaffold_and_polish(query: str, retrieved_docs: List[Document], llm_client)
|
|
| 500 |
logger.warning(" β Scaffold empty after selection")
|
| 501 |
return None
|
| 502 |
|
| 503 |
-
# Craft polish prompt
|
| 504 |
-
polish_prompt = f"""
|
| 505 |
|
| 506 |
-
|
| 507 |
-
- Keep paragraphs natural and connected.
|
| 508 |
-
- Preserve factual content from the draft and avoid inventing unsupported facts.
|
| 509 |
-
- Use a friendly, expert tone and provide practical, actionable advice.
|
| 510 |
|
| 511 |
-
|
| 512 |
-
{scaffold}
|
| 513 |
-
|
| 514 |
-
Answer:
|
| 515 |
"""
|
| 516 |
|
| 517 |
logger.info(" β Polishing scaffold with PHI model")
|
| 518 |
try:
|
| 519 |
out = llm_client(
|
| 520 |
polish_prompt,
|
| 521 |
-
max_new_tokens=
|
| 522 |
-
temperature=0.
|
| 523 |
-
top_p=0.
|
| 524 |
do_sample=True,
|
| 525 |
repetition_penalty=1.1,
|
| 526 |
pad_token_id=llm_client.tokenizer.eos_token_id
|
|
@@ -548,11 +555,11 @@ Answer:
|
|
| 548 |
|
| 549 |
final_words = polished.split()
|
| 550 |
fw = len(final_words)
|
| 551 |
-
if fw <
|
| 552 |
logger.warning(f" β Polished output too short ({fw} words)")
|
| 553 |
return None
|
| 554 |
-
if fw >
|
| 555 |
-
polished = ' '.join(final_words[:
|
| 556 |
|
| 557 |
logger.info(f" β
Polished answer ready ({len(polished.split())} words)")
|
| 558 |
return polished
|
|
@@ -641,21 +648,18 @@ def generate_llm_answer(
|
|
| 641 |
context_parts = []
|
| 642 |
for doc in top_docs:
|
| 643 |
content = doc.page_content.strip()
|
| 644 |
-
if len(content) >
|
| 645 |
-
content = content[:
|
| 646 |
context_parts.append(content)
|
| 647 |
|
| 648 |
context_text = "\n\n".join(context_parts)
|
| 649 |
|
| 650 |
-
#
|
| 651 |
-
#
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
target_max_words = 420
|
| 657 |
-
chunk_target_words = 140
|
| 658 |
-
max_iterations = 4
|
| 659 |
|
| 660 |
def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
|
| 661 |
logger.info(f" β PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
|
|
@@ -690,33 +694,30 @@ def generate_llm_answer(
|
|
| 690 |
logger.error(f" β PHI model call error: {e}")
|
| 691 |
return ''
|
| 692 |
|
| 693 |
-
# Build initial prompt
|
| 694 |
-
base_prompt = f"""
|
| 695 |
|
| 696 |
Question: {query}
|
| 697 |
|
| 698 |
-
|
| 699 |
-
{context_text[:
|
| 700 |
|
| 701 |
-
|
| 702 |
-
- Aim for a long-form answer ~{target_min_words}-{target_max_words} words, structured in paragraphs.
|
| 703 |
-
- Use the provided context where relevant and add practical, actionable advice.
|
| 704 |
-
- Keep a friendly, expert tone and avoid hedging phrases like "I can't" or "I don't know".
|
| 705 |
|
| 706 |
Answer:
|
| 707 |
"""
|
| 708 |
|
| 709 |
-
#
|
| 710 |
if attempt == 1:
|
| 711 |
-
temperature = 0.
|
| 712 |
-
max_new_tokens =
|
| 713 |
-
top_p = 0.
|
| 714 |
repetition_penalty = 1.1
|
| 715 |
else:
|
| 716 |
-
temperature = 0.
|
| 717 |
-
max_new_tokens =
|
| 718 |
-
top_p = 0.
|
| 719 |
-
repetition_penalty = 1.
|
| 720 |
|
| 721 |
initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
|
| 722 |
response = (initial_output or '').strip()
|
|
@@ -747,18 +748,15 @@ Answer:
|
|
| 747 |
break
|
| 748 |
|
| 749 |
# Ask the model to continue without repeating previous content
|
| 750 |
-
continue_prompt = f"""
|
| 751 |
-
|
| 752 |
-
Do not repeat sentences already present. Keep paragraphs natural and connected.
|
| 753 |
|
| 754 |
-
|
| 755 |
-
{accumulated}
|
| 756 |
|
| 757 |
-
Continue:
|
| 758 |
"""
|
| 759 |
|
| 760 |
-
#
|
| 761 |
-
cont_output = call_model(continue_prompt, max_new_tokens=
|
| 762 |
cont_text = (cont_output or '').strip()
|
| 763 |
|
| 764 |
if not cont_text:
|
|
@@ -903,12 +901,13 @@ def fashion_chatbot(message: str, history: List[List[str]]):
|
|
| 903 |
words = llm_answer.split()
|
| 904 |
displayed_text = ""
|
| 905 |
|
|
|
|
| 906 |
for i, word in enumerate(words):
|
| 907 |
displayed_text += word + " "
|
| 908 |
|
| 909 |
-
if i %
|
| 910 |
yield displayed_text.strip()
|
| 911 |
-
time.sleep(0.
|
| 912 |
|
| 913 |
except Exception as e:
|
| 914 |
logger.error(f"Error in chatbot: {e}")
|
|
|
|
| 23 |
logging.basicConfig(level=logging.INFO)
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
| 26 |
+
# Optimize PyTorch for CPU inference
|
| 27 |
+
torch.set_num_threads(4) # Limit threads for better CPU performance
|
| 28 |
+
torch.set_grad_enabled(False) # Disable gradients (inference only)
|
| 29 |
+
|
| 30 |
+
# Suppress specific warnings
|
| 31 |
+
import warnings
|
| 32 |
+
warnings.filterwarnings("ignore", message="MatMul8bitLt")
|
| 33 |
+
warnings.filterwarnings("ignore", message="torch_dtype")
|
| 34 |
+
|
| 35 |
# ============================================================================
|
| 36 |
# CONFIGURATION
|
| 37 |
# ============================================================================
|
|
|
|
| 40 |
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
|
| 41 |
"llm_model": None,
|
| 42 |
"vector_store_path": ".",
|
| 43 |
+
"top_k": 10, # Reduced for faster retrieval
|
| 44 |
"temperature": 0.75,
|
| 45 |
+
"max_tokens": 300, # Reduced for faster generation
|
| 46 |
}
|
| 47 |
|
| 48 |
# Local PHI model configuration for Hugging Face Spaces
|
|
|
|
| 52 |
USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
|
| 53 |
USE_REMOTE_LLM = False
|
| 54 |
|
| 55 |
+
# Generation optimization for speed
|
| 56 |
+
MAX_CONTEXT_LENGTH = 800 # Reduce context to speed up generation
|
| 57 |
+
TARGET_ANSWER_WORDS = 280 # Shorter target for faster responses
|
| 58 |
+
|
| 59 |
# Prefer the environment variable, but also allow a local token file for users
|
| 60 |
# who don't know how to set env vars. Create a file named `hf_token.txt` in the
|
| 61 |
# project root containing only the token (no newline is necessary). DO NOT
|
|
|
|
| 513 |
logger.warning(" β Scaffold empty after selection")
|
| 514 |
return None
|
| 515 |
|
| 516 |
+
# Craft polish prompt - optimized for speed
|
| 517 |
+
polish_prompt = f"""Expand this draft to ~280 words with practical fashion advice for: {query}
|
| 518 |
|
| 519 |
+
Draft: {scaffold[:400]}
|
|
|
|
|
|
|
|
|
|
| 520 |
|
| 521 |
+
Enhanced answer:
|
|
|
|
|
|
|
|
|
|
| 522 |
"""
|
| 523 |
|
| 524 |
logger.info(" β Polishing scaffold with PHI model")
|
| 525 |
try:
|
| 526 |
out = llm_client(
|
| 527 |
polish_prompt,
|
| 528 |
+
max_new_tokens=400, # Reduced for speed
|
| 529 |
+
temperature=0.75,
|
| 530 |
+
top_p=0.90,
|
| 531 |
do_sample=True,
|
| 532 |
repetition_penalty=1.1,
|
| 533 |
pad_token_id=llm_client.tokenizer.eos_token_id
|
|
|
|
| 555 |
|
| 556 |
final_words = polished.split()
|
| 557 |
fw = len(final_words)
|
| 558 |
+
if fw < 200:
|
| 559 |
logger.warning(f" β Polished output too short ({fw} words)")
|
| 560 |
return None
|
| 561 |
+
if fw > 380:
|
| 562 |
+
polished = ' '.join(final_words[:350]) + '...'
|
| 563 |
|
| 564 |
logger.info(f" β
Polished answer ready ({len(polished.split())} words)")
|
| 565 |
return polished
|
|
|
|
| 648 |
context_parts = []
|
| 649 |
for doc in top_docs:
|
| 650 |
content = doc.page_content.strip()
|
| 651 |
+
if len(content) > 300:
|
| 652 |
+
content = content[:300] + "..."
|
| 653 |
context_parts.append(content)
|
| 654 |
|
| 655 |
context_text = "\n\n".join(context_parts)
|
| 656 |
|
| 657 |
+
# Optimized for speed: shorter context, shorter target, fewer iterations
|
| 658 |
+
# This significantly reduces generation time on CPU
|
| 659 |
+
target_min_words = 250
|
| 660 |
+
target_max_words = 350
|
| 661 |
+
chunk_target_words = 120
|
| 662 |
+
max_iterations = 2
|
|
|
|
|
|
|
|
|
|
| 663 |
|
| 664 |
def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
|
| 665 |
logger.info(f" β PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
|
|
|
|
| 694 |
logger.error(f" β PHI model call error: {e}")
|
| 695 |
return ''
|
| 696 |
|
| 697 |
+
# Build initial prompt - optimized for speed with shorter context
|
| 698 |
+
base_prompt = f"""Answer this fashion question with practical advice in ~{target_min_words} words.
|
| 699 |
|
| 700 |
Question: {query}
|
| 701 |
|
| 702 |
+
Key information:
|
| 703 |
+
{context_text[:600]}
|
| 704 |
|
| 705 |
+
Provide a clear, helpful answer with specific recommendations.
|
|
|
|
|
|
|
|
|
|
| 706 |
|
| 707 |
Answer:
|
| 708 |
"""
|
| 709 |
|
| 710 |
+
# Optimized parameters for faster CPU generation
|
| 711 |
if attempt == 1:
|
| 712 |
+
temperature = 0.75
|
| 713 |
+
max_new_tokens = 400 # Reduced for speed
|
| 714 |
+
top_p = 0.90
|
| 715 |
repetition_penalty = 1.1
|
| 716 |
else:
|
| 717 |
+
temperature = 0.85
|
| 718 |
+
max_new_tokens = 500
|
| 719 |
+
top_p = 0.92
|
| 720 |
+
repetition_penalty = 1.12
|
| 721 |
|
| 722 |
initial_output = call_model(base_prompt, max_new_tokens, temperature, top_p, repetition_penalty)
|
| 723 |
response = (initial_output or '').strip()
|
|
|
|
| 748 |
break
|
| 749 |
|
| 750 |
# Ask the model to continue without repeating previous content
|
| 751 |
+
continue_prompt = f"""Add {min(chunk_target_words, remaining)} more words to complete this answer:
|
|
|
|
|
|
|
| 752 |
|
| 753 |
+
{accumulated[-400:]}
|
|
|
|
| 754 |
|
| 755 |
+
Continue naturally:
|
| 756 |
"""
|
| 757 |
|
| 758 |
+
# Optimized continuation parameters for speed
|
| 759 |
+
cont_output = call_model(continue_prompt, max_new_tokens=250, temperature=0.80, top_p=0.90, repetition_penalty=1.10)
|
| 760 |
cont_text = (cont_output or '').strip()
|
| 761 |
|
| 762 |
if not cont_text:
|
|
|
|
| 901 |
words = llm_answer.split()
|
| 902 |
displayed_text = ""
|
| 903 |
|
| 904 |
+
# Faster streaming for better UX
|
| 905 |
for i, word in enumerate(words):
|
| 906 |
displayed_text += word + " "
|
| 907 |
|
| 908 |
+
if i % 5 == 0 or i == len(words) - 1:
|
| 909 |
yield displayed_text.strip()
|
| 910 |
+
time.sleep(0.02) # Reduced delay
|
| 911 |
|
| 912 |
except Exception as e:
|
| 913 |
logger.error(f"Error in chatbot: {e}")
|