Update app.py
Browse files
app.py
CHANGED
|
@@ -333,25 +333,25 @@ def generate_llm_answer(
|
|
| 333 |
|
| 334 |
context_text = "\n\n".join(context_parts)
|
| 335 |
|
| 336 |
-
# Progressive parameters -
|
| 337 |
if attempt == 1:
|
| 338 |
temperature = 0.7
|
| 339 |
-
max_new_tokens =
|
| 340 |
top_p = 0.9
|
| 341 |
repetition_penalty = 1.2
|
| 342 |
elif attempt == 2:
|
| 343 |
temperature = 0.75
|
| 344 |
-
max_new_tokens =
|
| 345 |
top_p = 0.92
|
| 346 |
repetition_penalty = 1.25
|
| 347 |
elif attempt == 3:
|
| 348 |
temperature = 0.8
|
| 349 |
-
max_new_tokens =
|
| 350 |
top_p = 0.94
|
| 351 |
repetition_penalty = 1.3
|
| 352 |
else:
|
| 353 |
temperature = 0.85
|
| 354 |
-
max_new_tokens =
|
| 355 |
top_p = 0.95
|
| 356 |
repetition_penalty = 1.35
|
| 357 |
|
|
@@ -370,17 +370,16 @@ Provide detailed fashion advice (200-400 words):"""
|
|
| 370 |
try:
|
| 371 |
logger.info(f" β Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
|
| 372 |
|
| 373 |
-
# T5 optimized
|
| 374 |
output = llm_client(
|
| 375 |
user_prompt,
|
| 376 |
max_new_tokens=max_new_tokens,
|
| 377 |
-
min_new_tokens=
|
| 378 |
temperature=temperature,
|
| 379 |
top_p=top_p,
|
| 380 |
do_sample=True,
|
| 381 |
-
num_beams=
|
| 382 |
repetition_penalty=repetition_penalty,
|
| 383 |
-
length_penalty=1.2, # Encourage longer responses
|
| 384 |
early_stopping=True,
|
| 385 |
no_repeat_ngram_size=3,
|
| 386 |
truncation=True # CRITICAL: Truncate input if too long
|
|
@@ -394,8 +393,8 @@ Provide detailed fashion advice (200-400 words):"""
|
|
| 394 |
return None
|
| 395 |
|
| 396 |
# Validation - accept responses with meaningful content
|
| 397 |
-
if len(response) <
|
| 398 |
-
logger.warning(f" β Response too short: {len(response)} chars (need
|
| 399 |
return None
|
| 400 |
|
| 401 |
# Check for apologies/refusals
|
|
@@ -439,17 +438,17 @@ def generate_answer_langchain(
|
|
| 439 |
if not retrieved_docs:
|
| 440 |
return "I couldn't find relevant information to answer your question."
|
| 441 |
|
| 442 |
-
# Step 2: Try LLM generation (
|
| 443 |
llm_answer = None
|
| 444 |
-
for attempt in range(1,
|
| 445 |
-
logger.info(f"\n π€ LLM Generation Attempt {attempt}/
|
| 446 |
llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
|
| 447 |
|
| 448 |
if llm_answer:
|
| 449 |
logger.info(f" β
LLM answer generated successfully")
|
| 450 |
break
|
| 451 |
else:
|
| 452 |
-
logger.warning(f" β Attempt {attempt}/
|
| 453 |
|
| 454 |
# Step 3: If all attempts fail, return error
|
| 455 |
if not llm_answer:
|
|
@@ -488,10 +487,10 @@ def fashion_chatbot(message: str, history: List[List[str]]):
|
|
| 488 |
# Show generating indicator
|
| 489 |
yield f"π Generating answer ({len(retrieved_docs)} sources found)..."
|
| 490 |
|
| 491 |
-
# Generate answer with
|
| 492 |
llm_answer = None
|
| 493 |
-
for attempt in range(1,
|
| 494 |
-
logger.info(f"\n π€ LLM Generation Attempt {attempt}/
|
| 495 |
llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
|
| 496 |
|
| 497 |
if llm_answer:
|
|
|
|
| 333 |
|
| 334 |
context_text = "\n\n".join(context_parts)
|
| 335 |
|
| 336 |
+
# Progressive parameters - optimized for SPEED (shorter = faster)
|
| 337 |
if attempt == 1:
|
| 338 |
temperature = 0.7
|
| 339 |
+
max_new_tokens = 250 # Faster generation
|
| 340 |
top_p = 0.9
|
| 341 |
repetition_penalty = 1.2
|
| 342 |
elif attempt == 2:
|
| 343 |
temperature = 0.75
|
| 344 |
+
max_new_tokens = 300
|
| 345 |
top_p = 0.92
|
| 346 |
repetition_penalty = 1.25
|
| 347 |
elif attempt == 3:
|
| 348 |
temperature = 0.8
|
| 349 |
+
max_new_tokens = 350
|
| 350 |
top_p = 0.94
|
| 351 |
repetition_penalty = 1.3
|
| 352 |
else:
|
| 353 |
temperature = 0.85
|
| 354 |
+
max_new_tokens = 400
|
| 355 |
top_p = 0.95
|
| 356 |
repetition_penalty = 1.35
|
| 357 |
|
|
|
|
| 370 |
try:
|
| 371 |
logger.info(f" β Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
|
| 372 |
|
| 373 |
+
# T5 optimized for SPEED on CPU - use greedy decoding (num_beams=1)
|
| 374 |
output = llm_client(
|
| 375 |
user_prompt,
|
| 376 |
max_new_tokens=max_new_tokens,
|
| 377 |
+
min_new_tokens=80, # Lower minimum for faster completion
|
| 378 |
temperature=temperature,
|
| 379 |
top_p=top_p,
|
| 380 |
do_sample=True,
|
| 381 |
+
num_beams=1, # Greedy decoding for 4x faster speed on CPU
|
| 382 |
repetition_penalty=repetition_penalty,
|
|
|
|
| 383 |
early_stopping=True,
|
| 384 |
no_repeat_ngram_size=3,
|
| 385 |
truncation=True # CRITICAL: Truncate input if too long
|
|
|
|
| 393 |
return None
|
| 394 |
|
| 395 |
# Validation - accept responses with meaningful content
|
| 396 |
+
if len(response) < 80:
|
| 397 |
+
logger.warning(f" β Response too short: {len(response)} chars (need 80+)")
|
| 398 |
return None
|
| 399 |
|
| 400 |
# Check for apologies/refusals
|
|
|
|
| 438 |
if not retrieved_docs:
|
| 439 |
return "I couldn't find relevant information to answer your question."
|
| 440 |
|
| 441 |
+
# Step 2: Try LLM generation (2 attempts for speed)
|
| 442 |
llm_answer = None
|
| 443 |
+
for attempt in range(1, 3):
|
| 444 |
+
logger.info(f"\n π€ LLM Generation Attempt {attempt}/2")
|
| 445 |
llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
|
| 446 |
|
| 447 |
if llm_answer:
|
| 448 |
logger.info(f" β
LLM answer generated successfully")
|
| 449 |
break
|
| 450 |
else:
|
| 451 |
+
logger.warning(f" β Attempt {attempt}/2 failed, retrying...")
|
| 452 |
|
| 453 |
# Step 3: If all attempts fail, return error
|
| 454 |
if not llm_answer:
|
|
|
|
| 487 |
# Show generating indicator
|
| 488 |
yield f"π Generating answer ({len(retrieved_docs)} sources found)..."
|
| 489 |
|
| 490 |
+
# Generate answer with 2 quick attempts
|
| 491 |
llm_answer = None
|
| 492 |
+
for attempt in range(1, 3):
|
| 493 |
+
logger.info(f"\n π€ LLM Generation Attempt {attempt}/2")
|
| 494 |
llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
|
| 495 |
|
| 496 |
if llm_answer:
|