hamxaameer commited on
Commit
24467e0
Β·
verified Β·
1 Parent(s): 068ed2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -18
app.py CHANGED
@@ -333,25 +333,25 @@ def generate_llm_answer(
333
 
334
  context_text = "\n\n".join(context_parts)
335
 
336
- # Progressive parameters - balanced for T5's capabilities
337
  if attempt == 1:
338
  temperature = 0.7
339
- max_new_tokens = 350 # Realistic length for T5
340
  top_p = 0.9
341
  repetition_penalty = 1.2
342
  elif attempt == 2:
343
  temperature = 0.75
344
- max_new_tokens = 400
345
  top_p = 0.92
346
  repetition_penalty = 1.25
347
  elif attempt == 3:
348
  temperature = 0.8
349
- max_new_tokens = 450
350
  top_p = 0.94
351
  repetition_penalty = 1.3
352
  else:
353
  temperature = 0.85
354
- max_new_tokens = 500
355
  top_p = 0.95
356
  repetition_penalty = 1.35
357
 
@@ -370,17 +370,16 @@ Provide detailed fashion advice (200-400 words):"""
370
  try:
371
  logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
372
 
373
- # T5 optimized parameters - CRITICAL: truncate input to stay under 512 tokens
374
  output = llm_client(
375
  user_prompt,
376
  max_new_tokens=max_new_tokens,
377
- min_new_tokens=100, # Ensure minimum length generation
378
  temperature=temperature,
379
  top_p=top_p,
380
  do_sample=True,
381
- num_beams=4, # Higher beams for better quality
382
  repetition_penalty=repetition_penalty,
383
- length_penalty=1.2, # Encourage longer responses
384
  early_stopping=True,
385
  no_repeat_ngram_size=3,
386
  truncation=True # CRITICAL: Truncate input if too long
@@ -394,8 +393,8 @@ Provide detailed fashion advice (200-400 words):"""
394
  return None
395
 
396
  # Validation - accept responses with meaningful content
397
- if len(response) < 100:
398
- logger.warning(f" βœ— Response too short: {len(response)} chars (need 100+)")
399
  return None
400
 
401
  # Check for apologies/refusals
@@ -439,17 +438,17 @@ def generate_answer_langchain(
439
  if not retrieved_docs:
440
  return "I couldn't find relevant information to answer your question."
441
 
442
- # Step 2: Try LLM generation (4 attempts)
443
  llm_answer = None
444
- for attempt in range(1, 5):
445
- logger.info(f"\n πŸ€– LLM Generation Attempt {attempt}/4")
446
  llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
447
 
448
  if llm_answer:
449
  logger.info(f" βœ… LLM answer generated successfully")
450
  break
451
  else:
452
- logger.warning(f" β†’ Attempt {attempt}/4 failed, retrying...")
453
 
454
  # Step 3: If all attempts fail, return error
455
  if not llm_answer:
@@ -488,10 +487,10 @@ def fashion_chatbot(message: str, history: List[List[str]]):
488
  # Show generating indicator
489
  yield f"πŸ’­ Generating answer ({len(retrieved_docs)} sources found)..."
490
 
491
- # Generate answer with multiple attempts
492
  llm_answer = None
493
- for attempt in range(1, 5):
494
- logger.info(f"\n πŸ€– LLM Generation Attempt {attempt}/4")
495
  llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
496
 
497
  if llm_answer:
 
333
 
334
  context_text = "\n\n".join(context_parts)
335
 
336
+ # Progressive parameters - optimized for SPEED (shorter = faster)
337
  if attempt == 1:
338
  temperature = 0.7
339
+ max_new_tokens = 250 # Faster generation
340
  top_p = 0.9
341
  repetition_penalty = 1.2
342
  elif attempt == 2:
343
  temperature = 0.75
344
+ max_new_tokens = 300
345
  top_p = 0.92
346
  repetition_penalty = 1.25
347
  elif attempt == 3:
348
  temperature = 0.8
349
+ max_new_tokens = 350
350
  top_p = 0.94
351
  repetition_penalty = 1.3
352
  else:
353
  temperature = 0.85
354
+ max_new_tokens = 400
355
  top_p = 0.95
356
  repetition_penalty = 1.35
357
 
 
370
  try:
371
  logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
372
 
373
+ # T5 optimized for SPEED on CPU - use greedy decoding (num_beams=1)
374
  output = llm_client(
375
  user_prompt,
376
  max_new_tokens=max_new_tokens,
377
+ min_new_tokens=80, # Lower minimum for faster completion
378
  temperature=temperature,
379
  top_p=top_p,
380
  do_sample=True,
381
+ num_beams=1, # Greedy decoding for 4x faster speed on CPU
382
  repetition_penalty=repetition_penalty,
 
383
  early_stopping=True,
384
  no_repeat_ngram_size=3,
385
  truncation=True # CRITICAL: Truncate input if too long
 
393
  return None
394
 
395
  # Validation - accept responses with meaningful content
396
+ if len(response) < 80:
397
+ logger.warning(f" βœ— Response too short: {len(response)} chars (need 80+)")
398
  return None
399
 
400
  # Check for apologies/refusals
 
438
  if not retrieved_docs:
439
  return "I couldn't find relevant information to answer your question."
440
 
441
+ # Step 2: Try LLM generation (2 attempts for speed)
442
  llm_answer = None
443
+ for attempt in range(1, 3):
444
+ logger.info(f"\n πŸ€– LLM Generation Attempt {attempt}/2")
445
  llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
446
 
447
  if llm_answer:
448
  logger.info(f" βœ… LLM answer generated successfully")
449
  break
450
  else:
451
+ logger.warning(f" β†’ Attempt {attempt}/2 failed, retrying...")
452
 
453
  # Step 3: If all attempts fail, return error
454
  if not llm_answer:
 
487
  # Show generating indicator
488
  yield f"πŸ’­ Generating answer ({len(retrieved_docs)} sources found)..."
489
 
490
+ # Generate answer with 2 quick attempts
491
  llm_answer = None
492
+ for attempt in range(1, 3):
493
+ logger.info(f"\n πŸ€– LLM Generation Attempt {attempt}/2")
494
  llm_answer = generate_llm_answer(message.strip(), retrieved_docs, llm_client, attempt)
495
 
496
  if llm_answer: