Update app.py
Browse files
app.py
CHANGED
|
@@ -322,71 +322,68 @@ def generate_llm_answer(
|
|
| 322 |
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
| 323 |
top_docs = [doc[0] for doc in scored_docs[:8]]
|
| 324 |
|
| 325 |
-
# Build context
|
| 326 |
context_parts = []
|
| 327 |
-
for doc in top_docs:
|
| 328 |
content = doc.page_content.strip()
|
| 329 |
-
|
| 330 |
-
|
|
|
|
| 331 |
context_parts.append(content)
|
| 332 |
|
| 333 |
context_text = "\n\n".join(context_parts)
|
| 334 |
|
| 335 |
-
# Progressive parameters
|
| 336 |
if attempt == 1:
|
| 337 |
-
temperature = 0.
|
| 338 |
-
max_new_tokens =
|
| 339 |
-
top_p = 0.
|
| 340 |
-
repetition_penalty = 1.
|
| 341 |
elif attempt == 2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
temperature = 0.85
|
| 343 |
max_new_tokens = 500
|
| 344 |
-
top_p = 0.94
|
| 345 |
-
repetition_penalty = 1.18
|
| 346 |
-
elif attempt == 3:
|
| 347 |
-
temperature = 0.9
|
| 348 |
-
max_new_tokens = 550
|
| 349 |
top_p = 0.95
|
| 350 |
-
repetition_penalty = 1.
|
| 351 |
-
else:
|
| 352 |
-
temperature = 0.95
|
| 353 |
-
max_new_tokens = 600
|
| 354 |
-
top_p = 0.96
|
| 355 |
-
repetition_penalty = 1.22
|
| 356 |
|
| 357 |
-
# Create
|
| 358 |
model_type = CONFIG.get("model_type", "t5")
|
| 359 |
|
| 360 |
-
# T5 format -
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
Question: {query}
|
| 364 |
-
|
| 365 |
-
Fashion Knowledge Base:
|
| 366 |
-
{context_text[:2000]}
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
- Practical styling tips and combinations
|
| 371 |
-
- Why these suggestions work
|
| 372 |
-
- Additional helpful considerations
|
| 373 |
|
| 374 |
-
|
| 375 |
|
| 376 |
try:
|
| 377 |
logger.info(f" β Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
|
| 378 |
|
| 379 |
-
# T5 optimized parameters
|
| 380 |
output = llm_client(
|
| 381 |
user_prompt,
|
| 382 |
-
max_new_tokens=max_new_tokens,
|
|
|
|
| 383 |
temperature=temperature,
|
| 384 |
top_p=top_p,
|
| 385 |
do_sample=True,
|
| 386 |
-
num_beams=
|
| 387 |
repetition_penalty=repetition_penalty,
|
|
|
|
| 388 |
early_stopping=True,
|
| 389 |
-
no_repeat_ngram_size=3
|
|
|
|
| 390 |
)
|
| 391 |
|
| 392 |
# Extract generated text
|
|
@@ -396,9 +393,9 @@ Answer:"""
|
|
| 396 |
logger.warning(f" β Empty response (attempt {attempt})")
|
| 397 |
return None
|
| 398 |
|
| 399 |
-
# Validation - accept
|
| 400 |
-
if len(response) <
|
| 401 |
-
logger.warning(f" β Response too short: {len(response)} chars (need
|
| 402 |
return None
|
| 403 |
|
| 404 |
# Check for apologies/refusals
|
|
|
|
| 322 |
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
| 323 |
top_docs = [doc[0] for doc in scored_docs[:8]]
|
| 324 |
|
| 325 |
+
# Build context - keep it SHORT to stay under 512 tokens
|
| 326 |
context_parts = []
|
| 327 |
+
for doc in top_docs[:5]: # Only use top 5 docs
|
| 328 |
content = doc.page_content.strip()
|
| 329 |
+
# Keep each doc snippet under 150 chars
|
| 330 |
+
if len(content) > 150:
|
| 331 |
+
content = content[:150] + "..."
|
| 332 |
context_parts.append(content)
|
| 333 |
|
| 334 |
context_text = "\n\n".join(context_parts)
|
| 335 |
|
| 336 |
+
# Progressive parameters - balanced for T5's capabilities
|
| 337 |
if attempt == 1:
|
| 338 |
+
temperature = 0.7
|
| 339 |
+
max_new_tokens = 350 # Realistic length for T5
|
| 340 |
+
top_p = 0.9
|
| 341 |
+
repetition_penalty = 1.2
|
| 342 |
elif attempt == 2:
|
| 343 |
+
temperature = 0.75
|
| 344 |
+
max_new_tokens = 400
|
| 345 |
+
top_p = 0.92
|
| 346 |
+
repetition_penalty = 1.25
|
| 347 |
+
elif attempt == 3:
|
| 348 |
+
temperature = 0.8
|
| 349 |
+
max_new_tokens = 450
|
| 350 |
+
top_p = 0.94
|
| 351 |
+
repetition_penalty = 1.3
|
| 352 |
+
else:
|
| 353 |
temperature = 0.85
|
| 354 |
max_new_tokens = 500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
top_p = 0.95
|
| 356 |
+
repetition_penalty = 1.35
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
+
# Create COMPACT T5 prompt to stay under 512 tokens (critical!)
|
| 359 |
model_type = CONFIG.get("model_type", "t5")
|
| 360 |
|
| 361 |
+
# T5 format - simple and effective to minimize tokens
|
| 362 |
+
# Keep prompt minimal to leave room for generation
|
| 363 |
+
user_prompt = f"""Fashion Question: {query}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
+
Relevant Fashion Tips:
|
| 366 |
+
{context_text[:600]}
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
+
Provide detailed fashion advice (200-400 words):"""
|
| 369 |
|
| 370 |
try:
|
| 371 |
logger.info(f" β Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_new_tokens})...")
|
| 372 |
|
| 373 |
+
# T5 optimized parameters - CRITICAL: truncate input to stay under 512 tokens
|
| 374 |
output = llm_client(
|
| 375 |
user_prompt,
|
| 376 |
+
max_new_tokens=max_new_tokens,
|
| 377 |
+
min_new_tokens=100, # Ensure minimum length generation
|
| 378 |
temperature=temperature,
|
| 379 |
top_p=top_p,
|
| 380 |
do_sample=True,
|
| 381 |
+
num_beams=4, # Higher beams for better quality
|
| 382 |
repetition_penalty=repetition_penalty,
|
| 383 |
+
length_penalty=1.2, # Encourage longer responses
|
| 384 |
early_stopping=True,
|
| 385 |
+
no_repeat_ngram_size=3,
|
| 386 |
+
truncation=True # CRITICAL: Truncate input if too long
|
| 387 |
)
|
| 388 |
|
| 389 |
# Extract generated text
|
|
|
|
| 393 |
logger.warning(f" β Empty response (attempt {attempt})")
|
| 394 |
return None
|
| 395 |
|
| 396 |
+
# Validation - accept responses with meaningful content
|
| 397 |
+
if len(response) < 100:
|
| 398 |
+
logger.warning(f" β Response too short: {len(response)} chars (need 100+)")
|
| 399 |
return None
|
| 400 |
|
| 401 |
# Check for apologies/refusals
|