hamxaameer commited on
Commit
bd2bde4
Β·
verified Β·
1 Parent(s): 4096b3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -94
app.py CHANGED
@@ -43,47 +43,38 @@ def initialize_llm():
43
  """Initialize free local LLM with transformers pipeline"""
44
  logger.info("πŸ”„ Initializing FREE local language model...")
45
 
46
- BACKUP_MODELS = [
47
- "microsoft/phi-2", # 2.7B - Best quality that fits in 16GB
48
- ]
49
 
50
- for model_name in BACKUP_MODELS:
51
- try:
52
- logger.info(f" Trying {model_name}...")
53
- device = 0 if torch.cuda.is_available() else -1
54
-
55
- # Phi-2 configuration
56
- task = "text-generation"
57
- model_type = "phi"
58
-
59
- # Optimized for memory efficiency
60
- model_kwargs = {
61
- "low_cpu_mem_usage": True,
62
- "torch_dtype": torch.float32, # Use float32 for CPU
63
- "trust_remote_code": True # Required for Phi-2
64
- }
65
-
66
- llm_client = pipeline(
67
- task,
68
- model=model_name,
69
- device=device,
70
- max_new_tokens=300,
71
- truncation=True,
72
- model_kwargs=model_kwargs
73
- )
74
-
75
- CONFIG["llm_model"] = model_name
76
- CONFIG["model_type"] = model_type
77
- logger.info(f"βœ… FREE LLM initialized: {model_name}")
78
- logger.info(f" Device: {'GPU' if device == 0 else 'CPU'}")
79
- return llm_client
80
 
81
- except Exception as e:
82
- logger.warning(f"⚠️ Failed {model_name}: {str(e)[:100]}")
83
- continue
84
-
85
- logger.error("⚠️ All models failed - will use fallback generation")
86
- return None
 
 
 
87
 
88
  def initialize_embeddings():
89
  """Initialize sentence transformer embeddings"""
@@ -363,32 +354,31 @@ def generate_llm_answer(
363
  top_p = 0.97
364
  repetition_penalty = 1.25
365
 
366
- # Create prompt for Phi-2
367
- model_type = CONFIG.get("model_type", "phi")
368
 
369
- # Phi-2 optimized format (simple and effective)
370
- user_prompt = f"""Instruct: You are a professional fashion advisor. Use the fashion knowledge below to answer the question with specific, detailed advice.
371
-
372
- Fashion Knowledge:
373
- {context_text[:1200]}
374
 
375
  Question: {query}
376
 
377
- Output: """
 
 
 
378
 
379
  try:
380
  logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
381
 
382
- # Phi-2 optimized parameters
383
  output = llm_client(
384
  user_prompt,
385
- max_new_tokens=min(max_tokens, 250), # Cap for speed
386
- temperature=0.7, # Balanced
387
- top_p=0.9,
388
- repetition_penalty=1.15,
389
  do_sample=True,
390
- return_full_text=False,
391
- pad_token_id=50256 # Phi-2 pad token
392
  )
393
 
394
  # Extract generated text
@@ -417,34 +407,7 @@ Output: """
417
  return None
418
 
419
  def synthesize_direct_answer(
420
- query: str,
421
- retrieved_docs: List[Document]
422
- ) -> str:
423
- """
424
- Enhanced fallback: Combine multiple documents intelligently
425
- """
426
- logger.info(" β†’ Using enhanced fallback synthesis")
427
-
428
- if not retrieved_docs:
429
- return "I don't have enough information to answer that question accurately. Please try rephrasing your question."
430
-
431
- # Combine top 3 most relevant documents
432
- top_docs = retrieved_docs[:3]
433
- combined_content = []
434
-
435
- for i, doc in enumerate(top_docs, 1):
436
- content = doc.page_content.strip()
437
- if len(content) > 200:
438
- content = content[:200]
439
- combined_content.append(f"{content}")
440
-
441
- answer = " ".join(combined_content)
442
-
443
- # Add context-aware prefix
444
- answer = f"Based on fashion guidelines: {answer}"
445
-
446
- return answer
447
-
448
  def generate_answer_langchain(
449
  query: str,
450
  vectorstore,
@@ -507,12 +470,12 @@ def fashion_chatbot(message: str, history: List[List[str]]):
507
  message.strip(),
508
  vectorstore,
509
  top_k=CONFIG["top_k"]
510
- )
511
-
512
- if not retrieved_docs:
513
- yield "I couldn't find relevant information to answer your question."
514
- return
515
-
516
  # Show generating indicator
517
  yield f"πŸ’­ Generating answer ({len(retrieved_docs)} sources found)..."
518
 
@@ -552,12 +515,11 @@ def fashion_chatbot(message: str, history: List[List[str]]):
552
  # ============================================================================
553
  # INITIALIZE AND LAUNCH
554
  # ============================================================================
555
-
556
- # Global variables
557
- llm_client = None
558
- embeddings = None
559
- vectorstore = None
560
-
561
  def startup():
562
  """Initialize all models and load vector store"""
563
  global llm_client, embeddings, vectorstore
 
43
  """Initialize free local LLM with transformers pipeline"""
44
  logger.info("πŸ”„ Initializing FREE local language model...")
45
 
46
+ # Use FLAN-T5-Large - reliable, fast, and proven to work
47
+ model_name = "google/flan-t5-large"
 
48
 
49
+ try:
50
+ logger.info(f" Loading {model_name}...")
51
+ device = 0 if torch.cuda.is_available() else -1
52
+
53
+ # T5 configuration
54
+ task = "text2text-generation"
55
+ model_type = "t5"
56
+
57
+ # Optimized for speed and quality
58
+ model_kwargs = {
59
+ "low_cpu_mem_usage": True,
60
+ }
61
+
62
+ llm_client = pipeline(
63
+ task,
64
+ model=model_name,
65
+ device=device,
66
+ model_kwargs=model_kwargs
67
+ )
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ CONFIG["llm_model"] = model_name
70
+ CONFIG["model_type"] = model_type
71
+ logger.info(f"βœ… LLM initialized: {model_name}")
72
+ logger.info(f" Device: {'GPU' if device == 0 else 'CPU'}")
73
+ return llm_client
74
+
75
+ except Exception as e:
76
+ logger.error(f"❌ Failed to load model: {str(e)}")
77
+ raise Exception(f"Failed to initialize LLM: {str(e)}")
78
 
79
  def initialize_embeddings():
80
  """Initialize sentence transformer embeddings"""
 
354
  top_p = 0.97
355
  repetition_penalty = 1.25
356
 
357
+ # Create optimized T5 prompt
358
+ model_type = CONFIG.get("model_type", "t5")
359
 
360
+ # T5 format - simple and effective for good answers
361
+ user_prompt = f"""Answer this fashion question with detailed, specific advice using the context provided.
 
 
 
362
 
363
  Question: {query}
364
 
365
+ Fashion Context:
366
+ {context_text[:1500]}
367
+
368
+ Provide a complete, detailed answer (150-250 words):"""
369
 
370
  try:
371
  logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
372
 
373
+ # T5 optimized parameters for quality and speed
374
  output = llm_client(
375
  user_prompt,
376
+ max_length=300, # Good length for detailed answers
377
+ temperature=0.75, # Balanced creativity
378
+ top_p=0.92,
 
379
  do_sample=True,
380
+ num_beams=2, # Light beam search for quality
381
+ early_stopping=True
382
  )
383
 
384
  # Extract generated text
 
407
  return None
408
 
409
  def synthesize_direct_answer(
410
+ # Removed synthetic fallback - only use LLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  def generate_answer_langchain(
412
  query: str,
413
  vectorstore,
 
470
  message.strip(),
471
  vectorstore,
472
  top_k=CONFIG["top_k"]
473
+ # Step 3: If all attempts fail, return error
474
+ if not llm_answer:
475
+ logger.error(f" βœ— All 4 LLM attempts failed")
476
+ return "I apologize, but I'm having trouble generating a response. Please try rephrasing your question or ask something else."
477
+
478
+ return llm_answer
479
  # Show generating indicator
480
  yield f"πŸ’­ Generating answer ({len(retrieved_docs)} sources found)..."
481
 
 
515
  # ============================================================================
516
  # INITIALIZE AND LAUNCH
517
  # ============================================================================
518
+ # If LLM fails, show error
519
+ if not llm_answer:
520
+ logger.error(f" βœ— All LLM attempts failed")
521
+ yield "I apologize, but I'm having trouble generating a response. Please try rephrasing your question."
522
+ return
 
523
  def startup():
524
  """Initialize all models and load vector store"""
525
  global llm_client, embeddings, vectorstore