hamxaameer commited on
Commit
4096b3f
Β·
verified Β·
1 Parent(s): 5a8f7f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -81
app.py CHANGED
@@ -44,9 +44,7 @@ def initialize_llm():
44
  logger.info("πŸ”„ Initializing FREE local language model...")
45
 
46
  BACKUP_MODELS = [
47
- "HuggingFaceH4/zephyr-7b-beta", # Primary - 7B, excellent quality
48
- "mistralai/Mistral-7B-Instruct-v0.2", # Backup - 7B, very good
49
- "google/flan-t5-xl", # Fallback - 3B, reliable
50
  ]
51
 
52
  for model_name in BACKUP_MODELS:
@@ -54,24 +52,15 @@ def initialize_llm():
54
  logger.info(f" Trying {model_name}...")
55
  device = 0 if torch.cuda.is_available() else -1
56
 
57
- # Determine task and model type
58
- if "t5" in model_name.lower():
59
- task = "text2text-generation"
60
- model_type = "t5"
61
- elif "zephyr" in model_name.lower():
62
- task = "text-generation"
63
- model_type = "zephyr"
64
- elif "mistral" in model_name.lower():
65
- task = "text-generation"
66
- model_type = "mistral"
67
- else:
68
- task = "text-generation"
69
- model_type = "instruct"
70
 
71
- # Model-specific kwargs for optimization
72
  model_kwargs = {
73
  "low_cpu_mem_usage": True,
74
- "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32
 
75
  }
76
 
77
  llm_client = pipeline(
@@ -374,78 +363,33 @@ def generate_llm_answer(
374
  top_p = 0.97
375
  repetition_penalty = 1.25
376
 
377
- # Create prompt based on model type
378
- model_type = CONFIG.get("model_type", "instruct")
379
-
380
- if model_type == "t5":
381
- # T5 needs simple format
382
- user_prompt = f"Answer this fashion question using the context:\n\nQuestion: {query}\n\nContext: {context_text[:1000]}\n\nAnswer:"
383
- elif model_type == "zephyr":
384
- # Zephyr chat format
385
- user_prompt = f"""<|system|>
386
- You are a professional fashion advisor. Use the provided fashion knowledge to give specific, detailed advice.</|system|>
387
- <|user|>
388
- Fashion Knowledge:
389
- {context_text[:1500]}
390
-
391
- Question: {query}
392
-
393
- Provide a detailed, specific answer (150-250 words) based on the fashion knowledge above.</|user|>
394
- <|assistant|>"""
395
- elif model_type == "mistral":
396
- # Mistral instruct format
397
- user_prompt = f"""[INST] You are a fashion expert. Use the following fashion knowledge to answer the question with specific, practical advice.
398
 
399
  Fashion Knowledge:
400
- {context_text[:1500]}
401
 
402
  Question: {query}
403
 
404
- Provide a detailed answer (150-250 words). [/INST]"""
405
- else:
406
- # Generic instruct format
407
- user_prompt = f"""[INST] Question: {query}
408
-
409
- Fashion Knowledge:
410
- {context_text}
411
-
412
- Answer the question using the knowledge above. Be specific and helpful (150-250 words). [/INST]"""
413
 
414
  try:
415
  logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
416
 
417
- # Call pipeline with model-specific parameters
418
- if model_type == "t5":
419
- # T5 uses max_length
420
- output = llm_client(
421
- user_prompt,
422
- max_length=200,
423
- temperature=0.7,
424
- top_p=0.9,
425
- do_sample=True
426
- )
427
- elif model_type in ["zephyr", "mistral"]:
428
- # Modern instruct models - optimized for quality
429
- output = llm_client(
430
- user_prompt,
431
- max_new_tokens=250, # Good length for detailed answers
432
- temperature=0.7, # Balanced creativity
433
- top_p=0.9,
434
- repetition_penalty=1.1,
435
- do_sample=True,
436
- return_full_text=False
437
- )
438
- else:
439
- # Other models
440
- output = llm_client(
441
- user_prompt,
442
- max_new_tokens=max_tokens,
443
- temperature=temperature,
444
- top_p=top_p,
445
- repetition_penalty=repetition_penalty,
446
- do_sample=True,
447
- return_full_text=False
448
- )
449
 
450
  # Extract generated text
451
  response = output[0]['generated_text'].strip()
 
44
  logger.info("πŸ”„ Initializing FREE local language model...")
45
 
46
  BACKUP_MODELS = [
47
+ "microsoft/phi-2", # 2.7B - Best quality that fits in 16GB
 
 
48
  ]
49
 
50
  for model_name in BACKUP_MODELS:
 
52
  logger.info(f" Trying {model_name}...")
53
  device = 0 if torch.cuda.is_available() else -1
54
 
55
+ # Phi-2 configuration
56
+ task = "text-generation"
57
+ model_type = "phi"
 
 
 
 
 
 
 
 
 
 
58
 
59
+ # Optimized for memory efficiency
60
  model_kwargs = {
61
  "low_cpu_mem_usage": True,
62
+ "torch_dtype": torch.float32, # Use float32 for CPU
63
+ "trust_remote_code": True # Required for Phi-2
64
  }
65
 
66
  llm_client = pipeline(
 
363
  top_p = 0.97
364
  repetition_penalty = 1.25
365
 
366
+ # Create prompt for Phi-2
367
+ model_type = CONFIG.get("model_type", "phi")
368
+
369
+ # Phi-2 optimized format (simple and effective)
370
+ user_prompt = f"""Instruct: You are a professional fashion advisor. Use the fashion knowledge below to answer the question with specific, detailed advice.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
  Fashion Knowledge:
373
+ {context_text[:1200]}
374
 
375
  Question: {query}
376
 
377
+ Output: """
 
 
 
 
 
 
 
 
378
 
379
  try:
380
  logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
381
 
382
+ # Phi-2 optimized parameters
383
+ output = llm_client(
384
+ user_prompt,
385
+ max_new_tokens=min(max_tokens, 250), # Cap for speed
386
+ temperature=0.7, # Balanced
387
+ top_p=0.9,
388
+ repetition_penalty=1.15,
389
+ do_sample=True,
390
+ return_full_text=False,
391
+ pad_token_id=50256 # Phi-2 pad token
392
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  # Extract generated text
395
  response = output[0]['generated_text'].strip()