hamxaameer commited on
Commit
9af7bbc
Β·
verified Β·
1 Parent(s): 5eeadbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +451 -440
app.py CHANGED
@@ -1,440 +1,451 @@
1
- """
2
- Fashion Advisor RAG - Hugging Face Deployment
3
- Complete RAG system with FAISS vector store and local LLM
4
- """
5
-
6
- import gradio as gr
7
- import logging
8
- import os
9
- from pathlib import Path
10
- from typing import List, Tuple, Dict, Optional
11
- import pickle
12
-
13
- # Core ML libraries
14
- import torch
15
- from transformers import pipeline
16
- from sentence_transformers import SentenceTransformer
17
- from langchain_community.vectorstores import FAISS
18
- from langchain_community.embeddings import HuggingFaceEmbeddings
19
- from langchain.schema import Document
20
-
21
- # Setup logging
22
- logging.basicConfig(level=logging.INFO)
23
- logger = logging.getLogger(__name__)
24
-
25
- # ============================================================================
26
- # CONFIGURATION
27
- # ============================================================================
28
-
29
- CONFIG = {
30
- "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
31
- "llm_model": None, # Will be set during initialization
32
- "vector_store_path": "./faiss_vectorstore",
33
- "top_k": 15,
34
- "temperature": 0.75,
35
- "max_tokens": 350,
36
- }
37
-
38
- # ============================================================================
39
- # INITIALIZE MODELS
40
- # ============================================================================
41
-
42
- def initialize_llm():
43
- """Initialize free local LLM with transformers pipeline"""
44
- logger.info("πŸ”„ Initializing FREE local language model...")
45
-
46
- BACKUP_MODELS = [
47
- "microsoft/Phi-3-mini-4k-instruct", # Primary - 3.8B, very efficient
48
- "google/flan-t5-large", # Backup - 780M, good quality
49
- "google/flan-t5-base", # Fallback - 250M, fast
50
- ]
51
-
52
- for model_name in BACKUP_MODELS:
53
- try:
54
- logger.info(f" Trying {model_name}...")
55
- device = 0 if torch.cuda.is_available() else -1
56
-
57
- llm_client = pipeline(
58
- "text-generation",
59
- model=model_name,
60
- device=device,
61
- max_length=512,
62
- truncation=True,
63
- )
64
-
65
- CONFIG["llm_model"] = model_name
66
- logger.info(f"βœ… FREE LLM initialized: {model_name}")
67
- logger.info(f" Device: {'GPU' if device == 0 else 'CPU'}")
68
- return llm_client
69
-
70
- except Exception as e:
71
- logger.warning(f"⚠️ Failed {model_name}: {str(e)[:100]}")
72
- continue
73
-
74
- logger.error("⚠️ All models failed - will use fallback generation")
75
- return None
76
-
77
- def initialize_embeddings():
78
- """Initialize sentence transformer embeddings"""
79
- logger.info("πŸ”„ Initializing embeddings model...")
80
-
81
- embeddings = HuggingFaceEmbeddings(
82
- model_name=CONFIG["embedding_model"],
83
- model_kwargs={'device': 'cpu'},
84
- encode_kwargs={'normalize_embeddings': True}
85
- )
86
-
87
- logger.info(f"βœ… Embeddings initialized: {CONFIG['embedding_model']}")
88
- return embeddings
89
-
90
- def load_vector_store(embeddings):
91
- """Load FAISS vector store"""
92
- logger.info("πŸ”„ Loading FAISS vector store...")
93
-
94
- vector_store_path = CONFIG["vector_store_path"]
95
-
96
- if not os.path.exists(vector_store_path):
97
- logger.error(f"❌ Vector store not found at {vector_store_path}")
98
- raise FileNotFoundError(f"Vector store directory not found: {vector_store_path}")
99
-
100
- vectorstore = FAISS.load_local(
101
- vector_store_path,
102
- embeddings,
103
- allow_dangerous_deserialization=True
104
- )
105
-
106
- logger.info(f"βœ… FAISS vector store loaded from {vector_store_path}")
107
- return vectorstore
108
-
109
- # ============================================================================
110
- # RAG PIPELINE FUNCTIONS
111
- # ============================================================================
112
-
113
- def retrieve_knowledge_langchain(
114
- query: str,
115
- vectorstore,
116
- top_k: int = 15
117
- ) -> Tuple[List[Document], float]:
118
- """
119
- Retrieve relevant documents using LangChain FAISS with query expansion
120
- """
121
- logger.info(f"πŸ” Retrieving knowledge for: '{query}'")
122
-
123
- # Create query variants for better coverage
124
- query_variants = [
125
- query, # Original
126
- f"fashion advice clothing outfit style for {query}", # Semantic expansion
127
- ]
128
-
129
- all_docs = []
130
-
131
- # Retrieve for each variant
132
- for variant in query_variants:
133
- try:
134
- docs_and_scores = vectorstore.similarity_search_with_score(variant, k=top_k)
135
-
136
- for doc, score in docs_and_scores:
137
- similarity = 1.0 / (1.0 + score)
138
- doc.metadata['similarity'] = similarity
139
- doc.metadata['query_variant'] = variant
140
- all_docs.append(doc)
141
-
142
- except Exception as e:
143
- logger.error(f"Retrieval error for variant '{variant}': {e}")
144
-
145
- # Deduplicate by content
146
- unique_docs = {}
147
- for doc in all_docs:
148
- content_key = doc.page_content[:100]
149
- if content_key not in unique_docs:
150
- unique_docs[content_key] = doc
151
- else:
152
- # Keep document with higher similarity
153
- if doc.metadata.get('similarity', 0) > unique_docs[content_key].metadata.get('similarity', 0):
154
- unique_docs[content_key] = doc
155
-
156
- final_docs = list(unique_docs.values())
157
-
158
- # Sort by similarity
159
- final_docs.sort(key=lambda x: x.metadata.get('similarity', 0), reverse=True)
160
-
161
- # Calculate confidence
162
- if final_docs:
163
- avg_similarity = sum(d.metadata.get('similarity', 0) for d in final_docs) / len(final_docs)
164
- confidence = min(avg_similarity, 1.0)
165
- else:
166
- confidence = 0.0
167
-
168
- logger.info(f"βœ… Retrieved {len(final_docs)} unique documents (confidence: {confidence:.2f})")
169
-
170
- return final_docs, confidence
171
-
172
- def generate_llm_answer(
173
- query: str,
174
- retrieved_docs: List[Document],
175
- llm_client,
176
- attempt: int = 1
177
- ) -> Optional[str]:
178
- """
179
- Generate answer using local LLM with retrieved context
180
- """
181
- if not llm_client:
182
- logger.error(" β†’ LLM client not initialized")
183
- return None
184
-
185
- # Build focused context
186
- query_lower = query.lower()
187
- query_words = set(query_lower.split())
188
-
189
- # Score documents by relevance
190
- scored_docs = []
191
- for doc in retrieved_docs[:20]:
192
- content = doc.page_content.lower()
193
- doc_words = set(content.split())
194
- overlap = len(query_words.intersection(doc_words))
195
-
196
- # Boost for verified/curated
197
- if doc.metadata.get('verified', False):
198
- overlap += 10
199
-
200
- # Boost for longer content
201
- if len(doc.page_content) > 200:
202
- overlap += 3
203
-
204
- scored_docs.append((doc, overlap))
205
-
206
- # Sort and take top 8
207
- scored_docs.sort(key=lambda x: x[1], reverse=True)
208
- top_docs = [doc[0] for doc in scored_docs[:8]]
209
-
210
- # Build context
211
- context_parts = []
212
- for doc in top_docs:
213
- content = doc.page_content.strip()
214
- if len(content) > 400:
215
- content = content[:400] + "..."
216
- context_parts.append(content)
217
-
218
- context_text = "\n\n".join(context_parts)
219
-
220
- # Progressive parameters based on attempt
221
- if attempt == 1:
222
- temperature = 0.75
223
- max_tokens = 350
224
- top_p = 0.92
225
- repetition_penalty = 1.1
226
- elif attempt == 2:
227
- temperature = 0.85
228
- max_tokens = 450
229
- top_p = 0.94
230
- repetition_penalty = 1.15
231
- elif attempt == 3:
232
- temperature = 0.92
233
- max_tokens = 550
234
- top_p = 0.96
235
- repetition_penalty = 1.2
236
- else:
237
- temperature = 1.0
238
- max_tokens = 600
239
- top_p = 0.97
240
- repetition_penalty = 1.25
241
-
242
- # Create prompt
243
- user_prompt = f"""[INST] Question: {query}
244
-
245
- Fashion Knowledge:
246
- {context_text}
247
-
248
- Answer the question using the knowledge above. Be specific and helpful (100-250 words). [/INST]"""
249
-
250
- try:
251
- logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
252
-
253
- # Call pipeline
254
- output = llm_client(
255
- user_prompt,
256
- max_new_tokens=max_tokens,
257
- temperature=temperature,
258
- top_p=top_p,
259
- repetition_penalty=repetition_penalty,
260
- do_sample=True,
261
- return_full_text=False,
262
- pad_token_id=llm_client.tokenizer.eos_token_id
263
- )
264
-
265
- # Extract generated text
266
- response = output[0]['generated_text'].strip()
267
-
268
- if not response:
269
- logger.warning(f" βœ— Empty response (attempt {attempt})")
270
- return None
271
-
272
- # Minimal validation
273
- if len(response) < 20:
274
- logger.warning(f" βœ— Response too short: {len(response)} chars")
275
- return None
276
-
277
- # Check for apologies/refusals
278
- apology_phrases = ["i cannot", "i can't", "i'm sorry", "i apologize", "i don't have"]
279
- if any(phrase in response.lower()[:100] for phrase in apology_phrases):
280
- logger.warning(f" βœ— Apology detected")
281
- return None
282
-
283
- logger.info(f" βœ… Generated answer ({len(response)} chars)")
284
- return response
285
-
286
- except Exception as e:
287
- logger.error(f" βœ— Generation error: {e}")
288
- return None
289
-
290
- def synthesize_direct_answer(
291
- query: str,
292
- retrieved_docs: List[Document]
293
- ) -> str:
294
- """
295
- Fallback: Synthesize answer directly from most relevant documents
296
- """
297
- logger.info(" β†’ Using fallback: direct synthesis")
298
-
299
- if not retrieved_docs:
300
- return "I don't have enough information to answer that question accurately."
301
-
302
- # Get most relevant document
303
- best_doc = retrieved_docs[0]
304
- content = best_doc.page_content.strip()
305
-
306
- # Create answer from top document
307
- if len(content) > 500:
308
- answer = content[:500] + "..."
309
- else:
310
- answer = content
311
-
312
- return answer
313
-
314
- def generate_answer_langchain(
315
- query: str,
316
- vectorstore,
317
- llm_client
318
- ) -> str:
319
- """
320
- Main RAG pipeline: Retrieve β†’ Generate β†’ Fallback
321
- """
322
- logger.info(f"\n{'='*80}")
323
- logger.info(f"Processing query: '{query}'")
324
- logger.info(f"{'='*80}")
325
-
326
- # Step 1: Retrieve documents
327
- retrieved_docs, confidence = retrieve_knowledge_langchain(
328
- query,
329
- vectorstore,
330
- top_k=CONFIG["top_k"]
331
- )
332
-
333
- if not retrieved_docs:
334
- return "I couldn't find relevant information to answer your question."
335
-
336
- # Step 2: Try LLM generation (4 attempts)
337
- llm_answer = None
338
- for attempt in range(1, 5):
339
- logger.info(f"\n πŸ€– LLM Generation Attempt {attempt}/4")
340
- llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
341
-
342
- if llm_answer:
343
- logger.info(f" βœ… LLM answer generated successfully")
344
- break
345
- else:
346
- logger.warning(f" β†’ Attempt {attempt}/4 failed, retrying...")
347
-
348
- # Step 3: Fallback if all attempts fail
349
- if not llm_answer:
350
- logger.error(f" βœ— All 4 LLM attempts failed - using fallback")
351
- llm_answer = synthesize_direct_answer(query, retrieved_docs)
352
-
353
- return llm_answer
354
-
355
- # ============================================================================
356
- # GRADIO INTERFACE
357
- # ============================================================================
358
-
359
- def fashion_chatbot(message: str, history: List[List[str]]) -> str:
360
- """
361
- Chatbot function for Gradio interface
362
- """
363
- try:
364
- if not message or not message.strip():
365
- return "Please ask a fashion-related question!"
366
-
367
- # Generate answer using RAG pipeline
368
- answer = generate_answer_langchain(
369
- message.strip(),
370
- vectorstore,
371
- llm_client
372
- )
373
-
374
- return answer
375
-
376
- except Exception as e:
377
- logger.error(f"Error in chatbot: {e}")
378
- return f"Sorry, I encountered an error: {str(e)}"
379
-
380
- # ============================================================================
381
- # INITIALIZE AND LAUNCH
382
- # ============================================================================
383
-
384
- # Global variables
385
- llm_client = None
386
- embeddings = None
387
- vectorstore = None
388
-
389
- def startup():
390
- """Initialize all models and load vector store"""
391
- global llm_client, embeddings, vectorstore
392
-
393
- logger.info("πŸš€ Starting Fashion Advisor RAG...")
394
-
395
- # Initialize embeddings
396
- embeddings = initialize_embeddings()
397
-
398
- # Load vector store
399
- vectorstore = load_vector_store(embeddings)
400
-
401
- # Initialize LLM
402
- llm_client = initialize_llm()
403
-
404
- logger.info("βœ… All components initialized successfully!")
405
-
406
- # Initialize on startup
407
- startup()
408
-
409
- # Create Gradio interface
410
- demo = gr.ChatInterface(
411
- fn=fashion_chatbot,
412
- title="πŸ‘— Fashion Advisor - RAG System",
413
- description="""
414
- **Ask me anything about fashion!** 🌟
415
-
416
- I can help with:
417
- - Outfit recommendations for occasions
418
- - Color combinations and styling
419
- - Seasonal fashion advice
420
- - Body type and fit guidance
421
- - Wardrobe essentials
422
-
423
- *Powered by RAG with FAISS vector search and local LLM*
424
- """,
425
- examples=[
426
- "What should I wear to a business meeting?",
427
- "What colors go well with navy blue?",
428
- "What are essential wardrobe items for fall?",
429
- "How to dress for a summer wedding?",
430
- "What's the best outfit for a university presentation?",
431
- ],
432
- theme=gr.themes.Soft(),
433
- retry_btn=None,
434
- undo_btn="Delete Previous",
435
- clear_btn="Clear Chat",
436
- )
437
-
438
- # Launch
439
- if __name__ == "__main__":
440
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fashion Advisor RAG - Hugging Face Deployment
3
+ Complete RAG system with FAISS vector store and local LLM
4
+ """
5
+
6
+ import gradio as gr
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+ from typing import List, Tuple, Dict, Optional
11
+ import pickle
12
+
13
+ # Core ML libraries
14
+ import torch
15
+ from transformers import pipeline
16
+ from sentence_transformers import SentenceTransformer
17
+ from langchain_community.vectorstores import FAISS
18
+ from langchain_community.embeddings import HuggingFaceEmbeddings
19
+ from langchain.schema import Document
20
+
21
+ # Setup logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # ============================================================================
26
+ # CONFIGURATION
27
+ # ============================================================================
28
+
29
+ CONFIG = {
30
+ "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
31
+ "llm_model": None, # Will be set during initialization
32
+ "vector_store_path": ".", # Root directory (files are in root on HF Spaces)
33
+ "top_k": 15,
34
+ "temperature": 0.75,
35
+ "max_tokens": 350,
36
+ }
37
+
38
+ # ============================================================================
39
+ # INITIALIZE MODELS
40
+ # ============================================================================
41
+
42
+ def initialize_llm():
43
+ """Initialize free local LLM with transformers pipeline"""
44
+ logger.info("πŸ”„ Initializing FREE local language model...")
45
+
46
+ BACKUP_MODELS = [
47
+ "microsoft/Phi-3-mini-4k-instruct", # Primary - 3.8B, very efficient
48
+ "google/flan-t5-large", # Backup - 780M, good quality
49
+ "google/flan-t5-base", # Fallback - 250M, fast
50
+ ]
51
+
52
+ for model_name in BACKUP_MODELS:
53
+ try:
54
+ logger.info(f" Trying {model_name}...")
55
+ device = 0 if torch.cuda.is_available() else -1
56
+
57
+ llm_client = pipeline(
58
+ "text-generation",
59
+ model=model_name,
60
+ device=device,
61
+ max_length=512,
62
+ truncation=True,
63
+ )
64
+
65
+ CONFIG["llm_model"] = model_name
66
+ logger.info(f"βœ… FREE LLM initialized: {model_name}")
67
+ logger.info(f" Device: {'GPU' if device == 0 else 'CPU'}")
68
+ return llm_client
69
+
70
+ except Exception as e:
71
+ logger.warning(f"⚠️ Failed {model_name}: {str(e)[:100]}")
72
+ continue
73
+
74
+ logger.error("⚠️ All models failed - will use fallback generation")
75
+ return None
76
+
77
+ def initialize_embeddings():
78
+ """Initialize sentence transformer embeddings"""
79
+ logger.info("πŸ”„ Initializing embeddings model...")
80
+
81
+ embeddings = HuggingFaceEmbeddings(
82
+ model_name=CONFIG["embedding_model"],
83
+ model_kwargs={'device': 'cpu'},
84
+ encode_kwargs={'normalize_embeddings': True}
85
+ )
86
+
87
+ logger.info(f"βœ… Embeddings initialized: {CONFIG['embedding_model']}")
88
+ return embeddings
89
+
90
+ def load_vector_store(embeddings):
91
+ """Load FAISS vector store"""
92
+ logger.info("πŸ”„ Loading FAISS vector store...")
93
+
94
+ vector_store_path = CONFIG["vector_store_path"]
95
+
96
+ # Check for required FAISS files
97
+ index_file = os.path.join(vector_store_path, "index.faiss")
98
+ pkl_file = os.path.join(vector_store_path, "index.pkl")
99
+
100
+ if not os.path.exists(index_file):
101
+ logger.error(f"❌ index.faiss not found at {index_file}")
102
+ raise FileNotFoundError(f"FAISS index file not found: {index_file}")
103
+
104
+ if not os.path.exists(pkl_file):
105
+ logger.error(f"❌ index.pkl not found at {pkl_file}")
106
+ raise FileNotFoundError(f"FAISS metadata file not found: {pkl_file}")
107
+
108
+ logger.info(f"βœ… Found index.faiss ({os.path.getsize(index_file)/1024/1024:.2f} MB)")
109
+ logger.info(f"βœ… Found index.pkl ({os.path.getsize(pkl_file)/1024:.2f} KB)")
110
+
111
+ vectorstore = FAISS.load_local(
112
+ vector_store_path,
113
+ embeddings,
114
+ allow_dangerous_deserialization=True
115
+ )
116
+
117
+ logger.info(f"βœ… FAISS vector store loaded successfully")
118
+ return vectorstore
119
+
120
+ # ============================================================================
121
+ # RAG PIPELINE FUNCTIONS
122
+ # ============================================================================
123
+
124
+ def retrieve_knowledge_langchain(
125
+ query: str,
126
+ vectorstore,
127
+ top_k: int = 15
128
+ ) -> Tuple[List[Document], float]:
129
+ """
130
+ Retrieve relevant documents using LangChain FAISS with query expansion
131
+ """
132
+ logger.info(f"πŸ” Retrieving knowledge for: '{query}'")
133
+
134
+ # Create query variants for better coverage
135
+ query_variants = [
136
+ query, # Original
137
+ f"fashion advice clothing outfit style for {query}", # Semantic expansion
138
+ ]
139
+
140
+ all_docs = []
141
+
142
+ # Retrieve for each variant
143
+ for variant in query_variants:
144
+ try:
145
+ docs_and_scores = vectorstore.similarity_search_with_score(variant, k=top_k)
146
+
147
+ for doc, score in docs_and_scores:
148
+ similarity = 1.0 / (1.0 + score)
149
+ doc.metadata['similarity'] = similarity
150
+ doc.metadata['query_variant'] = variant
151
+ all_docs.append(doc)
152
+
153
+ except Exception as e:
154
+ logger.error(f"Retrieval error for variant '{variant}': {e}")
155
+
156
+ # Deduplicate by content
157
+ unique_docs = {}
158
+ for doc in all_docs:
159
+ content_key = doc.page_content[:100]
160
+ if content_key not in unique_docs:
161
+ unique_docs[content_key] = doc
162
+ else:
163
+ # Keep document with higher similarity
164
+ if doc.metadata.get('similarity', 0) > unique_docs[content_key].metadata.get('similarity', 0):
165
+ unique_docs[content_key] = doc
166
+
167
+ final_docs = list(unique_docs.values())
168
+
169
+ # Sort by similarity
170
+ final_docs.sort(key=lambda x: x.metadata.get('similarity', 0), reverse=True)
171
+
172
+ # Calculate confidence
173
+ if final_docs:
174
+ avg_similarity = sum(d.metadata.get('similarity', 0) for d in final_docs) / len(final_docs)
175
+ confidence = min(avg_similarity, 1.0)
176
+ else:
177
+ confidence = 0.0
178
+
179
+ logger.info(f"βœ… Retrieved {len(final_docs)} unique documents (confidence: {confidence:.2f})")
180
+
181
+ return final_docs, confidence
182
+
183
+ def generate_llm_answer(
184
+ query: str,
185
+ retrieved_docs: List[Document],
186
+ llm_client,
187
+ attempt: int = 1
188
+ ) -> Optional[str]:
189
+ """
190
+ Generate answer using local LLM with retrieved context
191
+ """
192
+ if not llm_client:
193
+ logger.error(" β†’ LLM client not initialized")
194
+ return None
195
+
196
+ # Build focused context
197
+ query_lower = query.lower()
198
+ query_words = set(query_lower.split())
199
+
200
+ # Score documents by relevance
201
+ scored_docs = []
202
+ for doc in retrieved_docs[:20]:
203
+ content = doc.page_content.lower()
204
+ doc_words = set(content.split())
205
+ overlap = len(query_words.intersection(doc_words))
206
+
207
+ # Boost for verified/curated
208
+ if doc.metadata.get('verified', False):
209
+ overlap += 10
210
+
211
+ # Boost for longer content
212
+ if len(doc.page_content) > 200:
213
+ overlap += 3
214
+
215
+ scored_docs.append((doc, overlap))
216
+
217
+ # Sort and take top 8
218
+ scored_docs.sort(key=lambda x: x[1], reverse=True)
219
+ top_docs = [doc[0] for doc in scored_docs[:8]]
220
+
221
+ # Build context
222
+ context_parts = []
223
+ for doc in top_docs:
224
+ content = doc.page_content.strip()
225
+ if len(content) > 400:
226
+ content = content[:400] + "..."
227
+ context_parts.append(content)
228
+
229
+ context_text = "\n\n".join(context_parts)
230
+
231
+ # Progressive parameters based on attempt
232
+ if attempt == 1:
233
+ temperature = 0.75
234
+ max_tokens = 350
235
+ top_p = 0.92
236
+ repetition_penalty = 1.1
237
+ elif attempt == 2:
238
+ temperature = 0.85
239
+ max_tokens = 450
240
+ top_p = 0.94
241
+ repetition_penalty = 1.15
242
+ elif attempt == 3:
243
+ temperature = 0.92
244
+ max_tokens = 550
245
+ top_p = 0.96
246
+ repetition_penalty = 1.2
247
+ else:
248
+ temperature = 1.0
249
+ max_tokens = 600
250
+ top_p = 0.97
251
+ repetition_penalty = 1.25
252
+
253
+ # Create prompt
254
+ user_prompt = f"""[INST] Question: {query}
255
+
256
+ Fashion Knowledge:
257
+ {context_text}
258
+
259
+ Answer the question using the knowledge above. Be specific and helpful (100-250 words). [/INST]"""
260
+
261
+ try:
262
+ logger.info(f" β†’ Calling {CONFIG['llm_model']} (temp={temperature}, tokens={max_tokens})...")
263
+
264
+ # Call pipeline
265
+ output = llm_client(
266
+ user_prompt,
267
+ max_new_tokens=max_tokens,
268
+ temperature=temperature,
269
+ top_p=top_p,
270
+ repetition_penalty=repetition_penalty,
271
+ do_sample=True,
272
+ return_full_text=False,
273
+ pad_token_id=llm_client.tokenizer.eos_token_id
274
+ )
275
+
276
+ # Extract generated text
277
+ response = output[0]['generated_text'].strip()
278
+
279
+ if not response:
280
+ logger.warning(f" βœ— Empty response (attempt {attempt})")
281
+ return None
282
+
283
+ # Minimal validation
284
+ if len(response) < 20:
285
+ logger.warning(f" βœ— Response too short: {len(response)} chars")
286
+ return None
287
+
288
+ # Check for apologies/refusals
289
+ apology_phrases = ["i cannot", "i can't", "i'm sorry", "i apologize", "i don't have"]
290
+ if any(phrase in response.lower()[:100] for phrase in apology_phrases):
291
+ logger.warning(f" βœ— Apology detected")
292
+ return None
293
+
294
+ logger.info(f" βœ… Generated answer ({len(response)} chars)")
295
+ return response
296
+
297
+ except Exception as e:
298
+ logger.error(f" βœ— Generation error: {e}")
299
+ return None
300
+
301
+ def synthesize_direct_answer(
302
+ query: str,
303
+ retrieved_docs: List[Document]
304
+ ) -> str:
305
+ """
306
+ Fallback: Synthesize answer directly from most relevant documents
307
+ """
308
+ logger.info(" β†’ Using fallback: direct synthesis")
309
+
310
+ if not retrieved_docs:
311
+ return "I don't have enough information to answer that question accurately."
312
+
313
+ # Get most relevant document
314
+ best_doc = retrieved_docs[0]
315
+ content = best_doc.page_content.strip()
316
+
317
+ # Create answer from top document
318
+ if len(content) > 500:
319
+ answer = content[:500] + "..."
320
+ else:
321
+ answer = content
322
+
323
+ return answer
324
+
325
+ def generate_answer_langchain(
326
+ query: str,
327
+ vectorstore,
328
+ llm_client
329
+ ) -> str:
330
+ """
331
+ Main RAG pipeline: Retrieve β†’ Generate β†’ Fallback
332
+ """
333
+ logger.info(f"\n{'='*80}")
334
+ logger.info(f"Processing query: '{query}'")
335
+ logger.info(f"{'='*80}")
336
+
337
+ # Step 1: Retrieve documents
338
+ retrieved_docs, confidence = retrieve_knowledge_langchain(
339
+ query,
340
+ vectorstore,
341
+ top_k=CONFIG["top_k"]
342
+ )
343
+
344
+ if not retrieved_docs:
345
+ return "I couldn't find relevant information to answer your question."
346
+
347
+ # Step 2: Try LLM generation (4 attempts)
348
+ llm_answer = None
349
+ for attempt in range(1, 5):
350
+ logger.info(f"\n πŸ€– LLM Generation Attempt {attempt}/4")
351
+ llm_answer = generate_llm_answer(query, retrieved_docs, llm_client, attempt)
352
+
353
+ if llm_answer:
354
+ logger.info(f" βœ… LLM answer generated successfully")
355
+ break
356
+ else:
357
+ logger.warning(f" β†’ Attempt {attempt}/4 failed, retrying...")
358
+
359
+ # Step 3: Fallback if all attempts fail
360
+ if not llm_answer:
361
+ logger.error(f" βœ— All 4 LLM attempts failed - using fallback")
362
+ llm_answer = synthesize_direct_answer(query, retrieved_docs)
363
+
364
+ return llm_answer
365
+
366
+ # ============================================================================
367
+ # GRADIO INTERFACE
368
+ # ============================================================================
369
+
370
+ def fashion_chatbot(message: str, history: List[List[str]]) -> str:
371
+ """
372
+ Chatbot function for Gradio interface
373
+ """
374
+ try:
375
+ if not message or not message.strip():
376
+ return "Please ask a fashion-related question!"
377
+
378
+ # Generate answer using RAG pipeline
379
+ answer = generate_answer_langchain(
380
+ message.strip(),
381
+ vectorstore,
382
+ llm_client
383
+ )
384
+
385
+ return answer
386
+
387
+ except Exception as e:
388
+ logger.error(f"Error in chatbot: {e}")
389
+ return f"Sorry, I encountered an error: {str(e)}"
390
+
391
+ # ============================================================================
392
+ # INITIALIZE AND LAUNCH
393
+ # ============================================================================
394
+
395
+ # Global variables
396
+ llm_client = None
397
+ embeddings = None
398
+ vectorstore = None
399
+
400
+ def startup():
401
+ """Initialize all models and load vector store"""
402
+ global llm_client, embeddings, vectorstore
403
+
404
+ logger.info("πŸš€ Starting Fashion Advisor RAG...")
405
+
406
+ # Initialize embeddings
407
+ embeddings = initialize_embeddings()
408
+
409
+ # Load vector store
410
+ vectorstore = load_vector_store(embeddings)
411
+
412
+ # Initialize LLM
413
+ llm_client = initialize_llm()
414
+
415
+ logger.info("βœ… All components initialized successfully!")
416
+
417
+ # Initialize on startup
418
+ startup()
419
+
420
+ # Create Gradio interface
421
+ demo = gr.ChatInterface(
422
+ fn=fashion_chatbot,
423
+ title="πŸ‘— Fashion Advisor - RAG System",
424
+ description="""
425
+ **Ask me anything about fashion!** 🌟
426
+
427
+ I can help with:
428
+ - Outfit recommendations for occasions
429
+ - Color combinations and styling
430
+ - Seasonal fashion advice
431
+ - Body type and fit guidance
432
+ - Wardrobe essentials
433
+
434
+ *Powered by RAG with FAISS vector search and local LLM*
435
+ """,
436
+ examples=[
437
+ "What should I wear to a business meeting?",
438
+ "What colors go well with navy blue?",
439
+ "What are essential wardrobe items for fall?",
440
+ "How to dress for a summer wedding?",
441
+ "What's the best outfit for a university presentation?",
442
+ ],
443
+ theme=gr.themes.Soft(),
444
+ retry_btn=None,
445
+ undo_btn="Delete Previous",
446
+ clear_btn="Clear Chat",
447
+ )
448
+
449
+ # Launch
450
+ if __name__ == "__main__":
451
+ demo.launch()