pradeepsengarr commited on
Commit
a2bdf76
Β·
verified Β·
1 Parent(s): 345418e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +729 -230
app.py CHANGED
@@ -2,313 +2,812 @@ import gradio as gr
2
  import requests
3
  import os
4
  import tempfile
 
 
5
  from PyPDF2 import PdfReader
6
  from sentence_transformers import SentenceTransformer
7
  import numpy as np
8
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
 
9
 
10
- TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
11
- SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
12
- model = SentenceTransformer("all-MiniLM-L6-v2")
13
- doc_chunks = []
14
- doc_embeddings = []
15
 
16
- # --- Extract text from PDF ---
17
- def extract_pdf_text(file_obj):
18
- reader = PdfReader(file_obj)
19
- return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # --- Break into small chunks ---
22
- def split_into_chunks(text, chunk_size=300):
23
- words = text.split()
24
- return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # --- Embed all chunks and cache ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def process_uploaded_file(file):
28
- global doc_chunks, doc_embeddings
29
  if file is None:
30
- return "⚠️ No file selected", gr.update(visible=False)
31
 
32
  try:
33
- text = extract_pdf_text(file)
34
- doc_chunks = split_into_chunks(text)
35
- doc_embeddings = model.encode(doc_chunks)
36
- status = f"βœ… Successfully processed {len(doc_chunks)} chunks from your document!"
37
- return status, gr.update(visible=True, value=f"πŸ“„ Document loaded: {len(doc_chunks)} chunks ready")
 
 
 
 
 
 
 
 
 
38
  except Exception as e:
39
- return f"❌ Error processing file: {str(e)}", gr.update(visible=False)
40
-
41
- # --- RAG from file ---
42
- def retrieve_relevant_chunks(query):
43
- query_emb = model.encode([query])
44
- sims = cosine_similarity(query_emb, doc_embeddings)[0]
45
- top_indices = np.argsort(sims)[::-1][:3]
46
- return "\n\n".join([doc_chunks[i] for i in top_indices])
47
-
48
- # --- Together LLM call ---
49
- def call_together_llm(context, question):
50
- url = "https://api.together.xyz/v1/chat/completions"
51
- headers = {
52
- "Authorization": f"Bearer {TOGETHER_API_KEY}",
53
- "Content-Type": "application/json"
54
- }
55
- messages = [
56
- {"role": "system", "content": "You are a helpful assistant answering from the given context. Provide detailed, accurate responses based on the context provided."},
57
- {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
58
- ]
59
- data = {
60
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
61
- "messages": messages,
62
- "temperature": 0.7,
63
- "max_tokens": 512
64
- }
65
- response = requests.post(url, headers=headers, json=data)
66
- return response.json()["choices"][0]["message"]["content"]
67
-
68
- # --- Web search via Serper ---
69
- def web_search(query):
70
- url = "https://google.serper.dev/search"
71
- headers = {"X-API-KEY": SERPER_API_KEY}
72
- payload = {"q": query}
73
- response = requests.post(url, json=payload, headers=headers)
74
- data = response.json()
75
- results = data.get("organic", [])
76
- return "\n".join([f"{r['title']} - {r['link']}\n{r['snippet']}" for r in results[:3]])
77
-
78
- # --- Main Chat Logic ---
79
- def answer_question(question, source, history):
80
  if not question.strip():
81
  return history, ""
82
 
 
 
 
83
  try:
84
- # Add user question to history
85
- history = history + [[question, None]]
86
-
87
  if source == "🌐 Web Search":
88
- context = web_search(question)
89
- source_info = "🌐 **Source:** Web Search"
90
- elif source == "πŸ“„ Uploaded File":
91
- if not doc_chunks:
 
 
 
92
  answer = "❌ Please upload a PDF document first to use this feature."
93
  history[-1][1] = answer
94
  return history, ""
95
- context = retrieve_relevant_chunks(question)
96
- source_info = "πŸ“„ **Source:** Uploaded Document"
 
 
 
 
97
  else:
98
  answer = "❌ Please select a valid knowledge source."
99
  history[-1][1] = answer
100
  return history, ""
101
 
102
- # Get answer from LLM
103
- answer = call_together_llm(context, question)
104
- formatted_answer = f"{source_info}\n\n{answer}"
 
105
 
106
- # Update history with answer
107
- history[-1][1] = formatted_answer
108
 
 
 
 
 
 
 
 
 
 
 
 
109
  return history, ""
110
 
111
  except Exception as e:
112
- error_msg = f"❌ **Error:** {str(e)}\n\nPlease check your API keys and try again."
 
 
 
 
 
 
 
 
113
  history[-1][1] = error_msg
114
  return history, ""
115
 
116
- # --- Clear chat history ---
117
  def clear_chat():
 
118
  return []
119
 
120
- # --- Custom CSS ---
121
- custom_css = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  .gradio-container {
123
- max-width: 1200px !important;
124
  margin: auto !important;
 
125
  }
126
 
127
- .header-text {
 
 
 
 
 
 
 
 
 
 
 
 
128
  text-align: center;
129
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
130
- -webkit-background-clip: text;
131
- -webkit-text-fill-color: transparent;
132
- font-size: 2.5em;
133
- font-weight: bold;
134
- margin-bottom: 10px;
135
  }
136
 
137
- .subtitle-text {
 
 
138
  text-align: center;
139
- color: #666;
140
- font-size: 1.2em;
141
- margin-bottom: 30px;
142
  }
143
 
144
- .source-radio .wrap {
145
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
 
146
  border-radius: 15px;
147
- padding: 15px;
148
- margin: 10px 0;
 
 
149
  }
150
 
151
- .source-radio label {
152
- color: white !important;
153
- font-weight: 600;
 
 
 
 
 
 
 
 
 
 
 
 
154
  }
155
 
156
- .upload-area {
157
- border: 2px dashed #667eea;
 
 
 
 
 
 
 
 
158
  border-radius: 15px;
159
- padding: 20px;
160
  text-align: center;
161
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
162
  transition: all 0.3s ease;
 
163
  }
164
 
165
- .upload-area:hover {
166
- border-color: #764ba2;
167
- transform: translateY(-2px);
 
168
  }
169
 
170
- .chat-container {
171
- border-radius: 15px;
172
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
173
- background: white;
174
- padding: 20px;
175
- margin: 20px 0;
 
 
176
  }
177
 
178
- .status-box {
179
- background: linear-gradient(135deg, #84fab0 0%, #8fd3f4 100%);
180
- border-radius: 10px;
181
- padding: 15px;
182
- margin: 10px 0;
183
  border: none;
 
 
184
  color: #2d3748;
185
  font-weight: 500;
186
  }
187
 
188
- .footer-text {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  text-align: center;
190
- color: #888;
191
- font-size: 0.9em;
192
- margin-top: 30px;
193
- padding: 20px;
194
- border-top: 1px solid #eee;
195
  }
196
- """
197
 
198
- # --- Enhanced Gradio UI ---
199
- with gr.Blocks(css=custom_css, theme=gr.themes.Soft(), title="πŸ€– RAG Chatbot") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- # Header
202
- gr.HTML("""
203
- <div class="header-text">πŸ€– Intelligent RAG Chatbot</div>
204
- <div class="subtitle-text">Ask questions from web or upload your documents for AI-powered answers</div>
205
- """)
206
 
207
- with gr.Row():
208
- with gr.Column(scale=1):
209
- # Knowledge Source Selection
210
- gr.Markdown("### 🎯 **Choose Your Knowledge Source**")
211
- source_choice = gr.Radio(
212
- ["🌐 Web Search", "πŸ“„ Uploaded File"],
213
- label="Select Knowledge Source",
214
- value="🌐 Web Search",
215
- elem_classes=["source-radio"]
216
- )
217
-
218
- # File Upload Section
219
- gr.Markdown("### πŸ“ **Document Upload**")
220
- file_input = gr.File(
221
- label="Upload PDF Document",
222
- file_types=[".pdf"],
223
- elem_classes=["upload-area"]
224
- )
225
-
226
- file_status = gr.Textbox(
227
- label="πŸ“Š Processing Status",
228
- interactive=False,
229
- elem_classes=["status-box"]
230
- )
231
-
232
- document_info = gr.Textbox(
233
- label="πŸ“„ Document Info",
234
- visible=False,
235
- interactive=False,
236
- elem_classes=["status-box"]
237
- )
238
-
239
- with gr.Column(scale=2):
240
- # Chat Interface
241
- gr.Markdown("### πŸ’¬ **Chat Interface**")
242
-
243
- chatbot = gr.Chatbot(
244
- label="Conversation",
245
- height=500,
246
- elem_classes=["chat-container"],
247
- bubble_full_width=False,
248
- show_label=False
249
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- with gr.Row():
252
- question_input = gr.Textbox(
253
- label="Ask your question",
254
- placeholder="Type your question here... (Press Enter to send)",
255
- lines=2,
256
- scale=4
 
 
 
 
 
257
  )
258
 
259
- with gr.Column(scale=1, min_width=100):
260
- send_btn = gr.Button("πŸš€ Send", variant="primary", size="lg")
261
- clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary", size="lg")
262
-
263
- # Advanced Settings (Collapsible)
264
- with gr.Accordion("βš™οΈ Advanced Settings", open=False):
265
- gr.Markdown("""
266
- - 🌐 **Web Search**: Get real-time information from the internet
267
- - πŸ“„ **Document Upload**: Upload PDF files and ask questions about their content
268
- - πŸ€– **AI-Powered**: Uses Mixtral-8x7B model for intelligent responses
269
- - πŸ” **Semantic Search**: Advanced embedding-based document retrieval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- # Footer
273
- gr.HTML("""
274
- <div class="footer-text">
275
- πŸš€ Powered by Together AI & Serper API |
276
- πŸ“š Built with Sentence Transformers & Gradio |
277
- πŸ’‘ Enhanced RAG System
278
- </div>
279
- """)
280
-
281
- # Event Handlers
282
- file_input.change(
283
- fn=process_uploaded_file,
284
- inputs=file_input,
285
- outputs=[file_status, document_info]
286
- )
287
-
288
- # Send message on button click or Enter key
289
- question_input.submit(
290
- fn=answer_question,
291
- inputs=[question_input, source_choice, chatbot],
292
- outputs=[chatbot, question_input]
293
- )
294
-
295
- send_btn.click(
296
- fn=answer_question,
297
- inputs=[question_input, source_choice, chatbot],
298
- outputs=[chatbot, question_input]
299
- )
300
-
301
- clear_btn.click(
302
- fn=clear_chat,
303
- inputs=[],
304
- outputs=[chatbot]
305
- )
306
 
307
- # Launch the app
308
  if __name__ == "__main__":
 
309
  demo.launch(
310
  share=True,
311
- server_name="0.0.0.0",
312
- server_port=7860,
313
- show_error=True
314
- )
 
2
  import requests
3
  import os
4
  import tempfile
5
+ import asyncio
6
+ import aiohttp
7
  from PyPDF2 import PdfReader
8
  from sentence_transformers import SentenceTransformer
9
  import numpy as np
10
  from sklearn.metrics.pairwise import cosine_similarity
11
+ import logging
12
+ from typing import List, Dict, Tuple, Optional
13
+ import json
14
+ from datetime import datetime
15
+ import hashlib
16
+ import pickle
17
+ from pathlib import Path
18
 
19
+ # Configure logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
 
 
22
 
23
+ # Configuration
24
+ class Config:
25
+ TOGETHER_API_KEY = os.environ.get("TOGETHER_API_KEY")
26
+ SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
27
+ MODEL_NAME = "all-MiniLM-L6-v2"
28
+ CHUNK_SIZE = 400
29
+ CHUNK_OVERLAP = 50
30
+ MAX_TOKENS = 1024
31
+ TEMPERATURE = 0.7
32
+ TOP_K_CHUNKS = 5
33
+ CACHE_DIR = Path("./cache")
34
+
35
+ def __init__(self):
36
+ self.CACHE_DIR.mkdir(exist_ok=True)
37
+
38
+ config = Config()
39
+
40
+ class DocumentProcessor:
41
+ """Advanced document processing with caching and optimization"""
42
+
43
+ def __init__(self):
44
+ self.model = SentenceTransformer(config.MODEL_NAME)
45
+ self.doc_chunks = []
46
+ self.doc_embeddings = []
47
+ self.document_metadata = {}
48
+
49
+ def extract_text_from_pdf(self, file_obj) -> str:
50
+ """Extract text from PDF with error handling"""
51
+ try:
52
+ reader = PdfReader(file_obj)
53
+ text_parts = []
54
+
55
+ for page_num, page in enumerate(reader.pages):
56
+ page_text = page.extract_text()
57
+ if page_text.strip():
58
+ text_parts.append(f"[Page {page_num + 1}] {page_text}")
59
+
60
+ full_text = "\n".join(text_parts)
61
+ logger.info(f"Extracted {len(full_text)} characters from PDF")
62
+ return full_text
63
+
64
+ except Exception as e:
65
+ logger.error(f"PDF extraction error: {str(e)}")
66
+ raise ValueError(f"Failed to process PDF: {str(e)}")
67
+
68
+ def create_intelligent_chunks(self, text: str) -> List[str]:
69
+ """Create overlapping chunks with sentence boundary awareness"""
70
+ sentences = text.split('. ')
71
+ chunks = []
72
+ current_chunk = ""
73
+
74
+ for sentence in sentences:
75
+ test_chunk = current_chunk + sentence + ". "
76
+
77
+ if len(test_chunk.split()) <= config.CHUNK_SIZE:
78
+ current_chunk = test_chunk
79
+ else:
80
+ if current_chunk:
81
+ chunks.append(current_chunk.strip())
82
+ current_chunk = sentence + ". "
83
+
84
+ if current_chunk:
85
+ chunks.append(current_chunk.strip())
86
+
87
+ # Add overlap between chunks
88
+ overlapped_chunks = []
89
+ for i, chunk in enumerate(chunks):
90
+ overlapped_chunks.append(chunk)
91
+
92
+ # Add overlapping chunk if not the last one
93
+ if i < len(chunks) - 1:
94
+ overlap_words = chunk.split()[-config.CHUNK_OVERLAP:]
95
+ next_words = chunks[i + 1].split()[:config.CHUNK_OVERLAP]
96
+ overlap_chunk = " ".join(overlap_words + next_words)
97
+ overlapped_chunks.append(overlap_chunk)
98
+
99
+ return overlapped_chunks
100
+
101
+ def generate_document_hash(self, file_obj) -> str:
102
+ """Generate hash for document caching"""
103
+ file_obj.seek(0)
104
+ content = file_obj.read()
105
+ file_obj.seek(0)
106
+ return hashlib.md5(content).hexdigest()
107
+
108
+ def load_cached_embeddings(self, doc_hash: str) -> Optional[Tuple[List[str], np.ndarray]]:
109
+ """Load cached embeddings if available"""
110
+ cache_file = config.CACHE_DIR / f"{doc_hash}.pkl"
111
+ if cache_file.exists():
112
+ try:
113
+ with open(cache_file, 'rb') as f:
114
+ return pickle.load(f)
115
+ except Exception as e:
116
+ logger.warning(f"Failed to load cache: {e}")
117
+ return None
118
+
119
+ def save_embeddings_to_cache(self, doc_hash: str, chunks: List[str], embeddings: np.ndarray):
120
+ """Save embeddings to cache"""
121
+ cache_file = config.CACHE_DIR / f"{doc_hash}.pkl"
122
+ try:
123
+ with open(cache_file, 'wb') as f:
124
+ pickle.dump((chunks, embeddings), f)
125
+ except Exception as e:
126
+ logger.warning(f"Failed to save cache: {e}")
127
+
128
+ def process_document(self, file_obj) -> Tuple[str, bool]:
129
+ """Process uploaded document with caching"""
130
+ try:
131
+ doc_hash = self.generate_document_hash(file_obj)
132
+
133
+ # Try to load from cache first
134
+ cached_data = self.load_cached_embeddings(doc_hash)
135
+ if cached_data:
136
+ self.doc_chunks, self.doc_embeddings = cached_data
137
+ logger.info(f"Loaded {len(self.doc_chunks)} chunks from cache")
138
+ return f"βœ… Successfully loaded {len(self.doc_chunks)} chunks from cache!", True
139
+
140
+ # Process document
141
+ text = self.extract_text_from_pdf(file_obj)
142
+ self.doc_chunks = self.create_intelligent_chunks(text)
143
+
144
+ # Generate embeddings
145
+ logger.info("Generating embeddings...")
146
+ self.doc_embeddings = self.model.encode(
147
+ self.doc_chunks,
148
+ batch_size=32,
149
+ show_progress_bar=True,
150
+ convert_to_numpy=True
151
+ )
152
+
153
+ # Save to cache
154
+ self.save_embeddings_to_cache(doc_hash, self.doc_chunks, self.doc_embeddings)
155
+
156
+ # Store metadata
157
+ self.document_metadata = {
158
+ 'hash': doc_hash,
159
+ 'chunks_count': len(self.doc_chunks),
160
+ 'processed_at': datetime.now().isoformat(),
161
+ 'total_characters': len(text)
162
+ }
163
+
164
+ return f"βœ… Successfully processed {len(self.doc_chunks)} chunks from your document!", True
165
+
166
+ except Exception as e:
167
+ logger.error(f"Document processing error: {str(e)}")
168
+ return f"❌ Error processing document: {str(e)}", False
169
+
170
+ def retrieve_relevant_chunks(self, query: str, top_k: int = None) -> Tuple[str, List[float]]:
171
+ """Retrieve most relevant chunks with similarity scores"""
172
+ if not self.doc_chunks:
173
+ return "", []
174
+
175
+ top_k = top_k or config.TOP_K_CHUNKS
176
+ query_embedding = self.model.encode([query])
177
+
178
+ similarities = cosine_similarity(query_embedding, self.doc_embeddings)[0]
179
+ top_indices = np.argsort(similarities)[::-1][:top_k]
180
+
181
+ relevant_chunks = []
182
+ scores = []
183
+
184
+ for idx in top_indices:
185
+ if similarities[idx] > 0.1: # Minimum similarity threshold
186
+ relevant_chunks.append(self.doc_chunks[idx])
187
+ scores.append(similarities[idx])
188
+
189
+ context = "\n\n---\n\n".join(relevant_chunks)
190
+ return context, scores
191
 
192
+ class LLMService:
193
+ """Enhanced LLM service with multiple providers and error handling"""
194
+
195
+ @staticmethod
196
+ async def call_together_ai_async(context: str, question: str, system_prompt: str = None) -> str:
197
+ """Async call to Together AI API"""
198
+ url = "https://api.together.xyz/v1/chat/completions"
199
+ headers = {
200
+ "Authorization": f"Bearer {config.TOGETHER_API_KEY}",
201
+ "Content-Type": "application/json"
202
+ }
203
+
204
+ system_msg = system_prompt or """You are an intelligent AI assistant specializing in document analysis and web research.
205
+ Provide comprehensive, accurate, and well-structured responses based on the given context.
206
+ Use bullet points, numbered lists, and clear formatting when appropriate.
207
+ If the context doesn't contain enough information, clearly state what's missing."""
208
+
209
+ messages = [
210
+ {"role": "system", "content": system_msg},
211
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}\n\nPlease provide a detailed and helpful response."}
212
+ ]
213
+
214
+ data = {
215
+ "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
216
+ "messages": messages,
217
+ "temperature": config.TEMPERATURE,
218
+ "max_tokens": config.MAX_TOKENS,
219
+ "top_p": 0.9,
220
+ "repetition_penalty": 1.1
221
+ }
222
+
223
+ async with aiohttp.ClientSession() as session:
224
+ async with session.post(url, headers=headers, json=data) as response:
225
+ if response.status == 200:
226
+ result = await response.json()
227
+ return result["choices"][0]["message"]["content"]
228
+ else:
229
+ raise Exception(f"API call failed with status {response.status}")
230
+
231
+ @staticmethod
232
+ def call_together_ai_sync(context: str, question: str, system_prompt: str = None) -> str:
233
+ """Synchronous wrapper for Together AI API"""
234
+ try:
235
+ loop = asyncio.new_event_loop()
236
+ asyncio.set_event_loop(loop)
237
+ return loop.run_until_complete(
238
+ LLMService.call_together_ai_async(context, question, system_prompt)
239
+ )
240
+ except Exception as e:
241
+ logger.error(f"LLM API error: {str(e)}")
242
+ return f"❌ Sorry, I encountered an error while generating the response: {str(e)}"
243
 
244
+ class WebSearchService:
245
+ """Enhanced web search with multiple sources and caching"""
246
+
247
+ @staticmethod
248
+ def search_web(query: str, num_results: int = 5) -> str:
249
+ """Enhanced web search with better formatting"""
250
+ try:
251
+ url = "https://google.serper.dev/search"
252
+ headers = {"X-API-KEY": config.SERPER_API_KEY}
253
+ payload = {
254
+ "q": query,
255
+ "num": num_results,
256
+ "type": "search"
257
+ }
258
+
259
+ response = requests.post(url, json=payload, headers=headers, timeout=10)
260
+ response.raise_for_status()
261
+
262
+ data = response.json()
263
+ results = data.get("organic", [])
264
+
265
+ if not results:
266
+ return "No search results found for your query."
267
+
268
+ formatted_results = []
269
+ for i, result in enumerate(results[:num_results], 1):
270
+ title = result.get('title', 'No title')
271
+ link = result.get('link', '')
272
+ snippet = result.get('snippet', 'No description available')
273
+
274
+ formatted_results.append(f"""
275
+ **Result {i}: {title}**
276
+ URL: {link}
277
+ Summary: {snippet}
278
+ """)
279
+
280
+ return "\n".join(formatted_results)
281
+
282
+ except Exception as e:
283
+ logger.error(f"Web search error: {str(e)}")
284
+ return f"❌ Search failed: {str(e)}"
285
+
286
+ # Global instances
287
+ doc_processor = DocumentProcessor()
288
+ llm_service = LLMService()
289
+ search_service = WebSearchService()
290
+
291
+ # Enhanced UI Functions
292
  def process_uploaded_file(file):
293
+ """Process uploaded file with enhanced feedback"""
294
  if file is None:
295
+ return "⚠️ No file selected", gr.update(visible=False), gr.update(visible=False)
296
 
297
  try:
298
+ status, success = doc_processor.process_document(file)
299
+
300
+ if success:
301
+ metadata = doc_processor.document_metadata
302
+ info_text = f"""πŸ“„ **Document Successfully Loaded**
303
+ πŸ“Š Chunks: {metadata.get('chunks_count', 'N/A')}
304
+ πŸ“ Characters: {metadata.get('total_characters', 'N/A'):,}
305
+ ⏰ Processed: {metadata.get('processed_at', 'N/A')[:19]}
306
+ πŸ” Ready for questions!"""
307
+
308
+ return status, gr.update(visible=True, value=info_text), gr.update(visible=True)
309
+ else:
310
+ return status, gr.update(visible=False), gr.update(visible=False)
311
+
312
  except Exception as e:
313
+ error_msg = f"❌ Processing Error: {str(e)}"
314
+ return error_msg, gr.update(visible=False), gr.update(visible=False)
315
+
316
+ def answer_question(question: str, source: str, history: List[List[str]], use_advanced: bool = False):
317
+ """Enhanced question answering with better context and formatting"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  if not question.strip():
319
  return history, ""
320
 
321
+ # Add user question to history
322
+ history = history + [[question, None]]
323
+
324
  try:
 
 
 
325
  if source == "🌐 Web Search":
326
+ context = search_service.search_web(question, num_results=5)
327
+ source_info = "🌐 **Source:** Real-time Web Search"
328
+ system_prompt = """You are a web research assistant. Analyze the search results and provide a comprehensive answer.
329
+ Cite specific sources when possible and organize information clearly."""
330
+
331
+ elif source == "πŸ“„ Uploaded Document":
332
+ if not doc_processor.doc_chunks:
333
  answer = "❌ Please upload a PDF document first to use this feature."
334
  history[-1][1] = answer
335
  return history, ""
336
+
337
+ context, similarity_scores = doc_processor.retrieve_relevant_chunks(question)
338
+ source_info = f"πŸ“„ **Source:** Uploaded Document ({len(similarity_scores)} relevant sections found)"
339
+ system_prompt = """You are a document analysis assistant. Based on the provided document excerpts,
340
+ give a detailed and accurate answer. If information is incomplete, clearly state what's missing."""
341
+
342
  else:
343
  answer = "❌ Please select a valid knowledge source."
344
  history[-1][1] = answer
345
  return history, ""
346
 
347
+ if not context.strip():
348
+ answer = "❌ No relevant information found for your question."
349
+ history[-1][1] = answer
350
+ return history, ""
351
 
352
+ # Generate response using LLM
353
+ llm_response = llm_service.call_together_ai_sync(context, question, system_prompt)
354
 
355
+ # Format final answer
356
+ timestamp = datetime.now().strftime("%H:%M:%S")
357
+ formatted_answer = f"""{source_info}
358
+ ⏰ **Generated at:** {timestamp}
359
+
360
+ {llm_response}
361
+
362
+ ---
363
+ πŸ’‘ *Tip: Try asking follow-up questions for more details!*"""
364
+
365
+ history[-1][1] = formatted_answer
366
  return history, ""
367
 
368
  except Exception as e:
369
+ error_msg = f"""❌ **Error Occurred**
370
+ πŸ” **Details:** {str(e)}
371
+ πŸ’‘ **Suggestion:** Please check your API keys and try again.
372
+
373
+ If the problem persists, try:
374
+ - Rephrasing your question
375
+ - Checking your internet connection
376
+ - Ensuring API keys are properly configured"""
377
+
378
  history[-1][1] = error_msg
379
  return history, ""
380
 
 
381
  def clear_chat():
382
+ """Clear chat history"""
383
  return []
384
 
385
+ def get_sample_questions(source):
386
+ """Provide sample questions based on source"""
387
+ if source == "🌐 Web Search":
388
+ return [
389
+ "What are the latest developments in AI technology?",
390
+ "Current weather in major cities",
391
+ "Recent news about renewable energy",
392
+ "What's trending in technology today?"
393
+ ]
394
+ else:
395
+ return [
396
+ "What is the main topic of this document?",
397
+ "Summarize the key points",
398
+ "What are the conclusions?",
399
+ "Explain the methodology used"
400
+ ]
401
+
402
+ # Enhanced CSS with modern design
403
+ enhanced_css = """
404
+ /* Global Styles */
405
  .gradio-container {
406
+ max-width: 1400px !important;
407
  margin: auto !important;
408
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
409
  }
410
 
411
+ /* Header Styles */
412
+ .main-header {
413
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
414
+ padding: 2rem;
415
+ border-radius: 20px;
416
+ margin-bottom: 2rem;
417
+ box-shadow: 0 10px 30px rgba(102, 126, 234, 0.3);
418
+ }
419
+
420
+ .header-title {
421
+ color: white;
422
+ font-size: 3rem;
423
+ font-weight: 800;
424
  text-align: center;
425
+ margin-bottom: 0.5rem;
426
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
 
 
 
 
427
  }
428
 
429
+ .header-subtitle {
430
+ color: rgba(255,255,255,0.9);
431
+ font-size: 1.3rem;
432
  text-align: center;
433
+ font-weight: 300;
 
 
434
  }
435
 
436
+ /* Card Styles */
437
+ .control-card {
438
+ background: white;
439
  border-radius: 15px;
440
+ padding: 1.5rem;
441
+ box-shadow: 0 5px 20px rgba(0,0,0,0.1);
442
+ border: 1px solid #e2e8f0;
443
+ margin-bottom: 1rem;
444
  }
445
 
446
+ .chat-card {
447
+ background: white;
448
+ border-radius: 15px;
449
+ padding: 1.5rem;
450
+ box-shadow: 0 5px 20px rgba(0,0,0,0.1);
451
+ border: 1px solid #e2e8f0;
452
+ min-height: 600px;
453
+ }
454
+
455
+ /* Source Selection */
456
+ .source-selector {
457
+ background: linear-gradient(135deg, #84fab0 0%, #8fd3f4 100%);
458
+ border-radius: 12px;
459
+ padding: 1rem;
460
+ margin: 1rem 0;
461
  }
462
 
463
+ .source-selector label {
464
+ color: #2d3748 !important;
465
+ font-weight: 600 !important;
466
+ font-size: 1.1rem !important;
467
+ }
468
+
469
+ /* File Upload */
470
+ .upload-zone {
471
+ background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
472
+ border: 3px dashed #ff8a65;
473
  border-radius: 15px;
474
+ padding: 2rem;
475
  text-align: center;
 
476
  transition: all 0.3s ease;
477
+ cursor: pointer;
478
  }
479
 
480
+ .upload-zone:hover {
481
+ transform: translateY(-3px);
482
+ box-shadow: 0 8px 25px rgba(255, 138, 101, 0.3);
483
+ border-color: #ff7043;
484
  }
485
 
486
+ /* Status Boxes */
487
+ .status-success {
488
+ background: linear-gradient(135deg, #84fab0 0%, #8fd3f4 100%);
489
+ border: none;
490
+ border-radius: 12px;
491
+ padding: 1rem;
492
+ color: #2d3748;
493
+ font-weight: 500;
494
  }
495
 
496
+ .status-info {
497
+ background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
 
 
 
498
  border: none;
499
+ border-radius: 12px;
500
+ padding: 1rem;
501
  color: #2d3748;
502
  font-weight: 500;
503
  }
504
 
505
+ /* Chat Interface */
506
+ .chat-container {
507
+ background: #f8fafc;
508
+ border-radius: 12px;
509
+ border: 1px solid #e2e8f0;
510
+ min-height: 500px;
511
+ }
512
+
513
+ /* Input Styles */
514
+ .question-input {
515
+ border-radius: 12px;
516
+ border: 2px solid #cbd5e0;
517
+ padding: 1rem;
518
+ font-size: 1rem;
519
+ transition: all 0.3s ease;
520
+ }
521
+
522
+ .question-input:focus {
523
+ border-color: #667eea;
524
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
525
+ }
526
+
527
+ /* Button Styles */
528
+ .btn-primary {
529
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
530
+ border: none;
531
+ border-radius: 12px;
532
+ padding: 0.75rem 1.5rem;
533
+ font-weight: 600;
534
+ color: white;
535
+ transition: all 0.3s ease;
536
+ }
537
+
538
+ .btn-primary:hover {
539
+ transform: translateY(-2px);
540
+ box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4);
541
+ }
542
+
543
+ .btn-secondary {
544
+ background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
545
+ border: none;
546
+ border-radius: 12px;
547
+ padding: 0.75rem 1.5rem;
548
+ font-weight: 600;
549
+ color: #2d3748;
550
+ transition: all 0.3s ease;
551
+ }
552
+
553
+ .btn-secondary:hover {
554
+ transform: translateY(-2px);
555
+ box-shadow: 0 8px 25px rgba(252, 182, 159, 0.4);
556
+ }
557
+
558
+ /* Advanced Settings */
559
+ .advanced-panel {
560
+ background: linear-gradient(135deg, #e0c3fc 0%, #9bb5ff 100%);
561
+ border-radius: 12px;
562
+ padding: 1.5rem;
563
+ margin: 1rem 0;
564
+ }
565
+
566
+ /* Footer */
567
+ .footer-info {
568
+ background: #2d3748;
569
+ color: white;
570
+ padding: 2rem;
571
+ border-radius: 15px;
572
  text-align: center;
573
+ margin-top: 2rem;
 
 
 
 
574
  }
 
575
 
576
+ /* Animations */
577
+ @keyframes fadeIn {
578
+ from { opacity: 0; transform: translateY(20px); }
579
+ to { opacity: 1; transform: translateY(0); }
580
+ }
581
+
582
+ .animate-in {
583
+ animation: fadeIn 0.6s ease-out;
584
+ }
585
+
586
+ /* Responsive Design */
587
+ @media (max-width: 768px) {
588
+ .header-title {
589
+ font-size: 2rem;
590
+ }
591
 
592
+ .header-subtitle {
593
+ font-size: 1rem;
594
+ }
 
 
595
 
596
+ .control-card, .chat-card {
597
+ padding: 1rem;
598
+ }
599
+ }
600
+ """
601
+
602
+ # Build Enhanced Gradio Interface
603
+ def create_enhanced_interface():
604
+ with gr.Blocks(
605
+ css=enhanced_css,
606
+ theme=gr.themes.Soft(
607
+ primary_hue="blue",
608
+ secondary_hue="purple",
609
+ neutral_hue="slate"
610
+ ),
611
+ title="πŸ€– Advanced RAG Chatbot"
612
+ ) as demo:
613
+
614
+ # Header Section
615
+ gr.HTML("""
616
+ <div class="main-header animate-in">
617
+ <div class="header-title">πŸ€– Advanced RAG Intelligence System</div>
618
+ <div class="header-subtitle">
619
+ Next-generation AI assistant powered by advanced retrieval-augmented generation
620
+ </div>
621
+ </div>
622
+ """)
623
+
624
+ with gr.Row():
625
+ # Left Panel - Controls
626
+ with gr.Column(scale=1, elem_classes=["control-card"]):
627
+
628
+ # Knowledge Source Selection
629
+ gr.HTML("<h3 style='color: #4a5568; margin-bottom: 1rem;'>🎯 Knowledge Source</h3>")
630
+ source_choice = gr.Radio(
631
+ ["🌐 Web Search", "πŸ“„ Uploaded Document"],
632
+ label="Select Your Information Source",
633
+ value="🌐 Web Search",
634
+ elem_classes=["source-selector"]
635
+ )
636
+
637
+ # Document Upload Section
638
+ gr.HTML("<h3 style='color: #4a5568; margin: 2rem 0 1rem 0;'>πŸ“ Document Processing</h3>")
639
+
640
+ file_input = gr.File(
641
+ label="Upload PDF Document",
642
+ file_types=[".pdf"],
643
+ elem_classes=["upload-zone"]
644
+ )
645
+
646
+ file_status = gr.Textbox(
647
+ label="Processing Status",
648
+ interactive=False,
649
+ elem_classes=["status-success"],
650
+ visible=True
651
+ )
652
+
653
+ document_info = gr.Textbox(
654
+ label="Document Information",
655
+ interactive=False,
656
+ elem_classes=["status-info"],
657
+ visible=False,
658
+ lines=6
659
+ )
660
+
661
+ # Quick Actions
662
+ gr.HTML("<h3 style='color: #4a5568; margin: 2rem 0 1rem 0;'>⚑ Quick Actions</h3>")
663
+
664
+ sample_questions_display = gr.HTML("""
665
+ <div style='background: #f7fafc; padding: 1rem; border-radius: 8px; border-left: 4px solid #667eea;'>
666
+ <strong>πŸ’‘ Sample Questions for Web Search:</strong><br>
667
+ β€’ What are the latest AI breakthroughs?<br>
668
+ β€’ Current tech industry trends<br>
669
+ β€’ Recent scientific discoveries<br>
670
+ β€’ Today's market updates
671
+ </div>
672
+ """)
673
 
674
+ # Right Panel - Chat Interface
675
+ with gr.Column(scale=2, elem_classes=["chat-card"]):
676
+ gr.HTML("<h3 style='color: #4a5568; margin-bottom: 1rem;'>πŸ’¬ Intelligent Conversation</h3>")
677
+
678
+ chatbot = gr.Chatbot(
679
+ label="AI Assistant",
680
+ height=500,
681
+ elem_classes=["chat-container"],
682
+ bubble_full_width=False,
683
+ show_label=False,
684
+ avatar_images=("πŸ‘€", "πŸ€–")
685
  )
686
 
687
+ with gr.Row():
688
+ question_input = gr.Textbox(
689
+ label="Your Question",
690
+ placeholder="Ask me anything... (Press Enter or click Send)",
691
+ lines=2,
692
+ scale=4,
693
+ elem_classes=["question-input"]
694
+ )
695
+
696
+ with gr.Column(scale=1, min_width=120):
697
+ send_btn = gr.Button(
698
+ "πŸš€ Send",
699
+ variant="primary",
700
+ size="lg",
701
+ elem_classes=["btn-primary"]
702
+ )
703
+ clear_btn = gr.Button(
704
+ "πŸ—‘οΈ Clear",
705
+ variant="secondary",
706
+ size="lg",
707
+ elem_classes=["btn-secondary"]
708
+ )
709
+
710
+ # Advanced Settings Panel
711
+ with gr.Accordion("βš™οΈ Advanced Configuration", open=False, elem_classes=["advanced-panel"]):
712
+ with gr.Row():
713
+ with gr.Column():
714
+ gr.HTML("""
715
+ <div style='background: white; padding: 1.5rem; border-radius: 12px; margin: 1rem 0;'>
716
+ <h4>πŸ”§ System Features</h4>
717
+ <ul style='line-height: 1.8;'>
718
+ <li><strong>🌐 Real-time Web Search:</strong> Live internet data retrieval</li>
719
+ <li><strong>πŸ“„ Document Intelligence:</strong> Advanced PDF processing with semantic chunking</li>
720
+ <li><strong>🧠 Neural Embeddings:</strong> Sentence-BERT powered similarity matching</li>
721
+ <li><strong>⚑ Smart Caching:</strong> Optimized performance with intelligent storage</li>
722
+ </ul>
723
+ </div>
724
+ """)
725
+
726
+ with gr.Column():
727
+ gr.HTML("""
728
+ <div style='background: white; padding: 1.5rem; border-radius: 12px; margin: 1rem 0;'>
729
+ <h4>πŸ€– AI Capabilities</h4>
730
+ <ul style='line-height: 1.8;'>
731
+ <li><strong>Language Model:</strong> Mixtral-8x7B-Instruct</li>
732
+ <li><strong>Context Understanding:</strong> Advanced semantic retrieval</li>
733
+ <li><strong>Multi-source Fusion:</strong> Combined web + document insights</li>
734
+ <li><strong>Error Recovery:</strong> Robust fallback mechanisms</li>
735
+ </ul>
736
+ </div>
737
+ """)
738
+
739
+ # Footer with Credits
740
+ gr.HTML("""
741
+ <div class="footer-info">
742
+ <h4>πŸš€ Technical Architecture</h4>
743
+ <p>Built with cutting-edge AI technologies: Together AI β€’ Serper API β€’ Sentence Transformers β€’ Advanced RAG Pipeline</p>
744
+ <p style='margin-top: 1rem; opacity: 0.8;'>
745
+ πŸ’‘ Engineered for optimal performance and user experience β€’
746
+ πŸ”’ Secure and scalable architecture β€’
747
+ 🎯 Production-ready implementation
748
+ </p>
749
+ </div>
750
  """)
751
+
752
+ # Event Handlers with Enhanced Logic
753
+ file_input.change(
754
+ fn=process_uploaded_file,
755
+ inputs=[file_input],
756
+ outputs=[file_status, document_info, gr.update()]
757
+ )
758
+
759
+ question_input.submit(
760
+ fn=answer_question,
761
+ inputs=[question_input, source_choice, chatbot],
762
+ outputs=[chatbot, question_input]
763
+ )
764
+
765
+ send_btn.click(
766
+ fn=answer_question,
767
+ inputs=[question_input, source_choice, chatbot],
768
+ outputs=[chatbot, question_input]
769
+ )
770
+
771
+ clear_btn.click(
772
+ fn=clear_chat,
773
+ inputs=[],
774
+ outputs=[chatbot]
775
+ )
776
+
777
+ # Dynamic sample questions update
778
+ def update_sample_questions(source):
779
+ if source == "🌐 Web Search":
780
+ return gr.HTML("""
781
+ <div style='background: #f0fff4; padding: 1rem; border-radius: 8px; border-left: 4px solid #48bb78;'>
782
+ <strong>πŸ’‘ Sample Questions for Web Search:</strong><br>
783
+ β€’ What are the latest AI breakthroughs?<br>
784
+ β€’ Current cryptocurrency market trends<br>
785
+ β€’ Recent climate change developments<br>
786
+ β€’ Today's technology news
787
+ </div>
788
+ """)
789
+ else:
790
+ return gr.HTML("""
791
+ <div style='background: #fef5e7; padding: 1rem; border-radius: 8px; border-left: 4px solid #ed8936;'>
792
+ <strong>πŸ’‘ Sample Questions for Documents:</strong><br>
793
+ β€’ Summarize the main findings<br>
794
+ β€’ What methodology was used?<br>
795
+ β€’ List the key conclusions<br>
796
+ β€’ Explain the technical details
797
+ </div>
798
+ """)
799
+
800
+ source_choice.change(
801
+ fn=update_sample_questions,
802
+ inputs=[source_choice],
803
+ outputs=[sample_questions_display]
804
+ )
805
 
806
+ return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
807
 
808
+ # Launch Application
809
  if __name__ == "__main__":
810
+ demo = create_enhanced_interface()
811
  demo.launch(
812
  share=True,
813
+ server_name="0