Rabbit-Innotech commited on
Commit
2ed0c6d
·
verified ·
1 Parent(s): 5d9d36a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +329 -270
app.py CHANGED
@@ -1,35 +1,54 @@
1
  import os
2
- from langchain_groq import ChatGroq
3
- from langchain.prompts import ChatPromptTemplate, PromptTemplate
4
- from langchain.output_parsers import ResponseSchema, StructuredOutputParser
5
- from urllib.parse import urljoin, urlparse
6
  import requests
7
  from io import BytesIO
8
- from langchain_chroma import Chroma
9
- import requests
 
 
10
  from bs4 import BeautifulSoup
11
- from langchain_core.prompts import ChatPromptTemplate
12
- import gradio as gr
13
  from PyPDF2 import PdfReader
14
- from langchain_huggingface import HuggingFaceEmbeddings
 
 
 
15
  from langchain_core.output_parsers import StrOutputParser
16
  from langchain_core.runnables import RunnablePassthrough
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Simple session management
19
  class SessionManager:
 
 
20
  def __init__(self):
21
  self.sessions = {}
22
 
23
- def get_or_create_session(self, session_id):
 
24
  if session_id not in self.sessions:
25
  self.sessions[session_id] = []
26
  return self.sessions[session_id]
27
 
28
- def add_interaction(self, session_id, user_message, ai_response):
 
29
  session = self.get_or_create_session(session_id)
30
  session.append({"user": user_message, "ai": ai_response})
31
 
32
- def get_history(self, session_id, max_turns=5):
 
33
  session = self.get_or_create_session(session_id)
34
  recent_history = session[-max_turns:] if len(session) > max_turns else session
35
 
@@ -40,160 +59,172 @@ class SessionManager:
40
 
41
  return history_text.strip()
42
 
43
- # Initialize session manager
44
- session_manager = SessionManager()
45
-
46
- groq_api_key= os.environ.get('GBV')
47
-
48
- embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
49
-
50
- def scrape_websites(base_urls):
51
- try:
52
- visited_links = set() # To avoid revisiting the same link
53
- content_by_url = {} # Store content from each URL
54
-
55
- for base_url in base_urls:
56
- if not base_url.strip():
57
- continue # Skip empty or invalid URLs
58
-
59
- print(f"Scraping base URL: {base_url}")
60
- html_content = fetch_page_content(base_url)
61
- if html_content:
62
- cleaned_content = clean_body_content(html_content)
63
- content_by_url[base_url] = cleaned_content
64
- visited_links.add(base_url)
65
-
66
- # Extract and process all internal links
67
- soup = BeautifulSoup(html_content, "html.parser")
68
- links = extract_internal_links(base_url, soup)
69
-
70
- for link in links:
71
- if link not in visited_links:
72
- print(f"Scraping link: {link}")
73
- page_content = fetch_page_content(link)
74
- if page_content:
75
- cleaned_content = clean_body_content(page_content)
76
- content_by_url[link] = cleaned_content
77
- visited_links.add(link)
78
-
79
- # If the link is a PDF file, extract its content
80
- if link.lower().endswith('.pdf'):
81
- print(f"Extracting PDF content from: {link}")
82
- pdf_content = extract_pdf_text(link)
83
- if pdf_content:
84
- content_by_url[link] = pdf_content
85
-
86
- return content_by_url
87
-
88
- except Exception as e:
89
- print(f"Error during scraping: {e}")
90
- return {}
91
-
92
-
93
- def fetch_page_content(url):
94
- try:
95
- response = requests.get(url, timeout=10)
96
- response.raise_for_status()
97
- return response.text
98
- except requests.exceptions.RequestException as e:
99
- print(f"Error fetching {url}: {e}")
100
- return None
101
-
102
-
103
- def extract_internal_links(base_url, soup):
104
- links = set()
105
- for anchor in soup.find_all("a", href=True):
106
- href = anchor["href"]
107
- full_url = urljoin(base_url, href)
108
- if is_internal_link(base_url, full_url):
109
- links.add(full_url)
110
- return links
111
-
112
-
113
- def is_internal_link(base_url, link_url):
114
- base_netloc = urlparse(base_url).netloc
115
- link_netloc = urlparse(link_url).netloc
116
- return base_netloc == link_netloc
117
-
118
-
119
- def extract_pdf_text(pdf_url):
120
- try:
121
- response = requests.get(pdf_url)
122
- response.raise_for_status()
123
- with BytesIO(response.content) as file:
124
- reader = PdfReader(file)
125
- pdf_text = ""
126
- for page in reader.pages:
127
- pdf_text += page.extract_text()
128
-
129
- return pdf_text if pdf_text else None
130
- except requests.exceptions.RequestException as e:
131
- print(f"Error fetching PDF {pdf_url}: {e}")
132
- return None
133
- except Exception as e:
134
- print(f"Error reading PDF {pdf_url}: {e}")
135
- return None
136
-
137
-
138
- def clean_body_content(html_content):
139
- soup = BeautifulSoup(html_content, "html.parser")
140
 
 
 
141
 
142
- for script_or_style in soup(["script", "style"]):
143
- script_or_style.extract()
144
-
 
 
 
 
 
 
 
145
 
146
- cleaned_content = soup.get_text(separator="\n")
147
- cleaned_content = "\n".join(
148
- line.strip() for line in cleaned_content.splitlines() if line.strip()
149
- )
150
- return cleaned_content
151
-
152
-
153
- if __name__ == "__main__":
154
- website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"
155
-
156
- ]
157
- all_content = scrape_websites(website)
158
-
159
- temp_list = []
160
- for url, content in all_content.items():
161
- temp_list.append((url, content))
162
-
163
 
164
- processed_texts = []
165
-
 
 
 
 
166
 
167
- for element in temp_list:
168
- if isinstance(element, tuple):
169
- url, content = element
170
- processed_texts.append(f"url: {url}, content: {content}")
171
- elif isinstance(element, str):
172
- processed_texts.append(element)
173
- else:
174
- processed_texts.append(str(element))
175
-
176
- def chunk_string(s, chunk_size=1000):
177
- return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
178
-
179
- chunked_texts = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
- for text in processed_texts:
182
- chunked_texts.extend(chunk_string(text))
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- vectorstore = Chroma(
186
- collection_name="GBVR_Dataset",
187
- embedding_function=embed_model,
188
- persist_directory="./",
189
- )
190
 
191
- vectorstore.get().keys()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
- vectorstore.add_texts(chunked_texts)
194
 
195
- # Updated template to include conversation history
196
- template = ("""
 
 
197
  You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
198
 
199
  1. **Warm & Natural Interaction**
@@ -232,129 +263,157 @@ template = ("""
232
  **Context:** {context}
233
  **User's Question:** {question}
234
  **Your Response:**
235
- """)
236
-
237
-
238
- rag_prompt = PromptTemplate.from_template(template)
239
-
240
- retriever = vectorstore.as_retriever()
241
-
242
- llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- # Dictionary to store user sessions with session IDs
245
- user_sessions = {}
246
 
247
- # Define the RAG chain with session history
248
- def rag_chain(question, session_id="default"):
249
- # Get conversation history if available
250
- conversation_history = session_manager.get_history(session_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- # Get context from retriever
253
- context_docs = retriever.invoke(question)
254
- context = "\n".join(doc.page_content for doc in context_docs)
 
255
 
256
- # Create prompt with history
257
- prompt = rag_prompt.format(
258
- context=context,
259
- question=question,
260
- conversation_history=conversation_history
261
- )
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- # Generate response
264
- response = llm.invoke(prompt).content
265
 
266
- # Store the interaction
267
- session_manager.add_interaction(session_id, question, response)
268
 
269
- return response
270
-
271
- # Define the RAG memory stream function
272
- def rag_memory_stream(message, history):
273
- # Generate a session ID based on the first message if not exists
274
- session_id = None
275
- for msg in history:
276
- if msg[0]: # If there's a user message
277
- # Use first few characters of first message as simple session ID
278
- session_id = hash(msg[0][:20]) if session_id is None else session_id
279
- break
280
 
281
- # Default session ID if history is empty
282
- if session_id is None:
283
- session_id = "default_session"
 
284
 
285
- # Process the message and get response
286
- response = rag_chain(message, str(session_id))
 
287
 
288
- # Stream the response word by word
289
- partial_text = ""
290
- words = response.split(' ')
291
- for word in words:
292
- partial_text += word + " "
293
- yield partial_text.strip()
294
-
295
- # Title with emojis
296
- title = "GBVR Chatbot"
297
-
298
- # Custom CSS for styling the interface
299
- custom_css = """
300
- /* Custom CSS for styling the interface */
301
- body {
302
- font-family: "Arial", serif;
303
- }
304
-
305
- .gradio-container {
306
- font-family: "Times New Roman", serif;
307
- }
308
-
309
- .gr-button {
310
- background-color: #007bff; /* Blue button */
311
- color: white;
312
- border: none;
313
- border-radius: 5px;
314
- font-size: 16px;
315
- padding: 10px 20px;
316
- cursor: pointer;
317
- }
318
-
319
- .gr-textbox:focus, .gr-button:focus {
320
- outline: none; /* Remove outline focus for a cleaner look */
321
- }
322
-
323
- /* Specific CSS for the welcome message */
324
- .gradio-description {
325
- font-size: 20px; /* Set font size for the welcome message */
326
- font-family: "Arial", sans-serif;
327
- text-align: center; /* Optional: Center-align the text */
328
- padding: 20px; /* Optional: Add padding around the welcome message */
329
- }
330
-
331
- """
332
-
333
- # Generate a simple welcome message using the LLM
334
- def generate_welcome_message():
335
- welcome_prompt = """
336
- Create a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
337
- Keep it under 3 sentences, use simple language, and include one emoji.
338
- Make it warm and supportive but direct and easy to read.
339
- """
340
 
341
- # Get the welcome message from the LLM
342
- welcome_message = llm.invoke(welcome_prompt).content
343
- return welcome_message
344
-
345
- # Create simple welcome message
346
- welcome_msg = generate_welcome_message()
347
-
348
- # Create the Chat Interface with welcome message
349
- demo = gr.ChatInterface(
350
- fn=rag_memory_stream,
351
- title=title,
352
- fill_height=True,
353
- theme="soft",
354
- css=custom_css, # Apply the custom CSS
355
- description=welcome_msg
356
- )
357
-
358
- # Launch the app
359
  if __name__ == "__main__":
360
- demo.launch(share=True, inbrowser=True, debug=True)
 
1
  import os
 
 
 
 
2
  import requests
3
  from io import BytesIO
4
+ from urllib.parse import urljoin, urlparse
5
+ from typing import Dict, List, Set, Tuple, Optional, Union
6
+
7
+ # Libraries for web scraping and text processing
8
  from bs4 import BeautifulSoup
 
 
9
  from PyPDF2 import PdfReader
10
+
11
+ # LangChain imports
12
+ from langchain_groq import ChatGroq
13
+ from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
14
  from langchain_core.output_parsers import StrOutputParser
15
  from langchain_core.runnables import RunnablePassthrough
16
+ from langchain_chroma import Chroma
17
+ from langchain_huggingface import HuggingFaceEmbeddings
18
+
19
+ # Gradio import for the user interface
20
+ import gradio as gr
21
+
22
+ # Configuration settings
23
+ GROQ_API_KEY = os.environ.get('GBV')
24
+ EMBED_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
25
+ LLM_MODEL_NAME = "llama-3.3-70b-versatile"
26
+ CHUNK_SIZE = 1000
27
+ VECTOR_DB_COLLECTION = "GBVR_Dataset"
28
+ VECTOR_DB_PERSIST_DIR = "./"
29
+ DEFAULT_SESSION_ID = "default_session"
30
+ MAX_HISTORY_TURNS = 5
31
+
32
 
 
33
  class SessionManager:
34
+ """Manages chat sessions and conversation history."""
35
+
36
  def __init__(self):
37
  self.sessions = {}
38
 
39
+ def get_or_create_session(self, session_id: str) -> List[Dict[str, str]]:
40
+ """Get existing session or create a new one."""
41
  if session_id not in self.sessions:
42
  self.sessions[session_id] = []
43
  return self.sessions[session_id]
44
 
45
+ def add_interaction(self, session_id: str, user_message: str, ai_response: str) -> None:
46
+ """Add user-AI interaction to the session history."""
47
  session = self.get_or_create_session(session_id)
48
  session.append({"user": user_message, "ai": ai_response})
49
 
50
+ def get_history(self, session_id: str, max_turns: int = MAX_HISTORY_TURNS) -> str:
51
+ """Get formatted conversation history."""
52
  session = self.get_or_create_session(session_id)
53
  recent_history = session[-max_turns:] if len(session) > max_turns else session
54
 
 
59
 
60
  return history_text.strip()
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ class WebScraper:
64
+ """Handles web scraping operations."""
65
 
66
+ @staticmethod
67
+ def fetch_page_content(url: str) -> Optional[str]:
68
+ """Fetch HTML content from a URL."""
69
+ try:
70
+ response = requests.get(url, timeout=10)
71
+ response.raise_for_status()
72
+ return response.text
73
+ except requests.exceptions.RequestException as e:
74
+ print(f"Error fetching {url}: {e}")
75
+ return None
76
 
77
+ @staticmethod
78
+ def extract_internal_links(base_url: str, soup: BeautifulSoup) -> Set[str]:
79
+ """Extract internal links from a page."""
80
+ links = set()
81
+ for anchor in soup.find_all("a", href=True):
82
+ href = anchor["href"]
83
+ full_url = urljoin(base_url, href)
84
+ if WebScraper.is_internal_link(base_url, full_url):
85
+ links.add(full_url)
86
+ return links
 
 
 
 
 
 
 
87
 
88
+ @staticmethod
89
+ def is_internal_link(base_url: str, link_url: str) -> bool:
90
+ """Check if a link is internal to the base domain."""
91
+ base_netloc = urlparse(base_url).netloc
92
+ link_netloc = urlparse(link_url).netloc
93
+ return base_netloc == link_netloc
94
 
95
+ @staticmethod
96
+ def extract_pdf_text(pdf_url: str) -> Optional[str]:
97
+ """Extract text from a PDF URL."""
98
+ try:
99
+ response = requests.get(pdf_url)
100
+ response.raise_for_status()
101
+ with BytesIO(response.content) as file:
102
+ reader = PdfReader(file)
103
+ pdf_text = ""
104
+ for page in reader.pages:
105
+ pdf_text += page.extract_text()
106
+ return pdf_text if pdf_text else None
107
+ except requests.exceptions.RequestException as e:
108
+ print(f"Error fetching PDF {pdf_url}: {e}")
109
+ return None
110
+ except Exception as e:
111
+ print(f"Error reading PDF {pdf_url}: {e}")
112
+ return None
113
+
114
+ @staticmethod
115
+ def clean_body_content(html_content: str) -> str:
116
+ """Clean HTML content by removing scripts and styles."""
117
+ soup = BeautifulSoup(html_content, "html.parser")
118
+
119
+ # Remove script and style elements
120
+ for script_or_style in soup(["script", "style"]):
121
+ script_or_style.extract()
122
+
123
+ # Extract and clean text
124
+ cleaned_content = soup.get_text(separator="\n")
125
+ cleaned_content = "\n".join(
126
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
127
+ )
128
+ return cleaned_content
129
+
130
+ @classmethod
131
+ def scrape_websites(cls, base_urls: List[str]) -> Dict[str, str]:
132
+ """Scrape content from a list of base URLs and their internal links."""
133
+ try:
134
+ visited_links = set()
135
+ content_by_url = {}
136
+
137
+ for base_url in base_urls:
138
+ if not base_url.strip():
139
+ continue
140
+
141
+ print(f"Scraping base URL: {base_url}")
142
+ html_content = cls.fetch_page_content(base_url)
143
+ if html_content:
144
+ cleaned_content = cls.clean_body_content(html_content)
145
+ content_by_url[base_url] = cleaned_content
146
+ visited_links.add(base_url)
147
+
148
+ # Process internal links
149
+ soup = BeautifulSoup(html_content, "html.parser")
150
+ links = cls.extract_internal_links(base_url, soup)
151
+
152
+ for link in links:
153
+ if link not in visited_links:
154
+ print(f"Scraping link: {link}")
155
+ page_content = cls.fetch_page_content(link)
156
+ if page_content:
157
+ cleaned_content = cls.clean_body_content(page_content)
158
+ content_by_url[link] = cleaned_content
159
+ visited_links.add(link)
160
+
161
+ # Handle PDF links
162
+ if link.lower().endswith('.pdf'):
163
+ print(f"Extracting PDF content from: {link}")
164
+ pdf_content = cls.extract_pdf_text(link)
165
+ if pdf_content:
166
+ content_by_url[link] = pdf_content
167
+
168
+ return content_by_url
169
+
170
+ except Exception as e:
171
+ print(f"Error during scraping: {e}")
172
+ return {}
173
 
 
 
174
 
175
+ class TextProcessor:
176
+ """Handles text processing and chunking."""
177
+
178
+ @staticmethod
179
+ def process_content_tuples(content_tuples: List[Tuple[str, str]]) -> List[str]:
180
+ """Process content tuples into formatted strings."""
181
+ processed_texts = []
182
+
183
+ for url, content in content_tuples:
184
+ processed_texts.append(f"url: {url}, content: {content}")
185
+
186
+ return processed_texts
187
+
188
+ @staticmethod
189
+ def chunk_string(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
190
+ """Split text into chunks of specified size."""
191
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
192
+
193
+ @classmethod
194
+ def chunk_texts(cls, texts: List[str], chunk_size: int = CHUNK_SIZE) -> List[str]:
195
+ """Process multiple texts into chunks."""
196
+ chunked_texts = []
197
+
198
+ for text in texts:
199
+ chunked_texts.extend(cls.chunk_string(text, chunk_size))
200
+
201
+ return chunked_texts
202
 
 
 
 
 
 
203
 
204
+ class VectorStore:
205
+ """Manages vector embeddings and retrieval."""
206
+
207
+ def __init__(self, collection_name: str, persist_directory: str):
208
+ self.embed_model = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
209
+ self.vectorstore = Chroma(
210
+ collection_name=collection_name,
211
+ embedding_function=self.embed_model,
212
+ persist_directory=persist_directory,
213
+ )
214
+
215
+ def add_texts(self, texts: List[str]) -> None:
216
+ """Add texts to the vector store."""
217
+ self.vectorstore.add_texts(texts)
218
+
219
+ def get_retriever(self):
220
+ """Get a retriever from the vector store."""
221
+ return self.vectorstore.as_retriever()
222
 
 
223
 
224
+ class ChatbotRAG:
225
+ """Manages the Retrieval-Augmented Generation (RAG) chatbot."""
226
+
227
+ PROMPT_TEMPLATE = """
228
  You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
229
 
230
  1. **Warm & Natural Interaction**
 
263
  **Context:** {context}
264
  **User's Question:** {question}
265
  **Your Response:**
266
+ """
267
+
268
+ def __init__(self, api_key: str, model_name: str):
269
+ self.llm = ChatGroq(model=model_name, api_key=api_key)
270
+ self.rag_prompt = PromptTemplate.from_template(self.PROMPT_TEMPLATE)
271
+ self.session_manager = SessionManager()
272
+
273
+ def generate_welcome_message(self) -> str:
274
+ """Generate a welcome message for the chatbot interface."""
275
+ welcome_prompt = """
276
+ Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
277
+ Keep it under 3 sentences, and use simple language.
278
+ Make it warm and supportive but direct and easy to read.
279
+ """
280
+
281
+ welcome_message = self.llm.invoke(welcome_prompt).content
282
+ return welcome_message
283
+
284
+ def process_query(self, question: str, retriever, session_id: str = DEFAULT_SESSION_ID) -> str:
285
+ """Process a user query using RAG and maintain session history."""
286
+ # Get conversation history if available
287
+ conversation_history = self.session_manager.get_history(session_id)
288
+
289
+ # Get context from retriever
290
+ context_docs = retriever.invoke(question)
291
+ context = "\n".join(doc.page_content for doc in context_docs)
292
+
293
+ # Create prompt with history
294
+ prompt = self.rag_prompt.format(
295
+ context=context,
296
+ question=question,
297
+ conversation_history=conversation_history
298
+ )
299
+
300
+ # Generate response
301
+ response = self.llm.invoke(prompt).content
302
+
303
+ # Store the interaction
304
+ self.session_manager.add_interaction(session_id, question, response)
305
+
306
+ return response
307
+
308
+ def streaming_response(self, message: str, history) -> str:
309
+ """Stream the response word by word for the Gradio interface."""
310
+ # Generate a session ID based on the first message if not exists
311
+ session_id = None
312
+ for msg in history:
313
+ if msg[0]: # If there's a user message
314
+ session_id = hash(msg[0][:20]) if session_id is None else session_id
315
+ break
316
+
317
+ # Default session ID if history is empty
318
+ if session_id is None:
319
+ session_id = DEFAULT_SESSION_ID
320
+
321
+ # Process the message and get response
322
+ response = self.process_query(message, self.retriever, str(session_id))
323
+
324
+ # Stream the response word by word
325
+ partial_text = ""
326
+ words = response.split(' ')
327
+ for word in words:
328
+ partial_text += word + " "
329
+ yield partial_text.strip()
330
 
 
 
331
 
332
+ class ChatbotUI:
333
+ """Manages the Gradio UI for the chatbot."""
334
+
335
+ CUSTOM_CSS = """
336
+ /* Custom CSS for styling the interface */
337
+ body {
338
+ font-family: "Arial", serif;
339
+ }
340
+
341
+ .gradio-container {
342
+ font-family: "Times New Roman", serif;
343
+ }
344
+
345
+ .gr-button {
346
+ background-color: #007bff; /* Blue button */
347
+ color: white;
348
+ border: none;
349
+ border-radius: 5px;
350
+ font-size: 16px;
351
+ padding: 10px 20px;
352
+ cursor: pointer;
353
+ }
354
+
355
+ .gr-textbox:focus, .gr-button:focus {
356
+ outline: none; /* Remove outline focus for a cleaner look */
357
+ }
358
+
359
+ /* Specific CSS for the welcome message */
360
+ .gradio-description {
361
+ font-size: 30px; /* Set font size for the welcome message */
362
+ font-family: "Arial", sans-serif;
363
+ text-align: center; /* Optional: Center-align the text */
364
+ padding: 20px; /* Optional: Add padding around the welcome message */
365
+ }
366
+ """
367
 
368
+ def __init__(self, chatbot_rag):
369
+ self.chatbot_rag = chatbot_rag
370
+ self.title = "GBVR Chatbot"
371
+ self.welcome_msg = chatbot_rag.generate_welcome_message()
372
 
373
+ def create_interface(self):
374
+ """Create and configure the Gradio interface."""
375
+ demo = gr.ChatInterface(
376
+ fn=self.chatbot_rag.streaming_response,
377
+ title=self.title,
378
+ fill_height=True,
379
+ theme="soft",
380
+ css=self.CUSTOM_CSS,
381
+ description=self.welcome_msg
382
+ )
383
+ return demo
384
+
385
+
386
+ def main():
387
+ """Main function to initialize and run the chatbot."""
388
+ # Define target websites to scrape
389
+ websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
390
 
391
+ # Scrape website content
392
+ content_by_url = WebScraper.scrape_websites(websites)
393
 
394
+ # Process content into tuples
395
+ content_tuples = [(url, content) for url, content in content_by_url.items()]
396
 
397
+ # Process and chunk texts
398
+ processed_texts = TextProcessor.process_content_tuples(content_tuples)
399
+ chunked_texts = TextProcessor.chunk_texts(processed_texts)
 
 
 
 
 
 
 
 
400
 
401
+ # Initialize vector store
402
+ vector_store = VectorStore(VECTOR_DB_COLLECTION, VECTOR_DB_PERSIST_DIR)
403
+ vector_store.add_texts(chunked_texts)
404
+ retriever = vector_store.get_retriever()
405
 
406
+ # Initialize chatbot RAG
407
+ chatbot_rag = ChatbotRAG(GROQ_API_KEY, LLM_MODEL_NAME)
408
+ chatbot_rag.retriever = retriever
409
 
410
+ # Initialize UI
411
+ ui = ChatbotUI(chatbot_rag)
412
+ demo = ui.create_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
 
414
+ # Launch the app
415
+ demo.launch(share=True, inbrowser=True, debug=True)
416
+
417
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  if __name__ == "__main__":
419
+ main()