Rabbit-Innotech commited on
Commit
007ac45
·
verified ·
1 Parent(s): 2ed0c6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -294
app.py CHANGED
@@ -1,54 +1,34 @@
1
  import os
 
 
 
 
2
  import requests
3
  from io import BytesIO
4
- from urllib.parse import urljoin, urlparse
5
- from typing import Dict, List, Set, Tuple, Optional, Union
6
-
7
- # Libraries for web scraping and text processing
8
  from bs4 import BeautifulSoup
 
 
9
  from PyPDF2 import PdfReader
10
-
11
- # LangChain imports
12
- from langchain_groq import ChatGroq
13
- from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
14
  from langchain_core.output_parsers import StrOutputParser
15
  from langchain_core.runnables import RunnablePassthrough
16
- from langchain_chroma import Chroma
17
- from langchain_huggingface import HuggingFaceEmbeddings
18
-
19
- # Gradio import for the user interface
20
- import gradio as gr
21
-
22
- # Configuration settings
23
- GROQ_API_KEY = os.environ.get('GBV')
24
- EMBED_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
25
- LLM_MODEL_NAME = "llama-3.3-70b-versatile"
26
- CHUNK_SIZE = 1000
27
- VECTOR_DB_COLLECTION = "GBVR_Dataset"
28
- VECTOR_DB_PERSIST_DIR = "./"
29
- DEFAULT_SESSION_ID = "default_session"
30
- MAX_HISTORY_TURNS = 5
31
-
32
 
 
33
  class SessionManager:
34
- """Manages chat sessions and conversation history."""
35
-
36
  def __init__(self):
37
  self.sessions = {}
38
 
39
- def get_or_create_session(self, session_id: str) -> List[Dict[str, str]]:
40
- """Get existing session or create a new one."""
41
  if session_id not in self.sessions:
42
  self.sessions[session_id] = []
43
  return self.sessions[session_id]
44
 
45
- def add_interaction(self, session_id: str, user_message: str, ai_response: str) -> None:
46
- """Add user-AI interaction to the session history."""
47
  session = self.get_or_create_session(session_id)
48
  session.append({"user": user_message, "ai": ai_response})
49
 
50
- def get_history(self, session_id: str, max_turns: int = MAX_HISTORY_TURNS) -> str:
51
- """Get formatted conversation history."""
52
  session = self.get_or_create_session(session_id)
53
  recent_history = session[-max_turns:] if len(session) > max_turns else session
54
 
@@ -59,172 +39,174 @@ class SessionManager:
59
 
60
  return history_text.strip()
61
 
 
 
62
 
63
- class WebScraper:
64
- """Handles web scraping operations."""
65
-
66
- @staticmethod
67
- def fetch_page_content(url: str) -> Optional[str]:
68
- """Fetch HTML content from a URL."""
69
- try:
70
- response = requests.get(url, timeout=10)
71
- response.raise_for_status()
72
- return response.text
73
- except requests.exceptions.RequestException as e:
74
- print(f"Error fetching {url}: {e}")
75
- return None
76
-
77
- @staticmethod
78
- def extract_internal_links(base_url: str, soup: BeautifulSoup) -> Set[str]:
79
- """Extract internal links from a page."""
80
- links = set()
81
- for anchor in soup.find_all("a", href=True):
82
- href = anchor["href"]
83
- full_url = urljoin(base_url, href)
84
- if WebScraper.is_internal_link(base_url, full_url):
85
- links.add(full_url)
86
- return links
87
-
88
- @staticmethod
89
- def is_internal_link(base_url: str, link_url: str) -> bool:
90
- """Check if a link is internal to the base domain."""
91
- base_netloc = urlparse(base_url).netloc
92
- link_netloc = urlparse(link_url).netloc
93
- return base_netloc == link_netloc
94
-
95
- @staticmethod
96
- def extract_pdf_text(pdf_url: str) -> Optional[str]:
97
- """Extract text from a PDF URL."""
98
- try:
99
- response = requests.get(pdf_url)
100
- response.raise_for_status()
101
- with BytesIO(response.content) as file:
102
- reader = PdfReader(file)
103
- pdf_text = ""
104
- for page in reader.pages:
105
- pdf_text += page.extract_text()
106
- return pdf_text if pdf_text else None
107
- except requests.exceptions.RequestException as e:
108
- print(f"Error fetching PDF {pdf_url}: {e}")
109
- return None
110
- except Exception as e:
111
- print(f"Error reading PDF {pdf_url}: {e}")
112
- return None
113
-
114
- @staticmethod
115
- def clean_body_content(html_content: str) -> str:
116
- """Clean HTML content by removing scripts and styles."""
117
- soup = BeautifulSoup(html_content, "html.parser")
118
-
119
- # Remove script and style elements
120
- for script_or_style in soup(["script", "style"]):
121
- script_or_style.extract()
122
-
123
- # Extract and clean text
124
- cleaned_content = soup.get_text(separator="\n")
125
- cleaned_content = "\n".join(
126
- line.strip() for line in cleaned_content.splitlines() if line.strip()
127
- )
128
- return cleaned_content
129
-
130
- @classmethod
131
- def scrape_websites(cls, base_urls: List[str]) -> Dict[str, str]:
132
- """Scrape content from a list of base URLs and their internal links."""
133
- try:
134
- visited_links = set()
135
- content_by_url = {}
136
-
137
- for base_url in base_urls:
138
- if not base_url.strip():
139
- continue
140
-
141
- print(f"Scraping base URL: {base_url}")
142
- html_content = cls.fetch_page_content(base_url)
143
- if html_content:
144
- cleaned_content = cls.clean_body_content(html_content)
145
- content_by_url[base_url] = cleaned_content
146
- visited_links.add(base_url)
147
-
148
- # Process internal links
149
- soup = BeautifulSoup(html_content, "html.parser")
150
- links = cls.extract_internal_links(base_url, soup)
151
-
152
- for link in links:
153
- if link not in visited_links:
154
- print(f"Scraping link: {link}")
155
- page_content = cls.fetch_page_content(link)
156
- if page_content:
157
- cleaned_content = cls.clean_body_content(page_content)
158
- content_by_url[link] = cleaned_content
159
- visited_links.add(link)
160
-
161
- # Handle PDF links
162
- if link.lower().endswith('.pdf'):
163
- print(f"Extracting PDF content from: {link}")
164
- pdf_content = cls.extract_pdf_text(link)
165
- if pdf_content:
166
- content_by_url[link] = pdf_content
167
 
168
- return content_by_url
169
-
170
- except Exception as e:
171
- print(f"Error during scraping: {e}")
172
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
 
175
- class TextProcessor:
176
- """Handles text processing and chunking."""
177
-
178
- @staticmethod
179
- def process_content_tuples(content_tuples: List[Tuple[str, str]]) -> List[str]:
180
- """Process content tuples into formatted strings."""
181
- processed_texts = []
182
-
183
- for url, content in content_tuples:
184
- processed_texts.append(f"url: {url}, content: {content}")
185
-
186
- return processed_texts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
- @staticmethod
189
- def chunk_string(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
190
- """Split text into chunks of specified size."""
191
- return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
192
 
193
- @classmethod
194
- def chunk_texts(cls, texts: List[str], chunk_size: int = CHUNK_SIZE) -> List[str]:
195
- """Process multiple texts into chunks."""
196
- chunked_texts = []
197
-
198
- for text in texts:
199
- chunked_texts.extend(cls.chunk_string(text, chunk_size))
200
-
201
- return chunked_texts
 
 
 
 
202
 
203
 
204
- class VectorStore:
205
- """Manages vector embeddings and retrieval."""
 
 
 
 
 
 
 
 
 
206
 
207
- def __init__(self, collection_name: str, persist_directory: str):
208
- self.embed_model = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
209
- self.vectorstore = Chroma(
210
- collection_name=collection_name,
211
- embedding_function=self.embed_model,
212
- persist_directory=persist_directory,
213
- )
214
 
215
- def add_texts(self, texts: List[str]) -> None:
216
- """Add texts to the vector store."""
217
- self.vectorstore.add_texts(texts)
 
218
 
219
- def get_retriever(self):
220
- """Get a retriever from the vector store."""
221
- return self.vectorstore.as_retriever()
 
 
 
 
 
 
 
222
 
223
 
224
- class ChatbotRAG:
225
- """Manages the Retrieval-Augmented Generation (RAG) chatbot."""
226
-
227
- PROMPT_TEMPLATE = """
228
  You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
229
 
230
  1. **Warm & Natural Interaction**
@@ -263,76 +245,104 @@ class ChatbotRAG:
263
  **Context:** {context}
264
  **User's Question:** {question}
265
  **Your Response:**
 
 
 
 
 
 
266
  """
 
 
 
 
267
 
268
- def __init__(self, api_key: str, model_name: str):
269
- self.llm = ChatGroq(model=model_name, api_key=api_key)
270
- self.rag_prompt = PromptTemplate.from_template(self.PROMPT_TEMPLATE)
271
- self.session_manager = SessionManager()
272
 
273
- def generate_welcome_message(self) -> str:
274
- """Generate a welcome message for the chatbot interface."""
275
- welcome_prompt = """
276
- Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
277
- Keep it under 3 sentences, and use simple language.
278
- Make it warm and supportive but direct and easy to read.
279
- """
280
-
281
- welcome_message = self.llm.invoke(welcome_prompt).content
282
- return welcome_message
283
 
284
- def process_query(self, question: str, retriever, session_id: str = DEFAULT_SESSION_ID) -> str:
285
- """Process a user query using RAG and maintain session history."""
286
- # Get conversation history if available
287
- conversation_history = self.session_manager.get_history(session_id)
288
-
289
- # Get context from retriever
290
- context_docs = retriever.invoke(question)
291
- context = "\n".join(doc.page_content for doc in context_docs)
292
-
293
- # Create prompt with history
294
- prompt = self.rag_prompt.format(
295
- context=context,
296
- question=question,
297
- conversation_history=conversation_history
298
- )
299
-
300
- # Generate response
301
- response = self.llm.invoke(prompt).content
302
-
303
- # Store the interaction
304
- self.session_manager.add_interaction(session_id, question, response)
305
-
306
- return response
307
 
308
- def streaming_response(self, message: str, history) -> str:
309
- """Stream the response word by word for the Gradio interface."""
310
- # Generate a session ID based on the first message if not exists
311
- session_id = None
312
- for msg in history:
313
- if msg[0]: # If there's a user message
314
- session_id = hash(msg[0][:20]) if session_id is None else session_id
315
- break
316
-
317
- # Default session ID if history is empty
318
- if session_id is None:
319
- session_id = DEFAULT_SESSION_ID
320
-
321
- # Process the message and get response
322
- response = self.process_query(message, self.retriever, str(session_id))
323
-
324
- # Stream the response word by word
325
- partial_text = ""
326
- words = response.split(' ')
327
- for word in words:
328
- partial_text += word + " "
329
- yield partial_text.strip()
330
 
331
 
332
- class ChatbotUI:
333
- """Manages the Gradio UI for the chatbot."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
- CUSTOM_CSS = """
 
336
  /* Custom CSS for styling the interface */
337
  body {
338
  font-family: "Arial", serif;
@@ -365,55 +375,33 @@ class ChatbotUI:
365
  }
366
  """
367
 
368
- def __init__(self, chatbot_rag):
369
- self.chatbot_rag = chatbot_rag
370
- self.title = "GBVR Chatbot"
371
- self.welcome_msg = chatbot_rag.generate_welcome_message()
372
-
373
- def create_interface(self):
374
- """Create and configure the Gradio interface."""
375
- demo = gr.ChatInterface(
376
- fn=self.chatbot_rag.streaming_response,
377
- title=self.title,
378
- fill_height=True,
379
- theme="soft",
380
- css=self.CUSTOM_CSS,
381
- description=self.welcome_msg
382
- )
383
- return demo
384
-
385
-
386
- def main():
387
- """Main function to initialize and run the chatbot."""
388
- # Define target websites to scrape
389
- websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
390
-
391
- # Scrape website content
392
- content_by_url = WebScraper.scrape_websites(websites)
393
-
394
- # Process content into tuples
395
- content_tuples = [(url, content) for url, content in content_by_url.items()]
396
-
397
- # Process and chunk texts
398
- processed_texts = TextProcessor.process_content_tuples(content_tuples)
399
- chunked_texts = TextProcessor.chunk_texts(processed_texts)
400
 
401
- # Initialize vector store
402
- vector_store = VectorStore(VECTOR_DB_COLLECTION, VECTOR_DB_PERSIST_DIR)
403
- vector_store.add_texts(chunked_texts)
404
- retriever = vector_store.get_retriever()
 
 
 
 
 
405
 
406
- # Initialize chatbot RAG
407
- chatbot_rag = ChatbotRAG(GROQ_API_KEY, LLM_MODEL_NAME)
408
- chatbot_rag.retriever = retriever
409
-
410
- # Initialize UI
411
- ui = ChatbotUI(chatbot_rag)
412
- demo = ui.create_interface()
413
-
414
- # Launch the app
415
- demo.launch(share=True, inbrowser=True, debug=True)
416
 
417
 
418
  if __name__ == "__main__":
419
- main()
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ from langchain_groq import ChatGroq
3
+ from langchain.prompts import ChatPromptTemplate, PromptTemplate
4
+ from langchain.output_parsers import ResponseSchema, StructuredOutputParser
5
+ from urllib.parse import urljoin, urlparse
6
  import requests
7
  from io import BytesIO
8
+ from langchain_chroma import Chroma
 
 
 
9
  from bs4 import BeautifulSoup
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ import gradio as gr
12
  from PyPDF2 import PdfReader
13
+ from langchain_huggingface import HuggingFaceEmbeddings
 
 
 
14
  from langchain_core.output_parsers import StrOutputParser
15
  from langchain_core.runnables import RunnablePassthrough
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # Simple session management
18
  class SessionManager:
 
 
19
  def __init__(self):
20
  self.sessions = {}
21
 
22
+ def get_or_create_session(self, session_id):
 
23
  if session_id not in self.sessions:
24
  self.sessions[session_id] = []
25
  return self.sessions[session_id]
26
 
27
+ def add_interaction(self, session_id, user_message, ai_response):
 
28
  session = self.get_or_create_session(session_id)
29
  session.append({"user": user_message, "ai": ai_response})
30
 
31
+ def get_history(self, session_id, max_turns=5):
 
32
  session = self.get_or_create_session(session_id)
33
  recent_history = session[-max_turns:] if len(session) > max_turns else session
34
 
 
39
 
40
  return history_text.strip()
41
 
42
+ # Initialize session manager
43
+ session_manager = SessionManager()
44
 
45
+ # Get API key from environment variable
46
+ groq_api_key = os.environ.get('GBV')
47
+
48
+ # Initialize embedding model
49
+ embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
50
+
51
+ def scrape_websites(base_urls):
52
+ """
53
+ Scrape content from given URLs and their internal links
54
+ """
55
+ visited_links = set() # To avoid revisiting the same link
56
+ content_by_url = {} # Store content from each URL
57
+
58
+ for base_url in base_urls:
59
+ if not base_url.strip():
60
+ continue # Skip empty URLs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ print(f"Scraping base URL: {base_url}")
63
+ html_content = fetch_page_content(base_url)
64
+ if html_content:
65
+ cleaned_content = clean_body_content(html_content)
66
+ content_by_url[base_url] = cleaned_content
67
+ visited_links.add(base_url)
68
+
69
+ # Extract and process internal links
70
+ soup = BeautifulSoup(html_content, "html.parser")
71
+ links = extract_internal_links(base_url, soup)
72
+
73
+ for link in links:
74
+ if link not in visited_links:
75
+ print(f"Scraping link: {link}")
76
+ page_content = fetch_page_content(link)
77
+ if page_content:
78
+ cleaned_content = clean_body_content(page_content)
79
+ content_by_url[link] = cleaned_content
80
+ visited_links.add(link)
81
+
82
+ # Handle PDF files
83
+ if link.lower().endswith('.pdf'):
84
+ print(f"Extracting PDF content from: {link}")
85
+ pdf_content = extract_pdf_text(link)
86
+ if pdf_content:
87
+ content_by_url[link] = pdf_content
88
+
89
+ return content_by_url
90
+
91
+
92
+ def fetch_page_content(url):
93
+ """
94
+ Fetch HTML content from a URL
95
+ """
96
+ try:
97
+ response = requests.get(url, timeout=10)
98
+ response.raise_for_status()
99
+ return response.text
100
+ except requests.exceptions.RequestException as e:
101
+ print(f"Error fetching {url}: {e}")
102
+ return None
103
 
104
 
105
+ def extract_internal_links(base_url, soup):
106
+ """
107
+ Extract all internal links from a BeautifulSoup object
108
+ """
109
+ links = set()
110
+ for anchor in soup.find_all("a", href=True):
111
+ href = anchor["href"]
112
+ full_url = urljoin(base_url, href)
113
+ if is_internal_link(base_url, full_url):
114
+ links.add(full_url)
115
+ return links
116
+
117
+
118
+ def is_internal_link(base_url, link_url):
119
+ """
120
+ Check if a URL belongs to the same domain as the base URL
121
+ """
122
+ base_netloc = urlparse(base_url).netloc
123
+ link_netloc = urlparse(link_url).netloc
124
+ return base_netloc == link_netloc
125
+
126
+
127
+ def extract_pdf_text(pdf_url):
128
+ """
129
+ Extract text content from a PDF file
130
+ """
131
+ try:
132
+ response = requests.get(pdf_url)
133
+ response.raise_for_status()
134
+ with BytesIO(response.content) as file:
135
+ reader = PdfReader(file)
136
+ pdf_text = ""
137
+ for page in reader.pages:
138
+ pdf_text += page.extract_text()
139
+
140
+ return pdf_text if pdf_text else None
141
+ except requests.exceptions.RequestException as e:
142
+ print(f"Error fetching PDF {pdf_url}: {e}")
143
+ return None
144
+ except Exception as e:
145
+ print(f"Error reading PDF {pdf_url}: {e}")
146
+ return None
147
+
148
+
149
+ def clean_body_content(html_content):
150
+ """
151
+ Extract and clean text content from HTML
152
+ """
153
+ soup = BeautifulSoup(html_content, "html.parser")
154
 
155
+ # Remove script and style elements
156
+ for script_or_style in soup(["script", "style"]):
157
+ script_or_style.extract()
 
158
 
159
+ # Extract text and clean
160
+ cleaned_content = soup.get_text(separator="\n")
161
+ cleaned_content = "\n".join(
162
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
163
+ )
164
+ return cleaned_content
165
+
166
+
167
+ def chunk_string(s, chunk_size=1000):
168
+ """
169
+ Split a string into chunks of specific size
170
+ """
171
+ return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
172
 
173
 
174
+ def process_and_load_content(website_urls):
175
+ """
176
+ Process website content and load into vector database
177
+ """
178
+ # Scrape websites
179
+ all_content = scrape_websites(website_urls)
180
+
181
+ # Convert to list of tuples
182
+ temp_list = []
183
+ for url, content in all_content.items():
184
+ temp_list.append((url, content))
185
 
186
+ # Process texts with URL context
187
+ processed_texts = []
188
+ for url, content in temp_list:
189
+ processed_texts.append(f"url: {url}, content: {content}")
 
 
 
190
 
191
+ # Split into chunks
192
+ chunked_texts = []
193
+ for text in processed_texts:
194
+ chunked_texts.extend(chunk_string(text))
195
 
196
+ # Create and populate vector store
197
+ vectorstore = Chroma(
198
+ collection_name="GBVR_Dataset",
199
+ embedding_function=embed_model,
200
+ persist_directory="./",
201
+ )
202
+
203
+ vectorstore.add_texts(chunked_texts)
204
+
205
+ return vectorstore
206
 
207
 
208
+ # RAG prompt template
209
+ rag_prompt_template = """
 
 
210
  You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
211
 
212
  1. **Warm & Natural Interaction**
 
245
  **Context:** {context}
246
  **User's Question:** {question}
247
  **Your Response:**
248
+ """
249
+
250
+ # Create prompt template
251
+ rag_prompt = PromptTemplate.from_template(rag_prompt_template)
252
+
253
+ def init_rag_components(vectorstore):
254
  """
255
+ Initialize RAG components: retriever and LLM
256
+ """
257
+ # Create retriever from vector store
258
+ retriever = vectorstore.as_retriever()
259
 
260
+ # Initialize LLM
261
+ llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
 
 
262
 
263
+ return retriever, llm
264
+
265
+
266
+ def rag_chain(question, session_id="default", retriever=None, llm=None):
267
+ """
268
+ Process a query through the RAG pipeline
269
+ """
270
+ # Get conversation history
271
+ conversation_history = session_manager.get_history(session_id)
 
272
 
273
+ # Get context from retriever
274
+ context_docs = retriever.invoke(question)
275
+ context = "\n".join(doc.page_content for doc in context_docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
+ # Create prompt with history
278
+ prompt = rag_prompt.format(
279
+ context=context,
280
+ question=question,
281
+ conversation_history=conversation_history
282
+ )
283
+
284
+ # Generate response
285
+ response = llm.invoke(prompt).content
286
+
287
+ # Store the interaction
288
+ session_manager.add_interaction(session_id, question, response)
289
+
290
+ return response
 
 
 
 
 
 
 
 
291
 
292
 
293
+ def generate_welcome_message(llm):
294
+ """
295
+ Generate a welcoming message for the chatbot
296
+ """
297
+ welcome_prompt = """
298
+ Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
299
+ Keep it under 3 sentences, and use simple language.
300
+ Make it warm and supportive but direct and easy to read.
301
+ """
302
+
303
+ welcome_message = llm.invoke(welcome_prompt).content
304
+ return welcome_message
305
+
306
+
307
+ def rag_memory_stream(message, history, retriever, llm):
308
+ """
309
+ Stream responses for the Gradio interface
310
+ """
311
+ # Generate a session ID based on the first message
312
+ session_id = None
313
+ for msg in history:
314
+ if msg[0]: # If there's a user message
315
+ # Use hash of first message as session ID
316
+ session_id = hash(msg[0][:20]) if session_id is None else session_id
317
+ break
318
+
319
+ # Default session ID if history is empty
320
+ if session_id is None:
321
+ session_id = "default_session"
322
+
323
+ # Process the message and get response
324
+ response = rag_chain(message, str(session_id), retriever, llm)
325
+
326
+ # Stream the response word by word
327
+ partial_text = ""
328
+ words = response.split(' ')
329
+ for word in words:
330
+ partial_text += word + " "
331
+ yield partial_text.strip()
332
+
333
+
334
+ def create_ui(retriever, llm):
335
+ """
336
+ Create the Gradio UI for the chatbot
337
+ """
338
+ # Title
339
+ title = "GBVR Chatbot"
340
+
341
+ # Generate welcome message
342
+ welcome_msg = generate_welcome_message(llm)
343
 
344
+ # Custom CSS for styling
345
+ custom_css = """
346
  /* Custom CSS for styling the interface */
347
  body {
348
  font-family: "Arial", serif;
 
375
  }
376
  """
377
 
378
+ # Create a wrapper function for rag_memory_stream that includes retriever and llm
379
+ def wrapped_rag_memory_stream(message, history):
380
+ return rag_memory_stream(message, history, retriever, llm)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
+ # Create the Chat Interface
383
+ demo = gr.ChatInterface(
384
+ fn=wrapped_rag_memory_stream,
385
+ title=title,
386
+ fill_height=True,
387
+ theme="soft",
388
+ css=custom_css,
389
+ description=welcome_msg
390
+ )
391
 
392
+ return demo
 
 
 
 
 
 
 
 
 
393
 
394
 
395
  if __name__ == "__main__":
396
+ # Define target websites
397
+ websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
398
+
399
+ # Process content and create vector store
400
+ vectorstore = process_and_load_content(websites)
401
+
402
+ # Initialize RAG components
403
+ retriever, llm = init_rag_components(vectorstore)
404
+
405
+ # Create and launch UI
406
+ demo = create_ui(retriever, llm)
407
+ demo.launch(share=True, inbrowser=True, debug=True)