Rabbit-Innotech commited on
Commit
768a211
·
verified ·
1 Parent(s): 007ac45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -202
app.py CHANGED
@@ -6,6 +6,7 @@ from urllib.parse import urljoin, urlparse
6
  import requests
7
  from io import BytesIO
8
  from langchain_chroma import Chroma
 
9
  from bs4 import BeautifulSoup
10
  from langchain_core.prompts import ChatPromptTemplate
11
  import gradio as gr
@@ -42,57 +43,54 @@ class SessionManager:
42
  # Initialize session manager
43
  session_manager = SessionManager()
44
 
45
- # Get API key from environment variable
46
- groq_api_key = os.environ.get('GBV')
47
 
48
- # Initialize embedding model
49
  embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
50
 
51
  def scrape_websites(base_urls):
52
- """
53
- Scrape content from given URLs and their internal links
54
- """
55
- visited_links = set() # To avoid revisiting the same link
56
- content_by_url = {} # Store content from each URL
57
-
58
- for base_url in base_urls:
59
- if not base_url.strip():
60
- continue # Skip empty URLs
61
-
62
- print(f"Scraping base URL: {base_url}")
63
- html_content = fetch_page_content(base_url)
64
- if html_content:
65
- cleaned_content = clean_body_content(html_content)
66
- content_by_url[base_url] = cleaned_content
67
- visited_links.add(base_url)
68
-
69
- # Extract and process internal links
70
- soup = BeautifulSoup(html_content, "html.parser")
71
- links = extract_internal_links(base_url, soup)
72
-
73
- for link in links:
74
- if link not in visited_links:
75
- print(f"Scraping link: {link}")
76
- page_content = fetch_page_content(link)
77
- if page_content:
78
- cleaned_content = clean_body_content(page_content)
79
- content_by_url[link] = cleaned_content
80
- visited_links.add(link)
81
-
82
- # Handle PDF files
83
- if link.lower().endswith('.pdf'):
84
- print(f"Extracting PDF content from: {link}")
85
- pdf_content = extract_pdf_text(link)
86
- if pdf_content:
87
- content_by_url[link] = pdf_content
88
-
89
- return content_by_url
 
 
90
 
91
 
92
  def fetch_page_content(url):
93
- """
94
- Fetch HTML content from a URL
95
- """
96
  try:
97
  response = requests.get(url, timeout=10)
98
  response.raise_for_status()
@@ -103,9 +101,6 @@ def fetch_page_content(url):
103
 
104
 
105
  def extract_internal_links(base_url, soup):
106
- """
107
- Extract all internal links from a BeautifulSoup object
108
- """
109
  links = set()
110
  for anchor in soup.find_all("a", href=True):
111
  href = anchor["href"]
@@ -116,18 +111,12 @@ def extract_internal_links(base_url, soup):
116
 
117
 
118
  def is_internal_link(base_url, link_url):
119
- """
120
- Check if a URL belongs to the same domain as the base URL
121
- """
122
  base_netloc = urlparse(base_url).netloc
123
  link_netloc = urlparse(link_url).netloc
124
  return base_netloc == link_netloc
125
 
126
 
127
  def extract_pdf_text(pdf_url):
128
- """
129
- Extract text content from a PDF file
130
- """
131
  try:
132
  response = requests.get(pdf_url)
133
  response.raise_for_status()
@@ -147,16 +136,13 @@ def extract_pdf_text(pdf_url):
147
 
148
 
149
  def clean_body_content(html_content):
150
- """
151
- Extract and clean text content from HTML
152
- """
153
  soup = BeautifulSoup(html_content, "html.parser")
 
154
 
155
- # Remove script and style elements
156
  for script_or_style in soup(["script", "style"]):
157
  script_or_style.extract()
 
158
 
159
- # Extract text and clean
160
  cleaned_content = soup.get_text(separator="\n")
161
  cleaned_content = "\n".join(
162
  line.strip() for line in cleaned_content.splitlines() if line.strip()
@@ -164,49 +150,50 @@ def clean_body_content(html_content):
164
  return cleaned_content
165
 
166
 
167
- def chunk_string(s, chunk_size=1000):
168
- """
169
- Split a string into chunks of specific size
170
- """
171
- return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
172
-
173
 
174
- def process_and_load_content(website_urls):
175
- """
176
- Process website content and load into vector database
177
- """
178
- # Scrape websites
179
- all_content = scrape_websites(website_urls)
180
-
181
- # Convert to list of tuples
182
  temp_list = []
183
  for url, content in all_content.items():
184
  temp_list.append((url, content))
 
185
 
186
- # Process texts with URL context
187
- processed_texts = []
188
- for url, content in temp_list:
189
- processed_texts.append(f"url: {url}, content: {content}")
190
-
191
- # Split into chunks
192
- chunked_texts = []
193
- for text in processed_texts:
194
- chunked_texts.extend(chunk_string(text))
195
-
196
- # Create and populate vector store
197
- vectorstore = Chroma(
198
- collection_name="GBVR_Dataset",
199
- embedding_function=embed_model,
200
- persist_directory="./",
201
- )
202
-
203
- vectorstore.add_texts(chunked_texts)
204
 
205
- return vectorstore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
 
207
 
208
- # RAG prompt template
209
- rag_prompt_template = """
210
  You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
211
 
212
  1. **Warm & Natural Interaction**
@@ -245,29 +232,21 @@ rag_prompt_template = """
245
  **Context:** {context}
246
  **User's Question:** {question}
247
  **Your Response:**
248
- """
249
 
250
- # Create prompt template
251
- rag_prompt = PromptTemplate.from_template(rag_prompt_template)
252
 
253
- def init_rag_components(vectorstore):
254
- """
255
- Initialize RAG components: retriever and LLM
256
- """
257
- # Create retriever from vector store
258
- retriever = vectorstore.as_retriever()
259
-
260
- # Initialize LLM
261
- llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
262
-
263
- return retriever, llm
264
 
 
265
 
266
- def rag_chain(question, session_id="default", retriever=None, llm=None):
267
- """
268
- Process a query through the RAG pipeline
269
- """
270
- # Get conversation history
 
 
 
271
  conversation_history = session_manager.get_history(session_id)
272
 
273
  # Get context from retriever
@@ -289,30 +268,13 @@ def rag_chain(question, session_id="default", retriever=None, llm=None):
289
 
290
  return response
291
 
292
-
293
- def generate_welcome_message(llm):
294
- """
295
- Generate a welcoming message for the chatbot
296
- """
297
- welcome_prompt = """
298
- Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
299
- Keep it under 3 sentences, and use simple language.
300
- Make it warm and supportive but direct and easy to read.
301
- """
302
-
303
- welcome_message = llm.invoke(welcome_prompt).content
304
- return welcome_message
305
-
306
-
307
- def rag_memory_stream(message, history, retriever, llm):
308
- """
309
- Stream responses for the Gradio interface
310
- """
311
- # Generate a session ID based on the first message
312
  session_id = None
313
  for msg in history:
314
  if msg[0]: # If there's a user message
315
- # Use hash of first message as session ID
316
  session_id = hash(msg[0][:20]) if session_id is None else session_id
317
  break
318
 
@@ -321,7 +283,7 @@ def rag_memory_stream(message, history, retriever, llm):
321
  session_id = "default_session"
322
 
323
  # Process the message and get response
324
- response = rag_chain(message, str(session_id), retriever, llm)
325
 
326
  # Stream the response word by word
327
  partial_text = ""
@@ -330,78 +292,69 @@ def rag_memory_stream(message, history, retriever, llm):
330
  partial_text += word + " "
331
  yield partial_text.strip()
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
- def create_ui(retriever, llm):
335
- """
336
- Create the Gradio UI for the chatbot
337
- """
338
- # Title
339
- title = "GBVR Chatbot"
340
-
341
- # Generate welcome message
342
- welcome_msg = generate_welcome_message(llm)
343
-
344
- # Custom CSS for styling
345
- custom_css = """
346
- /* Custom CSS for styling the interface */
347
- body {
348
- font-family: "Arial", serif;
349
- }
350
-
351
- .gradio-container {
352
- font-family: "Times New Roman", serif;
353
- }
354
-
355
- .gr-button {
356
- background-color: #007bff; /* Blue button */
357
- color: white;
358
- border: none;
359
- border-radius: 5px;
360
- font-size: 16px;
361
- padding: 10px 20px;
362
- cursor: pointer;
363
- }
364
-
365
- .gr-textbox:focus, .gr-button:focus {
366
- outline: none; /* Remove outline focus for a cleaner look */
367
- }
368
-
369
- /* Specific CSS for the welcome message */
370
- .gradio-description {
371
- font-size: 30px; /* Set font size for the welcome message */
372
- font-family: "Arial", sans-serif;
373
- text-align: center; /* Optional: Center-align the text */
374
- padding: 20px; /* Optional: Add padding around the welcome message */
375
- }
376
  """
377
 
378
- # Create a wrapper function for rag_memory_stream that includes retriever and llm
379
- def wrapped_rag_memory_stream(message, history):
380
- return rag_memory_stream(message, history, retriever, llm)
381
-
382
- # Create the Chat Interface
383
- demo = gr.ChatInterface(
384
- fn=wrapped_rag_memory_stream,
385
- title=title,
386
- fill_height=True,
387
- theme="soft",
388
- css=custom_css,
389
- description=welcome_msg
390
- )
391
-
392
- return demo
393
 
 
 
 
 
 
 
 
 
 
394
 
 
395
  if __name__ == "__main__":
396
- # Define target websites
397
- websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
398
-
399
- # Process content and create vector store
400
- vectorstore = process_and_load_content(websites)
401
-
402
- # Initialize RAG components
403
- retriever, llm = init_rag_components(vectorstore)
404
-
405
- # Create and launch UI
406
- demo = create_ui(retriever, llm)
407
  demo.launch(share=True, inbrowser=True, debug=True)
 
6
  import requests
7
  from io import BytesIO
8
  from langchain_chroma import Chroma
9
+ import requests
10
  from bs4 import BeautifulSoup
11
  from langchain_core.prompts import ChatPromptTemplate
12
  import gradio as gr
 
43
  # Initialize session manager
44
  session_manager = SessionManager()
45
 
46
+ groq_api_key= os.environ.get('GBV')
 
47
 
 
48
  embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
49
 
50
  def scrape_websites(base_urls):
51
+ try:
52
+ visited_links = set() # To avoid revisiting the same link
53
+ content_by_url = {} # Store content from each URL
54
+
55
+ for base_url in base_urls:
56
+ if not base_url.strip():
57
+ continue # Skip empty or invalid URLs
58
+
59
+ print(f"Scraping base URL: {base_url}")
60
+ html_content = fetch_page_content(base_url)
61
+ if html_content:
62
+ cleaned_content = clean_body_content(html_content)
63
+ content_by_url[base_url] = cleaned_content
64
+ visited_links.add(base_url)
65
+
66
+ # Extract and process all internal links
67
+ soup = BeautifulSoup(html_content, "html.parser")
68
+ links = extract_internal_links(base_url, soup)
69
+
70
+ for link in links:
71
+ if link not in visited_links:
72
+ print(f"Scraping link: {link}")
73
+ page_content = fetch_page_content(link)
74
+ if page_content:
75
+ cleaned_content = clean_body_content(page_content)
76
+ content_by_url[link] = cleaned_content
77
+ visited_links.add(link)
78
+
79
+ # If the link is a PDF file, extract its content
80
+ if link.lower().endswith('.pdf'):
81
+ print(f"Extracting PDF content from: {link}")
82
+ pdf_content = extract_pdf_text(link)
83
+ if pdf_content:
84
+ content_by_url[link] = pdf_content
85
+
86
+ return content_by_url
87
+
88
+ except Exception as e:
89
+ print(f"Error during scraping: {e}")
90
+ return {}
91
 
92
 
93
  def fetch_page_content(url):
 
 
 
94
  try:
95
  response = requests.get(url, timeout=10)
96
  response.raise_for_status()
 
101
 
102
 
103
  def extract_internal_links(base_url, soup):
 
 
 
104
  links = set()
105
  for anchor in soup.find_all("a", href=True):
106
  href = anchor["href"]
 
111
 
112
 
113
  def is_internal_link(base_url, link_url):
 
 
 
114
  base_netloc = urlparse(base_url).netloc
115
  link_netloc = urlparse(link_url).netloc
116
  return base_netloc == link_netloc
117
 
118
 
119
  def extract_pdf_text(pdf_url):
 
 
 
120
  try:
121
  response = requests.get(pdf_url)
122
  response.raise_for_status()
 
136
 
137
 
138
  def clean_body_content(html_content):
 
 
 
139
  soup = BeautifulSoup(html_content, "html.parser")
140
+
141
 
 
142
  for script_or_style in soup(["script", "style"]):
143
  script_or_style.extract()
144
+
145
 
 
146
  cleaned_content = soup.get_text(separator="\n")
147
  cleaned_content = "\n".join(
148
  line.strip() for line in cleaned_content.splitlines() if line.strip()
 
150
  return cleaned_content
151
 
152
 
153
+ if __name__ == "__main__":
154
+ website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"
155
+
156
+ ]
157
+ all_content = scrape_websites(website)
 
158
 
 
 
 
 
 
 
 
 
159
  temp_list = []
160
  for url, content in all_content.items():
161
  temp_list.append((url, content))
162
+
163
 
164
+ processed_texts = []
165
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ for element in temp_list:
168
+ if isinstance(element, tuple):
169
+ url, content = element
170
+ processed_texts.append(f"url: {url}, content: {content}")
171
+ elif isinstance(element, str):
172
+ processed_texts.append(element)
173
+ else:
174
+ processed_texts.append(str(element))
175
+
176
+ def chunk_string(s, chunk_size=1000):
177
+ return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
178
+
179
+ chunked_texts = []
180
+
181
+ for text in processed_texts:
182
+ chunked_texts.extend(chunk_string(text))
183
+
184
+
185
+ vectorstore = Chroma(
186
+ collection_name="GBVR_Datst",
187
+ embedding_function=embed_model,
188
+ persist_directory="./",
189
+ )
190
+
191
+ vectorstore.get().keys()
192
 
193
+ vectorstore.add_texts(chunked_texts)
194
 
195
+ # Updated template to include conversation history
196
+ template = ("""
197
  You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
198
 
199
  1. **Warm & Natural Interaction**
 
232
  **Context:** {context}
233
  **User's Question:** {question}
234
  **Your Response:**
235
+ """)
236
 
 
 
237
 
238
+ rag_prompt = PromptTemplate.from_template(template)
 
 
 
 
 
 
 
 
 
 
239
 
240
+ retriever = vectorstore.as_retriever()
241
 
242
+ llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
243
+
244
+ # Dictionary to store user sessions with session IDs
245
+ user_sessions = {}
246
+
247
+ # Define the RAG chain with session history
248
+ def rag_chain(question, session_id="default"):
249
+ # Get conversation history if available
250
  conversation_history = session_manager.get_history(session_id)
251
 
252
  # Get context from retriever
 
268
 
269
  return response
270
 
271
+ # Define the RAG memory stream function
272
+ def rag_memory_stream(message, history):
273
+ # Generate a session ID based on the first message if not exists
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  session_id = None
275
  for msg in history:
276
  if msg[0]: # If there's a user message
277
+ # Use first few characters of first message as simple session ID
278
  session_id = hash(msg[0][:20]) if session_id is None else session_id
279
  break
280
 
 
283
  session_id = "default_session"
284
 
285
  # Process the message and get response
286
+ response = rag_chain(message, str(session_id))
287
 
288
  # Stream the response word by word
289
  partial_text = ""
 
292
  partial_text += word + " "
293
  yield partial_text.strip()
294
 
295
+ # Title with emojis
296
+ title = "GBVR Chatbot"
297
+
298
+ # Custom CSS for styling the interface
299
+ custom_css = """
300
+ /* Custom CSS for styling the interface */
301
+ body {
302
+ font-family: "Arial", serif;
303
+ }
304
+
305
+ .gradio-container {
306
+ font-family: "Times New Roman", serif;
307
+ }
308
+
309
+ .gr-button {
310
+ background-color: #007bff; /* Blue button */
311
+ color: white;
312
+ border: none;
313
+ border-radius: 5px;
314
+ font-size: 16px;
315
+ padding: 10px 20px;
316
+ cursor: pointer;
317
+ }
318
+
319
+ .gr-textbox:focus, .gr-button:focus {
320
+ outline: none; /* Remove outline focus for a cleaner look */
321
+ }
322
+
323
+ /* Specific CSS for the welcome message */
324
+ .gradio-description {
325
+ font-size: 30px; /* Set font size for the welcome message */
326
+ font-family: "Arial", sans-serif;
327
+ text-align: center; /* Optional: Center-align the text */
328
+ padding: 20px; /* Optional: Add padding around the welcome message */
329
+ }
330
 
331
+ """
332
+
333
+ # Generate a simple welcome message using the LLM
334
+ def generate_welcome_message():
335
+ welcome_prompt = """
336
+ Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
337
+ Keep it under 3 sentences, and use simple language.
338
+ Make it warm and supportive but direct and easy to read.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  """
340
 
341
+ # Get the welcome message from the LLM
342
+ welcome_message = llm.invoke(welcome_prompt).content
343
+ return welcome_message
344
+
345
+ # Create simple welcome message
346
+ welcome_msg = generate_welcome_message()
 
 
 
 
 
 
 
 
 
347
 
348
+ # Create the Chat Interface with welcome message
349
+ demo = gr.ChatInterface(
350
+ fn=rag_memory_stream,
351
+ title=title,
352
+ fill_height=True,
353
+ theme="soft",
354
+ css=custom_css, # Apply the custom CSS
355
+ description=welcome_msg
356
+ )
357
 
358
+ # Launch the app
359
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
360
  demo.launch(share=True, inbrowser=True, debug=True)