Rabbit-Innotech commited on
Commit
27a0883
Β·
verified Β·
1 Parent(s): 64498f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +340 -48
app.py CHANGED
@@ -1,64 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
27
 
28
- response = ""
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
41
 
 
 
 
 
 
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
  )
61
 
62
-
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
+ import os
2
+ from langchain_groq import ChatGroq
3
+ from langchain.prompts import ChatPromptTemplate, PromptTemplate
4
+ from langchain.output_parsers import ResponseSchema, StructuredOutputParser
5
+ from urllib.parse import urljoin, urlparse
6
+ import requests
7
+ from io import BytesIO
8
+ from langchain_chroma import Chroma
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ from langchain_core.prompts import ChatPromptTemplate
12
  import gradio as gr
13
+ from PyPDF2 import PdfReader
14
+ from langchain_huggingface import HuggingFaceEmbeddings
15
+ from langchain_core.output_parsers import StrOutputParser
16
+ from langchain_core.runnables import RunnablePassthrough
17
 
18
+ # Simple session management
19
+ class SessionManager:
20
+ def __init__(self):
21
+ self.sessions = {}
22
+
23
+ def get_or_create_session(self, session_id):
24
+ if session_id not in self.sessions:
25
+ self.sessions[session_id] = []
26
+ return self.sessions[session_id]
27
+
28
+ def add_interaction(self, session_id, user_message, ai_response):
29
+ session = self.get_or_create_session(session_id)
30
+ session.append({"user": user_message, "ai": ai_response})
31
+
32
+ def get_history(self, session_id, max_turns=5):
33
+ session = self.get_or_create_session(session_id)
34
+ recent_history = session[-max_turns:] if len(session) > max_turns else session
35
+
36
+ history_text = ""
37
+ for interaction in recent_history:
38
+ history_text += f"User: {interaction['user']}\n"
39
+ history_text += f"Assistant: {interaction['ai']}\n\n"
40
+
41
+ return history_text.strip()
42
 
43
+ # Initialize session manager
44
+ session_manager = SessionManager()
45
 
46
+ groq_api_key= os.environ.get('GBV')
 
 
 
 
 
 
 
 
47
 
48
+ embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
 
 
 
 
49
 
50
+ def scrape_websites(base_urls):
51
+ try:
52
+ visited_links = set() # To avoid revisiting the same link
53
+ content_by_url = {} # Store content from each URL
54
 
55
+ for base_url in base_urls:
56
+ if not base_url.strip():
57
+ continue # Skip empty or invalid URLs
58
 
59
+ print(f"Scraping base URL: {base_url}")
60
+ html_content = fetch_page_content(base_url)
61
+ if html_content:
62
+ cleaned_content = clean_body_content(html_content)
63
+ content_by_url[base_url] = cleaned_content
64
+ visited_links.add(base_url)
 
 
65
 
66
+ # Extract and process all internal links
67
+ soup = BeautifulSoup(html_content, "html.parser")
68
+ links = extract_internal_links(base_url, soup)
69
 
70
+ for link in links:
71
+ if link not in visited_links:
72
+ print(f"Scraping link: {link}")
73
+ page_content = fetch_page_content(link)
74
+ if page_content:
75
+ cleaned_content = clean_body_content(page_content)
76
+ content_by_url[link] = cleaned_content
77
+ visited_links.add(link)
78
 
79
+ # If the link is a PDF file, extract its content
80
+ if link.lower().endswith('.pdf'):
81
+ print(f"Extracting PDF content from: {link}")
82
+ pdf_content = extract_pdf_text(link)
83
+ if pdf_content:
84
+ content_by_url[link] = pdf_content
85
+
86
+ return content_by_url
87
+
88
+ except Exception as e:
89
+ print(f"Error during scraping: {e}")
90
+ return {}
91
+
92
+
93
+ def fetch_page_content(url):
94
+ try:
95
+ response = requests.get(url, timeout=10)
96
+ response.raise_for_status()
97
+ return response.text
98
+ except requests.exceptions.RequestException as e:
99
+ print(f"Error fetching {url}: {e}")
100
+ return None
101
+
102
+
103
+ def extract_internal_links(base_url, soup):
104
+ links = set()
105
+ for anchor in soup.find_all("a", href=True):
106
+ href = anchor["href"]
107
+ full_url = urljoin(base_url, href)
108
+ if is_internal_link(base_url, full_url):
109
+ links.add(full_url)
110
+ return links
111
+
112
+
113
+ def is_internal_link(base_url, link_url):
114
+ base_netloc = urlparse(base_url).netloc
115
+ link_netloc = urlparse(link_url).netloc
116
+ return base_netloc == link_netloc
117
+
118
+
119
+ def extract_pdf_text(pdf_url):
120
+ try:
121
+ response = requests.get(pdf_url)
122
+ response.raise_for_status()
123
+ with BytesIO(response.content) as file:
124
+ reader = PdfReader(file)
125
+ pdf_text = ""
126
+ for page in reader.pages:
127
+ pdf_text += page.extract_text()
128
+
129
+ return pdf_text if pdf_text else None
130
+ except requests.exceptions.RequestException as e:
131
+ print(f"Error fetching PDF {pdf_url}: {e}")
132
+ return None
133
+ except Exception as e:
134
+ print(f"Error reading PDF {pdf_url}: {e}")
135
+ return None
136
+
137
+
138
+ def clean_body_content(html_content):
139
+ soup = BeautifulSoup(html_content, "html.parser")
140
+
141
+
142
+ for script_or_style in soup(["script", "style"]):
143
+ script_or_style.extract()
144
+
145
+
146
+ cleaned_content = soup.get_text(separator="\n")
147
+ cleaned_content = "\n".join(
148
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
149
+ )
150
+ return cleaned_content
151
+
152
+
153
+ if __name__ == "__main__":
154
+ website = ["https://haguruka.org.rw/"
155
+
156
+ ]
157
+ all_content = scrape_websites(website)
158
+
159
+ temp_list = []
160
+ for url, content in all_content.items():
161
+ temp_list.append((url, content))
162
+
163
+
164
+ processed_texts = []
165
+
166
+
167
+ for element in temp_list:
168
+ if isinstance(element, tuple):
169
+ url, content = element
170
+ processed_texts.append(f"url: {url}, content: {content}")
171
+ elif isinstance(element, str):
172
+ processed_texts.append(element)
173
+ else:
174
+ processed_texts.append(str(element))
175
+
176
+ def chunk_string(s, chunk_size=1000):
177
+ return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
178
+
179
+ chunked_texts = []
180
+
181
+ for text in processed_texts:
182
+ chunked_texts.extend(chunk_string(text))
183
+
184
+
185
+ vectorstore = Chroma(
186
+ collection_name="GBVR_Dataset",
187
+ embedding_function=embed_model,
188
+ persist_directory="./",
189
+ )
190
+
191
+ vectorstore.get().keys()
192
+
193
+ vectorstore.add_texts(chunked_texts)
194
+
195
+ # Updated template to include conversation history
196
+ template = ("""
197
+ You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
198
+
199
+ 1. **Warm & Natural Interaction**
200
+ - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.
201
+ - Example responses:
202
+ - "😊 Good morning! How can I assist you today?"
203
+ - "Hello! What can I do for you? πŸš€"
204
+
205
+ 2. **Precise Information Extraction**
206
+ - Provide only the relevant details from the given context: {context}.
207
+ - Do not generate extra content or assumptions beyond the provided information.
208
+
209
+ 3. **Conversational & Engaging Tone**
210
+ - Keep responses friendly, natural, and engaging.
211
+ - Use occasional emojis (e.g., 😊, πŸš€) to make interactions more lively.
212
+
213
+ 4. **Awareness of Real-Time Context**
214
+ - If necessary, acknowledge the current date and time to show awareness of real-world updates.
215
+
216
+ 5. **Handling Missing Information**
217
+ - If no relevant information exists in the context, respond politely:
218
+ - "I don't have that information at the moment, but I'm happy to help with something else! 😊"
219
+
220
+ 6. **Personalized Interaction**
221
+ - Use the conversation history to provide more personalized and contextually relevant responses.
222
+ - Previous conversation history: {conversation_history}
223
+
224
+ 7. **Direct, Concise Responses**
225
+ - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.
226
+
227
+ 8. **Extracting Relevant Links**
228
+ - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
229
+ - Example response:
230
+ - "Here is the link you requested: [URL]"
231
+
232
+ **Context:** {context}
233
+ **User's Question:** {question}
234
+ **Your Response:**
235
+ """)
236
+
237
+
238
+ rag_prompt = PromptTemplate.from_template(template)
239
+
240
+ retriever = vectorstore.as_retriever()
241
+
242
+ llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
243
+
244
+ # Dictionary to store user sessions with session IDs
245
+ user_sessions = {}
246
+
247
+ # Define the RAG chain with session history
248
+ def rag_chain(question, session_id="default"):
249
+ # Get conversation history if available
250
+ conversation_history = session_manager.get_history(session_id)
251
+
252
+ # Get context from retriever
253
+ context_docs = retriever.invoke(question)
254
+ context = "\n".join(doc.page_content for doc in context_docs)
255
+
256
+ # Create prompt with history
257
+ prompt = rag_prompt.format(
258
+ context=context,
259
+ question=question,
260
+ conversation_history=conversation_history
261
+ )
262
+
263
+ # Generate response
264
+ response = llm.invoke(prompt).content
265
+
266
+ # Store the interaction
267
+ session_manager.add_interaction(session_id, question, response)
268
+
269
+ return response
270
+
271
+ # Define the RAG memory stream function
272
+ def rag_memory_stream(message, history):
273
+ # Generate a session ID based on the first message if not exists
274
+ session_id = None
275
+ for msg in history:
276
+ if msg[0]: # If there's a user message
277
+ # Use first few characters of first message as simple session ID
278
+ session_id = hash(msg[0][:20]) if session_id is None else session_id
279
+ break
280
+
281
+ # Default session ID if history is empty
282
+ if session_id is None:
283
+ session_id = "default_session"
284
+
285
+ # Process the message and get response
286
+ response = rag_chain(message, str(session_id))
287
+
288
+ # Stream the response word by word
289
+ partial_text = ""
290
+ words = response.split(' ')
291
+ for word in words:
292
+ partial_text += word + " "
293
+ yield partial_text.strip()
294
+
295
+ # Title with emojis
296
+ title = "GBVR Chatbot"
297
+
298
+ # Custom CSS for styling the interface
299
+ custom_css = """
300
+ body {
301
+ font-family: "Arial", serif;
302
+ }
303
+ .gradio-container {
304
+ font-family: "Times New Roman", serif;
305
+ }
306
+ .gr-button {
307
+ background-color: #007bff; /* Blue button */
308
+ color: white;
309
+ border: none;
310
+ border-radius: 5px;
311
+ font-size: 16px;
312
+ padding: 10px 20px;
313
+ cursor: pointer;
314
+ }
315
+ .gr-textbox:focus, .gr-button:focus {
316
+ outline: none; /* Remove outline focus for a cleaner look */
317
+ }
318
  """
319
+
320
+ # Generate a dynamic welcome message using the LLM
321
+ def generate_welcome_message():
322
+ welcome_prompt = """
323
+ Generate a warm, friendly welcome message for a chatbot that focuses on helping users
324
+ find information about Gender-Based Violence Resources in Rwanda. The message should:
325
+
326
+ 1. Introduce the chatbot's purpose clearly
327
+ 2. Be empathetic and supportive given the sensitive nature of the topic
328
+ 3. Encourage the user to ask questions
329
+ 4. Include 1-2 examples of questions they could ask
330
+ 5. Use a warm, friendly tone with 1-2 appropriate emojis
331
+ 6. Be concise (3-5 sentences)
332
+
333
+ Your welcome message:
334
+ """
335
+
336
+ # Get the welcome message from the LLM
337
+ welcome_message = llm.invoke(welcome_prompt).content
338
+ return welcome_message
339
+
340
+ # Create dynamic welcome message
341
+ welcome_msg = generate_welcome_message()
342
+
343
+ # Create the Chat Interface with welcome message
344
  demo = gr.ChatInterface(
345
+ fn=rag_memory_stream,
346
+ title=title,
347
+ fill_height=True,
348
+ theme="soft",
349
+ css=custom_css, # Apply the custom CSS
350
+ examples=["What services does Haguruka offer?", "How can I report a case of GBV?"],
351
+ description=welcome_msg
 
 
 
 
 
 
352
  )
353
 
354
+ # Launch the app
355
  if __name__ == "__main__":
356
+ demo.launch(share=True, inbrowser=True, debug=True)