File size: 11,917 Bytes
27a0883
007ac45
 
 
 
27a0883
 
007ac45
768a211
27a0883
007ac45
 
27a0883
007ac45
27a0883
 
65a884e
007ac45
27a0883
 
 
 
007ac45
27a0883
 
 
 
007ac45
27a0883
 
 
007ac45
27a0883
 
 
 
 
 
 
 
 
65a884e
007ac45
 
27a0883
768a211
007ac45
 
 
 
768a211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
007ac45
 
 
 
 
 
 
 
 
 
27a0883
 
007ac45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768a211
2ed0c6d
007ac45
 
768a211
2ed0c6d
007ac45
 
 
 
 
 
 
768a211
 
 
 
 
27a0883
007ac45
 
 
768a211
2ed0c6d
768a211
 
007ac45
768a211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27a0883
768a211
27a0883
768a211
 
27a0883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768a211
007ac45
 
768a211
007ac45
768a211
007ac45
768a211
 
 
 
 
 
 
 
007ac45
2ed0c6d
007ac45
 
 
2ed0c6d
007ac45
 
 
 
 
 
 
 
 
 
 
 
 
 
27a0883
768a211
 
 
007ac45
 
 
768a211
007ac45
 
 
 
 
 
 
 
768a211
007ac45
 
 
 
 
 
 
 
768a211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
007ac45
768a211
 
 
 
 
 
 
 
2ed0c6d
27a0883
768a211
 
 
 
 
 
2ed0c6d
768a211
 
 
 
 
 
 
 
 
2ed0c6d
768a211
65a884e
007ac45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import os
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from urllib.parse import urljoin, urlparse
import requests
from io import BytesIO
from langchain_chroma import Chroma
import requests
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
import gradio as gr
from PyPDF2 import PdfReader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Simple session management
class SessionManager:
    def __init__(self):
        self.sessions = {}
    
    def get_or_create_session(self, session_id):
        if session_id not in self.sessions:
            self.sessions[session_id] = []
        return self.sessions[session_id]
    
    def add_interaction(self, session_id, user_message, ai_response):
        session = self.get_or_create_session(session_id)
        session.append({"user": user_message, "ai": ai_response})
    
    def get_history(self, session_id, max_turns=5):
        session = self.get_or_create_session(session_id)
        recent_history = session[-max_turns:] if len(session) > max_turns else session
        
        history_text = ""
        for interaction in recent_history:
            history_text += f"User: {interaction['user']}\n"
            history_text += f"Assistant: {interaction['ai']}\n\n"
        
        return history_text.strip()

# Initialize session manager
session_manager = SessionManager()

groq_api_key= os.environ.get('GBV')

embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")

def scrape_websites(base_urls):
    try:
        visited_links = set()  # To avoid revisiting the same link
        content_by_url = {}  # Store content from each URL

        for base_url in base_urls:
            if not base_url.strip():
                continue  # Skip empty or invalid URLs

            print(f"Scraping base URL: {base_url}")
            html_content = fetch_page_content(base_url)
            if html_content:
                cleaned_content = clean_body_content(html_content)
                content_by_url[base_url] = cleaned_content
                visited_links.add(base_url)

                # Extract and process all internal links
                soup = BeautifulSoup(html_content, "html.parser")
                links = extract_internal_links(base_url, soup)

                for link in links:
                    if link not in visited_links:
                        print(f"Scraping link: {link}")
                        page_content = fetch_page_content(link)
                        if page_content:
                            cleaned_content = clean_body_content(page_content)
                            content_by_url[link] = cleaned_content
                            visited_links.add(link)

                        # If the link is a PDF file, extract its content
                        if link.lower().endswith('.pdf'):
                            print(f"Extracting PDF content from: {link}")
                            pdf_content = extract_pdf_text(link)
                            if pdf_content:
                                content_by_url[link] = pdf_content

        return content_by_url

    except Exception as e:
        print(f"Error during scraping: {e}")
        return {}


def fetch_page_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def extract_internal_links(base_url, soup):
    links = set()
    for anchor in soup.find_all("a", href=True):
        href = anchor["href"]
        full_url = urljoin(base_url, href)
        if is_internal_link(base_url, full_url):
            links.add(full_url)
    return links


def is_internal_link(base_url, link_url):
    base_netloc = urlparse(base_url).netloc
    link_netloc = urlparse(link_url).netloc
    return base_netloc == link_netloc


def extract_pdf_text(pdf_url):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        with BytesIO(response.content) as file:
            reader = PdfReader(file)
            pdf_text = ""
            for page in reader.pages:
                pdf_text += page.extract_text()

        return pdf_text if pdf_text else None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching PDF {pdf_url}: {e}")
        return None
    except Exception as e:
        print(f"Error reading PDF {pdf_url}: {e}")
        return None


def clean_body_content(html_content):
    soup = BeautifulSoup(html_content, "html.parser")

    
    for script_or_style in soup(["script", "style"]):
        script_or_style.extract()

    
    cleaned_content = soup.get_text(separator="\n")
    cleaned_content = "\n".join(
        line.strip() for line in cleaned_content.splitlines() if line.strip()
    )
    return cleaned_content


if __name__ == "__main__":
    website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"
              
               ] 
    all_content = scrape_websites(website)

    temp_list = []
    for url, content in all_content.items():
        temp_list.append((url, content)) 

    
processed_texts = []

    
for element in temp_list:
    if isinstance(element, tuple):
        url, content = element  
        processed_texts.append(f"url: {url}, content: {content}")
    elif isinstance(element, str):
        processed_texts.append(element)
    else:
        processed_texts.append(str(element))

def chunk_string(s, chunk_size=1000):
    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]

chunked_texts = []

for text in processed_texts:
  chunked_texts.extend(chunk_string(text))


vectorstore = Chroma(
    collection_name="GBVR_Datst",
    embedding_function=embed_model,
    persist_directory="./",
)

vectorstore.get().keys()

vectorstore.add_texts(chunked_texts)

# Updated template to include conversation history
template = ("""
    You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:

    1. **Warm & Natural Interaction**  
       - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.  
       - Example responses:  
         - "😊 Good morning! How can I assist you today?"  
         - "Hello! What can I do for you? πŸš€"  

    2. **Precise Information Extraction**  
       - Provide only the relevant details from the given context: {context}.  
       - Do not generate extra content or assumptions beyond the provided information.  

    3. **Conversational & Engaging Tone**  
       - Keep responses friendly, natural, and engaging.  
       - Use occasional emojis (e.g., 😊, πŸš€) to make interactions more lively.  

    4. **Awareness of Real-Time Context**  
       - If necessary, acknowledge the current date and time to show awareness of real-world updates.  

    5. **Handling Missing Information**  
       - If no relevant information exists in the context, respond politely:  
         - "I don't have that information at the moment, but I'm happy to help with something else! 😊"  

    6. **Personalized Interaction**  
       - Use the conversation history to provide more personalized and contextually relevant responses.
       - Previous conversation history: {conversation_history}

    7. **Direct, Concise Responses**  
       - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.  

    8. **Extracting Relevant Links**  
       - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.  
       - Example response:  
         - "Here is the link you requested: [URL]"  

    **Context:** {context}  
    **User's Question:** {question}  
    **Your Response:**  
""")


rag_prompt = PromptTemplate.from_template(template)

retriever = vectorstore.as_retriever()

llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)

# Dictionary to store user sessions with session IDs
user_sessions = {}

# Define the RAG chain with session history
def rag_chain(question, session_id="default"):
    # Get conversation history if available
    conversation_history = session_manager.get_history(session_id)
    
    # Get context from retriever
    context_docs = retriever.invoke(question)
    context = "\n".join(doc.page_content for doc in context_docs)
    
    # Create prompt with history
    prompt = rag_prompt.format(
        context=context,
        question=question,
        conversation_history=conversation_history
    )
    
    # Generate response
    response = llm.invoke(prompt).content
    
    # Store the interaction
    session_manager.add_interaction(session_id, question, response)
    
    return response

# Define the RAG memory stream function
def rag_memory_stream(message, history):
    # Generate a session ID based on the first message if not exists
    session_id = None
    for msg in history:
        if msg[0]:  # If there's a user message
            # Use first few characters of first message as simple session ID
            session_id = hash(msg[0][:20]) if session_id is None else session_id
            break
    
    # Default session ID if history is empty
    if session_id is None:
        session_id = "default_session"
    
    # Process the message and get response
    response = rag_chain(message, str(session_id))
    
    # Stream the response word by word
    partial_text = ""
    words = response.split(' ')
    for word in words:
        partial_text += word + " "
        yield partial_text.strip()

# Title with emojis
title = "GBVR Chatbot"

# Custom CSS for styling the interface
custom_css = """
/* Custom CSS for styling the interface */
body {
    font-family: "Arial", serif;
}

.gradio-container {
    font-family: "Times New Roman", serif;
}

.gr-button {
    background-color: #007bff; /* Blue button */
    color: white;
    border: none;
    border-radius: 5px;
    font-size: 16px;
    padding: 10px 20px;
    cursor: pointer;
}

.gr-textbox:focus, .gr-button:focus {
    outline: none; /* Remove outline focus for a cleaner look */
}

/* Specific CSS for the welcome message */
.gradio-description {
    font-size: 30px; /* Set font size for the welcome message */
    font-family: "Arial", sans-serif;
    text-align: center; /* Optional: Center-align the text */
    padding: 20px; /* Optional: Add padding around the welcome message */
}

"""

# Generate a simple welcome message using the LLM
def generate_welcome_message():
    welcome_prompt = """
    Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
    Keep it under 3 sentences, and use simple language.
    Make it warm and supportive but direct and easy to read.
    """
    
    # Get the welcome message from the LLM
    welcome_message = llm.invoke(welcome_prompt).content
    return welcome_message

# Create simple welcome message
welcome_msg = generate_welcome_message()

# Create the Chat Interface with welcome message
demo = gr.ChatInterface(
    fn=rag_memory_stream,
    title=title,
    fill_height=True,
    theme="soft",
    css=custom_css, # Apply the custom CSS
    description=welcome_msg
)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True, inbrowser=True, debug=True)