Spaces:
Runtime error
Runtime error
File size: 11,917 Bytes
27a0883 007ac45 27a0883 007ac45 768a211 27a0883 007ac45 27a0883 007ac45 27a0883 65a884e 007ac45 27a0883 007ac45 27a0883 007ac45 27a0883 007ac45 27a0883 65a884e 007ac45 27a0883 768a211 007ac45 768a211 007ac45 27a0883 007ac45 768a211 2ed0c6d 007ac45 768a211 2ed0c6d 007ac45 768a211 27a0883 007ac45 768a211 2ed0c6d 768a211 007ac45 768a211 27a0883 768a211 27a0883 768a211 27a0883 768a211 007ac45 768a211 007ac45 768a211 007ac45 768a211 007ac45 2ed0c6d 007ac45 2ed0c6d 007ac45 27a0883 768a211 007ac45 768a211 007ac45 768a211 007ac45 768a211 007ac45 768a211 2ed0c6d 27a0883 768a211 2ed0c6d 768a211 2ed0c6d 768a211 65a884e 007ac45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 |
import os
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from urllib.parse import urljoin, urlparse
import requests
from io import BytesIO
from langchain_chroma import Chroma
import requests
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
import gradio as gr
from PyPDF2 import PdfReader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# Simple session management
class SessionManager:
def __init__(self):
self.sessions = {}
def get_or_create_session(self, session_id):
if session_id not in self.sessions:
self.sessions[session_id] = []
return self.sessions[session_id]
def add_interaction(self, session_id, user_message, ai_response):
session = self.get_or_create_session(session_id)
session.append({"user": user_message, "ai": ai_response})
def get_history(self, session_id, max_turns=5):
session = self.get_or_create_session(session_id)
recent_history = session[-max_turns:] if len(session) > max_turns else session
history_text = ""
for interaction in recent_history:
history_text += f"User: {interaction['user']}\n"
history_text += f"Assistant: {interaction['ai']}\n\n"
return history_text.strip()
# Initialize session manager
session_manager = SessionManager()
groq_api_key= os.environ.get('GBV')
embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
def scrape_websites(base_urls):
try:
visited_links = set() # To avoid revisiting the same link
content_by_url = {} # Store content from each URL
for base_url in base_urls:
if not base_url.strip():
continue # Skip empty or invalid URLs
print(f"Scraping base URL: {base_url}")
html_content = fetch_page_content(base_url)
if html_content:
cleaned_content = clean_body_content(html_content)
content_by_url[base_url] = cleaned_content
visited_links.add(base_url)
# Extract and process all internal links
soup = BeautifulSoup(html_content, "html.parser")
links = extract_internal_links(base_url, soup)
for link in links:
if link not in visited_links:
print(f"Scraping link: {link}")
page_content = fetch_page_content(link)
if page_content:
cleaned_content = clean_body_content(page_content)
content_by_url[link] = cleaned_content
visited_links.add(link)
# If the link is a PDF file, extract its content
if link.lower().endswith('.pdf'):
print(f"Extracting PDF content from: {link}")
pdf_content = extract_pdf_text(link)
if pdf_content:
content_by_url[link] = pdf_content
return content_by_url
except Exception as e:
print(f"Error during scraping: {e}")
return {}
def fetch_page_content(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def extract_internal_links(base_url, soup):
links = set()
for anchor in soup.find_all("a", href=True):
href = anchor["href"]
full_url = urljoin(base_url, href)
if is_internal_link(base_url, full_url):
links.add(full_url)
return links
def is_internal_link(base_url, link_url):
base_netloc = urlparse(base_url).netloc
link_netloc = urlparse(link_url).netloc
return base_netloc == link_netloc
def extract_pdf_text(pdf_url):
try:
response = requests.get(pdf_url)
response.raise_for_status()
with BytesIO(response.content) as file:
reader = PdfReader(file)
pdf_text = ""
for page in reader.pages:
pdf_text += page.extract_text()
return pdf_text if pdf_text else None
except requests.exceptions.RequestException as e:
print(f"Error fetching PDF {pdf_url}: {e}")
return None
except Exception as e:
print(f"Error reading PDF {pdf_url}: {e}")
return None
def clean_body_content(html_content):
soup = BeautifulSoup(html_content, "html.parser")
for script_or_style in soup(["script", "style"]):
script_or_style.extract()
cleaned_content = soup.get_text(separator="\n")
cleaned_content = "\n".join(
line.strip() for line in cleaned_content.splitlines() if line.strip()
)
return cleaned_content
if __name__ == "__main__":
website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"
]
all_content = scrape_websites(website)
temp_list = []
for url, content in all_content.items():
temp_list.append((url, content))
processed_texts = []
for element in temp_list:
if isinstance(element, tuple):
url, content = element
processed_texts.append(f"url: {url}, content: {content}")
elif isinstance(element, str):
processed_texts.append(element)
else:
processed_texts.append(str(element))
def chunk_string(s, chunk_size=1000):
return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
chunked_texts = []
for text in processed_texts:
chunked_texts.extend(chunk_string(text))
vectorstore = Chroma(
collection_name="GBVR_Datst",
embedding_function=embed_model,
persist_directory="./",
)
vectorstore.get().keys()
vectorstore.add_texts(chunked_texts)
# Updated template to include conversation history
template = ("""
You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
1. **Warm & Natural Interaction**
- If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.
- Example responses:
- "π Good morning! How can I assist you today?"
- "Hello! What can I do for you? π"
2. **Precise Information Extraction**
- Provide only the relevant details from the given context: {context}.
- Do not generate extra content or assumptions beyond the provided information.
3. **Conversational & Engaging Tone**
- Keep responses friendly, natural, and engaging.
- Use occasional emojis (e.g., π, π) to make interactions more lively.
4. **Awareness of Real-Time Context**
- If necessary, acknowledge the current date and time to show awareness of real-world updates.
5. **Handling Missing Information**
- If no relevant information exists in the context, respond politely:
- "I don't have that information at the moment, but I'm happy to help with something else! π"
6. **Personalized Interaction**
- Use the conversation history to provide more personalized and contextually relevant responses.
- Previous conversation history: {conversation_history}
7. **Direct, Concise Responses**
- If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.
8. **Extracting Relevant Links**
- If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
- Example response:
- "Here is the link you requested: [URL]"
**Context:** {context}
**User's Question:** {question}
**Your Response:**
""")
rag_prompt = PromptTemplate.from_template(template)
retriever = vectorstore.as_retriever()
llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
# Dictionary to store user sessions with session IDs
user_sessions = {}
# Define the RAG chain with session history
def rag_chain(question, session_id="default"):
# Get conversation history if available
conversation_history = session_manager.get_history(session_id)
# Get context from retriever
context_docs = retriever.invoke(question)
context = "\n".join(doc.page_content for doc in context_docs)
# Create prompt with history
prompt = rag_prompt.format(
context=context,
question=question,
conversation_history=conversation_history
)
# Generate response
response = llm.invoke(prompt).content
# Store the interaction
session_manager.add_interaction(session_id, question, response)
return response
# Define the RAG memory stream function
def rag_memory_stream(message, history):
# Generate a session ID based on the first message if not exists
session_id = None
for msg in history:
if msg[0]: # If there's a user message
# Use first few characters of first message as simple session ID
session_id = hash(msg[0][:20]) if session_id is None else session_id
break
# Default session ID if history is empty
if session_id is None:
session_id = "default_session"
# Process the message and get response
response = rag_chain(message, str(session_id))
# Stream the response word by word
partial_text = ""
words = response.split(' ')
for word in words:
partial_text += word + " "
yield partial_text.strip()
# Title with emojis
title = "GBVR Chatbot"
# Custom CSS for styling the interface
custom_css = """
/* Custom CSS for styling the interface */
body {
font-family: "Arial", serif;
}
.gradio-container {
font-family: "Times New Roman", serif;
}
.gr-button {
background-color: #007bff; /* Blue button */
color: white;
border: none;
border-radius: 5px;
font-size: 16px;
padding: 10px 20px;
cursor: pointer;
}
.gr-textbox:focus, .gr-button:focus {
outline: none; /* Remove outline focus for a cleaner look */
}
/* Specific CSS for the welcome message */
.gradio-description {
font-size: 30px; /* Set font size for the welcome message */
font-family: "Arial", sans-serif;
text-align: center; /* Optional: Center-align the text */
padding: 20px; /* Optional: Add padding around the welcome message */
}
"""
# Generate a simple welcome message using the LLM
def generate_welcome_message():
welcome_prompt = """
Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
Keep it under 3 sentences, and use simple language.
Make it warm and supportive but direct and easy to read.
"""
# Get the welcome message from the LLM
welcome_message = llm.invoke(welcome_prompt).content
return welcome_message
# Create simple welcome message
welcome_msg = generate_welcome_message()
# Create the Chat Interface with welcome message
demo = gr.ChatInterface(
fn=rag_memory_stream,
title=title,
fill_height=True,
theme="soft",
css=custom_css, # Apply the custom CSS
description=welcome_msg
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True, inbrowser=True, debug=True) |