import os from langchain_groq import ChatGroq from langchain.prompts import ChatPromptTemplate, PromptTemplate from langchain.output_parsers import ResponseSchema, StructuredOutputParser from urllib.parse import urljoin, urlparse import requests from io import BytesIO from langchain_chroma import Chroma import requests from bs4 import BeautifulSoup from langchain_core.prompts import ChatPromptTemplate import gradio as gr from PyPDF2 import PdfReader from langchain_huggingface import HuggingFaceEmbeddings from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough # Simple session management class SessionManager: def __init__(self): self.sessions = {} def get_or_create_session(self, session_id): if session_id not in self.sessions: self.sessions[session_id] = [] return self.sessions[session_id] def add_interaction(self, session_id, user_message, ai_response): session = self.get_or_create_session(session_id) session.append({"user": user_message, "ai": ai_response}) def get_history(self, session_id, max_turns=5): session = self.get_or_create_session(session_id) recent_history = session[-max_turns:] if len(session) > max_turns else session history_text = "" for interaction in recent_history: history_text += f"User: {interaction['user']}\n" history_text += f"Assistant: {interaction['ai']}\n\n" return history_text.strip() # Initialize session manager session_manager = SessionManager() groq_api_key= os.environ.get('GBV') embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1") def scrape_websites(base_urls): try: visited_links = set() # To avoid revisiting the same link content_by_url = {} # Store content from each URL for base_url in base_urls: if not base_url.strip(): continue # Skip empty or invalid URLs print(f"Scraping base URL: {base_url}") html_content = fetch_page_content(base_url) if html_content: cleaned_content = clean_body_content(html_content) content_by_url[base_url] = cleaned_content visited_links.add(base_url) # Extract and process all internal links soup = BeautifulSoup(html_content, "html.parser") links = extract_internal_links(base_url, soup) for link in links: if link not in visited_links: print(f"Scraping link: {link}") page_content = fetch_page_content(link) if page_content: cleaned_content = clean_body_content(page_content) content_by_url[link] = cleaned_content visited_links.add(link) # If the link is a PDF file, extract its content if link.lower().endswith('.pdf'): print(f"Extracting PDF content from: {link}") pdf_content = extract_pdf_text(link) if pdf_content: content_by_url[link] = pdf_content return content_by_url except Exception as e: print(f"Error during scraping: {e}") return {} def fetch_page_content(url): try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None def extract_internal_links(base_url, soup): links = set() for anchor in soup.find_all("a", href=True): href = anchor["href"] full_url = urljoin(base_url, href) if is_internal_link(base_url, full_url): links.add(full_url) return links def is_internal_link(base_url, link_url): base_netloc = urlparse(base_url).netloc link_netloc = urlparse(link_url).netloc return base_netloc == link_netloc def extract_pdf_text(pdf_url): try: response = requests.get(pdf_url) response.raise_for_status() with BytesIO(response.content) as file: reader = PdfReader(file) pdf_text = "" for page in reader.pages: pdf_text += page.extract_text() return pdf_text if pdf_text else None except requests.exceptions.RequestException as e: print(f"Error fetching PDF {pdf_url}: {e}") return None except Exception as e: print(f"Error reading PDF {pdf_url}: {e}") return None def clean_body_content(html_content): soup = BeautifulSoup(html_content, "html.parser") for script_or_style in soup(["script", "style"]): script_or_style.extract() cleaned_content = soup.get_text(separator="\n") cleaned_content = "\n".join( line.strip() for line in cleaned_content.splitlines() if line.strip() ) return cleaned_content if __name__ == "__main__": website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/" ] all_content = scrape_websites(website) temp_list = [] for url, content in all_content.items(): temp_list.append((url, content)) processed_texts = [] for element in temp_list: if isinstance(element, tuple): url, content = element processed_texts.append(f"url: {url}, content: {content}") elif isinstance(element, str): processed_texts.append(element) else: processed_texts.append(str(element)) def chunk_string(s, chunk_size=1000): return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)] chunked_texts = [] for text in processed_texts: chunked_texts.extend(chunk_string(text)) vectorstore = Chroma( collection_name="GBVR_Datst", embedding_function=embed_model, persist_directory="./", ) vectorstore.get().keys() vectorstore.add_texts(chunked_texts) # Updated template to include conversation history template = (""" You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines: 1. **Warm & Natural Interaction** - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them. - Example responses: - "😊 Good morning! How can I assist you today?" - "Hello! What can I do for you? 🚀" 2. **Precise Information Extraction** - Provide only the relevant details from the given context: {context}. - Do not generate extra content or assumptions beyond the provided information. 3. **Conversational & Engaging Tone** - Keep responses friendly, natural, and engaging. - Use occasional emojis (e.g., 😊, 🚀) to make interactions more lively. 4. **Awareness of Real-Time Context** - If necessary, acknowledge the current date and time to show awareness of real-world updates. 5. **Handling Missing Information** - If no relevant information exists in the context, respond politely: - "I don't have that information at the moment, but I'm happy to help with something else! 😊" 6. **Personalized Interaction** - Use the conversation history to provide more personalized and contextually relevant responses. - Previous conversation history: {conversation_history} 7. **Direct, Concise Responses** - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked. 8. **Extracting Relevant Links** - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly. - Example response: - "Here is the link you requested: [URL]" **Context:** {context} **User's Question:** {question} **Your Response:** """) rag_prompt = PromptTemplate.from_template(template) retriever = vectorstore.as_retriever() llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key) # Dictionary to store user sessions with session IDs user_sessions = {} # Define the RAG chain with session history def rag_chain(question, session_id="default"): # Get conversation history if available conversation_history = session_manager.get_history(session_id) # Get context from retriever context_docs = retriever.invoke(question) context = "\n".join(doc.page_content for doc in context_docs) # Create prompt with history prompt = rag_prompt.format( context=context, question=question, conversation_history=conversation_history ) # Generate response response = llm.invoke(prompt).content # Store the interaction session_manager.add_interaction(session_id, question, response) return response # Define the RAG memory stream function def rag_memory_stream(message, history): # Generate a session ID based on the first message if not exists session_id = None for msg in history: if msg[0]: # If there's a user message # Use first few characters of first message as simple session ID session_id = hash(msg[0][:20]) if session_id is None else session_id break # Default session ID if history is empty if session_id is None: session_id = "default_session" # Process the message and get response response = rag_chain(message, str(session_id)) # Stream the response word by word partial_text = "" words = response.split(' ') for word in words: partial_text += word + " " yield partial_text.strip() # Title with emojis title = "GBVR Chatbot" # Custom CSS for styling the interface custom_css = """ /* Custom CSS for styling the interface */ body { font-family: "Arial", serif; } .gradio-container { font-family: "Times New Roman", serif; } .gr-button { background-color: #007bff; /* Blue button */ color: white; border: none; border-radius: 5px; font-size: 16px; padding: 10px 20px; cursor: pointer; } .gr-textbox:focus, .gr-button:focus { outline: none; /* Remove outline focus for a cleaner look */ } /* Specific CSS for the welcome message */ .gradio-description { font-size: 30px; /* Set font size for the welcome message */ font-family: "Arial", sans-serif; text-align: center; /* Optional: Center-align the text */ padding: 20px; /* Optional: Add padding around the welcome message */ } """ # Generate a simple welcome message using the LLM def generate_welcome_message(): welcome_prompt = """ Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda. Keep it under 3 sentences, and use simple language. Make it warm and supportive but direct and easy to read. """ # Get the welcome message from the LLM welcome_message = llm.invoke(welcome_prompt).content return welcome_message # Create simple welcome message welcome_msg = generate_welcome_message() # Create the Chat Interface with welcome message demo = gr.ChatInterface( fn=rag_memory_stream, title=title, fill_height=True, theme="soft", css=custom_css, # Apply the custom CSS description=welcome_msg ) # Launch the app if __name__ == "__main__": demo.launch(share=True, inbrowser=True, debug=True)