Spaces:

Rabbit-Innotech
/

GBVR

Runtime error

File size: 11,917 Bytes

import os
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from urllib.parse import urljoin, urlparse
import requests
from io import BytesIO
from langchain_chroma import Chroma
import requests
from bs4 import BeautifulSoup
from langchain_core.prompts import ChatPromptTemplate
import gradio as gr
from PyPDF2 import PdfReader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Simple session management
class SessionManager:
    def __init__(self):
        self.sessions = {}
    
    def get_or_create_session(self, session_id):
        if session_id not in self.sessions:
            self.sessions[session_id] = []
        return self.sessions[session_id]
    
    def add_interaction(self, session_id, user_message, ai_response):
        session = self.get_or_create_session(session_id)
        session.append({"user": user_message, "ai": ai_response})
    
    def get_history(self, session_id, max_turns=5):
        session = self.get_or_create_session(session_id)
        recent_history = session[-max_turns:] if len(session) > max_turns else session
        
        history_text = ""
        for interaction in recent_history:
            history_text += f"User: {interaction['user']}\n"
            history_text += f"Assistant: {interaction['ai']}\n\n"
        
        return history_text.strip()

# Initialize session manager
session_manager = SessionManager()

groq_api_key= os.environ.get('GBV')

embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")

def scrape_websites(base_urls):
    try:
        visited_links = set()  # To avoid revisiting the same link
        content_by_url = {}  # Store content from each URL

        for base_url in base_urls:
            if not base_url.strip():
                continue  # Skip empty or invalid URLs

            print(f"Scraping base URL: {base_url}")
            html_content = fetch_page_content(base_url)
            if html_content:
                cleaned_content = clean_body_content(html_content)
                content_by_url[base_url] = cleaned_content
                visited_links.add(base_url)

                # Extract and process all internal links
                soup = BeautifulSoup(html_content, "html.parser")
                links = extract_internal_links(base_url, soup)

                for link in links:
                    if link not in visited_links:
                        print(f"Scraping link: {link}")
                        page_content = fetch_page_content(link)
                        if page_content:
                            cleaned_content = clean_body_content(page_content)
                            content_by_url[link] = cleaned_content
                            visited_links.add(link)

                        # If the link is a PDF file, extract its content
                        if link.lower().endswith('.pdf'):
                            print(f"Extracting PDF content from: {link}")
                            pdf_content = extract_pdf_text(link)
                            if pdf_content:
                                content_by_url[link] = pdf_content

        return content_by_url

    except Exception as e:
        print(f"Error during scraping: {e}")
        return {}


def fetch_page_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def extract_internal_links(base_url, soup):
    links = set()
    for anchor in soup.find_all("a", href=True):
        href = anchor["href"]
        full_url = urljoin(base_url, href)
        if is_internal_link(base_url, full_url):
            links.add(full_url)
    return links


def is_internal_link(base_url, link_url):
    base_netloc = urlparse(base_url).netloc
    link_netloc = urlparse(link_url).netloc
    return base_netloc == link_netloc


def extract_pdf_text(pdf_url):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        with BytesIO(response.content) as file:
            reader = PdfReader(file)
            pdf_text = ""
            for page in reader.pages:
                pdf_text += page.extract_text()

        return pdf_text if pdf_text else None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching PDF {pdf_url}: {e}")
        return None
    except Exception as e:
        print(f"Error reading PDF {pdf_url}: {e}")
        return None


def clean_body_content(html_content):
    soup = BeautifulSoup(html_content, "html.parser")

    
    for script_or_style in soup(["script", "style"]):
        script_or_style.extract()

    
    cleaned_content = soup.get_text(separator="\n")
    cleaned_content = "\n".join(
        line.strip() for line in cleaned_content.splitlines() if line.strip()
    )
    return cleaned_content


if __name__ == "__main__":
    website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"
              
               ] 
    all_content = scrape_websites(website)

    temp_list = []
    for url, content in all_content.items():
        temp_list.append((url, content)) 

    
processed_texts = []

    
for element in temp_list:
    if isinstance(element, tuple):
        url, content = element  
        processed_texts.append(f"url: {url}, content: {content}")
    elif isinstance(element, str):
        processed_texts.append(element)
    else:
        processed_texts.append(str(element))

def chunk_string(s, chunk_size=1000):
    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]

chunked_texts = []

for text in processed_texts:
  chunked_texts.extend(chunk_string(text))


vectorstore = Chroma(
    collection_name="GBVR_Datst",
    embedding_function=embed_model,
    persist_directory="./",
)

vectorstore.get().keys()

vectorstore.add_texts(chunked_texts)

# Updated template to include conversation history
template = ("""
    You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:

    1. **Warm & Natural Interaction**  
       - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.  
       - Example responses:  
         - "😊 Good morning! How can I assist you today?"  
         - "Hello! What can I do for you? 🚀"  

    2. **Precise Information Extraction**  
       - Provide only the relevant details from the given context: {context}.  
       - Do not generate extra content or assumptions beyond the provided information.  

    3. **Conversational & Engaging Tone**  
       - Keep responses friendly, natural, and engaging.  
       - Use occasional emojis (e.g., 😊, 🚀) to make interactions more lively.  

    4. **Awareness of Real-Time Context**  
       - If necessary, acknowledge the current date and time to show awareness of real-world updates.  

    5. **Handling Missing Information**  
       - If no relevant information exists in the context, respond politely:  
         - "I don't have that information at the moment, but I'm happy to help with something else! 😊"  

    6. **Personalized Interaction**  
       - Use the conversation history to provide more personalized and contextually relevant responses.
       - Previous conversation history: {conversation_history}

    7. **Direct, Concise Responses**  
       - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.  

    8. **Extracting Relevant Links**  
       - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.  
       - Example response:  
         - "Here is the link you requested: [URL]"  

    **Context:** {context}  
    **User's Question:** {question}  
    **Your Response:**  
""")


rag_prompt = PromptTemplate.from_template(template)

retriever = vectorstore.as_retriever()

llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)

# Dictionary to store user sessions with session IDs
user_sessions = {}

# Define the RAG chain with session history
def rag_chain(question, session_id="default"):
    # Get conversation history if available
    conversation_history = session_manager.get_history(session_id)
    
    # Get context from retriever
    context_docs = retriever.invoke(question)
    context = "\n".join(doc.page_content for doc in context_docs)
    
    # Create prompt with history
    prompt = rag_prompt.format(
        context=context,
        question=question,
        conversation_history=conversation_history
    )
    
    # Generate response
    response = llm.invoke(prompt).content
    
    # Store the interaction
    session_manager.add_interaction(session_id, question, response)
    
    return response

# Define the RAG memory stream function
def rag_memory_stream(message, history):
    # Generate a session ID based on the first message if not exists
    session_id = None
    for msg in history:
        if msg[0]:  # If there's a user message
            # Use first few characters of first message as simple session ID
            session_id = hash(msg[0][:20]) if session_id is None else session_id
            break
    
    # Default session ID if history is empty
    if session_id is None:
        session_id = "default_session"
    
    # Process the message and get response
    response = rag_chain(message, str(session_id))
    
    # Stream the response word by word
    partial_text = ""
    words = response.split(' ')
    for word in words:
        partial_text += word + " "
        yield partial_text.strip()

# Title with emojis
title = "GBVR Chatbot"

# Custom CSS for styling the interface
custom_css = """
/* Custom CSS for styling the interface */
body {
    font-family: "Arial", serif;
}

.gradio-container {
    font-family: "Times New Roman", serif;
}

.gr-button {
    background-color: #007bff; /* Blue button */
    color: white;
    border: none;
    border-radius: 5px;
    font-size: 16px;
    padding: 10px 20px;
    cursor: pointer;
}

.gr-textbox:focus, .gr-button:focus {
    outline: none; /* Remove outline focus for a cleaner look */
}

/* Specific CSS for the welcome message */
.gradio-description {
    font-size: 30px; /* Set font size for the welcome message */
    font-family: "Arial", sans-serif;
    text-align: center; /* Optional: Center-align the text */
    padding: 20px; /* Optional: Add padding around the welcome message */
}

"""

# Generate a simple welcome message using the LLM
def generate_welcome_message():
    welcome_prompt = """
    Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
    Keep it under 3 sentences, and use simple language.
    Make it warm and supportive but direct and easy to read.
    """
    
    # Get the welcome message from the LLM
    welcome_message = llm.invoke(welcome_prompt).content
    return welcome_message

# Create simple welcome message
welcome_msg = generate_welcome_message()

# Create the Chat Interface with welcome message
demo = gr.ChatInterface(
    fn=rag_memory_stream,
    title=title,
    fill_height=True,
    theme="soft",
    css=custom_css, # Apply the custom CSS
    description=welcome_msg
)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True, inbrowser=True, debug=True)