Spaces:

Rabbit-Innotech
/

GBVR

Runtime error

App Files Files Community

Rabbit-Innotech commited on Apr 30, 2025

Commit

2ed0c6d

verified ·

1 Parent(s): 5d9d36a

Update app.py

Browse files

Files changed (1) hide show

app.py +329 -270

app.py CHANGED Viewed

@@ -1,35 +1,54 @@
 import os
-from langchain_groq import ChatGroq
-from langchain.prompts import ChatPromptTemplate, PromptTemplate
-from langchain.output_parsers import ResponseSchema, StructuredOutputParser
-from urllib.parse import urljoin, urlparse
 import requests
 from io import BytesIO
-from langchain_chroma import Chroma
-import requests
 from bs4 import BeautifulSoup
-from langchain_core.prompts import ChatPromptTemplate
-import gradio as gr
 from PyPDF2 import PdfReader
-from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
-# Simple session management
 class SessionManager:
     def __init__(self):
         self.sessions = {}
-    def get_or_create_session(self, session_id):
         if session_id not in self.sessions:
             self.sessions[session_id] = []
         return self.sessions[session_id]
-    def add_interaction(self, session_id, user_message, ai_response):
         session = self.get_or_create_session(session_id)
         session.append({"user": user_message, "ai": ai_response})
-    def get_history(self, session_id, max_turns=5):
         session = self.get_or_create_session(session_id)
         recent_history = session[-max_turns:] if len(session) > max_turns else session
@@ -40,160 +59,172 @@ class SessionManager:
         return history_text.strip()
-# Initialize session manager
-session_manager = SessionManager()
-groq_api_key= os.environ.get('GBV')
-embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
-def scrape_websites(base_urls):
-    try:
-        visited_links = set()  # To avoid revisiting the same link
-        content_by_url = {}  # Store content from each URL
-        for base_url in base_urls:
-            if not base_url.strip():
-                continue  # Skip empty or invalid URLs
-            print(f"Scraping base URL: {base_url}")
-            html_content = fetch_page_content(base_url)
-            if html_content:
-                cleaned_content = clean_body_content(html_content)
-                content_by_url[base_url] = cleaned_content
-                visited_links.add(base_url)
-                # Extract and process all internal links
-                soup = BeautifulSoup(html_content, "html.parser")
-                links = extract_internal_links(base_url, soup)
-                for link in links:
-                    if link not in visited_links:
-                        print(f"Scraping link: {link}")
-                        page_content = fetch_page_content(link)
-                        if page_content:
-                            cleaned_content = clean_body_content(page_content)
-                            content_by_url[link] = cleaned_content
-                            visited_links.add(link)
-                        # If the link is a PDF file, extract its content
-                        if link.lower().endswith('.pdf'):
-                            print(f"Extracting PDF content from: {link}")
-                            pdf_content = extract_pdf_text(link)
-                            if pdf_content:
-                                content_by_url[link] = pdf_content
-        return content_by_url
-    except Exception as e:
-        print(f"Error during scraping: {e}")
-        return {}
-def fetch_page_content(url):
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        return response.text
-    except requests.exceptions.RequestException as e:
-        print(f"Error fetching {url}: {e}")
-        return None
-def extract_internal_links(base_url, soup):
-    links = set()
-    for anchor in soup.find_all("a", href=True):
-        href = anchor["href"]
-        full_url = urljoin(base_url, href)
-        if is_internal_link(base_url, full_url):
-            links.add(full_url)
-    return links
-def is_internal_link(base_url, link_url):
-    base_netloc = urlparse(base_url).netloc
-    link_netloc = urlparse(link_url).netloc
-    return base_netloc == link_netloc
-def extract_pdf_text(pdf_url):
-    try:
-        response = requests.get(pdf_url)
-        response.raise_for_status()
-        with BytesIO(response.content) as file:
-            reader = PdfReader(file)
-            pdf_text = ""
-            for page in reader.pages:
-                pdf_text += page.extract_text()
-        return pdf_text if pdf_text else None
-    except requests.exceptions.RequestException as e:
-        print(f"Error fetching PDF {pdf_url}: {e}")
-        return None
-    except Exception as e:
-        print(f"Error reading PDF {pdf_url}: {e}")
-        return None
-def clean_body_content(html_content):
-    soup = BeautifulSoup(html_content, "html.parser")
-    for script_or_style in soup(["script", "style"]):
-        script_or_style.extract()
-    cleaned_content = soup.get_text(separator="\n")
-    cleaned_content = "\n".join(
-        line.strip() for line in cleaned_content.splitlines() if line.strip()
-    )
-    return cleaned_content
-if __name__ == "__main__":
-    website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"
-               ]
-    all_content = scrape_websites(website)
-    temp_list = []
-    for url, content in all_content.items():
-        temp_list.append((url, content))
-processed_texts = []
-for element in temp_list:
-    if isinstance(element, tuple):
-        url, content = element
-        processed_texts.append(f"url: {url}, content: {content}")
-    elif isinstance(element, str):
-        processed_texts.append(element)
-    else:
-        processed_texts.append(str(element))
-def chunk_string(s, chunk_size=1000):
-    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
-chunked_texts = []
-for text in processed_texts:
-  chunked_texts.extend(chunk_string(text))
-vectorstore = Chroma(
-    collection_name="GBVR_Dataset",
-    embedding_function=embed_model,
-    persist_directory="./",
-)
-vectorstore.get().keys()
-vectorstore.add_texts(chunked_texts)
-# Updated template to include conversation history
-template = ("""
     You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
     1. **Warm & Natural Interaction**
@@ -232,129 +263,157 @@ template = ("""
     **Context:** {context}
     **User's Question:** {question}
     **Your Response:**
-""")
-rag_prompt = PromptTemplate.from_template(template)
-retriever = vectorstore.as_retriever()
-llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
-# Dictionary to store user sessions with session IDs
-user_sessions = {}
-# Define the RAG chain with session history
-def rag_chain(question, session_id="default"):
-    # Get conversation history if available
-    conversation_history = session_manager.get_history(session_id)
-    # Get context from retriever
-    context_docs = retriever.invoke(question)
-    context = "\n".join(doc.page_content for doc in context_docs)
-    # Create prompt with history
-    prompt = rag_prompt.format(
-        context=context,
-        question=question,
-        conversation_history=conversation_history
-    )
-    # Generate response
-    response = llm.invoke(prompt).content
-    # Store the interaction
-    session_manager.add_interaction(session_id, question, response)
-    return response
-# Define the RAG memory stream function
-def rag_memory_stream(message, history):
-    # Generate a session ID based on the first message if not exists
-    session_id = None
-    for msg in history:
-        if msg[0]:  # If there's a user message
-            # Use first few characters of first message as simple session ID
-            session_id = hash(msg[0][:20]) if session_id is None else session_id
-            break
-    # Default session ID if history is empty
-    if session_id is None:
-        session_id = "default_session"
-    # Process the message and get response
-    response = rag_chain(message, str(session_id))
-    # Stream the response word by word
-    partial_text = ""
-    words = response.split(' ')
-    for word in words:
-        partial_text += word + " "
-        yield partial_text.strip()
-# Title with emojis
-title = "GBVR Chatbot"
-# Custom CSS for styling the interface
-custom_css = """
-/* Custom CSS for styling the interface */
-body {
-    font-family: "Arial", serif;
-}
-.gradio-container {
-    font-family: "Times New Roman", serif;
-}
-.gr-button {
-    background-color: #007bff; /* Blue button */
-    color: white;
-    border: none;
-    border-radius: 5px;
-    font-size: 16px;
-    padding: 10px 20px;
-    cursor: pointer;
-}
-.gr-textbox:focus, .gr-button:focus {
-    outline: none; /* Remove outline focus for a cleaner look */
-}
-/* Specific CSS for the welcome message */
-.gradio-description {
-    font-size: 20px; /* Set font size for the welcome message */
-    font-family: "Arial", sans-serif;
-    text-align: center; /* Optional: Center-align the text */
-    padding: 20px; /* Optional: Add padding around the welcome message */
-}
-"""
-# Generate a simple welcome message using the LLM
-def generate_welcome_message():
-    welcome_prompt = """
-    Create a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
-    Keep it under 3 sentences, use simple language, and include one emoji.
-    Make it warm and supportive but direct and easy to read.
-    """
-    # Get the welcome message from the LLM
-    welcome_message = llm.invoke(welcome_prompt).content
-    return welcome_message
-# Create simple welcome message
-welcome_msg = generate_welcome_message()
-# Create the Chat Interface with welcome message
-demo = gr.ChatInterface(
-    fn=rag_memory_stream,
-    title=title,
-    fill_height=True,
-    theme="soft",
-    css=custom_css, # Apply the custom CSS
-    description=welcome_msg
-)
-# Launch the app
 if __name__ == "__main__":
-    demo.launch(share=True, inbrowser=True, debug=True)

 import os
 import requests
 from io import BytesIO
+from urllib.parse import urljoin, urlparse
+from typing import Dict, List, Set, Tuple, Optional, Union
+# Libraries for web scraping and text processing
 from bs4 import BeautifulSoup
 from PyPDF2 import PdfReader
+# LangChain imports
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+# Gradio import for the user interface
+import gradio as gr
+# Configuration settings
+GROQ_API_KEY = os.environ.get('GBV')
+EMBED_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
+LLM_MODEL_NAME = "llama-3.3-70b-versatile"
+CHUNK_SIZE = 1000
+VECTOR_DB_COLLECTION = "GBVR_Dataset"
+VECTOR_DB_PERSIST_DIR = "./"
+DEFAULT_SESSION_ID = "default_session"
+MAX_HISTORY_TURNS = 5
 class SessionManager:
+    """Manages chat sessions and conversation history."""
     def __init__(self):
         self.sessions = {}
+    def get_or_create_session(self, session_id: str) -> List[Dict[str, str]]:
+        """Get existing session or create a new one."""
         if session_id not in self.sessions:
             self.sessions[session_id] = []
         return self.sessions[session_id]
+    def add_interaction(self, session_id: str, user_message: str, ai_response: str) -> None:
+        """Add user-AI interaction to the session history."""
         session = self.get_or_create_session(session_id)
         session.append({"user": user_message, "ai": ai_response})
+    def get_history(self, session_id: str, max_turns: int = MAX_HISTORY_TURNS) -> str:
+        """Get formatted conversation history."""
         session = self.get_or_create_session(session_id)
         recent_history = session[-max_turns:] if len(session) > max_turns else session
         return history_text.strip()
+class WebScraper:
+    """Handles web scraping operations."""
+    @staticmethod
+    def fetch_page_content(url: str) -> Optional[str]:
+        """Fetch HTML content from a URL."""
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            return response.text
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching {url}: {e}")
+            return None
+    @staticmethod
+    def extract_internal_links(base_url: str, soup: BeautifulSoup) -> Set[str]:
+        """Extract internal links from a page."""
+        links = set()
+        for anchor in soup.find_all("a", href=True):
+            href = anchor["href"]
+            full_url = urljoin(base_url, href)
+            if WebScraper.is_internal_link(base_url, full_url):
+                links.add(full_url)
+        return links
+    @staticmethod
+    def is_internal_link(base_url: str, link_url: str) -> bool:
+        """Check if a link is internal to the base domain."""
+        base_netloc = urlparse(base_url).netloc
+        link_netloc = urlparse(link_url).netloc
+        return base_netloc == link_netloc
+    @staticmethod
+    def extract_pdf_text(pdf_url: str) -> Optional[str]:
+        """Extract text from a PDF URL."""
+        try:
+            response = requests.get(pdf_url)
+            response.raise_for_status()
+            with BytesIO(response.content) as file:
+                reader = PdfReader(file)
+                pdf_text = ""
+                for page in reader.pages:
+                    pdf_text += page.extract_text()
+            return pdf_text if pdf_text else None
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching PDF {pdf_url}: {e}")
+            return None
+        except Exception as e:
+            print(f"Error reading PDF {pdf_url}: {e}")
+            return None
+    @staticmethod
+    def clean_body_content(html_content: str) -> str:
+        """Clean HTML content by removing scripts and styles."""
+        soup = BeautifulSoup(html_content, "html.parser")
+        # Remove script and style elements
+        for script_or_style in soup(["script", "style"]):
+            script_or_style.extract()
+        # Extract and clean text
+        cleaned_content = soup.get_text(separator="\n")
+        cleaned_content = "\n".join(
+            line.strip() for line in cleaned_content.splitlines() if line.strip()
+        )
+        return cleaned_content
+    @classmethod
+    def scrape_websites(cls, base_urls: List[str]) -> Dict[str, str]:
+        """Scrape content from a list of base URLs and their internal links."""
+        try:
+            visited_links = set()
+            content_by_url = {}
+            for base_url in base_urls:
+                if not base_url.strip():
+                    continue
+                print(f"Scraping base URL: {base_url}")
+                html_content = cls.fetch_page_content(base_url)
+                if html_content:
+                    cleaned_content = cls.clean_body_content(html_content)
+                    content_by_url[base_url] = cleaned_content
+                    visited_links.add(base_url)
+                    # Process internal links
+                    soup = BeautifulSoup(html_content, "html.parser")
+                    links = cls.extract_internal_links(base_url, soup)
+                    for link in links:
+                        if link not in visited_links:
+                            print(f"Scraping link: {link}")
+                            page_content = cls.fetch_page_content(link)
+                            if page_content:
+                                cleaned_content = cls.clean_body_content(page_content)
+                                content_by_url[link] = cleaned_content
+                                visited_links.add(link)
+                            # Handle PDF links
+                            if link.lower().endswith('.pdf'):
+                                print(f"Extracting PDF content from: {link}")
+                                pdf_content = cls.extract_pdf_text(link)
+                                if pdf_content:
+                                    content_by_url[link] = pdf_content
+            return content_by_url
+        except Exception as e:
+            print(f"Error during scraping: {e}")
+            return {}
+class TextProcessor:
+    """Handles text processing and chunking."""
+    @staticmethod
+    def process_content_tuples(content_tuples: List[Tuple[str, str]]) -> List[str]:
+        """Process content tuples into formatted strings."""
+        processed_texts = []
+        for url, content in content_tuples:
+            processed_texts.append(f"url: {url}, content: {content}")
+        return processed_texts
+    @staticmethod
+    def chunk_string(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
+        """Split text into chunks of specified size."""
+        return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+    @classmethod
+    def chunk_texts(cls, texts: List[str], chunk_size: int = CHUNK_SIZE) -> List[str]:
+        """Process multiple texts into chunks."""
+        chunked_texts = []
+        for text in texts:
+            chunked_texts.extend(cls.chunk_string(text, chunk_size))
+        return chunked_texts
+class VectorStore:
+    """Manages vector embeddings and retrieval."""
+    def __init__(self, collection_name: str, persist_directory: str):
+        self.embed_model = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
+        self.vectorstore = Chroma(
+            collection_name=collection_name,
+            embedding_function=self.embed_model,
+            persist_directory=persist_directory,
+        )
+    def add_texts(self, texts: List[str]) -> None:
+        """Add texts to the vector store."""
+        self.vectorstore.add_texts(texts)
+    def get_retriever(self):
+        """Get a retriever from the vector store."""
+        return self.vectorstore.as_retriever()
+class ChatbotRAG:
+    """Manages the Retrieval-Augmented Generation (RAG) chatbot."""
+    PROMPT_TEMPLATE = """
     You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
     1. **Warm & Natural Interaction**
     **Context:** {context}
     **User's Question:** {question}
     **Your Response:**
+    """
+    def __init__(self, api_key: str, model_name: str):
+        self.llm = ChatGroq(model=model_name, api_key=api_key)
+        self.rag_prompt = PromptTemplate.from_template(self.PROMPT_TEMPLATE)
+        self.session_manager = SessionManager()
+    def generate_welcome_message(self) -> str:
+        """Generate a welcome message for the chatbot interface."""
+        welcome_prompt = """
+        Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
+        Keep it under 3 sentences, and use simple language.
+        Make it warm and supportive but direct and easy to read.
+        """
+        welcome_message = self.llm.invoke(welcome_prompt).content
+        return welcome_message
+    def process_query(self, question: str, retriever, session_id: str = DEFAULT_SESSION_ID) -> str:
+        """Process a user query using RAG and maintain session history."""
+        # Get conversation history if available
+        conversation_history = self.session_manager.get_history(session_id)
+        # Get context from retriever
+        context_docs = retriever.invoke(question)
+        context = "\n".join(doc.page_content for doc in context_docs)
+        # Create prompt with history
+        prompt = self.rag_prompt.format(
+            context=context,
+            question=question,
+            conversation_history=conversation_history
+        )
+        # Generate response
+        response = self.llm.invoke(prompt).content
+        # Store the interaction
+        self.session_manager.add_interaction(session_id, question, response)
+        return response
+    def streaming_response(self, message: str, history) -> str:
+        """Stream the response word by word for the Gradio interface."""
+        # Generate a session ID based on the first message if not exists
+        session_id = None
+        for msg in history:
+            if msg[0]:  # If there's a user message
+                session_id = hash(msg[0][:20]) if session_id is None else session_id
+                break
+        # Default session ID if history is empty
+        if session_id is None:
+            session_id = DEFAULT_SESSION_ID
+        # Process the message and get response
+        response = self.process_query(message, self.retriever, str(session_id))
+        # Stream the response word by word
+        partial_text = ""
+        words = response.split(' ')
+        for word in words:
+            partial_text += word + " "
+            yield partial_text.strip()
+class ChatbotUI:
+    """Manages the Gradio UI for the chatbot."""
+    CUSTOM_CSS = """
+    /* Custom CSS for styling the interface */
+    body {
+        font-family: "Arial", serif;
+    }
+    .gradio-container {
+        font-family: "Times New Roman", serif;
+    }
+    .gr-button {
+        background-color: #007bff; /* Blue button */
+        color: white;
+        border: none;
+        border-radius: 5px;
+        font-size: 16px;
+        padding: 10px 20px;
+        cursor: pointer;
+    }
+    .gr-textbox:focus, .gr-button:focus {
+        outline: none; /* Remove outline focus for a cleaner look */
+    }
+    /* Specific CSS for the welcome message */
+    .gradio-description {
+        font-size: 30px; /* Set font size for the welcome message */
+        font-family: "Arial", sans-serif;
+        text-align: center; /* Optional: Center-align the text */
+        padding: 20px; /* Optional: Add padding around the welcome message */
+    }
+    """
+    def __init__(self, chatbot_rag):
+        self.chatbot_rag = chatbot_rag
+        self.title = "GBVR Chatbot"
+        self.welcome_msg = chatbot_rag.generate_welcome_message()
+    def create_interface(self):
+        """Create and configure the Gradio interface."""
+        demo = gr.ChatInterface(
+            fn=self.chatbot_rag.streaming_response,
+            title=self.title,
+            fill_height=True,
+            theme="soft",
+            css=self.CUSTOM_CSS,
+            description=self.welcome_msg
+        )
+        return demo
+def main():
+    """Main function to initialize and run the chatbot."""
+    # Define target websites to scrape
+    websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
+    # Scrape website content
+    content_by_url = WebScraper.scrape_websites(websites)
+    # Process content into tuples
+    content_tuples = [(url, content) for url, content in content_by_url.items()]
+    # Process and chunk texts
+    processed_texts = TextProcessor.process_content_tuples(content_tuples)
+    chunked_texts = TextProcessor.chunk_texts(processed_texts)
+    # Initialize vector store
+    vector_store = VectorStore(VECTOR_DB_COLLECTION, VECTOR_DB_PERSIST_DIR)
+    vector_store.add_texts(chunked_texts)
+    retriever = vector_store.get_retriever()
+    # Initialize chatbot RAG
+    chatbot_rag = ChatbotRAG(GROQ_API_KEY, LLM_MODEL_NAME)
+    chatbot_rag.retriever = retriever
+    # Initialize UI
+    ui = ChatbotUI(chatbot_rag)
+    demo = ui.create_interface()
+    # Launch the app
+    demo.launch(share=True, inbrowser=True, debug=True)
 if __name__ == "__main__":
+    main()