Spaces:

Rabbit-Innotech
/

GBVR

Runtime error

App Files Files Community

Rabbit-Innotech commited on Apr 30, 2025

Commit

007ac45

verified ·

1 Parent(s): 2ed0c6d

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -294

app.py CHANGED Viewed

@@ -1,54 +1,34 @@
 import os
 import requests
 from io import BytesIO
-from urllib.parse import urljoin, urlparse
-from typing import Dict, List, Set, Tuple, Optional, Union
-# Libraries for web scraping and text processing
 from bs4 import BeautifulSoup
 from PyPDF2 import PdfReader
-# LangChain imports
-from langchain_groq import ChatGroq
-from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
-from langchain_chroma import Chroma
-from langchain_huggingface import HuggingFaceEmbeddings
-# Gradio import for the user interface
-import gradio as gr
-# Configuration settings
-GROQ_API_KEY = os.environ.get('GBV')
-EMBED_MODEL_NAME = "mixedbread-ai/mxbai-embed-large-v1"
-LLM_MODEL_NAME = "llama-3.3-70b-versatile"
-CHUNK_SIZE = 1000
-VECTOR_DB_COLLECTION = "GBVR_Dataset"
-VECTOR_DB_PERSIST_DIR = "./"
-DEFAULT_SESSION_ID = "default_session"
-MAX_HISTORY_TURNS = 5
 class SessionManager:
-    """Manages chat sessions and conversation history."""
     def __init__(self):
         self.sessions = {}
-    def get_or_create_session(self, session_id: str) -> List[Dict[str, str]]:
-        """Get existing session or create a new one."""
         if session_id not in self.sessions:
             self.sessions[session_id] = []
         return self.sessions[session_id]
-    def add_interaction(self, session_id: str, user_message: str, ai_response: str) -> None:
-        """Add user-AI interaction to the session history."""
         session = self.get_or_create_session(session_id)
         session.append({"user": user_message, "ai": ai_response})
-    def get_history(self, session_id: str, max_turns: int = MAX_HISTORY_TURNS) -> str:
-        """Get formatted conversation history."""
         session = self.get_or_create_session(session_id)
         recent_history = session[-max_turns:] if len(session) > max_turns else session
@@ -59,172 +39,174 @@ class SessionManager:
         return history_text.strip()
-class WebScraper:
-    """Handles web scraping operations."""
-    @staticmethod
-    def fetch_page_content(url: str) -> Optional[str]:
-        """Fetch HTML content from a URL."""
-        try:
-            response = requests.get(url, timeout=10)
-            response.raise_for_status()
-            return response.text
-        except requests.exceptions.RequestException as e:
-            print(f"Error fetching {url}: {e}")
-            return None
-    @staticmethod
-    def extract_internal_links(base_url: str, soup: BeautifulSoup) -> Set[str]:
-        """Extract internal links from a page."""
-        links = set()
-        for anchor in soup.find_all("a", href=True):
-            href = anchor["href"]
-            full_url = urljoin(base_url, href)
-            if WebScraper.is_internal_link(base_url, full_url):
-                links.add(full_url)
-        return links
-    @staticmethod
-    def is_internal_link(base_url: str, link_url: str) -> bool:
-        """Check if a link is internal to the base domain."""
-        base_netloc = urlparse(base_url).netloc
-        link_netloc = urlparse(link_url).netloc
-        return base_netloc == link_netloc
-    @staticmethod
-    def extract_pdf_text(pdf_url: str) -> Optional[str]:
-        """Extract text from a PDF URL."""
-        try:
-            response = requests.get(pdf_url)
-            response.raise_for_status()
-            with BytesIO(response.content) as file:
-                reader = PdfReader(file)
-                pdf_text = ""
-                for page in reader.pages:
-                    pdf_text += page.extract_text()
-            return pdf_text if pdf_text else None
-        except requests.exceptions.RequestException as e:
-            print(f"Error fetching PDF {pdf_url}: {e}")
-            return None
-        except Exception as e:
-            print(f"Error reading PDF {pdf_url}: {e}")
-            return None
-    @staticmethod
-    def clean_body_content(html_content: str) -> str:
-        """Clean HTML content by removing scripts and styles."""
-        soup = BeautifulSoup(html_content, "html.parser")
-        # Remove script and style elements
-        for script_or_style in soup(["script", "style"]):
-            script_or_style.extract()
-        # Extract and clean text
-        cleaned_content = soup.get_text(separator="\n")
-        cleaned_content = "\n".join(
-            line.strip() for line in cleaned_content.splitlines() if line.strip()
-        )
-        return cleaned_content
-    @classmethod
-    def scrape_websites(cls, base_urls: List[str]) -> Dict[str, str]:
-        """Scrape content from a list of base URLs and their internal links."""
-        try:
-            visited_links = set()
-            content_by_url = {}
-            for base_url in base_urls:
-                if not base_url.strip():
-                    continue
-                print(f"Scraping base URL: {base_url}")
-                html_content = cls.fetch_page_content(base_url)
-                if html_content:
-                    cleaned_content = cls.clean_body_content(html_content)
-                    content_by_url[base_url] = cleaned_content
-                    visited_links.add(base_url)
-                    # Process internal links
-                    soup = BeautifulSoup(html_content, "html.parser")
-                    links = cls.extract_internal_links(base_url, soup)
-                    for link in links:
-                        if link not in visited_links:
-                            print(f"Scraping link: {link}")
-                            page_content = cls.fetch_page_content(link)
-                            if page_content:
-                                cleaned_content = cls.clean_body_content(page_content)
-                                content_by_url[link] = cleaned_content
-                                visited_links.add(link)
-                            # Handle PDF links
-                            if link.lower().endswith('.pdf'):
-                                print(f"Extracting PDF content from: {link}")
-                                pdf_content = cls.extract_pdf_text(link)
-                                if pdf_content:
-                                    content_by_url[link] = pdf_content
-            return content_by_url
-        except Exception as e:
-            print(f"Error during scraping: {e}")
-            return {}
-class TextProcessor:
-    """Handles text processing and chunking."""
-    @staticmethod
-    def process_content_tuples(content_tuples: List[Tuple[str, str]]) -> List[str]:
-        """Process content tuples into formatted strings."""
-        processed_texts = []
-        for url, content in content_tuples:
-            processed_texts.append(f"url: {url}, content: {content}")
-        return processed_texts
-    @staticmethod
-    def chunk_string(text: str, chunk_size: int = CHUNK_SIZE) -> List[str]:
-        """Split text into chunks of specified size."""
-        return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
-    @classmethod
-    def chunk_texts(cls, texts: List[str], chunk_size: int = CHUNK_SIZE) -> List[str]:
-        """Process multiple texts into chunks."""
-        chunked_texts = []
-        for text in texts:
-            chunked_texts.extend(cls.chunk_string(text, chunk_size))
-        return chunked_texts
-class VectorStore:
-    """Manages vector embeddings and retrieval."""
-    def __init__(self, collection_name: str, persist_directory: str):
-        self.embed_model = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
-        self.vectorstore = Chroma(
-            collection_name=collection_name,
-            embedding_function=self.embed_model,
-            persist_directory=persist_directory,
-        )
-    def add_texts(self, texts: List[str]) -> None:
-        """Add texts to the vector store."""
-        self.vectorstore.add_texts(texts)
-    def get_retriever(self):
-        """Get a retriever from the vector store."""
-        return self.vectorstore.as_retriever()
-class ChatbotRAG:
-    """Manages the Retrieval-Augmented Generation (RAG) chatbot."""
-    PROMPT_TEMPLATE = """
     You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
     1. **Warm & Natural Interaction**
@@ -263,76 +245,104 @@ class ChatbotRAG:
     **Context:** {context}
     **User's Question:** {question}
     **Your Response:**
     """
-    def __init__(self, api_key: str, model_name: str):
-        self.llm = ChatGroq(model=model_name, api_key=api_key)
-        self.rag_prompt = PromptTemplate.from_template(self.PROMPT_TEMPLATE)
-        self.session_manager = SessionManager()
-    def generate_welcome_message(self) -> str:
-        """Generate a welcome message for the chatbot interface."""
-        welcome_prompt = """
-        Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
-        Keep it under 3 sentences, and use simple language.
-        Make it warm and supportive but direct and easy to read.
-        """
-        welcome_message = self.llm.invoke(welcome_prompt).content
-        return welcome_message
-    def process_query(self, question: str, retriever, session_id: str = DEFAULT_SESSION_ID) -> str:
-        """Process a user query using RAG and maintain session history."""
-        # Get conversation history if available
-        conversation_history = self.session_manager.get_history(session_id)
-        # Get context from retriever
-        context_docs = retriever.invoke(question)
-        context = "\n".join(doc.page_content for doc in context_docs)
-        # Create prompt with history
-        prompt = self.rag_prompt.format(
-            context=context,
-            question=question,
-            conversation_history=conversation_history
-        )
-        # Generate response
-        response = self.llm.invoke(prompt).content
-        # Store the interaction
-        self.session_manager.add_interaction(session_id, question, response)
-        return response
-    def streaming_response(self, message: str, history) -> str:
-        """Stream the response word by word for the Gradio interface."""
-        # Generate a session ID based on the first message if not exists
-        session_id = None
-        for msg in history:
-            if msg[0]:  # If there's a user message
-                session_id = hash(msg[0][:20]) if session_id is None else session_id
-                break
-        # Default session ID if history is empty
-        if session_id is None:
-            session_id = DEFAULT_SESSION_ID
-        # Process the message and get response
-        response = self.process_query(message, self.retriever, str(session_id))
-        # Stream the response word by word
-        partial_text = ""
-        words = response.split(' ')
-        for word in words:
-            partial_text += word + " "
-            yield partial_text.strip()
-class ChatbotUI:
-    """Manages the Gradio UI for the chatbot."""
-    CUSTOM_CSS = """
     /* Custom CSS for styling the interface */
     body {
         font-family: "Arial", serif;
@@ -365,55 +375,33 @@ class ChatbotUI:
     }
     """
-    def __init__(self, chatbot_rag):
-        self.chatbot_rag = chatbot_rag
-        self.title = "GBVR Chatbot"
-        self.welcome_msg = chatbot_rag.generate_welcome_message()
-    def create_interface(self):
-        """Create and configure the Gradio interface."""
-        demo = gr.ChatInterface(
-            fn=self.chatbot_rag.streaming_response,
-            title=self.title,
-            fill_height=True,
-            theme="soft",
-            css=self.CUSTOM_CSS,
-            description=self.welcome_msg
-        )
-        return demo
-def main():
-    """Main function to initialize and run the chatbot."""
-    # Define target websites to scrape
-    websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
-    # Scrape website content
-    content_by_url = WebScraper.scrape_websites(websites)
-    # Process content into tuples
-    content_tuples = [(url, content) for url, content in content_by_url.items()]
-    # Process and chunk texts
-    processed_texts = TextProcessor.process_content_tuples(content_tuples)
-    chunked_texts = TextProcessor.chunk_texts(processed_texts)
-    # Initialize vector store
-    vector_store = VectorStore(VECTOR_DB_COLLECTION, VECTOR_DB_PERSIST_DIR)
-    vector_store.add_texts(chunked_texts)
-    retriever = vector_store.get_retriever()
-    # Initialize chatbot RAG
-    chatbot_rag = ChatbotRAG(GROQ_API_KEY, LLM_MODEL_NAME)
-    chatbot_rag.retriever = retriever
-    # Initialize UI
-    ui = ChatbotUI(chatbot_rag)
-    demo = ui.create_interface()
-    # Launch the app
-    demo.launch(share=True, inbrowser=True, debug=True)
 if __name__ == "__main__":
-    main()

 import os
+from langchain_groq import ChatGroq
+from langchain.prompts import ChatPromptTemplate, PromptTemplate
+from langchain.output_parsers import ResponseSchema, StructuredOutputParser
+from urllib.parse import urljoin, urlparse
 import requests
 from io import BytesIO
+from langchain_chroma import Chroma
 from bs4 import BeautifulSoup
+from langchain_core.prompts import ChatPromptTemplate
+import gradio as gr
 from PyPDF2 import PdfReader
+from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
+# Simple session management
 class SessionManager:
     def __init__(self):
         self.sessions = {}
+    def get_or_create_session(self, session_id):
         if session_id not in self.sessions:
             self.sessions[session_id] = []
         return self.sessions[session_id]
+    def add_interaction(self, session_id, user_message, ai_response):
         session = self.get_or_create_session(session_id)
         session.append({"user": user_message, "ai": ai_response})
+    def get_history(self, session_id, max_turns=5):
         session = self.get_or_create_session(session_id)
         recent_history = session[-max_turns:] if len(session) > max_turns else session
         return history_text.strip()
+# Initialize session manager
+session_manager = SessionManager()
+# Get API key from environment variable
+groq_api_key = os.environ.get('GBV')
+# Initialize embedding model
+embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+def scrape_websites(base_urls):
+    """
+    Scrape content from given URLs and their internal links
+    """
+    visited_links = set()  # To avoid revisiting the same link
+    content_by_url = {}    # Store content from each URL
+    for base_url in base_urls:
+        if not base_url.strip():
+            continue  # Skip empty URLs
+        print(f"Scraping base URL: {base_url}")
+        html_content = fetch_page_content(base_url)
+        if html_content:
+            cleaned_content = clean_body_content(html_content)
+            content_by_url[base_url] = cleaned_content
+            visited_links.add(base_url)
+            # Extract and process internal links
+            soup = BeautifulSoup(html_content, "html.parser")
+            links = extract_internal_links(base_url, soup)
+            for link in links:
+                if link not in visited_links:
+                    print(f"Scraping link: {link}")
+                    page_content = fetch_page_content(link)
+                    if page_content:
+                        cleaned_content = clean_body_content(page_content)
+                        content_by_url[link] = cleaned_content
+                        visited_links.add(link)
+                    # Handle PDF files
+                    if link.lower().endswith('.pdf'):
+                        print(f"Extracting PDF content from: {link}")
+                        pdf_content = extract_pdf_text(link)
+                        if pdf_content:
+                            content_by_url[link] = pdf_content
+    return content_by_url
+def fetch_page_content(url):
+    """
+    Fetch HTML content from a URL
+    """
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url}: {e}")
+        return None
+def extract_internal_links(base_url, soup):
+    """
+    Extract all internal links from a BeautifulSoup object
+    """
+    links = set()
+    for anchor in soup.find_all("a", href=True):
+        href = anchor["href"]
+        full_url = urljoin(base_url, href)
+        if is_internal_link(base_url, full_url):
+            links.add(full_url)
+    return links
+def is_internal_link(base_url, link_url):
+    """
+    Check if a URL belongs to the same domain as the base URL
+    """
+    base_netloc = urlparse(base_url).netloc
+    link_netloc = urlparse(link_url).netloc
+    return base_netloc == link_netloc
+def extract_pdf_text(pdf_url):
+    """
+    Extract text content from a PDF file
+    """
+    try:
+        response = requests.get(pdf_url)
+        response.raise_for_status()
+        with BytesIO(response.content) as file:
+            reader = PdfReader(file)
+            pdf_text = ""
+            for page in reader.pages:
+                pdf_text += page.extract_text()
+        return pdf_text if pdf_text else None
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching PDF {pdf_url}: {e}")
+        return None
+    except Exception as e:
+        print(f"Error reading PDF {pdf_url}: {e}")
+        return None
+def clean_body_content(html_content):
+    """
+    Extract and clean text content from HTML
+    """
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove script and style elements
+    for script_or_style in soup(["script", "style"]):
+        script_or_style.extract()
+    # Extract text and clean
+    cleaned_content = soup.get_text(separator="\n")
+    cleaned_content = "\n".join(
+        line.strip() for line in cleaned_content.splitlines() if line.strip()
+    )
+    return cleaned_content
+def chunk_string(s, chunk_size=1000):
+    """
+    Split a string into chunks of specific size
+    """
+    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
+def process_and_load_content(website_urls):
+    """
+    Process website content and load into vector database
+    """
+    # Scrape websites
+    all_content = scrape_websites(website_urls)
+    # Convert to list of tuples
+    temp_list = []
+    for url, content in all_content.items():
+        temp_list.append((url, content))
+    # Process texts with URL context
+    processed_texts = []
+    for url, content in temp_list:
+        processed_texts.append(f"url: {url}, content: {content}")
+    # Split into chunks
+    chunked_texts = []
+    for text in processed_texts:
+        chunked_texts.extend(chunk_string(text))
+    # Create and populate vector store
+    vectorstore = Chroma(
+        collection_name="GBVR_Dataset",
+        embedding_function=embed_model,
+        persist_directory="./",
+    )
+    vectorstore.add_texts(chunked_texts)
+    return vectorstore
+# RAG prompt template
+rag_prompt_template = """
     You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
     1. **Warm & Natural Interaction**
     **Context:** {context}
     **User's Question:** {question}
     **Your Response:**
+"""
+# Create prompt template
+rag_prompt = PromptTemplate.from_template(rag_prompt_template)
+def init_rag_components(vectorstore):
     """
+    Initialize RAG components: retriever and LLM
+    """
+    # Create retriever from vector store
+    retriever = vectorstore.as_retriever()
+    # Initialize LLM
+    llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
+    return retriever, llm
+def rag_chain(question, session_id="default", retriever=None, llm=None):
+    """
+    Process a query through the RAG pipeline
+    """
+    # Get conversation history
+    conversation_history = session_manager.get_history(session_id)
+    # Get context from retriever
+    context_docs = retriever.invoke(question)
+    context = "\n".join(doc.page_content for doc in context_docs)
+    # Create prompt with history
+    prompt = rag_prompt.format(
+        context=context,
+        question=question,
+        conversation_history=conversation_history
+    )
+    # Generate response
+    response = llm.invoke(prompt).content
+    # Store the interaction
+    session_manager.add_interaction(session_id, question, response)
+    return response
+def generate_welcome_message(llm):
+    """
+    Generate a welcoming message for the chatbot
+    """
+    welcome_prompt = """
+    Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
+    Keep it under 3 sentences, and use simple language.
+    Make it warm and supportive but direct and easy to read.
+    """
+    welcome_message = llm.invoke(welcome_prompt).content
+    return welcome_message
+def rag_memory_stream(message, history, retriever, llm):
+    """
+    Stream responses for the Gradio interface
+    """
+    # Generate a session ID based on the first message
+    session_id = None
+    for msg in history:
+        if msg[0]:  # If there's a user message
+            # Use hash of first message as session ID
+            session_id = hash(msg[0][:20]) if session_id is None else session_id
+            break
+    # Default session ID if history is empty
+    if session_id is None:
+        session_id = "default_session"
+    # Process the message and get response
+    response = rag_chain(message, str(session_id), retriever, llm)
+    # Stream the response word by word
+    partial_text = ""
+    words = response.split(' ')
+    for word in words:
+        partial_text += word + " "
+        yield partial_text.strip()
+def create_ui(retriever, llm):
+    """
+    Create the Gradio UI for the chatbot
+    """
+    # Title
+    title = "GBVR Chatbot"
+    # Generate welcome message
+    welcome_msg = generate_welcome_message(llm)
+    # Custom CSS for styling
+    custom_css = """
     /* Custom CSS for styling the interface */
     body {
         font-family: "Arial", serif;
     }
     """
+    # Create a wrapper function for rag_memory_stream that includes retriever and llm
+    def wrapped_rag_memory_stream(message, history):
+        return rag_memory_stream(message, history, retriever, llm)
+    # Create the Chat Interface
+    demo = gr.ChatInterface(
+        fn=wrapped_rag_memory_stream,
+        title=title,
+        fill_height=True,
+        theme="soft",
+        css=custom_css,
+        description=welcome_msg
+    )
+    return demo
 if __name__ == "__main__":
+    # Define target websites
+    websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
+    # Process content and create vector store
+    vectorstore = process_and_load_content(websites)
+    # Initialize RAG components
+    retriever, llm = init_rag_components(vectorstore)
+    # Create and launch UI
+    demo = create_ui(retriever, llm)
+    demo.launch(share=True, inbrowser=True, debug=True)