Spaces:

Rabbit-Innotech
/

GBVR

Runtime error

App Files Files Community

Rabbit-Innotech commited on Apr 30, 2025

Commit

768a211

verified ·

1 Parent(s): 007ac45

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -202

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from urllib.parse import urljoin, urlparse
 import requests
 from io import BytesIO
 from langchain_chroma import Chroma
 from bs4 import BeautifulSoup
 from langchain_core.prompts import ChatPromptTemplate
 import gradio as gr
@@ -42,57 +43,54 @@ class SessionManager:
 # Initialize session manager
 session_manager = SessionManager()
-# Get API key from environment variable
-groq_api_key = os.environ.get('GBV')
-# Initialize embedding model
 embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
 def scrape_websites(base_urls):
-    """
-    Scrape content from given URLs and their internal links
-    """
-    visited_links = set()  # To avoid revisiting the same link
-    content_by_url = {}    # Store content from each URL
-    for base_url in base_urls:
-        if not base_url.strip():
-            continue  # Skip empty URLs
-        print(f"Scraping base URL: {base_url}")
-        html_content = fetch_page_content(base_url)
-        if html_content:
-            cleaned_content = clean_body_content(html_content)
-            content_by_url[base_url] = cleaned_content
-            visited_links.add(base_url)
-            # Extract and process internal links
-            soup = BeautifulSoup(html_content, "html.parser")
-            links = extract_internal_links(base_url, soup)
-            for link in links:
-                if link not in visited_links:
-                    print(f"Scraping link: {link}")
-                    page_content = fetch_page_content(link)
-                    if page_content:
-                        cleaned_content = clean_body_content(page_content)
-                        content_by_url[link] = cleaned_content
-                        visited_links.add(link)
-                    # Handle PDF files
-                    if link.lower().endswith('.pdf'):
-                        print(f"Extracting PDF content from: {link}")
-                        pdf_content = extract_pdf_text(link)
-                        if pdf_content:
-                            content_by_url[link] = pdf_content
-    return content_by_url
 def fetch_page_content(url):
-    """
-    Fetch HTML content from a URL
-    """
     try:
         response = requests.get(url, timeout=10)
         response.raise_for_status()
@@ -103,9 +101,6 @@ def fetch_page_content(url):
 def extract_internal_links(base_url, soup):
-    """
-    Extract all internal links from a BeautifulSoup object
-    """
     links = set()
     for anchor in soup.find_all("a", href=True):
         href = anchor["href"]
@@ -116,18 +111,12 @@ def extract_internal_links(base_url, soup):
 def is_internal_link(base_url, link_url):
-    """
-    Check if a URL belongs to the same domain as the base URL
-    """
     base_netloc = urlparse(base_url).netloc
     link_netloc = urlparse(link_url).netloc
     return base_netloc == link_netloc
 def extract_pdf_text(pdf_url):
-    """
-    Extract text content from a PDF file
-    """
     try:
         response = requests.get(pdf_url)
         response.raise_for_status()
@@ -147,16 +136,13 @@ def extract_pdf_text(pdf_url):
 def clean_body_content(html_content):
-    """
-    Extract and clean text content from HTML
-    """
     soup = BeautifulSoup(html_content, "html.parser")
-    # Remove script and style elements
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
-    # Extract text and clean
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()
@@ -164,49 +150,50 @@ def clean_body_content(html_content):
     return cleaned_content
-def chunk_string(s, chunk_size=1000):
-    """
-    Split a string into chunks of specific size
-    """
-    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
-def process_and_load_content(website_urls):
-    """
-    Process website content and load into vector database
-    """
-    # Scrape websites
-    all_content = scrape_websites(website_urls)
-    # Convert to list of tuples
     temp_list = []
     for url, content in all_content.items():
         temp_list.append((url, content))
-    # Process texts with URL context
-    processed_texts = []
-    for url, content in temp_list:
-        processed_texts.append(f"url: {url}, content: {content}")
-    # Split into chunks
-    chunked_texts = []
-    for text in processed_texts:
-        chunked_texts.extend(chunk_string(text))
-    # Create and populate vector store
-    vectorstore = Chroma(
-        collection_name="GBVR_Dataset",
-        embedding_function=embed_model,
-        persist_directory="./",
-    )
-    vectorstore.add_texts(chunked_texts)
-    return vectorstore
-# RAG prompt template
-rag_prompt_template = """
     You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
     1. **Warm & Natural Interaction**
@@ -245,29 +232,21 @@ rag_prompt_template = """
     **Context:** {context}
     **User's Question:** {question}
     **Your Response:**
-"""
-# Create prompt template
-rag_prompt = PromptTemplate.from_template(rag_prompt_template)
-def init_rag_components(vectorstore):
-    """
-    Initialize RAG components: retriever and LLM
-    """
-    # Create retriever from vector store
-    retriever = vectorstore.as_retriever()
-    # Initialize LLM
-    llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
-    return retriever, llm
-def rag_chain(question, session_id="default", retriever=None, llm=None):
-    """
-    Process a query through the RAG pipeline
-    """
-    # Get conversation history
     conversation_history = session_manager.get_history(session_id)
     # Get context from retriever
@@ -289,30 +268,13 @@ def rag_chain(question, session_id="default", retriever=None, llm=None):
     return response
-def generate_welcome_message(llm):
-    """
-    Generate a welcoming message for the chatbot
-    """
-    welcome_prompt = """
-    Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
-    Keep it under 3 sentences, and use simple language.
-    Make it warm and supportive but direct and easy to read.
-    """
-    welcome_message = llm.invoke(welcome_prompt).content
-    return welcome_message
-def rag_memory_stream(message, history, retriever, llm):
-    """
-    Stream responses for the Gradio interface
-    """
-    # Generate a session ID based on the first message
     session_id = None
     for msg in history:
         if msg[0]:  # If there's a user message
-            # Use hash of first message as session ID
             session_id = hash(msg[0][:20]) if session_id is None else session_id
             break
@@ -321,7 +283,7 @@ def rag_memory_stream(message, history, retriever, llm):
         session_id = "default_session"
     # Process the message and get response
-    response = rag_chain(message, str(session_id), retriever, llm)
     # Stream the response word by word
     partial_text = ""
@@ -330,78 +292,69 @@ def rag_memory_stream(message, history, retriever, llm):
         partial_text += word + " "
         yield partial_text.strip()
-def create_ui(retriever, llm):
-    """
-    Create the Gradio UI for the chatbot
-    """
-    # Title
-    title = "GBVR Chatbot"
-    # Generate welcome message
-    welcome_msg = generate_welcome_message(llm)
-    # Custom CSS for styling
-    custom_css = """
-    /* Custom CSS for styling the interface */
-    body {
-        font-family: "Arial", serif;
-    }
-    .gradio-container {
-        font-family: "Times New Roman", serif;
-    }
-    .gr-button {
-        background-color: #007bff; /* Blue button */
-        color: white;
-        border: none;
-        border-radius: 5px;
-        font-size: 16px;
-        padding: 10px 20px;
-        cursor: pointer;
-    }
-    .gr-textbox:focus, .gr-button:focus {
-        outline: none; /* Remove outline focus for a cleaner look */
-    }
-    /* Specific CSS for the welcome message */
-    .gradio-description {
-        font-size: 30px; /* Set font size for the welcome message */
-        font-family: "Arial", sans-serif;
-        text-align: center; /* Optional: Center-align the text */
-        padding: 20px; /* Optional: Add padding around the welcome message */
-    }
     """
-    # Create a wrapper function for rag_memory_stream that includes retriever and llm
-    def wrapped_rag_memory_stream(message, history):
-        return rag_memory_stream(message, history, retriever, llm)
-    # Create the Chat Interface
-    demo = gr.ChatInterface(
-        fn=wrapped_rag_memory_stream,
-        title=title,
-        fill_height=True,
-        theme="soft",
-        css=custom_css,
-        description=welcome_msg
-    )
-    return demo
 if __name__ == "__main__":
-    # Define target websites
-    websites = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"]
-    # Process content and create vector store
-    vectorstore = process_and_load_content(websites)
-    # Initialize RAG components
-    retriever, llm = init_rag_components(vectorstore)
-    # Create and launch UI
-    demo = create_ui(retriever, llm)
     demo.launch(share=True, inbrowser=True, debug=True)

 import requests
 from io import BytesIO
 from langchain_chroma import Chroma
+import requests
 from bs4 import BeautifulSoup
 from langchain_core.prompts import ChatPromptTemplate
 import gradio as gr
 # Initialize session manager
 session_manager = SessionManager()
+groq_api_key= os.environ.get('GBV')
 embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
 def scrape_websites(base_urls):
+    try:
+        visited_links = set()  # To avoid revisiting the same link
+        content_by_url = {}  # Store content from each URL
+        for base_url in base_urls:
+            if not base_url.strip():
+                continue  # Skip empty or invalid URLs
+            print(f"Scraping base URL: {base_url}")
+            html_content = fetch_page_content(base_url)
+            if html_content:
+                cleaned_content = clean_body_content(html_content)
+                content_by_url[base_url] = cleaned_content
+                visited_links.add(base_url)
+                # Extract and process all internal links
+                soup = BeautifulSoup(html_content, "html.parser")
+                links = extract_internal_links(base_url, soup)
+                for link in links:
+                    if link not in visited_links:
+                        print(f"Scraping link: {link}")
+                        page_content = fetch_page_content(link)
+                        if page_content:
+                            cleaned_content = clean_body_content(page_content)
+                            content_by_url[link] = cleaned_content
+                            visited_links.add(link)
+                        # If the link is a PDF file, extract its content
+                        if link.lower().endswith('.pdf'):
+                            print(f"Extracting PDF content from: {link}")
+                            pdf_content = extract_pdf_text(link)
+                            if pdf_content:
+                                content_by_url[link] = pdf_content
+        return content_by_url
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+        return {}
 def fetch_page_content(url):
     try:
         response = requests.get(url, timeout=10)
         response.raise_for_status()
 def extract_internal_links(base_url, soup):
     links = set()
     for anchor in soup.find_all("a", href=True):
         href = anchor["href"]
 def is_internal_link(base_url, link_url):
     base_netloc = urlparse(base_url).netloc
     link_netloc = urlparse(link_url).netloc
     return base_netloc == link_netloc
 def extract_pdf_text(pdf_url):
     try:
         response = requests.get(pdf_url)
         response.raise_for_status()
 def clean_body_content(html_content):
     soup = BeautifulSoup(html_content, "html.parser")
     for script_or_style in soup(["script", "style"]):
         script_or_style.extract()
     cleaned_content = soup.get_text(separator="\n")
     cleaned_content = "\n".join(
         line.strip() for line in cleaned_content.splitlines() if line.strip()
     return cleaned_content
+if __name__ == "__main__":
+    website = ["https://haguruka.org.rw/country/social-cohesion-and-reconciliation/"
+               ]
+    all_content = scrape_websites(website)
     temp_list = []
     for url, content in all_content.items():
         temp_list.append((url, content))
+processed_texts = []
+for element in temp_list:
+    if isinstance(element, tuple):
+        url, content = element
+        processed_texts.append(f"url: {url}, content: {content}")
+    elif isinstance(element, str):
+        processed_texts.append(element)
+    else:
+        processed_texts.append(str(element))
+def chunk_string(s, chunk_size=1000):
+    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
+chunked_texts = []
+for text in processed_texts:
+  chunked_texts.extend(chunk_string(text))
+vectorstore = Chroma(
+    collection_name="GBVR_Datst",
+    embedding_function=embed_model,
+    persist_directory="./",
+)
+vectorstore.get().keys()
+vectorstore.add_texts(chunked_texts)
+# Updated template to include conversation history
+template = ("""
     You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
     1. **Warm & Natural Interaction**
     **Context:** {context}
     **User's Question:** {question}
     **Your Response:**
+""")
+rag_prompt = PromptTemplate.from_template(template)
+retriever = vectorstore.as_retriever()
+llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
+# Dictionary to store user sessions with session IDs
+user_sessions = {}
+# Define the RAG chain with session history
+def rag_chain(question, session_id="default"):
+    # Get conversation history if available
     conversation_history = session_manager.get_history(session_id)
     # Get context from retriever
     return response
+# Define the RAG memory stream function
+def rag_memory_stream(message, history):
+    # Generate a session ID based on the first message if not exists
     session_id = None
     for msg in history:
         if msg[0]:  # If there's a user message
+            # Use first few characters of first message as simple session ID
             session_id = hash(msg[0][:20]) if session_id is None else session_id
             break
         session_id = "default_session"
     # Process the message and get response
+    response = rag_chain(message, str(session_id))
     # Stream the response word by word
     partial_text = ""
         partial_text += word + " "
         yield partial_text.strip()
+# Title with emojis
+title = "GBVR Chatbot"
+# Custom CSS for styling the interface
+custom_css = """
+/* Custom CSS for styling the interface */
+body {
+    font-family: "Arial", serif;
+}
+.gradio-container {
+    font-family: "Times New Roman", serif;
+}
+.gr-button {
+    background-color: #007bff; /* Blue button */
+    color: white;
+    border: none;
+    border-radius: 5px;
+    font-size: 16px;
+    padding: 10px 20px;
+    cursor: pointer;
+}
+.gr-textbox:focus, .gr-button:focus {
+    outline: none; /* Remove outline focus for a cleaner look */
+}
+/* Specific CSS for the welcome message */
+.gradio-description {
+    font-size: 30px; /* Set font size for the welcome message */
+    font-family: "Arial", sans-serif;
+    text-align: center; /* Optional: Center-align the text */
+    padding: 20px; /* Optional: Add padding around the welcome message */
+}
+"""
+# Generate a simple welcome message using the LLM
+def generate_welcome_message():
+    welcome_prompt = """
+    Generate a short, simple welcome message for a chatbot about Gender-Based Violence Resources in Rwanda.
+    Keep it under 3 sentences, and use simple language.
+    Make it warm and supportive but direct and easy to read.
     """
+    # Get the welcome message from the LLM
+    welcome_message = llm.invoke(welcome_prompt).content
+    return welcome_message
+# Create simple welcome message
+welcome_msg = generate_welcome_message()
+# Create the Chat Interface with welcome message
+demo = gr.ChatInterface(
+    fn=rag_memory_stream,
+    title=title,
+    fill_height=True,
+    theme="soft",
+    css=custom_css, # Apply the custom CSS
+    description=welcome_msg
+)
+# Launch the app
 if __name__ == "__main__":
     demo.launch(share=True, inbrowser=True, debug=True)