Spaces:

Rabbit-Innotech
/

GBVR

Runtime error

App Files Files Community

Rabbit-Innotech commited on Apr 29, 2025

Commit

27a0883

verified ·

1 Parent(s): 64498f3

Update app.py

Browse files

Files changed (1) hide show

app.py +340 -48

app.py CHANGED Viewed

@@ -1,64 +1,356 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
 demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
 )
 if __name__ == "__main__":
-    demo.launch()

+import os
+from langchain_groq import ChatGroq
+from langchain.prompts import ChatPromptTemplate, PromptTemplate
+from langchain.output_parsers import ResponseSchema, StructuredOutputParser
+from urllib.parse import urljoin, urlparse
+import requests
+from io import BytesIO
+from langchain_chroma import Chroma
+import requests
+from bs4 import BeautifulSoup
+from langchain_core.prompts import ChatPromptTemplate
 import gradio as gr
+from PyPDF2 import PdfReader
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+# Simple session management
+class SessionManager:
+    def __init__(self):
+        self.sessions = {}
+    def get_or_create_session(self, session_id):
+        if session_id not in self.sessions:
+            self.sessions[session_id] = []
+        return self.sessions[session_id]
+    def add_interaction(self, session_id, user_message, ai_response):
+        session = self.get_or_create_session(session_id)
+        session.append({"user": user_message, "ai": ai_response})
+    def get_history(self, session_id, max_turns=5):
+        session = self.get_or_create_session(session_id)
+        recent_history = session[-max_turns:] if len(session) > max_turns else session
+        history_text = ""
+        for interaction in recent_history:
+            history_text += f"User: {interaction['user']}\n"
+            history_text += f"Assistant: {interaction['ai']}\n\n"
+        return history_text.strip()
+# Initialize session manager
+session_manager = SessionManager()
+groq_api_key= os.environ.get('GBV')
+embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+def scrape_websites(base_urls):
+    try:
+        visited_links = set()  # To avoid revisiting the same link
+        content_by_url = {}  # Store content from each URL
+        for base_url in base_urls:
+            if not base_url.strip():
+                continue  # Skip empty or invalid URLs
+            print(f"Scraping base URL: {base_url}")
+            html_content = fetch_page_content(base_url)
+            if html_content:
+                cleaned_content = clean_body_content(html_content)
+                content_by_url[base_url] = cleaned_content
+                visited_links.add(base_url)
+                # Extract and process all internal links
+                soup = BeautifulSoup(html_content, "html.parser")
+                links = extract_internal_links(base_url, soup)
+                for link in links:
+                    if link not in visited_links:
+                        print(f"Scraping link: {link}")
+                        page_content = fetch_page_content(link)
+                        if page_content:
+                            cleaned_content = clean_body_content(page_content)
+                            content_by_url[link] = cleaned_content
+                            visited_links.add(link)
+                        # If the link is a PDF file, extract its content
+                        if link.lower().endswith('.pdf'):
+                            print(f"Extracting PDF content from: {link}")
+                            pdf_content = extract_pdf_text(link)
+                            if pdf_content:
+                                content_by_url[link] = pdf_content
+        return content_by_url
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+        return {}
+def fetch_page_content(url):
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url}: {e}")
+        return None
+def extract_internal_links(base_url, soup):
+    links = set()
+    for anchor in soup.find_all("a", href=True):
+        href = anchor["href"]
+        full_url = urljoin(base_url, href)
+        if is_internal_link(base_url, full_url):
+            links.add(full_url)
+    return links
+def is_internal_link(base_url, link_url):
+    base_netloc = urlparse(base_url).netloc
+    link_netloc = urlparse(link_url).netloc
+    return base_netloc == link_netloc
+def extract_pdf_text(pdf_url):
+    try:
+        response = requests.get(pdf_url)
+        response.raise_for_status()
+        with BytesIO(response.content) as file:
+            reader = PdfReader(file)
+            pdf_text = ""
+            for page in reader.pages:
+                pdf_text += page.extract_text()
+        return pdf_text if pdf_text else None
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching PDF {pdf_url}: {e}")
+        return None
+    except Exception as e:
+        print(f"Error reading PDF {pdf_url}: {e}")
+        return None
+def clean_body_content(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    for script_or_style in soup(["script", "style"]):
+        script_or_style.extract()
+    cleaned_content = soup.get_text(separator="\n")
+    cleaned_content = "\n".join(
+        line.strip() for line in cleaned_content.splitlines() if line.strip()
+    )
+    return cleaned_content
+if __name__ == "__main__":
+    website = ["https://haguruka.org.rw/"
+               ]
+    all_content = scrape_websites(website)
+    temp_list = []
+    for url, content in all_content.items():
+        temp_list.append((url, content))
+processed_texts = []
+for element in temp_list:
+    if isinstance(element, tuple):
+        url, content = element
+        processed_texts.append(f"url: {url}, content: {content}")
+    elif isinstance(element, str):
+        processed_texts.append(element)
+    else:
+        processed_texts.append(str(element))
+def chunk_string(s, chunk_size=1000):
+    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
+chunked_texts = []
+for text in processed_texts:
+  chunked_texts.extend(chunk_string(text))
+vectorstore = Chroma(
+    collection_name="GBVR_Dataset",
+    embedding_function=embed_model,
+    persist_directory="./",
+)
+vectorstore.get().keys()
+vectorstore.add_texts(chunked_texts)
+# Updated template to include conversation history
+template = ("""
+    You are a friendly, intelligent, and conversational AI assistant designed to provide accurate, engaging, and human-like responses based on the given context. Your goal is to extract relevant details from the provided context: {context} and assist the user effectively. Follow these guidelines:
+    1. **Warm & Natural Interaction**
+       - If the user greets you (e.g., "Hello," "Hi," "Good morning"), respond warmly and acknowledge them.
+       - Example responses:
+         - "😊 Good morning! How can I assist you today?"
+         - "Hello! What can I do for you? 🚀"
+    2. **Precise Information Extraction**
+       - Provide only the relevant details from the given context: {context}.
+       - Do not generate extra content or assumptions beyond the provided information.
+    3. **Conversational & Engaging Tone**
+       - Keep responses friendly, natural, and engaging.
+       - Use occasional emojis (e.g., 😊, 🚀) to make interactions more lively.
+    4. **Awareness of Real-Time Context**
+       - If necessary, acknowledge the current date and time to show awareness of real-world updates.
+    5. **Handling Missing Information**
+       - If no relevant information exists in the context, respond politely:
+         - "I don't have that information at the moment, but I'm happy to help with something else! 😊"
+    6. **Personalized Interaction**
+       - Use the conversation history to provide more personalized and contextually relevant responses.
+       - Previous conversation history: {conversation_history}
+    7. **Direct, Concise Responses**
+       - If the user requests specific data, provide only the requested details without unnecessary explanations unless asked.
+    8. **Extracting Relevant Links**
+       - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
+       - Example response:
+         - "Here is the link you requested: [URL]"
+    **Context:** {context}
+    **User's Question:** {question}
+    **Your Response:**
+""")
+rag_prompt = PromptTemplate.from_template(template)
+retriever = vectorstore.as_retriever()
+llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key)
+# Dictionary to store user sessions with session IDs
+user_sessions = {}
+# Define the RAG chain with session history
+def rag_chain(question, session_id="default"):
+    # Get conversation history if available
+    conversation_history = session_manager.get_history(session_id)
+    # Get context from retriever
+    context_docs = retriever.invoke(question)
+    context = "\n".join(doc.page_content for doc in context_docs)
+    # Create prompt with history
+    prompt = rag_prompt.format(
+        context=context,
+        question=question,
+        conversation_history=conversation_history
+    )
+    # Generate response
+    response = llm.invoke(prompt).content
+    # Store the interaction
+    session_manager.add_interaction(session_id, question, response)
+    return response
+# Define the RAG memory stream function
+def rag_memory_stream(message, history):
+    # Generate a session ID based on the first message if not exists
+    session_id = None
+    for msg in history:
+        if msg[0]:  # If there's a user message
+            # Use first few characters of first message as simple session ID
+            session_id = hash(msg[0][:20]) if session_id is None else session_id
+            break
+    # Default session ID if history is empty
+    if session_id is None:
+        session_id = "default_session"
+    # Process the message and get response
+    response = rag_chain(message, str(session_id))
+    # Stream the response word by word
+    partial_text = ""
+    words = response.split(' ')
+    for word in words:
+        partial_text += word + " "
+        yield partial_text.strip()
+# Title with emojis
+title = "GBVR Chatbot"
+# Custom CSS for styling the interface
+custom_css = """
+body {
+    font-family: "Arial", serif;
+}
+.gradio-container {
+    font-family: "Times New Roman", serif;
+}
+.gr-button {
+    background-color: #007bff; /* Blue button */
+    color: white;
+    border: none;
+    border-radius: 5px;
+    font-size: 16px;
+    padding: 10px 20px;
+    cursor: pointer;
+}
+.gr-textbox:focus, .gr-button:focus {
+    outline: none; /* Remove outline focus for a cleaner look */
+}
 """
+# Generate a dynamic welcome message using the LLM
+def generate_welcome_message():
+    welcome_prompt = """
+    Generate a warm, friendly welcome message for a chatbot that focuses on helping users
+    find information about Gender-Based Violence Resources in Rwanda. The message should:
+    1. Introduce the chatbot's purpose clearly
+    2. Be empathetic and supportive given the sensitive nature of the topic
+    3. Encourage the user to ask questions
+    4. Include 1-2 examples of questions they could ask
+    5. Use a warm, friendly tone with 1-2 appropriate emojis
+    6. Be concise (3-5 sentences)
+    Your welcome message:
+    """
+    # Get the welcome message from the LLM
+    welcome_message = llm.invoke(welcome_prompt).content
+    return welcome_message
+# Create dynamic welcome message
+welcome_msg = generate_welcome_message()
+# Create the Chat Interface with welcome message
 demo = gr.ChatInterface(
+    fn=rag_memory_stream,
+    title=title,
+    fill_height=True,
+    theme="soft",
+    css=custom_css, # Apply the custom CSS
+    examples=["What services does Haguruka offer?", "How can I report a case of GBV?"],
+    description=welcome_msg
 )
+# Launch the app
 if __name__ == "__main__":
+    demo.launch(share=True, inbrowser=True, debug=True)