Spaces:

Phoenix21
/

ARM_BASED_QA_CHATBOT

Sleeping

App Files Files Community

Phoenix21 commited on Feb 4, 2025

Commit

701dcf6

1 Parent(s): 019f925

modularized the code

Browse files

Files changed (2) hide show

app.py +4 -143
pipeline.py +144 -0

app.py CHANGED Viewed

@@ -1,162 +1,23 @@
-import pandas as pd
-from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
-from groq import Groq
-import gradio as gr
-import os
-# Load the wellness dataset (assuming wellness.csv is available)
-df = pd.read_csv('wellness.csv')
-# Initialize Groq API client
-client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-# Handle missing values by using LLM-based generation for missing Method, Benefits, and Question
-def generate_missing_content(field, prompt):
-    """Use LLM to generate content for missing fields."""
-    result = llm(f"Fill the following field: {prompt}")
-    return result[0]['generated_text']
-# Step 1: Fill missing data using LLM for processing
-def fill_missing_data(row):
-    # If Method is missing, fill it using LLM
-    if row['Method'] == 'Not specified':
-        row['Method'] = generate_missing_content('Method', 'Generate a method for improving mental health')
-    # If Benefits is missing, fill it using LLM
-    if row['Benefits'] == 'No specific benefits mentioned':
-        row['Benefits'] = generate_missing_content('Benefits', 'Generate benefits for stress reduction methods')
-    # If Question is missing, fill it using LLM
-    if row['Question'] == 'No question specified':
-        row['Question'] = generate_missing_content('Question', 'Generate a relevant question about mental health')
-    return row
-# Apply LLM-based filling to the entire dataframe (process only for missing values)
-df = df.apply(fill_missing_data, axis=1)
-# Ensure that all columns are of string type before processing
-df['Method'] = df['Method'].astype(str)
-df['Benefits'] = df['Benefits'].astype(str)
-df['Question'] = df['Question'].astype(str)
-# Step 2: Information Alignment with Sentence-BERT for semantic matching
-model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
-# Combine `Method`, `Benefits`, and `Question` for alignment
-df['query_match'] = df['Method'] + " " + df['Benefits'] + " " + df['Question']
-# Ensure all the entries in `query_match` are strings and handle any NaNs
-df['query_match'] = df['query_match'].astype(str)
-# Compute embeddings for both the query and the dataset (Method + Benefits + Question)
-def chatbot_response(query):
-    # Convert query to embedding
-    query_embedding = model.encode([query])
-    document_embeddings = model.encode(df['query_match'].tolist())
-    # Calculate cosine similarity between the query and data rows
-    cosine_similarities = cosine_similarity(query_embedding, document_embeddings).flatten()
-    # Get the top N matches (let's assume top 3 for now)
-    top_n_indices = cosine_similarities.argsort()[-3:][::-1]
-    # Retrieve the top N most relevant results
-    retrieved_data = df.iloc[top_n_indices]
-    return retrieved_data
-# Step 3: Generate a Coherent Response Using Groq DeepSeek-R1 LLM
-def generate_coherent_response(query, retrieved_data, chat_history):
-    # Compile all relevant fields into a prompt for the LLM to create a coherent response
-    coherent_prompt = "Generate a coherent response based on the following information:\n"
-    for _, row in retrieved_data.iterrows():
-        coherent_prompt += f"Method: {row['Method']}\n"
-        coherent_prompt += f"Benefits: {row['Benefits']}\n"
-        coherent_prompt += f"Recommendation: {row['DailyWellness AI Recommendation']}\n"
-        coherent_prompt += f"Question: {row['Question']}\n\n"
-    # Add chat history to the prompt for context
-    coherent_prompt += f"Chat History:\n{chat_history}\n"
-    coherent_prompt += f"User Query: {query}\n\nGenerate a summary that integrates the methods, benefits, and recommendations."
-    # Using Groq's DeepSeek-R1 to generate the coherent response
-    completion = client.chat.completions.create(
-        model="mixtral-8x7b-32768",
-        messages=[
-            {"role": "user", "content": coherent_prompt}
-        ],
-        temperature=0.6,
-        max_completion_tokens=4096,
-        top_p=0.95,
-        stream=True,
-        stop=None,
-    )
-    # Collect and return the coherent response
-    response = ""
-    for chunk in completion:
-        response += chunk.choices[0].delta.content or ""
-    return response
-# Step 4: Manage Chat History
-class ChatHistory:
-    def __init__(self):
-        self.history = []
-    def add_message(self, role, content):
-        self.history.append({"role": role, "content": content})
-    def get_history(self):
-        return "\n".join([f"{msg['role']}: {msg['content']}" for msg in self.history])
 # Create a new instance of chat history
 chat_history = ChatHistory()
-# Step 5: Self-Verification and Content Moderation
-# def verify_health_wellness_query(query, retrieved_data):
-#     """
-#     Verifies if the query is related to health and wellness and checks if retrieved data is relevant.
-#     """
-#     query_lower = query.lower()
-#     # Use Groq's LLM to evaluate the safety of the query (new LLM-based moderation)
-#     chat_completion = client.chat.completions.create(
-#         messages=[
-#             {"role": "user", "content": query}
-#         ],
-#         model="llama-guard-3-8b",
-#     )
-#     moderation_result = chat_completion.choices[0].message.content.strip()
-#     # If the model's response indicates harmful content, block the query
-#     if 'unsafe' in moderation_result.lower() or 'harmful' in moderation_result.lower():
-#         return False, "The query is flagged as unsafe or harmful. Please rephrase it."
-#     # Proceed with verifying if the retrieved data aligns with the health/wellness context
-#     wellness_keywords = ['mental health', 'stress', 'wellness', 'anxiety', 'relaxation', 'meditation']
-#     # Check if the query contains any of the relevant wellness-related keywords
-#     if any(keyword in query_lower for keyword in wellness_keywords):
-#         return True, ""  # If any relevant wellness keyword is present, it's valid
-#     return False, "The query does not seem to match health and wellness topics."
 # Step 6: Define Gradio Interface for Chatbot
 def gradio_chatbot(user_query):
     # Step 7: Retrieve relevant data for the query
     retrieved_data = chatbot_response(user_query)
     # Step 8: Check and verify the query for health/wellness content
-    # is_valid, message = verify_health_wellness_query(user_query, retrieved_data)
     is_valid = True
     if is_valid:
         # Generate a coherent response using Groq's DeepSeek-R1 LLM
         coherent_response = generate_coherent_response(user_query, retrieved_data, chat_history.get_history())
     else:
-        coherent_response = message  # Return the moderation message if the query is harmful or irrelevant
     # Add the user message and assistant response to the chat history
     chat_history.add_message("user", user_query)

+# app.py
+import gradio as gr
+from pipeline import chatbot_response, generate_coherent_response, ChatHistory
 # Create a new instance of chat history
 chat_history = ChatHistory()
 # Step 6: Define Gradio Interface for Chatbot
 def gradio_chatbot(user_query):
     # Step 7: Retrieve relevant data for the query
     retrieved_data = chatbot_response(user_query)
     # Step 8: Check and verify the query for health/wellness content
     is_valid = True
     if is_valid:
         # Generate a coherent response using Groq's DeepSeek-R1 LLM
         coherent_response = generate_coherent_response(user_query, retrieved_data, chat_history.get_history())
     else:
+        coherent_response = "The query does not seem to match health and wellness topics."
     # Add the user message and assistant response to the chat history
     chat_history.add_message("user", user_query)

pipeline.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# pipeline.py
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from groq import Groq
+import os
+# Load the wellness dataset (assuming wellness.csv is available)
+df = pd.read_csv('wellness.csv')
+# Initialize Groq API client
+client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+# Handle missing values by using LLM-based generation for missing Method, Benefits, and Question
+def generate_missing_content(field, prompt):
+    """Use LLM to generate content for missing fields."""
+    result = llm(f"Fill the following field: {prompt}")
+    return result[0]['generated_text']
+# Step 1: Fill missing data using LLM for processing
+def fill_missing_data(row):
+    # If Method is missing, fill it using LLM
+    if row['Method'] == 'Not specified':
+        row['Method'] = generate_missing_content('Method', 'Generate a method for improving mental health')
+    # If Benefits is missing, fill it using LLM
+    if row['Benefits'] == 'No specific benefits mentioned':
+        row['Benefits'] = generate_missing_content('Benefits', 'Generate benefits for stress reduction methods')
+    # If Question is missing, fill it using LLM
+    if row['Question'] == 'No question specified':
+        row['Question'] = generate_missing_content('Question', 'Generate a relevant question about mental health')
+    return row
+# Apply LLM-based filling to the entire dataframe (process only for missing values)
+df = df.apply(fill_missing_data, axis=1)
+# Ensure that all columns are of string type before processing
+df['Method'] = df['Method'].astype(str)
+df['Benefits'] = df['Benefits'].astype(str)
+df['Question'] = df['Question'].astype(str)
+# Step 2: Information Alignment with Sentence-BERT for semantic matching
+model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+# Combine `Method`, `Benefits`, and `Question` for alignment
+df['query_match'] = df['Method'] + " " + df['Benefits'] + " " + df['Question']
+# Ensure all the entries in `query_match` are strings and handle any NaNs
+df['query_match'] = df['query_match'].astype(str)
+# Compute embeddings for both the query and the dataset (Method + Benefits + Question)
+def chatbot_response(query):
+    # Convert query to embedding
+    query_embedding = model.encode([query])
+    document_embeddings = model.encode(df['query_match'].tolist())
+    # Calculate cosine similarity between the query and data rows
+    cosine_similarities = cosine_similarity(query_embedding, document_embeddings).flatten()
+    # Get the top N matches (let's assume top 3 for now)
+    top_n_indices = cosine_similarities.argsort()[-3:][::-1]
+    # Retrieve the top N most relevant results
+    retrieved_data = df.iloc[top_n_indices]
+    return retrieved_data
+# Step 3: Generate a Coherent Response Using Groq DeepSeek-R1 LLM
+def generate_coherent_response(query, retrieved_data, chat_history):
+    # Compile all relevant fields into a prompt for the LLM to create a coherent response
+    coherent_prompt = "Generate a coherent response based on the following information:\n"
+    for _, row in retrieved_data.iterrows():
+        coherent_prompt += f"Method: {row['Method']}\n"
+        coherent_prompt += f"Benefits: {row['Benefits']}\n"
+        coherent_prompt += f"Recommendation: {row['DailyWellness AI Recommendation']}\n"
+        coherent_prompt += f"Question: {row['Question']}\n\n"
+    # Add chat history to the prompt for context
+    coherent_prompt += f"Chat History:\n{chat_history}\n"
+    coherent_prompt += f"User Query: {query}\n\nGenerate a summary that integrates the methods, benefits, and recommendations."
+    # Using Groq's DeepSeek-R1 to generate the coherent response
+    completion = client.chat.completions.create(
+        model="mixtral-8x7b-32768",
+        messages=[
+            {"role": "user", "content": coherent_prompt}
+        ],
+        temperature=0.6,
+        max_completion_tokens=4096,
+        top_p=0.95,
+        stream=True,
+        stop=None,
+    )
+    # Collect and return the coherent response
+    response = ""
+    for chunk in completion:
+        response += chunk.choices[0].delta.content or ""
+    return response
+# Step 4: Manage Chat History
+class ChatHistory:
+    def __init__(self):
+        self.history = []
+    def add_message(self, role, content):
+        self.history.append({"role": role, "content": content})
+    def get_history(self):
+        return "\n".join([f"{msg['role']}: {msg['content']}" for msg in self.history])
+# Step 5: Self-Verification and Content Moderation
+# def verify_health_wellness_query(query, retrieved_data):
+#     """
+#     Verifies if the query is related to health and wellness and checks if retrieved data is relevant.
+#     """
+#     query_lower = query.lower()
+#     # Use Groq's LLM to evaluate the safety of the query (new LLM-based moderation)
+#     chat_completion = client.chat.completions.create(
+#         messages=[
+#             {"role": "user", "content": query}
+#         ],
+#         model="llama-guard-3-8b",
+#     )
+#     moderation_result = chat_completion.choices[0].message.content.strip()
+#     # If the model's response indicates harmful content, block the query
+#     if 'unsafe' in moderation_result.lower() or 'harmful' in moderation_result.lower():
+#         return False, "The query is flagged as unsafe or harmful. Please rephrase it."
+#     # Proceed with verifying if the retrieved data aligns with the health/wellness context
+#     wellness_keywords = ['mental health', 'stress', 'wellness', 'anxiety', 'relaxation', 'meditation']
+#     # Check if the query contains any of the relevant wellness-related keywords
+#     if any(keyword in query_lower for keyword in wellness_keywords):
+#         return True, ""  # If any relevant wellness keyword is present, it's valid
+#     return False, "The query does not seem to match health and wellness topics."