Phoenix21 commited on
Commit
701dcf6
·
1 Parent(s): 019f925

modularized the code

Browse files
Files changed (2) hide show
  1. app.py +4 -143
  2. pipeline.py +144 -0
app.py CHANGED
@@ -1,162 +1,23 @@
1
- import pandas as pd
2
- from sentence_transformers import SentenceTransformer
3
- from sklearn.metrics.pairwise import cosine_similarity
4
- from groq import Groq
5
- import gradio as gr
6
- import os
7
-
8
- # Load the wellness dataset (assuming wellness.csv is available)
9
- df = pd.read_csv('wellness.csv')
10
-
11
- # Initialize Groq API client
12
- client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
13
-
14
- # Handle missing values by using LLM-based generation for missing Method, Benefits, and Question
15
- def generate_missing_content(field, prompt):
16
- """Use LLM to generate content for missing fields."""
17
- result = llm(f"Fill the following field: {prompt}")
18
- return result[0]['generated_text']
19
-
20
- # Step 1: Fill missing data using LLM for processing
21
- def fill_missing_data(row):
22
- # If Method is missing, fill it using LLM
23
- if row['Method'] == 'Not specified':
24
- row['Method'] = generate_missing_content('Method', 'Generate a method for improving mental health')
25
-
26
- # If Benefits is missing, fill it using LLM
27
- if row['Benefits'] == 'No specific benefits mentioned':
28
- row['Benefits'] = generate_missing_content('Benefits', 'Generate benefits for stress reduction methods')
29
-
30
- # If Question is missing, fill it using LLM
31
- if row['Question'] == 'No question specified':
32
- row['Question'] = generate_missing_content('Question', 'Generate a relevant question about mental health')
33
-
34
- return row
35
-
36
- # Apply LLM-based filling to the entire dataframe (process only for missing values)
37
- df = df.apply(fill_missing_data, axis=1)
38
-
39
- # Ensure that all columns are of string type before processing
40
- df['Method'] = df['Method'].astype(str)
41
- df['Benefits'] = df['Benefits'].astype(str)
42
- df['Question'] = df['Question'].astype(str)
43
-
44
- # Step 2: Information Alignment with Sentence-BERT for semantic matching
45
- model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
46
-
47
- # Combine `Method`, `Benefits`, and `Question` for alignment
48
- df['query_match'] = df['Method'] + " " + df['Benefits'] + " " + df['Question']
49
-
50
- # Ensure all the entries in `query_match` are strings and handle any NaNs
51
- df['query_match'] = df['query_match'].astype(str)
52
-
53
- # Compute embeddings for both the query and the dataset (Method + Benefits + Question)
54
- def chatbot_response(query):
55
- # Convert query to embedding
56
- query_embedding = model.encode([query])
57
- document_embeddings = model.encode(df['query_match'].tolist())
58
-
59
- # Calculate cosine similarity between the query and data rows
60
- cosine_similarities = cosine_similarity(query_embedding, document_embeddings).flatten()
61
-
62
- # Get the top N matches (let's assume top 3 for now)
63
- top_n_indices = cosine_similarities.argsort()[-3:][::-1]
64
 
65
- # Retrieve the top N most relevant results
66
- retrieved_data = df.iloc[top_n_indices]
67
- return retrieved_data
68
-
69
- # Step 3: Generate a Coherent Response Using Groq DeepSeek-R1 LLM
70
- def generate_coherent_response(query, retrieved_data, chat_history):
71
- # Compile all relevant fields into a prompt for the LLM to create a coherent response
72
- coherent_prompt = "Generate a coherent response based on the following information:\n"
73
-
74
- for _, row in retrieved_data.iterrows():
75
- coherent_prompt += f"Method: {row['Method']}\n"
76
- coherent_prompt += f"Benefits: {row['Benefits']}\n"
77
- coherent_prompt += f"Recommendation: {row['DailyWellness AI Recommendation']}\n"
78
- coherent_prompt += f"Question: {row['Question']}\n\n"
79
-
80
- # Add chat history to the prompt for context
81
- coherent_prompt += f"Chat History:\n{chat_history}\n"
82
- coherent_prompt += f"User Query: {query}\n\nGenerate a summary that integrates the methods, benefits, and recommendations."
83
-
84
- # Using Groq's DeepSeek-R1 to generate the coherent response
85
- completion = client.chat.completions.create(
86
- model="mixtral-8x7b-32768",
87
- messages=[
88
- {"role": "user", "content": coherent_prompt}
89
- ],
90
- temperature=0.6,
91
- max_completion_tokens=4096,
92
- top_p=0.95,
93
- stream=True,
94
- stop=None,
95
- )
96
-
97
- # Collect and return the coherent response
98
- response = ""
99
- for chunk in completion:
100
- response += chunk.choices[0].delta.content or ""
101
- return response
102
-
103
- # Step 4: Manage Chat History
104
- class ChatHistory:
105
- def __init__(self):
106
- self.history = []
107
-
108
- def add_message(self, role, content):
109
- self.history.append({"role": role, "content": content})
110
-
111
- def get_history(self):
112
- return "\n".join([f"{msg['role']}: {msg['content']}" for msg in self.history])
113
 
114
  # Create a new instance of chat history
115
  chat_history = ChatHistory()
116
 
117
- # Step 5: Self-Verification and Content Moderation
118
- # def verify_health_wellness_query(query, retrieved_data):
119
- # """
120
- # Verifies if the query is related to health and wellness and checks if retrieved data is relevant.
121
- # """
122
- # query_lower = query.lower()
123
-
124
- # # Use Groq's LLM to evaluate the safety of the query (new LLM-based moderation)
125
- # chat_completion = client.chat.completions.create(
126
- # messages=[
127
- # {"role": "user", "content": query}
128
- # ],
129
- # model="llama-guard-3-8b",
130
- # )
131
-
132
- # moderation_result = chat_completion.choices[0].message.content.strip()
133
-
134
- # # If the model's response indicates harmful content, block the query
135
- # if 'unsafe' in moderation_result.lower() or 'harmful' in moderation_result.lower():
136
- # return False, "The query is flagged as unsafe or harmful. Please rephrase it."
137
-
138
- # # Proceed with verifying if the retrieved data aligns with the health/wellness context
139
- # wellness_keywords = ['mental health', 'stress', 'wellness', 'anxiety', 'relaxation', 'meditation']
140
-
141
- # # Check if the query contains any of the relevant wellness-related keywords
142
- # if any(keyword in query_lower for keyword in wellness_keywords):
143
- # return True, "" # If any relevant wellness keyword is present, it's valid
144
-
145
- # return False, "The query does not seem to match health and wellness topics."
146
-
147
  # Step 6: Define Gradio Interface for Chatbot
148
  def gradio_chatbot(user_query):
149
  # Step 7: Retrieve relevant data for the query
150
  retrieved_data = chatbot_response(user_query)
151
 
152
  # Step 8: Check and verify the query for health/wellness content
153
- # is_valid, message = verify_health_wellness_query(user_query, retrieved_data)
154
  is_valid = True
155
  if is_valid:
156
  # Generate a coherent response using Groq's DeepSeek-R1 LLM
157
  coherent_response = generate_coherent_response(user_query, retrieved_data, chat_history.get_history())
158
  else:
159
- coherent_response = message # Return the moderation message if the query is harmful or irrelevant
160
 
161
  # Add the user message and assistant response to the chat history
162
  chat_history.add_message("user", user_query)
 
1
+ # app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ import gradio as gr
4
+ from pipeline import chatbot_response, generate_coherent_response, ChatHistory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # Create a new instance of chat history
7
  chat_history = ChatHistory()
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # Step 6: Define Gradio Interface for Chatbot
10
  def gradio_chatbot(user_query):
11
  # Step 7: Retrieve relevant data for the query
12
  retrieved_data = chatbot_response(user_query)
13
 
14
  # Step 8: Check and verify the query for health/wellness content
 
15
  is_valid = True
16
  if is_valid:
17
  # Generate a coherent response using Groq's DeepSeek-R1 LLM
18
  coherent_response = generate_coherent_response(user_query, retrieved_data, chat_history.get_history())
19
  else:
20
+ coherent_response = "The query does not seem to match health and wellness topics."
21
 
22
  # Add the user message and assistant response to the chat history
23
  chat_history.add_message("user", user_query)
pipeline.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pipeline.py
2
+
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from groq import Groq
7
+ import os
8
+
9
+ # Load the wellness dataset (assuming wellness.csv is available)
10
+ df = pd.read_csv('wellness.csv')
11
+
12
+ # Initialize Groq API client
13
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
14
+
15
+ # Handle missing values by using LLM-based generation for missing Method, Benefits, and Question
16
+ def generate_missing_content(field, prompt):
17
+ """Use LLM to generate content for missing fields."""
18
+ result = llm(f"Fill the following field: {prompt}")
19
+ return result[0]['generated_text']
20
+
21
+ # Step 1: Fill missing data using LLM for processing
22
+ def fill_missing_data(row):
23
+ # If Method is missing, fill it using LLM
24
+ if row['Method'] == 'Not specified':
25
+ row['Method'] = generate_missing_content('Method', 'Generate a method for improving mental health')
26
+
27
+ # If Benefits is missing, fill it using LLM
28
+ if row['Benefits'] == 'No specific benefits mentioned':
29
+ row['Benefits'] = generate_missing_content('Benefits', 'Generate benefits for stress reduction methods')
30
+
31
+ # If Question is missing, fill it using LLM
32
+ if row['Question'] == 'No question specified':
33
+ row['Question'] = generate_missing_content('Question', 'Generate a relevant question about mental health')
34
+
35
+ return row
36
+
37
+ # Apply LLM-based filling to the entire dataframe (process only for missing values)
38
+ df = df.apply(fill_missing_data, axis=1)
39
+
40
+ # Ensure that all columns are of string type before processing
41
+ df['Method'] = df['Method'].astype(str)
42
+ df['Benefits'] = df['Benefits'].astype(str)
43
+ df['Question'] = df['Question'].astype(str)
44
+
45
+ # Step 2: Information Alignment with Sentence-BERT for semantic matching
46
+ model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
47
+
48
+ # Combine `Method`, `Benefits`, and `Question` for alignment
49
+ df['query_match'] = df['Method'] + " " + df['Benefits'] + " " + df['Question']
50
+
51
+ # Ensure all the entries in `query_match` are strings and handle any NaNs
52
+ df['query_match'] = df['query_match'].astype(str)
53
+
54
+ # Compute embeddings for both the query and the dataset (Method + Benefits + Question)
55
+ def chatbot_response(query):
56
+ # Convert query to embedding
57
+ query_embedding = model.encode([query])
58
+ document_embeddings = model.encode(df['query_match'].tolist())
59
+
60
+ # Calculate cosine similarity between the query and data rows
61
+ cosine_similarities = cosine_similarity(query_embedding, document_embeddings).flatten()
62
+
63
+ # Get the top N matches (let's assume top 3 for now)
64
+ top_n_indices = cosine_similarities.argsort()[-3:][::-1]
65
+
66
+ # Retrieve the top N most relevant results
67
+ retrieved_data = df.iloc[top_n_indices]
68
+ return retrieved_data
69
+
70
+ # Step 3: Generate a Coherent Response Using Groq DeepSeek-R1 LLM
71
+ def generate_coherent_response(query, retrieved_data, chat_history):
72
+ # Compile all relevant fields into a prompt for the LLM to create a coherent response
73
+ coherent_prompt = "Generate a coherent response based on the following information:\n"
74
+
75
+ for _, row in retrieved_data.iterrows():
76
+ coherent_prompt += f"Method: {row['Method']}\n"
77
+ coherent_prompt += f"Benefits: {row['Benefits']}\n"
78
+ coherent_prompt += f"Recommendation: {row['DailyWellness AI Recommendation']}\n"
79
+ coherent_prompt += f"Question: {row['Question']}\n\n"
80
+
81
+ # Add chat history to the prompt for context
82
+ coherent_prompt += f"Chat History:\n{chat_history}\n"
83
+ coherent_prompt += f"User Query: {query}\n\nGenerate a summary that integrates the methods, benefits, and recommendations."
84
+
85
+ # Using Groq's DeepSeek-R1 to generate the coherent response
86
+ completion = client.chat.completions.create(
87
+ model="mixtral-8x7b-32768",
88
+ messages=[
89
+ {"role": "user", "content": coherent_prompt}
90
+ ],
91
+ temperature=0.6,
92
+ max_completion_tokens=4096,
93
+ top_p=0.95,
94
+ stream=True,
95
+ stop=None,
96
+ )
97
+
98
+ # Collect and return the coherent response
99
+ response = ""
100
+ for chunk in completion:
101
+ response += chunk.choices[0].delta.content or ""
102
+ return response
103
+
104
+ # Step 4: Manage Chat History
105
+ class ChatHistory:
106
+ def __init__(self):
107
+ self.history = []
108
+
109
+ def add_message(self, role, content):
110
+ self.history.append({"role": role, "content": content})
111
+
112
+ def get_history(self):
113
+ return "\n".join([f"{msg['role']}: {msg['content']}" for msg in self.history])
114
+
115
+ # Step 5: Self-Verification and Content Moderation
116
+ # def verify_health_wellness_query(query, retrieved_data):
117
+ # """
118
+ # Verifies if the query is related to health and wellness and checks if retrieved data is relevant.
119
+ # """
120
+ # query_lower = query.lower()
121
+
122
+ # # Use Groq's LLM to evaluate the safety of the query (new LLM-based moderation)
123
+ # chat_completion = client.chat.completions.create(
124
+ # messages=[
125
+ # {"role": "user", "content": query}
126
+ # ],
127
+ # model="llama-guard-3-8b",
128
+ # )
129
+
130
+ # moderation_result = chat_completion.choices[0].message.content.strip()
131
+
132
+ # # If the model's response indicates harmful content, block the query
133
+ # if 'unsafe' in moderation_result.lower() or 'harmful' in moderation_result.lower():
134
+ # return False, "The query is flagged as unsafe or harmful. Please rephrase it."
135
+
136
+ # # Proceed with verifying if the retrieved data aligns with the health/wellness context
137
+ # wellness_keywords = ['mental health', 'stress', 'wellness', 'anxiety', 'relaxation', 'meditation']
138
+
139
+ # # Check if the query contains any of the relevant wellness-related keywords
140
+ # if any(keyword in query_lower for keyword in wellness_keywords):
141
+ # return True, "" # If any relevant wellness keyword is present, it's valid
142
+
143
+ # return False, "The query does not seem to match health and wellness topics."
144
+