Celia commited on
Commit
61e1bd6
·
1 Parent(s): c94f1b1
Files changed (2) hide show
  1. app.py +123 -16
  2. test.ipynb +0 -0
app.py CHANGED
@@ -164,6 +164,111 @@ class DataContextManager:
164
  except Exception as e:
165
  logger.error(f"Error in specific dataset search: {e}")
166
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  def get_data_statistics(self):
169
  """Get statistics about loaded datasets"""
@@ -179,40 +284,43 @@ class DataContextManager:
179
 
180
  def enhanced_determine_query_type(query: str) -> Tuple[str, Dict[str, Any]]:
181
  """
182
- Enhanced query type determination - ADD THIS FUNCTION
183
  """
184
  query = query.lower()
185
  context = {}
186
 
187
- # Data-specific patterns
188
- if re.search(r'\b(ph|ph level|acidity|alkalinity)\b', query):
189
  return "data", {
190
  'dataset_preference': 'water_chemicals',
191
- 'focus_columns': ['pH', 'ph_level', 'acidity', 'alkalinity'],
192
  'data_type': 'chemical'
193
  }
194
 
195
- elif re.search(r'\b(oxygen|dissolved oxygen|do level|oxygen level)\b', query):
196
- return "data", {
197
- 'dataset_preference': 'water_chemicals',
198
- 'focus_columns': ['dissolved_oxygen', 'oxygen_level', 'DO'],
199
- 'data_type': 'chemical'
200
- }
201
-
202
- elif re.search(r'\b(quality score|water quality|contamination|pollution)\b', query):
203
  return "data", {
204
  'dataset_preference': 'water_quality_scores',
205
- 'focus_columns': ['quality_score', 'contamination_level', 'pollution_index'],
206
  'data_type': 'quality'
207
  }
208
 
209
- elif re.search(r'\b(inequality|access|drinking water access|water access)\b', query):
 
210
  return "data", {
211
  'dataset_preference': 'drinking_water_inequality',
212
- 'focus_columns': ['access_rate', 'inequality_index', 'coverage'],
213
  'data_type': 'social'
214
  }
215
 
 
 
 
 
 
 
 
 
216
  elif re.search(r'\b(statistics|stats|numbers|data|metrics|measurements)\b', query):
217
  return "hybrid", {'search_both': True}
218
 
@@ -232,7 +340,6 @@ def enhanced_determine_query_type(query: str) -> Tuple[str, Dict[str, Any]]:
232
  # Default to hybrid search for complex queries
233
  return "hybrid", {'search_both': True}
234
 
235
-
236
  # Configure logging
237
  logging.basicConfig(
238
  level=logging.INFO,
 
164
  except Exception as e:
165
  logger.error(f"Error in specific dataset search: {e}")
166
  return []
167
+
168
+
169
+ def _search_water_chemicals(self, query: str, df: pd.DataFrame, top_k: int = 5) -> List[Dict[str, Any]]:
170
+ """Special search method for water_chemicals dataset"""
171
+ query_lower = query.lower()
172
+
173
+ # Define chemical mappings for better matching
174
+ chemical_mappings = {
175
+ 'ph': ['pH', 'ph'],
176
+ 'nitrite': ['Nitrite', 'nitrite'],
177
+ 'oxygen': ['Dissolved oxygen', 'oxygen', 'O2'],
178
+ 'dissolved oxygen': ['Dissolved oxygen'],
179
+ 'nitrogen': ['Nitrogen', 'nitrogen', 'N'],
180
+ 'phosphorus': ['Phosphorus', 'phosphorus', 'P'],
181
+ 'temperature': ['Temperature', 'temperature', 'temp']
182
+ }
183
+
184
+ # Find relevant chemical types based on query
185
+ relevant_chemicals = set()
186
+ for key, chemicals in chemical_mappings.items():
187
+ if key in query_lower:
188
+ relevant_chemicals.update(chemicals)
189
+
190
+ # If no specific chemicals found, search more broadly
191
+ if not relevant_chemicals:
192
+ # Look for any chemical-related terms in the query
193
+ for chemical in df['observedPropertyDeterminandLabel'].unique():
194
+ if pd.notna(chemical) and any(term in chemical.lower() for term in query_lower.split()):
195
+ relevant_chemicals.add(chemical)
196
+
197
+ # Filter data by relevant chemicals
198
+ if relevant_chemicals:
199
+ chemical_filter = df['observedPropertyDeterminandLabel'].isin(relevant_chemicals)
200
+ filtered_df = df[chemical_filter]
201
+ else:
202
+ filtered_df = df
203
+
204
+ # Also filter by country if mentioned in query
205
+ countries_in_query = []
206
+ all_countries = df['Country'].unique()
207
+ for country in all_countries:
208
+ if pd.notna(country) and country.lower() in query_lower:
209
+ countries_in_query.append(country)
210
+
211
+ if countries_in_query:
212
+ country_filter = filtered_df['Country'].isin(countries_in_query)
213
+ filtered_df = filtered_df[country_filter]
214
+
215
+ # If we have filtered results, return them
216
+ if len(filtered_df) > 0:
217
+ # Sort by most recent year and return top results
218
+ if 'phenomenonTimeReferenceYear' in filtered_df.columns:
219
+ filtered_df = filtered_df.sort_values('phenomenonTimeReferenceYear', ascending=False)
220
+
221
+ results = []
222
+ for idx, row in filtered_df.head(top_k).iterrows():
223
+ result = {
224
+ 'dataset': 'water_chemicals',
225
+ 'score': 1.0, # High score for direct matches
226
+ 'row_index': int(idx),
227
+ 'data': row.to_dict(),
228
+ 'context_type': 'data'
229
+ }
230
+ results.append(result)
231
+
232
+ return results
233
+
234
+ # Fallback to TF-IDF search if no direct matches
235
+ try:
236
+ search_texts = []
237
+ for _, row in df.iterrows():
238
+ text_parts = [
239
+ str(row['Country']) if pd.notna(row['Country']) else '',
240
+ str(row['observedPropertyDeterminandLabel']) if pd.notna(row['observedPropertyDeterminandLabel']) else '',
241
+ str(row['resultMeanValue']) if pd.notna(row['resultMeanValue']) else ''
242
+ ]
243
+ search_texts.append(" ".join(text_parts))
244
+
245
+ if search_texts:
246
+ vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_features=3000)
247
+ tfidf_matrix = vectorizer.fit_transform(search_texts)
248
+ query_vector = vectorizer.transform([query])
249
+
250
+ similarity_scores = cosine_similarity(query_vector, tfidf_matrix)[0]
251
+ top_indices = similarity_scores.argsort()[-top_k:][::-1]
252
+
253
+ results = []
254
+ for idx in top_indices:
255
+ if similarity_scores[idx] > 0.01:
256
+ result = {
257
+ 'dataset': 'water_chemicals',
258
+ 'score': float(similarity_scores[idx]),
259
+ 'row_index': int(idx),
260
+ 'data': df.iloc[idx].to_dict(),
261
+ 'context_type': 'data'
262
+ }
263
+ results.append(result)
264
+
265
+ return results
266
+
267
+ except Exception as e:
268
+ logger.error(f"Error in water chemicals TF-IDF search: {e}")
269
+
270
+ return []
271
+
272
 
273
  def get_data_statistics(self):
274
  """Get statistics about loaded datasets"""
 
284
 
285
  def enhanced_determine_query_type(query: str) -> Tuple[str, Dict[str, Any]]:
286
  """
287
+ Enhanced query type determination - FIXED with real column names
288
  """
289
  query = query.lower()
290
  context = {}
291
 
292
+ # Chemical/pH patterns - search in water_chemicals data
293
+ if re.search(r'\b(ph|ph level|acidity|alkalinity|nitrite|dissolved oxygen|chemical|chemicals)\b', query):
294
  return "data", {
295
  'dataset_preference': 'water_chemicals',
296
+ 'focus_columns': ['observedPropertyDeterminandLabel', 'resultMeanValue'],
297
  'data_type': 'chemical'
298
  }
299
 
300
+ # Water quality patterns - search in water_quality_scores
301
+ elif re.search(r'\b(quality score|water quality|overall|rivers|groundwater|open water)\b', query):
 
 
 
 
 
 
302
  return "data", {
303
  'dataset_preference': 'water_quality_scores',
304
+ 'focus_columns': ['Overall', 'Rivers', 'Open water bodies', 'Groundwater'],
305
  'data_type': 'quality'
306
  }
307
 
308
+ # Inequality/access patterns - search in drinking_water_inequality
309
+ elif re.search(r'\b(inequality|access|drinking water|poorest|richest|ratio)\b', query):
310
  return "data", {
311
  'dataset_preference': 'drinking_water_inequality',
312
+ 'focus_columns': ['Poorest', 'Richest', 'Ratio'],
313
  'data_type': 'social'
314
  }
315
 
316
+ # Country-specific queries - search all datasets
317
+ elif re.search(r'\b(netherlands|usa|america|finland|bulgaria|albania|country|countries)\b', query):
318
+ return "hybrid", {
319
+ 'search_both': True,
320
+ 'country_query': True
321
+ }
322
+
323
+ # Statistics/data queries - hybrid search
324
  elif re.search(r'\b(statistics|stats|numbers|data|metrics|measurements)\b', query):
325
  return "hybrid", {'search_both': True}
326
 
 
340
  # Default to hybrid search for complex queries
341
  return "hybrid", {'search_both': True}
342
 
 
343
  # Configure logging
344
  logging.basicConfig(
345
  level=logging.INFO,
test.ipynb ADDED
File without changes