Spaces:
Sleeping
Sleeping
Celia commited on
Commit ·
61e1bd6
1
Parent(s): c94f1b1
ww
Browse files- app.py +123 -16
- test.ipynb +0 -0
app.py
CHANGED
|
@@ -164,6 +164,111 @@ class DataContextManager:
|
|
| 164 |
except Exception as e:
|
| 165 |
logger.error(f"Error in specific dataset search: {e}")
|
| 166 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
def get_data_statistics(self):
|
| 169 |
"""Get statistics about loaded datasets"""
|
|
@@ -179,40 +284,43 @@ class DataContextManager:
|
|
| 179 |
|
| 180 |
def enhanced_determine_query_type(query: str) -> Tuple[str, Dict[str, Any]]:
|
| 181 |
"""
|
| 182 |
-
Enhanced query type determination -
|
| 183 |
"""
|
| 184 |
query = query.lower()
|
| 185 |
context = {}
|
| 186 |
|
| 187 |
-
#
|
| 188 |
-
if re.search(r'\b(ph|ph level|acidity|alkalinity)\b', query):
|
| 189 |
return "data", {
|
| 190 |
'dataset_preference': 'water_chemicals',
|
| 191 |
-
'focus_columns': ['
|
| 192 |
'data_type': 'chemical'
|
| 193 |
}
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
'dataset_preference': 'water_chemicals',
|
| 198 |
-
'focus_columns': ['dissolved_oxygen', 'oxygen_level', 'DO'],
|
| 199 |
-
'data_type': 'chemical'
|
| 200 |
-
}
|
| 201 |
-
|
| 202 |
-
elif re.search(r'\b(quality score|water quality|contamination|pollution)\b', query):
|
| 203 |
return "data", {
|
| 204 |
'dataset_preference': 'water_quality_scores',
|
| 205 |
-
'focus_columns': ['
|
| 206 |
'data_type': 'quality'
|
| 207 |
}
|
| 208 |
|
| 209 |
-
|
|
|
|
| 210 |
return "data", {
|
| 211 |
'dataset_preference': 'drinking_water_inequality',
|
| 212 |
-
'focus_columns': ['
|
| 213 |
'data_type': 'social'
|
| 214 |
}
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
elif re.search(r'\b(statistics|stats|numbers|data|metrics|measurements)\b', query):
|
| 217 |
return "hybrid", {'search_both': True}
|
| 218 |
|
|
@@ -232,7 +340,6 @@ def enhanced_determine_query_type(query: str) -> Tuple[str, Dict[str, Any]]:
|
|
| 232 |
# Default to hybrid search for complex queries
|
| 233 |
return "hybrid", {'search_both': True}
|
| 234 |
|
| 235 |
-
|
| 236 |
# Configure logging
|
| 237 |
logging.basicConfig(
|
| 238 |
level=logging.INFO,
|
|
|
|
| 164 |
except Exception as e:
|
| 165 |
logger.error(f"Error in specific dataset search: {e}")
|
| 166 |
return []
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _search_water_chemicals(self, query: str, df: pd.DataFrame, top_k: int = 5) -> List[Dict[str, Any]]:
|
| 170 |
+
"""Special search method for water_chemicals dataset"""
|
| 171 |
+
query_lower = query.lower()
|
| 172 |
+
|
| 173 |
+
# Define chemical mappings for better matching
|
| 174 |
+
chemical_mappings = {
|
| 175 |
+
'ph': ['pH', 'ph'],
|
| 176 |
+
'nitrite': ['Nitrite', 'nitrite'],
|
| 177 |
+
'oxygen': ['Dissolved oxygen', 'oxygen', 'O2'],
|
| 178 |
+
'dissolved oxygen': ['Dissolved oxygen'],
|
| 179 |
+
'nitrogen': ['Nitrogen', 'nitrogen', 'N'],
|
| 180 |
+
'phosphorus': ['Phosphorus', 'phosphorus', 'P'],
|
| 181 |
+
'temperature': ['Temperature', 'temperature', 'temp']
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
# Find relevant chemical types based on query
|
| 185 |
+
relevant_chemicals = set()
|
| 186 |
+
for key, chemicals in chemical_mappings.items():
|
| 187 |
+
if key in query_lower:
|
| 188 |
+
relevant_chemicals.update(chemicals)
|
| 189 |
+
|
| 190 |
+
# If no specific chemicals found, search more broadly
|
| 191 |
+
if not relevant_chemicals:
|
| 192 |
+
# Look for any chemical-related terms in the query
|
| 193 |
+
for chemical in df['observedPropertyDeterminandLabel'].unique():
|
| 194 |
+
if pd.notna(chemical) and any(term in chemical.lower() for term in query_lower.split()):
|
| 195 |
+
relevant_chemicals.add(chemical)
|
| 196 |
+
|
| 197 |
+
# Filter data by relevant chemicals
|
| 198 |
+
if relevant_chemicals:
|
| 199 |
+
chemical_filter = df['observedPropertyDeterminandLabel'].isin(relevant_chemicals)
|
| 200 |
+
filtered_df = df[chemical_filter]
|
| 201 |
+
else:
|
| 202 |
+
filtered_df = df
|
| 203 |
+
|
| 204 |
+
# Also filter by country if mentioned in query
|
| 205 |
+
countries_in_query = []
|
| 206 |
+
all_countries = df['Country'].unique()
|
| 207 |
+
for country in all_countries:
|
| 208 |
+
if pd.notna(country) and country.lower() in query_lower:
|
| 209 |
+
countries_in_query.append(country)
|
| 210 |
+
|
| 211 |
+
if countries_in_query:
|
| 212 |
+
country_filter = filtered_df['Country'].isin(countries_in_query)
|
| 213 |
+
filtered_df = filtered_df[country_filter]
|
| 214 |
+
|
| 215 |
+
# If we have filtered results, return them
|
| 216 |
+
if len(filtered_df) > 0:
|
| 217 |
+
# Sort by most recent year and return top results
|
| 218 |
+
if 'phenomenonTimeReferenceYear' in filtered_df.columns:
|
| 219 |
+
filtered_df = filtered_df.sort_values('phenomenonTimeReferenceYear', ascending=False)
|
| 220 |
+
|
| 221 |
+
results = []
|
| 222 |
+
for idx, row in filtered_df.head(top_k).iterrows():
|
| 223 |
+
result = {
|
| 224 |
+
'dataset': 'water_chemicals',
|
| 225 |
+
'score': 1.0, # High score for direct matches
|
| 226 |
+
'row_index': int(idx),
|
| 227 |
+
'data': row.to_dict(),
|
| 228 |
+
'context_type': 'data'
|
| 229 |
+
}
|
| 230 |
+
results.append(result)
|
| 231 |
+
|
| 232 |
+
return results
|
| 233 |
+
|
| 234 |
+
# Fallback to TF-IDF search if no direct matches
|
| 235 |
+
try:
|
| 236 |
+
search_texts = []
|
| 237 |
+
for _, row in df.iterrows():
|
| 238 |
+
text_parts = [
|
| 239 |
+
str(row['Country']) if pd.notna(row['Country']) else '',
|
| 240 |
+
str(row['observedPropertyDeterminandLabel']) if pd.notna(row['observedPropertyDeterminandLabel']) else '',
|
| 241 |
+
str(row['resultMeanValue']) if pd.notna(row['resultMeanValue']) else ''
|
| 242 |
+
]
|
| 243 |
+
search_texts.append(" ".join(text_parts))
|
| 244 |
+
|
| 245 |
+
if search_texts:
|
| 246 |
+
vectorizer = TfidfVectorizer(stop_words='english', min_df=1, max_features=3000)
|
| 247 |
+
tfidf_matrix = vectorizer.fit_transform(search_texts)
|
| 248 |
+
query_vector = vectorizer.transform([query])
|
| 249 |
+
|
| 250 |
+
similarity_scores = cosine_similarity(query_vector, tfidf_matrix)[0]
|
| 251 |
+
top_indices = similarity_scores.argsort()[-top_k:][::-1]
|
| 252 |
+
|
| 253 |
+
results = []
|
| 254 |
+
for idx in top_indices:
|
| 255 |
+
if similarity_scores[idx] > 0.01:
|
| 256 |
+
result = {
|
| 257 |
+
'dataset': 'water_chemicals',
|
| 258 |
+
'score': float(similarity_scores[idx]),
|
| 259 |
+
'row_index': int(idx),
|
| 260 |
+
'data': df.iloc[idx].to_dict(),
|
| 261 |
+
'context_type': 'data'
|
| 262 |
+
}
|
| 263 |
+
results.append(result)
|
| 264 |
+
|
| 265 |
+
return results
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
logger.error(f"Error in water chemicals TF-IDF search: {e}")
|
| 269 |
+
|
| 270 |
+
return []
|
| 271 |
+
|
| 272 |
|
| 273 |
def get_data_statistics(self):
|
| 274 |
"""Get statistics about loaded datasets"""
|
|
|
|
| 284 |
|
| 285 |
def enhanced_determine_query_type(query: str) -> Tuple[str, Dict[str, Any]]:
|
| 286 |
"""
|
| 287 |
+
Enhanced query type determination - FIXED with real column names
|
| 288 |
"""
|
| 289 |
query = query.lower()
|
| 290 |
context = {}
|
| 291 |
|
| 292 |
+
# Chemical/pH patterns - search in water_chemicals data
|
| 293 |
+
if re.search(r'\b(ph|ph level|acidity|alkalinity|nitrite|dissolved oxygen|chemical|chemicals)\b', query):
|
| 294 |
return "data", {
|
| 295 |
'dataset_preference': 'water_chemicals',
|
| 296 |
+
'focus_columns': ['observedPropertyDeterminandLabel', 'resultMeanValue'],
|
| 297 |
'data_type': 'chemical'
|
| 298 |
}
|
| 299 |
|
| 300 |
+
# Water quality patterns - search in water_quality_scores
|
| 301 |
+
elif re.search(r'\b(quality score|water quality|overall|rivers|groundwater|open water)\b', query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
return "data", {
|
| 303 |
'dataset_preference': 'water_quality_scores',
|
| 304 |
+
'focus_columns': ['Overall', 'Rivers', 'Open water bodies', 'Groundwater'],
|
| 305 |
'data_type': 'quality'
|
| 306 |
}
|
| 307 |
|
| 308 |
+
# Inequality/access patterns - search in drinking_water_inequality
|
| 309 |
+
elif re.search(r'\b(inequality|access|drinking water|poorest|richest|ratio)\b', query):
|
| 310 |
return "data", {
|
| 311 |
'dataset_preference': 'drinking_water_inequality',
|
| 312 |
+
'focus_columns': ['Poorest', 'Richest', 'Ratio'],
|
| 313 |
'data_type': 'social'
|
| 314 |
}
|
| 315 |
|
| 316 |
+
# Country-specific queries - search all datasets
|
| 317 |
+
elif re.search(r'\b(netherlands|usa|america|finland|bulgaria|albania|country|countries)\b', query):
|
| 318 |
+
return "hybrid", {
|
| 319 |
+
'search_both': True,
|
| 320 |
+
'country_query': True
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
# Statistics/data queries - hybrid search
|
| 324 |
elif re.search(r'\b(statistics|stats|numbers|data|metrics|measurements)\b', query):
|
| 325 |
return "hybrid", {'search_both': True}
|
| 326 |
|
|
|
|
| 340 |
# Default to hybrid search for complex queries
|
| 341 |
return "hybrid", {'search_both': True}
|
| 342 |
|
|
|
|
| 343 |
# Configure logging
|
| 344 |
logging.basicConfig(
|
| 345 |
level=logging.INFO,
|
test.ipynb
ADDED
|
File without changes
|