Rajan Sharma commited on
Commit
c1ff5e2
·
verified ·
1 Parent(s): edf0adb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -177
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py - Enhanced Healthcare Scenario Analysis System
2
  import os, re, json, traceback, pathlib
3
  from functools import lru_cache
4
  from typing import List, Dict, Any, Tuple, Optional
@@ -10,9 +10,11 @@ import torch
10
  import regex as re2
11
 
12
  # Import necessary modules
13
- from settings import SNAPSHOT_PATH, PERSIST_CONTENT
14
  from audit_log import log_event, hash_summary
15
  from privacy import redact_text, safety_filter, refusal_reply
 
 
16
 
17
  # ---------- Writable caches (HF Spaces-safe) ----------
18
  HOME = pathlib.Path.home()
@@ -48,26 +50,12 @@ except Exception:
48
  from transformers import AutoTokenizer, AutoModelForCausalLM
49
  from huggingface_hub import login
50
 
51
- # ---------- Healthcare-specific constants ----------
52
- HEALTHCARE_KEYWORDS = [
53
- "hospital", "patient", "bed", "care", "health", "medical", "clinical",
54
- "facility", "nursing", "residential", "ambulatory", "healthcare", "occupancy",
55
- "capacity", "staff", "zone", "province", "alberta", "cihi", "odhf",
56
- "respiratory", "virus", "flu", "surge", "acute", "long-term", "ltc"
57
- ]
58
-
59
- HEALTHCARE_FACILITY_TYPES = {
60
- "Hospitals": ["hospital", "medical center", "health centre"],
61
- "Nursing and residential care facilities": ["nursing", "residential", "care facility", "long-term care"],
62
- "Ambulatory health care services": ["ambulatory", "clinic", "surgery center", "outpatient"]
63
- }
64
-
65
  # ---------- Config ----------
66
  MODEL_ID = os.getenv("MODEL_ID", "microsoft/Phi-3-mini-4k-instruct")
67
  HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
68
  COHERE_API_KEY = os.getenv("COHERE_API_KEY")
69
  USE_HOSTED_COHERE = bool(COHERE_API_KEY and _HAS_COHERE)
70
- MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "2048"))
71
 
72
  # ---------- Generic System Prompt ----------
73
  SYSTEM_MASTER = """
@@ -85,53 +73,6 @@ Formatting rules for structured analysis:
85
  - End with concrete recommendations and a brief "Provenance" mapping outputs to scenario text, uploaded files, and answers.
86
  """.strip()
87
 
88
- # ---------- Data Registry Class ----------
89
- class DataRegistry:
90
- def __init__(self):
91
- self.data = {}
92
- self.file_metadata = {}
93
-
94
- def add_path(self, path):
95
- try:
96
- file_name = os.path.basename(path)
97
- if file_name.endswith('.csv'):
98
- df = pd.read_csv(path)
99
- self.data[file_name] = df
100
- self.file_metadata[file_name] = {
101
- 'type': 'csv',
102
- 'columns': list(df.columns),
103
- 'shape': df.shape,
104
- 'sample': df.head(3).to_dict('records')
105
- }
106
- return True
107
- except Exception as e:
108
- print(f"Error adding {path}: {e}")
109
- return False
110
-
111
- def names(self):
112
- return list(self.data.keys())
113
-
114
- def get(self, name):
115
- return self.data.get(name)
116
-
117
- def summarize_for_prompt(self):
118
- if not self.data:
119
- return "No data files registered."
120
-
121
- summary = []
122
- for name, meta in self.file_metadata.items():
123
- summary.append(f"File: {name}")
124
- summary.append(f"Type: {meta['type']}")
125
- summary.append(f"Columns: {', '.join(meta['columns'])}")
126
- summary.append(f"Shape: {meta['shape']}")
127
- summary.append("")
128
-
129
- return "\n".join(summary)
130
-
131
- def clear(self):
132
- self.data.clear()
133
- self.file_metadata.clear()
134
-
135
  # ---------- Session RAG Class (Simplified) ----------
136
  class SessionRAG:
137
  def __init__(self):
@@ -162,12 +103,13 @@ def is_healthcare_scenario(text: str, uploaded_files_paths) -> bool:
162
  t = (text or "").lower()
163
 
164
  # Check for healthcare keywords
165
- has_healthcare_keywords = any(keyword in t for keyword in HEALTHCARE_KEYWORDS)
166
 
167
  # Check for healthcare facility types
168
  has_facility_types = any(
169
- any(ftype in t for ftype in types)
170
- for types in HEALTHCARE_FACILITY_TYPES.values()
 
171
  )
172
 
173
  # Check for healthcare-specific tasks
@@ -196,25 +138,10 @@ def process_healthcare_data(uploaded_files_paths, data_registry):
196
  """Process healthcare data files with robust error handling."""
197
  for file_path in uploaded_files_paths:
198
  try:
199
- file_name = os.path.basename(file_path).lower()
200
-
201
- if file_name.endswith('.csv'):
202
- df = pd.read_csv(file_path)
203
-
204
- # Standardize column names
205
- df.columns = [col.strip().lower().replace(' ', '_').replace('-', '_') for col in df.columns]
206
-
207
- # Handle healthcare-specific data structures
208
- if 'facility_name' in df.columns:
209
- if 'facility_type' not in df.columns and 'odhf_facility_type' in df.columns:
210
- df['facility_type'] = df['odhf_facility_type']
211
-
212
- if 'beds_current' in df.columns and 'beds_prev' in df.columns:
213
- df['bed_change'] = df['beds_current'] - df['beds_prev']
214
- df['percent_change'] = (df['bed_change'] / df['beds_prev']) * 100
215
-
216
- data_registry.add_path(file_path)
217
-
218
  except Exception as e:
219
  print(f"Error processing {file_path}: {e}")
220
  log_event("data_processing_error", None, {
@@ -223,27 +150,38 @@ def process_healthcare_data(uploaded_files_paths, data_registry):
223
  })
224
 
225
  def analyze_facility_distribution(facilities_df):
226
- """Analyze healthcare facility distribution by type and location."""
227
  try:
228
  # Filter to Alberta if province column exists
229
- if 'province' in facilities_df.columns:
230
- ab_facilities = facilities_df[facilities_df['province'] == 'ab']
 
 
 
231
  else:
232
- ab_facilities = facilities_df
 
 
 
 
 
 
233
 
234
  # Facility type frequency
235
- type_counts = ab_facilities['facility_type'].value_counts().to_dict()
236
 
237
  # Top cities by facility count
238
- if 'city' in ab_facilities.columns:
239
- city_counts = ab_facilities['city'].value_counts().head(5)
 
 
240
  top_cities = city_counts.index.tolist()
241
 
242
  # Breakdown by facility type for top cities
243
  city_breakdown = {}
244
  for city in top_cities:
245
- city_data = ab_facilities[ab_facilities['city'] == city]
246
- city_breakdown[city] = city_data['facility_type'].value_counts().to_dict()
247
  else:
248
  top_cities = []
249
  city_breakdown = {}
@@ -252,35 +190,74 @@ def analyze_facility_distribution(facilities_df):
252
  "total_facilities": len(ab_facilities),
253
  "type_distribution": type_counts,
254
  "top_cities": top_cities,
255
- "city_breakdown": city_breakdown
 
 
 
 
 
256
  }
257
  except Exception as e:
258
  log_event("facility_analysis_error", None, {"error": str(e)})
259
  return {"error": str(e)}
260
 
261
  def analyze_bed_capacity(beds_df):
262
- """Analyze bed capacity by zone and identify trends."""
263
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  # Filter to Alberta if province column exists
265
- if 'province' in beds_df.columns:
266
- ab_beds = beds_df[beds_df['province'] == 'alberta']
 
 
 
267
  else:
268
- ab_beds = beds_df
269
 
270
- # Calculate zone-level summaries
271
- if 'zone' in ab_beds.columns:
272
- zone_summary = ab_beds.groupby('zone').agg({
273
- 'beds_current': 'sum',
274
- 'beds_prev': 'sum',
 
 
275
  'bed_change': 'sum'
276
  }).reset_index()
277
 
278
- # Calculate percentage change
279
- zone_summary['percent_change'] = (zone_summary['bed_change'] / zone_summary['beds_prev']) * 100
 
 
280
 
281
  # Find zones with largest changes
282
- max_abs_decrease = zone_summary.loc[zone_summary['bed_change'].idxmin()]
283
- max_pct_decrease = zone_summary.loc[zone_summary['percent_change'].idxmin()]
 
 
 
 
 
 
 
284
 
285
  # Identify facilities with largest declines
286
  facilities_decline = ab_beds.sort_values('bed_change').head(5)
@@ -291,39 +268,62 @@ def analyze_bed_capacity(beds_df):
291
  facilities_decline = pd.DataFrame()
292
 
293
  return {
294
- "zone_summary": zone_summary.to_dict('records'),
295
- "max_absolute_decrease": max_abs_decrease.to_dict(),
296
- "max_percentage_decrease": max_pct_decrease.to_dict(),
297
- "facilities_with_largest_declines": facilities_decline.to_dict('records')
 
 
 
 
 
 
298
  }
299
  except Exception as e:
300
  log_event("bed_analysis_error", None, {"error": str(e)})
301
  return {"error": str(e)}
302
 
303
  def assess_long_term_capacity(facilities_df, beds_df, zone_name):
304
- """Assess long-term care capacity in a specific zone."""
305
  try:
 
 
 
 
 
 
 
 
 
 
306
  # Get facilities in the specified zone
307
- if 'zone' in facilities_df.columns:
308
- zone_facilities = facilities_df[facilities_df['zone'] == zone_name]
 
309
  else:
310
  # If zone column not available, use province
311
- zone_facilities = facilities_df[facilities_df['province'] == 'ab']
 
 
 
 
 
 
312
 
313
  # Find major city in zone
314
- if 'city' in zone_facilities.columns:
315
- city_counts = zone_facilities['city'].value_counts()
316
- major_city = city_counts.index[0] if len(city_counts) > 0 else None
317
-
318
- if major_city:
319
- city_facilities = zone_facilities[zone_facilities['city'] == major_city]
320
 
321
  # Count facility types
322
- facility_counts = city_facilities['facility_type'].value_counts().to_dict()
323
 
324
  # Calculate ratio of nursing/residential to hospitals
325
- hospitals = facility_counts.get('Hospitals', 0)
326
- nursing = facility_counts.get('Nursing and residential care facilities', 0)
327
  ratio = nursing / hospitals if hospitals > 0 else 0
328
 
329
  # Assess capacity
@@ -334,7 +334,12 @@ def assess_long_term_capacity(facilities_df, beds_df, zone_name):
334
  "major_city": major_city,
335
  "facility_counts": facility_counts,
336
  "nursing_to_hospital_ratio": ratio,
337
- "capacity_assessment": capacity_assessment
 
 
 
 
 
338
  }
339
 
340
  return {"error": "Could not determine major city or facility counts"}
@@ -352,22 +357,24 @@ def generate_operational_recommendations(analysis_results):
352
  if 'max_percentage_decrease' in bed_data and isinstance(bed_data['max_percentage_decrease'], dict):
353
  zone = bed_data['max_percentage_decrease'].get('zone', '')
354
  decrease = bed_data['max_percentage_decrease'].get('percent_change', 0)
355
- recommendations.append({
356
- "title": f"Restore staffed beds in {zone} Zone",
357
- "description": f"Priority should be given to reopening closed units and hiring staff to address the {decrease:.1f}% decrease in bed capacity.",
358
- "data_source": "Bed capacity analysis"
359
- })
 
360
 
361
  # Recommendation 2: Expand long-term care capacity
362
  if 'long_term_care' in analysis_results:
363
  ltc_data = analysis_results['long_term_care']
364
  if ltc_data.get('capacity_assessment') == 'insufficient':
365
  city = ltc_data.get('major_city', '')
366
- recommendations.append({
367
- "title": f"Expand long-term care capacity in {city}",
368
- "description": f"Invest in new long-term care beds or repurpose existing sites to expedite discharge of stabilized patients.",
369
- "data_source": "Long-term care capacity assessment"
370
- })
 
371
 
372
  # Recommendation 3: Implement surge plans
373
  if 'bed_capacity' in analysis_results:
@@ -400,7 +407,7 @@ def format_healthcare_analysis_response(scenario_text, results, recommendations,
400
  response += f"Error in facility distribution analysis: {fd['error']}\n\n"
401
  else:
402
  response += "## 1. Data Preparation\n\n"
403
- response += f"Total healthcare facilities in Alberta: {fd.get('total_facilities', 'N/A')}\n\n"
404
 
405
  if 'type_distribution' in fd:
406
  response += "### Facility Type Distribution\n\n"
@@ -432,13 +439,13 @@ def format_healthcare_analysis_response(scenario_text, results, recommendations,
432
 
433
  if 'zone_summary' in bc and bc['zone_summary']:
434
  response += "### Bed Capacity by Zone\n\n"
435
- response += "| Zone | Beds (2023-24) | Beds (2022-23) | Absolute Change | Percent Change |\n"
436
- response += "|------|---------------|---------------|-----------------|----------------|\n"
437
 
438
  for zone_data in bc['zone_summary']:
439
- zone = zone_data.get('zone', 'N/A')
440
- current = zone_data.get('beds_current', 'N/A')
441
- prev = zone_data.get('beds_prev', 'N/A')
442
  change = zone_data.get('bed_change', 'N/A')
443
  pct = zone_data.get('percent_change', 'N/A')
444
  response += f"| {zone} | {current} | {prev} | {change} | {pct:.1f}% |\n"
@@ -448,8 +455,8 @@ def format_healthcare_analysis_response(scenario_text, results, recommendations,
448
  'max_percentage_decrease' in bc and isinstance(bc['max_percentage_decrease'], dict):
449
  abs_dec = bc['max_absolute_decrease']
450
  pct_dec = bc['max_percentage_decrease']
451
- response += f"**Zone with largest absolute decrease**: {abs_dec.get('zone', 'N/A')} ({abs_dec.get('bed_change', 'N/A')} beds)\n\n"
452
- response += f"**Zone with largest percentage decrease**: {pct_dec.get('zone', 'N/A')} ({pct_dec.get('percent_change', 'N/A'):.1f}%)\n\n"
453
 
454
  if 'facilities_with_largest_declines' in bc and bc['facilities_with_largest_declines']:
455
  response += "### Facilities with Largest Bed Declines\n\n"
@@ -458,7 +465,7 @@ def format_healthcare_analysis_response(scenario_text, results, recommendations,
458
 
459
  for facility in bc['facilities_with_largest_declines']:
460
  name = facility.get('facility_name', 'N/A')
461
- zone = facility.get('zone', 'N/A')
462
  teaching = facility.get('teaching_status', 'N/A')
463
  change = facility.get('bed_change', 'N/A')
464
  response += f"| {name} | {zone} | {teaching} | {change} |\n"
@@ -511,61 +518,63 @@ def format_healthcare_analysis_response(scenario_text, results, recommendations,
511
  response += "## Provenance\n\n"
512
  response += "This analysis is based on:\n"
513
  response += "- Scenario description provided by the user\n"
514
- response += "- Uploaded data files: all_health_facilities.csv and clean_beds_data.csv\n"
515
  response += "- Calculations performed on the provided data\n"
516
 
517
  return response
518
 
519
  def handle_healthcare_scenario(scenario_text, data_registry, history):
520
- """Handle healthcare-specific scenario analysis."""
521
  try:
522
- # Initialize analysis results
523
  results = {}
524
 
525
- # Task 1: Data preparation
 
 
 
 
526
  facilities_df = None
527
- beds_df = None
 
528
 
529
- # Find the relevant data files
530
- for file_name in data_registry.names():
531
- df = data_registry.get(file_name)
532
- if df is not None:
533
- if 'facility' in file_name.lower() or 'health' in file_name.lower():
534
- facilities_df = df
535
- elif 'bed' in file_name.lower():
536
- beds_df = df
537
 
538
  # Log what we found
539
  log_event("data_files_found", None, {
540
  "facilities": facilities_df is not None,
541
  "beds": beds_df is not None,
542
- "files": data_registry.names()
 
543
  })
544
 
 
545
  if facilities_df is not None:
546
  results['facility_distribution'] = analyze_facility_distribution(facilities_df)
547
 
548
- # Task 2: Bed capacity analysis
549
  if beds_df is not None:
550
  results['bed_capacity'] = analyze_bed_capacity(beds_df)
551
 
552
- # Task 3: Long-term care capacity assessment
553
  if 'bed_capacity' in results and 'max_percentage_decrease' in results['bed_capacity']:
554
- worst_zone = results['bed_capacity']['max_percentage_decrease'].get('zone', '')
555
- if worst_zone and facilities_df is not None:
556
- results['long_term_care'] = assess_long_term_capacity(
557
- facilities_df,
558
- beds_df,
559
- worst_zone
560
- )
 
 
561
 
562
- # Generate operational recommendations
563
  recommendations = generate_operational_recommendations(results)
564
 
565
- # Generate future AI integration discussion
566
  ai_integration = generate_ai_integration_discussion(results)
567
 
568
- # Compile final response
569
  response = format_healthcare_analysis_response(scenario_text, results, recommendations, ai_integration)
570
 
571
  return response
@@ -650,7 +659,7 @@ def cohere_chat(message, history):
650
  resp = client.chat(
651
  model="command-r7b-12-2024",
652
  message=prompt,
653
- temperature=0.3,
654
  max_tokens=MAX_NEW_TOKENS,
655
  )
656
  if hasattr(resp, "text") and resp.text: return resp.text.strip()
@@ -675,8 +684,9 @@ def local_generate(model, tokenizer, input_ids, max_new_tokens=MAX_NEW_TOKENS):
675
  with torch.no_grad():
676
  out = model.generate(
677
  input_ids=input_ids, max_new_tokens=max_new_tokens,
678
- do_sample=True, temperature=0.3, top_p=0.9,
679
- repetition_penalty=1.15,
 
680
  pad_token_id=tokenizer.eos_token_id,
681
  eos_token_id=tokenizer.eos_token_id,
682
  )
@@ -705,6 +715,13 @@ def clarityops_reply(user_msg, history, tz, uploaded_files_paths, awaiting_answe
705
  if uploaded_files_paths:
706
  process_healthcare_data(uploaded_files_paths, data_registry)
707
 
 
 
 
 
 
 
 
708
  # Update session RAG with CSV columns
709
  for file_name in data_registry.names():
710
  if file_name.endswith('.csv'):
@@ -770,7 +787,7 @@ with gr.Blocks(theme=theme, css=custom_css, analytics_enabled=False) as demo:
770
  elem_classes="hero-box"
771
  )
772
  hero_send = gr.Button("➤", scale=0, elem_id="hero-send")
773
- gr.Markdown('<div class="hint">Upload healthcare data files (CSV, PDF, etc.) and describe your scenario for comprehensive analysis.</div>')
774
 
775
  # --- MAIN APP (hidden until first message) ---
776
  with gr.Column(elem_id="chat-container", visible=False) as app_wrap:
@@ -778,7 +795,8 @@ with gr.Blocks(theme=theme, css=custom_css, analytics_enabled=False) as demo:
778
  with gr.Row():
779
  uploads = gr.Files(
780
  label="Upload healthcare data files",
781
- file_types=["file"], file_count="multiple", height=68
 
782
  )
783
  with gr.Row(elem_id="chat-input-row"):
784
  msg = gr.Textbox(
 
1
+ # app.py - Complete Dynamic Healthcare Scenario Analysis System
2
  import os, re, json, traceback, pathlib
3
  from functools import lru_cache
4
  from typing import List, Dict, Any, Tuple, Optional
 
10
  import regex as re2
11
 
12
  # Import necessary modules
13
+ from settings import SNAPSHOT_PATH, PERSIST_CONTENT, HEALTHCARE_SETTINGS, MODEL_SETTINGS
14
  from audit_log import log_event, hash_summary
15
  from privacy import redact_text, safety_filter, refusal_reply
16
+ from data_registry import DataRegistry
17
+ from upload_ingest import extract_text_from_files
18
 
19
  # ---------- Writable caches (HF Spaces-safe) ----------
20
  HOME = pathlib.Path.home()
 
50
  from transformers import AutoTokenizer, AutoModelForCausalLM
51
  from huggingface_hub import login
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # ---------- Config ----------
54
  MODEL_ID = os.getenv("MODEL_ID", "microsoft/Phi-3-mini-4k-instruct")
55
  HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") or os.getenv("HF_TOKEN")
56
  COHERE_API_KEY = os.getenv("COHERE_API_KEY")
57
  USE_HOSTED_COHERE = bool(COHERE_API_KEY and _HAS_COHERE)
58
+ MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", MODEL_SETTINGS.get("max_new_tokens", 2048)))
59
 
60
  # ---------- Generic System Prompt ----------
61
  SYSTEM_MASTER = """
 
73
  - End with concrete recommendations and a brief "Provenance" mapping outputs to scenario text, uploaded files, and answers.
74
  """.strip()
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  # ---------- Session RAG Class (Simplified) ----------
77
  class SessionRAG:
78
  def __init__(self):
 
103
  t = (text or "").lower()
104
 
105
  # Check for healthcare keywords
106
+ has_healthcare_keywords = any(keyword in t for keyword in HEALTHCARE_SETTINGS["healthcare_keywords"])
107
 
108
  # Check for healthcare facility types
109
  has_facility_types = any(
110
+ any(ftype in t for ftype in ["hospital", "medical center", "health centre"]) or
111
+ any(ftype in t for ftype in ["nursing", "residential", "care facility", "long-term care"]) or
112
+ any(ftype in t for ftype in ["ambulatory", "clinic", "surgery center", "outpatient"])
113
  )
114
 
115
  # Check for healthcare-specific tasks
 
138
  """Process healthcare data files with robust error handling."""
139
  for file_path in uploaded_files_paths:
140
  try:
141
+ if data_registry.add_path(file_path):
142
+ print(f"Successfully processed: {file_path}")
143
+ else:
144
+ print(f"Failed to process: {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  except Exception as e:
146
  print(f"Error processing {file_path}: {e}")
147
  log_event("data_processing_error", None, {
 
150
  })
151
 
152
  def analyze_facility_distribution(facilities_df):
153
+ """Analyze healthcare facility distribution dynamically."""
154
  try:
155
  # Filter to Alberta if province column exists
156
+ province_col = facilities_df.columns[facilities_df.columns.str.contains('province', case=False)]
157
+ if len(province_col) > 0:
158
+ province_col = province_col[0]
159
+ alberta_mask = facilities_df[province_col].str.lower().isin(['alberta', 'ab'])
160
+ ab_facilities = facilities_df[alberta_mask].copy()
161
  else:
162
+ ab_facilities = facilities_df.copy()
163
+
164
+ # Find facility type column
165
+ type_col = facilities_df.columns[facilities_df.columns.str.contains('type', case=False)]
166
+ if len(type_col) == 0:
167
+ return {"error": "Facility type column not found"}
168
+ type_col = type_col[0]
169
 
170
  # Facility type frequency
171
+ type_counts = ab_facilities[type_col].value_counts().to_dict()
172
 
173
  # Top cities by facility count
174
+ city_col = facilities_df.columns[facilities_df.columns.str.contains('city', case=False)]
175
+ if len(city_col) > 0:
176
+ city_col = city_col[0]
177
+ city_counts = ab_facilities[city_col].value_counts().head(5)
178
  top_cities = city_counts.index.tolist()
179
 
180
  # Breakdown by facility type for top cities
181
  city_breakdown = {}
182
  for city in top_cities:
183
+ city_data = ab_facilities[ab_facilities[city_col] == city]
184
+ city_breakdown[city] = city_data[type_col].value_counts().to_dict()
185
  else:
186
  top_cities = []
187
  city_breakdown = {}
 
190
  "total_facilities": len(ab_facilities),
191
  "type_distribution": type_counts,
192
  "top_cities": top_cities,
193
+ "city_breakdown": city_breakdown,
194
+ "columns_used": {
195
+ "facility_type": type_col,
196
+ "city": city_col[0] if len(city_col) > 0 else None,
197
+ "province": province_col[0] if len(province_col) > 0 else None
198
+ }
199
  }
200
  except Exception as e:
201
  log_event("facility_analysis_error", None, {"error": str(e)})
202
  return {"error": str(e)}
203
 
204
  def analyze_bed_capacity(beds_df):
205
+ """Analyze bed capacity dynamically."""
206
  try:
207
+ # Find required columns
208
+ current_cols = beds_df.columns[beds_df.columns.str.contains('current|2023|2024', case=False)]
209
+ prev_cols = beds_df.columns[beds_df.columns.str.contains('prev|2022|previous', case=False)]
210
+
211
+ if len(current_cols) == 0 or len(prev_cols) == 0:
212
+ return {"error": f"Missing required columns. Found current: {current_cols.tolist()}, prev: {prev_cols.tolist()}"}
213
+
214
+ current_col = current_cols[0]
215
+ prev_col = prev_cols[0]
216
+
217
+ # Ensure derived columns exist
218
+ if 'bed_change' not in beds_df.columns:
219
+ beds_df['bed_change'] = beds_df[current_col] - beds_df[prev_col]
220
+
221
+ if 'percent_change' not in beds_df.columns:
222
+ beds_df['percent_change'] = beds_df.apply(
223
+ lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
224
+ axis=1
225
+ )
226
+
227
  # Filter to Alberta if province column exists
228
+ province_col = beds_df.columns[beds_df.columns.str.contains('province', case=False)]
229
+ if len(province_col) > 0:
230
+ province_col = province_col[0]
231
+ alberta_mask = beds_df[province_col].str.lower().isin(['alberta', 'ab'])
232
+ ab_beds = beds_df[alberta_mask].copy()
233
  else:
234
+ ab_beds = beds_df.copy()
235
 
236
+ # Calculate zone-level summaries if zone column exists
237
+ zone_col = beds_df.columns[beds_df.columns.str.contains('zone|region|area', case=False)]
238
+ if len(zone_col) > 0:
239
+ zone_col = zone_col[0]
240
+ zone_summary = ab_beds.groupby(zone_col).agg({
241
+ current_col: 'sum',
242
+ prev_col: 'sum',
243
  'bed_change': 'sum'
244
  }).reset_index()
245
 
246
+ zone_summary['percent_change'] = zone_summary.apply(
247
+ lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
248
+ axis=1
249
+ )
250
 
251
  # Find zones with largest changes
252
+ if len(zone_summary) > 0:
253
+ max_abs_decrease_idx = zone_summary['bed_change'].idxmin()
254
+ max_pct_decrease_idx = zone_summary['percent_change'].idxmin()
255
+
256
+ max_abs_decrease = zone_summary.loc[max_abs_decrease_idx]
257
+ max_pct_decrease = zone_summary.loc[max_pct_decrease_idx]
258
+ else:
259
+ max_abs_decrease = {}
260
+ max_pct_decrease = {}
261
 
262
  # Identify facilities with largest declines
263
  facilities_decline = ab_beds.sort_values('bed_change').head(5)
 
268
  facilities_decline = pd.DataFrame()
269
 
270
  return {
271
+ "zone_summary": zone_summary.to_dict('records') if not zone_summary.empty else [],
272
+ "max_absolute_decrease": max_abs_decrease.to_dict() if isinstance(max_abs_decrease, pd.Series) else max_abs_decrease,
273
+ "max_percentage_decrease": max_pct_decrease.to_dict() if isinstance(max_pct_decrease, pd.Series) else max_pct_decrease,
274
+ "facilities_with_largest_declines": facilities_decline.to_dict('records') if not facilities_decline.empty else [],
275
+ "columns_used": {
276
+ "beds_current": current_col,
277
+ "beds_prev": prev_col,
278
+ "zone": zone_col[0] if len(zone_col) > 0 else None,
279
+ "province": province_col[0] if len(province_col) > 0 else None
280
+ }
281
  }
282
  except Exception as e:
283
  log_event("bed_analysis_error", None, {"error": str(e)})
284
  return {"error": str(e)}
285
 
286
  def assess_long_term_capacity(facilities_df, beds_df, zone_name):
287
+ """Assess long-term care capacity dynamically."""
288
  try:
289
+ # Find relevant columns
290
+ zone_col = facilities_df.columns[facilities_df.columns.str.contains('zone|region|area', case=False)]
291
+ city_col = facilities_df.columns[facilities_df.columns.str.contains('city|municipality|town', case=False)]
292
+ type_col = facilities_df.columns[facilities_df.columns.str.contains('type|category|class', case=False)]
293
+
294
+ if len(type_col) == 0:
295
+ return {"error": "Facility type column not found"}
296
+
297
+ type_col = type_col[0]
298
+
299
  # Get facilities in the specified zone
300
+ if len(zone_col) > 0:
301
+ zone_col = zone_col[0]
302
+ zone_facilities = facilities_df[facilities_df[zone_col] == zone_name].copy()
303
  else:
304
  # If zone column not available, use province
305
+ province_col = facilities_df.columns[facilities_df.columns.str.contains('province', case=False)]
306
+ if len(province_col) > 0:
307
+ province_col = province_col[0]
308
+ alberta_mask = facilities_df[province_col].str.lower().isin(['alberta', 'ab'])
309
+ zone_facilities = facilities_df[alberta_mask].copy()
310
+ else:
311
+ zone_facilities = facilities_df.copy()
312
 
313
  # Find major city in zone
314
+ if len(city_col) > 0:
315
+ city_col = city_col[0]
316
+ city_counts = zone_facilities[city_col].value_counts()
317
+ if len(city_counts) > 0:
318
+ major_city = city_counts.index[0]
319
+ city_facilities = zone_facilities[zone_facilities[city_col] == major_city]
320
 
321
  # Count facility types
322
+ facility_counts = city_facilities[type_col].value_counts().to_dict()
323
 
324
  # Calculate ratio of nursing/residential to hospitals
325
+ hospitals = sum(count for key, count in facility_counts.items() if 'hospital' in key.lower())
326
+ nursing = sum(count for key, count in facility_counts.items() if any(word in key.lower() for word in ['nursing', 'residential', 'care']))
327
  ratio = nursing / hospitals if hospitals > 0 else 0
328
 
329
  # Assess capacity
 
334
  "major_city": major_city,
335
  "facility_counts": facility_counts,
336
  "nursing_to_hospital_ratio": ratio,
337
+ "capacity_assessment": capacity_assessment,
338
+ "columns_used": {
339
+ "zone": zone_col,
340
+ "city": city_col,
341
+ "facility_type": type_col
342
+ }
343
  }
344
 
345
  return {"error": "Could not determine major city or facility counts"}
 
357
  if 'max_percentage_decrease' in bed_data and isinstance(bed_data['max_percentage_decrease'], dict):
358
  zone = bed_data['max_percentage_decrease'].get('zone', '')
359
  decrease = bed_data['max_percentage_decrease'].get('percent_change', 0)
360
+ if zone and decrease:
361
+ recommendations.append({
362
+ "title": f"Restore staffed beds in {zone} Zone",
363
+ "description": f"Priority should be given to reopening closed units and hiring staff to address the {decrease:.1f}% decrease in bed capacity.",
364
+ "data_source": "Bed capacity analysis"
365
+ })
366
 
367
  # Recommendation 2: Expand long-term care capacity
368
  if 'long_term_care' in analysis_results:
369
  ltc_data = analysis_results['long_term_care']
370
  if ltc_data.get('capacity_assessment') == 'insufficient':
371
  city = ltc_data.get('major_city', '')
372
+ if city:
373
+ recommendations.append({
374
+ "title": f"Expand long-term care capacity in {city}",
375
+ "description": f"Invest in new long-term care beds or repurpose existing sites to expedite discharge of stabilized patients.",
376
+ "data_source": "Long-term care capacity assessment"
377
+ })
378
 
379
  # Recommendation 3: Implement surge plans
380
  if 'bed_capacity' in analysis_results:
 
407
  response += f"Error in facility distribution analysis: {fd['error']}\n\n"
408
  else:
409
  response += "## 1. Data Preparation\n\n"
410
+ response += f"Total healthcare facilities: {fd.get('total_facilities', 'N/A')}\n\n"
411
 
412
  if 'type_distribution' in fd:
413
  response += "### Facility Type Distribution\n\n"
 
439
 
440
  if 'zone_summary' in bc and bc['zone_summary']:
441
  response += "### Bed Capacity by Zone\n\n"
442
+ response += "| Zone | Beds (Current) | Beds (Previous) | Absolute Change | Percent Change |\n"
443
+ response += "|------|---------------|-----------------|-----------------|----------------|\n"
444
 
445
  for zone_data in bc['zone_summary']:
446
+ zone = zone_data.get(bc['columns_used']['zone'], 'N/A') if bc['columns_used'].get('zone') else 'N/A'
447
+ current = zone_data.get(bc['columns_used']['beds_current'], 'N/A')
448
+ prev = zone_data.get(bc['columns_used']['beds_prev'], 'N/A')
449
  change = zone_data.get('bed_change', 'N/A')
450
  pct = zone_data.get('percent_change', 'N/A')
451
  response += f"| {zone} | {current} | {prev} | {change} | {pct:.1f}% |\n"
 
455
  'max_percentage_decrease' in bc and isinstance(bc['max_percentage_decrease'], dict):
456
  abs_dec = bc['max_absolute_decrease']
457
  pct_dec = bc['max_percentage_decrease']
458
+ response += f"**Zone with largest absolute decrease**: {abs_dec.get(bc['columns_used']['zone'], 'N/A')} ({abs_dec.get('bed_change', 'N/A')} beds)\n\n"
459
+ response += f"**Zone with largest percentage decrease**: {pct_dec.get(bc['columns_used']['zone'], 'N/A')} ({pct_dec.get('percent_change', 'N/A'):.1f}%)\n\n"
460
 
461
  if 'facilities_with_largest_declines' in bc and bc['facilities_with_largest_declines']:
462
  response += "### Facilities with Largest Bed Declines\n\n"
 
465
 
466
  for facility in bc['facilities_with_largest_declines']:
467
  name = facility.get('facility_name', 'N/A')
468
+ zone = facility.get(bc['columns_used']['zone'], 'N/A') if bc['columns_used'].get('zone') else 'N/A'
469
  teaching = facility.get('teaching_status', 'N/A')
470
  change = facility.get('bed_change', 'N/A')
471
  response += f"| {name} | {zone} | {teaching} | {change} |\n"
 
518
  response += "## Provenance\n\n"
519
  response += "This analysis is based on:\n"
520
  response += "- Scenario description provided by the user\n"
521
+ response += "- Uploaded data files\n"
522
  response += "- Calculations performed on the provided data\n"
523
 
524
  return response
525
 
526
  def handle_healthcare_scenario(scenario_text, data_registry, history):
527
+ """Handle healthcare scenarios dynamically."""
528
  try:
 
529
  results = {}
530
 
531
+ # Dynamically identify relevant files
532
+ facility_files = data_registry.get_data_by_type('facility_data')
533
+ bed_files = data_registry.get_data_by_type('bed_data')
534
+
535
+ # Use the first file of each type (can be enhanced to use multiple)
536
  facilities_df = None
537
+ if facility_files:
538
+ facilities_df = data_registry.get(facility_files[0])
539
 
540
+ beds_df = None
541
+ if bed_files:
542
+ beds_df = data_registry.get(bed_files[0])
 
 
 
 
 
543
 
544
  # Log what we found
545
  log_event("data_files_found", None, {
546
  "facilities": facilities_df is not None,
547
  "beds": beds_df is not None,
548
+ "facility_files": facility_files,
549
+ "bed_files": bed_files
550
  })
551
 
552
+ # Perform analyses based on available data
553
  if facilities_df is not None:
554
  results['facility_distribution'] = analyze_facility_distribution(facilities_df)
555
 
 
556
  if beds_df is not None:
557
  results['bed_capacity'] = analyze_bed_capacity(beds_df)
558
 
559
+ # Long-term care assessment if we have both data types
560
  if 'bed_capacity' in results and 'max_percentage_decrease' in results['bed_capacity']:
561
+ zone_col = results['bed_capacity'].get('columns_used', {}).get('zone')
562
+ if zone_col:
563
+ worst_zone = results['bed_capacity']['max_percentage_decrease'].get(zone_col, '')
564
+ if worst_zone and facilities_df is not None:
565
+ results['long_term_care'] = assess_long_term_capacity(
566
+ facilities_df,
567
+ beds_df,
568
+ worst_zone
569
+ )
570
 
571
+ # Generate recommendations
572
  recommendations = generate_operational_recommendations(results)
573
 
574
+ # Generate AI integration discussion
575
  ai_integration = generate_ai_integration_discussion(results)
576
 
577
+ # Format response
578
  response = format_healthcare_analysis_response(scenario_text, results, recommendations, ai_integration)
579
 
580
  return response
 
659
  resp = client.chat(
660
  model="command-r7b-12-2024",
661
  message=prompt,
662
+ temperature=MODEL_SETTINGS.get("temperature", 0.3),
663
  max_tokens=MAX_NEW_TOKENS,
664
  )
665
  if hasattr(resp, "text") and resp.text: return resp.text.strip()
 
684
  with torch.no_grad():
685
  out = model.generate(
686
  input_ids=input_ids, max_new_tokens=max_new_tokens,
687
+ do_sample=True, temperature=MODEL_SETTINGS.get("temperature", 0.3),
688
+ top_p=MODEL_SETTINGS.get("top_p", 0.9),
689
+ repetition_penalty=MODEL_SETTINGS.get("repetition_penalty", 1.15),
690
  pad_token_id=tokenizer.eos_token_id,
691
  eos_token_id=tokenizer.eos_token_id,
692
  )
 
715
  if uploaded_files_paths:
716
  process_healthcare_data(uploaded_files_paths, data_registry)
717
 
718
+ # Also extract text for RAG
719
+ ing = extract_text_from_files(uploaded_files_paths)
720
+ if ing.get("chunks"):
721
+ session_rag.add_docs(ing["chunks"])
722
+ if ing.get("artifacts"):
723
+ session_rag.register_artifacts(ing["artifacts"])
724
+
725
  # Update session RAG with CSV columns
726
  for file_name in data_registry.names():
727
  if file_name.endswith('.csv'):
 
787
  elem_classes="hero-box"
788
  )
789
  hero_send = gr.Button("➤", scale=0, elem_id="hero-send")
790
+ gr.Markdown('<div class="hint">Upload healthcare data files (CSV, Excel, JSON, PDF, etc.) and describe your scenario for comprehensive analysis.</div>')
791
 
792
  # --- MAIN APP (hidden until first message) ---
793
  with gr.Column(elem_id="chat-container", visible=False) as app_wrap:
 
795
  with gr.Row():
796
  uploads = gr.Files(
797
  label="Upload healthcare data files",
798
+ file_types=HEALTHCARE_SETTINGS["supported_file_types"],
799
+ file_count="multiple", height=68
800
  )
801
  with gr.Row(elem_id="chat-input-row"):
802
  msg = gr.Textbox(