Rajan Sharma commited on
Commit
5cd1b74
·
verified ·
1 Parent(s): 4073913

Update upload_ingest.py

Browse files
Files changed (1) hide show
  1. upload_ingest.py +160 -30
upload_ingest.py CHANGED
@@ -1,10 +1,14 @@
1
  # upload_ingest.py
2
  import pandas as pd
3
  import os
 
4
  from typing import Dict, List, Any
 
 
 
5
 
6
  def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
7
- """Extract text and data from uploaded files with healthcare-specific handling."""
8
  result = {
9
  "chunks": [],
10
  "artifacts": [],
@@ -14,56 +18,182 @@ def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
14
  for file_path in file_paths:
15
  try:
16
  file_name = os.path.basename(file_path)
 
17
 
18
- if file_name.endswith('.csv'):
19
- # Handle CSV files with healthcare data
20
  df = pd.read_csv(file_path)
 
21
 
22
- # Extract basic info
23
- result["chunks"].append(f"File: {file_name}")
24
- result["chunks"].append(f"Shape: {df.shape}")
25
- result["chunks"].append(f"Columns: {', '.join(df.columns)}")
26
 
27
- # Healthcare-specific processing
28
- healthcare_info = {}
 
 
 
 
 
 
 
29
 
30
- # Check for facility data
31
- if any(col in df.columns for col in ['facility_name', 'facility_type']):
32
- healthcare_info['type'] = 'facility_data'
33
- if 'facility_type' in df.columns:
34
- healthcare_info['facility_types'] = df['facility_type'].value_counts().to_dict()
35
 
36
- # Check for bed data
37
- if any(col in df.columns for col in ['beds_current', 'beds_prev']):
38
- healthcare_info['type'] = 'bed_data'
39
- if 'zone' in df.columns:
40
- healthcare_info['zones'] = df['zone'].unique().tolist()
41
-
42
- # Calculate changes if both columns exist
43
- if 'beds_current' in df.columns and 'beds_prev' in df.columns:
44
- df['bed_change'] = df['beds_current'] - df['beds_prev']
45
- healthcare_info['total_change'] = df['bed_change'].sum()
 
46
 
 
47
  if healthcare_info:
48
  result["healthcare_data"][file_name] = healthcare_info
49
 
50
- # Add sample data
51
  result["artifacts"].append({
52
  "file": file_name,
53
- "type": "csv",
54
  "sample": df.head(3).to_dict('records')
55
  })
56
 
57
- elif file_name.endswith(('.pdf', '.docx', '.txt')):
58
- # For text files, just note the file
59
- result["chunks"].append(f"Document: {file_name}")
 
 
 
 
 
60
  result["artifacts"].append({
61
  "file": file_name,
62
- "type": "document"
 
63
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  except Exception as e:
66
  result["chunks"].append(f"Error processing {file_path}: {str(e)}")
67
 
68
  return result
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # upload_ingest.py
2
  import pandas as pd
3
  import os
4
+ import json
5
  from typing import Dict, List, Any
6
+ import PyPDF2
7
+ import docx
8
+ import csv
9
 
10
  def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
11
+ """Extract text and data from uploaded files dynamically."""
12
  result = {
13
  "chunks": [],
14
  "artifacts": [],
 
18
  for file_path in file_paths:
19
  try:
20
  file_name = os.path.basename(file_path)
21
+ file_ext = os.path.splitext(file_name)[1].lower()
22
 
23
+ if file_ext == '.csv':
 
24
  df = pd.read_csv(file_path)
25
+ result["chunks"].append(f"CSV file: {file_name}")
26
 
27
+ # Dynamic healthcare data detection
28
+ healthcare_info = detect_healthcare_data_type(df)
29
+ if healthcare_info:
30
+ result["healthcare_data"][file_name] = healthcare_info
31
 
32
+ result["artifacts"].append({
33
+ "file": file_name,
34
+ "type": "csv",
35
+ "sample": df.head(3).to_dict('records')
36
+ })
37
+
38
+ elif file_ext in ['.xlsx', '.xls']:
39
+ df = pd.read_excel(file_path)
40
+ result["chunks"].append(f"Excel file: {file_name}")
41
 
42
+ healthcare_info = detect_healthcare_data_type(df)
43
+ if healthcare_info:
44
+ result["healthcare_data"][file_name] = healthcare_info
 
 
45
 
46
+ result["artifacts"].append({
47
+ "file": file_name,
48
+ "type": "excel",
49
+ "sample": df.head(3).to_dict('records')
50
+ })
51
+
52
+ elif file_ext == '.json':
53
+ with open(file_path, 'r') as f:
54
+ data = json.load(f)
55
+ df = pd.json_normalize(data)
56
+ result["chunks"].append(f"JSON file: {file_name}")
57
 
58
+ healthcare_info = detect_healthcare_data_type(df)
59
  if healthcare_info:
60
  result["healthcare_data"][file_name] = healthcare_info
61
 
 
62
  result["artifacts"].append({
63
  "file": file_name,
64
+ "type": "json",
65
  "sample": df.head(3).to_dict('records')
66
  })
67
 
68
+ elif file_ext == '.parquet':
69
+ df = pd.read_parquet(file_path)
70
+ result["chunks"].append(f"Parquet file: {file_name}")
71
+
72
+ healthcare_info = detect_healthcare_data_type(df)
73
+ if healthcare_info:
74
+ result["healthcare_data"][file_name] = healthcare_info
75
+
76
  result["artifacts"].append({
77
  "file": file_name,
78
+ "type": "parquet",
79
+ "sample": df.head(3).to_dict('records')
80
  })
81
+
82
+ elif file_ext == '.pdf':
83
+ text = extract_text_from_pdf(file_path)
84
+ result["chunks"].append(f"PDF file: {file_name}")
85
+ result["chunks"].append(f"Extracted text preview: {text[:500]}...")
86
+
87
+ result["artifacts"].append({
88
+ "file": file_name,
89
+ "type": "pdf",
90
+ "text": text
91
+ })
92
+
93
+ elif file_ext == '.docx':
94
+ text = extract_text_from_docx(file_path)
95
+ result["chunks"].append(f"DOCX file: {file_name}")
96
+ result["chunks"].append(f"Extracted text preview: {text[:500]}...")
97
+
98
+ result["artifacts"].append({
99
+ "file": file_name,
100
+ "type": "docx",
101
+ "text": text
102
+ })
103
+
104
+ elif file_ext == '.txt':
105
+ with open(file_path, 'r', encoding='utf-8') as f:
106
+ text = f.read()
107
+ result["chunks"].append(f"Text file: {file_name}")
108
+ result["chunks"].append(f"Content preview: {text[:500]}...")
109
+
110
+ result["artifacts"].append({
111
+ "file": file_name,
112
+ "type": "txt",
113
+ "text": text
114
+ })
115
+
116
+ else:
117
+ result["chunks"].append(f"Unsupported file type: {file_ext}")
118
 
119
  except Exception as e:
120
  result["chunks"].append(f"Error processing {file_path}: {str(e)}")
121
 
122
  return result
123
 
124
+ def extract_text_from_pdf(file_path: str) -> str:
125
+ """Extract text from PDF file."""
126
+ text = ""
127
+ with open(file_path, 'rb') as file:
128
+ reader = PyPDF2.PdfReader(file)
129
+ for page in reader.pages:
130
+ text += page.extract_text()
131
+ return text
132
+
133
+ def extract_text_from_docx(file_path: str) -> str:
134
+ """Extract text from DOCX file."""
135
+ doc = docx.Document(file_path)
136
+ text = ""
137
+ for paragraph in doc.paragraphs:
138
+ text += paragraph.text + "\n"
139
+ return text
140
+
141
+ def detect_healthcare_data_type(df: pd.DataFrame) -> Dict[str, Any]:
142
+ """Detect healthcare data type dynamically."""
143
+ healthcare_info = {}
144
+
145
+ # Convert column names to lowercase for easier matching
146
+ columns_lower = [col.lower() for col in df.columns]
147
+
148
+ # Check for facility data indicators
149
+ facility_indicators = ['facility', 'hospital', 'clinic', 'center', 'site', 'name']
150
+ type_indicators = ['type', 'category', 'class']
151
+
152
+ has_facility_data = any(
153
+ any(indicator in col for indicator in facility_indicators)
154
+ for col in columns_lower
155
+ )
156
+
157
+ has_type_data = any(
158
+ any(indicator in col for indicator in type_indicators)
159
+ for col in columns_lower
160
+ )
161
+
162
+ if has_facility_data:
163
+ healthcare_info['type'] = 'facility_data'
164
+ if has_type_data:
165
+ type_col = next((col for col in df.columns if any(indicator in col.lower() for indicator in type_indicators)), None)
166
+ if type_col:
167
+ healthcare_info['facility_types'] = df[type_col].value_counts().to_dict()
168
+
169
+ # Check for bed data indicators
170
+ bed_indicators = ['bed', 'capacity', 'occupancy']
171
+ time_indicators = ['current', 'prev', '2023', '2024', '2022']
172
+
173
+ has_bed_data = any(
174
+ any(bed_indicator in col for bed_indicator in bed_indicators)
175
+ for col in columns_lower
176
+ )
177
+
178
+ if has_bed_data:
179
+ healthcare_info['type'] = 'bed_data'
180
+
181
+ # Try to calculate changes if we have current and previous data
182
+ current_cols = [col for col in df.columns if any(indicator in col.lower() for indicator in ['current', '2023', '2024'])]
183
+ prev_cols = [col for col in df.columns if any(indicator in col.lower() for indicator in ['prev', '2022', 'previous'])]
184
+
185
+ if current_cols and prev_cols:
186
+ current_col = current_cols[0]
187
+ prev_col = prev_cols[0]
188
+
189
+ df['bed_change'] = df[current_col] - df[prev_col]
190
+ healthcare_info['total_change'] = df['bed_change'].sum()
191
+
192
+ df['percent_change'] = df.apply(
193
+ lambda row: (row['bed_change'] / row[prev_col] * 100) if row[prev_col] != 0 else 0,
194
+ axis=1
195
+ )
196
+ healthcare_info['has_derived_metrics'] = True
197
+
198
+ return healthcare_info
199
+