Medica_DecisionSupportAI / upload_ingest.py
Rajan Sharma
Update upload_ingest.py
44836be verified
raw
history blame
2.83 kB
# upload_ingest.py
import pandas as pd
import os
from typing import Dict, List, Any
def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
"""Extract text and data from uploaded files with healthcare-specific handling."""
result = {
"chunks": [],
"artifacts": [],
"healthcare_data": {}
}
for file_path in file_paths:
try:
file_name = os.path.basename(file_path)
if file_name.endswith('.csv'):
# Handle CSV files with healthcare data
df = pd.read_csv(file_path)
# Extract basic info
result["chunks"].append(f"File: {file_name}")
result["chunks"].append(f"Shape: {df.shape}")
result["chunks"].append(f"Columns: {', '.join(df.columns)}")
# Healthcare-specific processing
healthcare_info = {}
# Check for facility data
if any(col in df.columns for col in ['facility_name', 'facility_type']):
healthcare_info['type'] = 'facility_data'
if 'facility_type' in df.columns:
healthcare_info['facility_types'] = df['facility_type'].value_counts().to_dict()
# Check for bed data
if any(col in df.columns for col in ['beds_current', 'beds_prev']):
healthcare_info['type'] = 'bed_data'
if 'zone' in df.columns:
healthcare_info['zones'] = df['zone'].unique().tolist()
# Calculate changes if both columns exist
if 'beds_current' in df.columns and 'beds_prev' in df.columns:
df['bed_change'] = df['beds_current'] - df['beds_prev']
healthcare_info['total_change'] = df['bed_change'].sum()
if healthcare_info:
result["healthcare_data"][file_name] = healthcare_info
# Add sample data
result["artifacts"].append({
"file": file_name,
"type": "csv",
"sample": df.head(3).to_dict('records')
})
elif file_name.endswith(('.pdf', '.docx', '.txt')):
# For text files, just note the file
result["chunks"].append(f"Document: {file_name}")
result["artifacts"].append({
"file": file_name,
"type": "document"
})
except Exception as e:
result["chunks"].append(f"Error processing {file_path}: {str(e)}")
return result