# upload_ingest.py import pandas as pd import os from typing import Dict, List, Any def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]: """Extract text and data from uploaded files with healthcare-specific handling.""" result = { "chunks": [], "artifacts": [], "healthcare_data": {} } for file_path in file_paths: try: file_name = os.path.basename(file_path) if file_name.endswith('.csv'): # Handle CSV files with healthcare data df = pd.read_csv(file_path) # Extract basic info result["chunks"].append(f"File: {file_name}") result["chunks"].append(f"Shape: {df.shape}") result["chunks"].append(f"Columns: {', '.join(df.columns)}") # Healthcare-specific processing healthcare_info = {} # Check for facility data if any(col in df.columns for col in ['facility_name', 'facility_type']): healthcare_info['type'] = 'facility_data' if 'facility_type' in df.columns: healthcare_info['facility_types'] = df['facility_type'].value_counts().to_dict() # Check for bed data if any(col in df.columns for col in ['beds_current', 'beds_prev']): healthcare_info['type'] = 'bed_data' if 'zone' in df.columns: healthcare_info['zones'] = df['zone'].unique().tolist() # Calculate changes if both columns exist if 'beds_current' in df.columns and 'beds_prev' in df.columns: df['bed_change'] = df['beds_current'] - df['beds_prev'] healthcare_info['total_change'] = df['bed_change'].sum() if healthcare_info: result["healthcare_data"][file_name] = healthcare_info # Add sample data result["artifacts"].append({ "file": file_name, "type": "csv", "sample": df.head(3).to_dict('records') }) elif file_name.endswith(('.pdf', '.docx', '.txt')): # For text files, just note the file result["chunks"].append(f"Document: {file_name}") result["artifacts"].append({ "file": file_name, "type": "document" }) except Exception as e: result["chunks"].append(f"Error processing {file_path}: {str(e)}") return result