File size: 2,831 Bytes
023cf3a
 
44836be
 
 
 
 
 
 
 
 
023cf3a
44836be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ae997f
44836be
 
 
 
 
 
 
 
 
 
 
 
 
023cf3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# upload_ingest.py
import pandas as pd
import os
from typing import Dict, List, Any

def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
    """Extract text and data from uploaded files with healthcare-specific handling."""
    result = {
        "chunks": [],
        "artifacts": [],
        "healthcare_data": {}
    }
    
    for file_path in file_paths:
        try:
            file_name = os.path.basename(file_path)
            
            if file_name.endswith('.csv'):
                # Handle CSV files with healthcare data
                df = pd.read_csv(file_path)
                
                # Extract basic info
                result["chunks"].append(f"File: {file_name}")
                result["chunks"].append(f"Shape: {df.shape}")
                result["chunks"].append(f"Columns: {', '.join(df.columns)}")
                
                # Healthcare-specific processing
                healthcare_info = {}
                
                # Check for facility data
                if any(col in df.columns for col in ['facility_name', 'facility_type']):
                    healthcare_info['type'] = 'facility_data'
                    if 'facility_type' in df.columns:
                        healthcare_info['facility_types'] = df['facility_type'].value_counts().to_dict()
                
                # Check for bed data
                if any(col in df.columns for col in ['beds_current', 'beds_prev']):
                    healthcare_info['type'] = 'bed_data'
                    if 'zone' in df.columns:
                        healthcare_info['zones'] = df['zone'].unique().tolist()
                    
                    # Calculate changes if both columns exist
                    if 'beds_current' in df.columns and 'beds_prev' in df.columns:
                        df['bed_change'] = df['beds_current'] - df['beds_prev']
                        healthcare_info['total_change'] = df['bed_change'].sum()
                
                if healthcare_info:
                    result["healthcare_data"][file_name] = healthcare_info
                
                # Add sample data
                result["artifacts"].append({
                    "file": file_name,
                    "type": "csv",
                    "sample": df.head(3).to_dict('records')
                })
            
            elif file_name.endswith(('.pdf', '.docx', '.txt')):
                # For text files, just note the file
                result["chunks"].append(f"Document: {file_name}")
                result["artifacts"].append({
                    "file": file_name,
                    "type": "document"
                })
        
        except Exception as e:
            result["chunks"].append(f"Error processing {file_path}: {str(e)}")
    
    return result