Spaces:
Sleeping
Sleeping
File size: 2,831 Bytes
023cf3a 44836be 023cf3a 44836be 7ae997f 44836be 023cf3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# upload_ingest.py
import pandas as pd
import os
from typing import Dict, List, Any
def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
"""Extract text and data from uploaded files with healthcare-specific handling."""
result = {
"chunks": [],
"artifacts": [],
"healthcare_data": {}
}
for file_path in file_paths:
try:
file_name = os.path.basename(file_path)
if file_name.endswith('.csv'):
# Handle CSV files with healthcare data
df = pd.read_csv(file_path)
# Extract basic info
result["chunks"].append(f"File: {file_name}")
result["chunks"].append(f"Shape: {df.shape}")
result["chunks"].append(f"Columns: {', '.join(df.columns)}")
# Healthcare-specific processing
healthcare_info = {}
# Check for facility data
if any(col in df.columns for col in ['facility_name', 'facility_type']):
healthcare_info['type'] = 'facility_data'
if 'facility_type' in df.columns:
healthcare_info['facility_types'] = df['facility_type'].value_counts().to_dict()
# Check for bed data
if any(col in df.columns for col in ['beds_current', 'beds_prev']):
healthcare_info['type'] = 'bed_data'
if 'zone' in df.columns:
healthcare_info['zones'] = df['zone'].unique().tolist()
# Calculate changes if both columns exist
if 'beds_current' in df.columns and 'beds_prev' in df.columns:
df['bed_change'] = df['beds_current'] - df['beds_prev']
healthcare_info['total_change'] = df['bed_change'].sum()
if healthcare_info:
result["healthcare_data"][file_name] = healthcare_info
# Add sample data
result["artifacts"].append({
"file": file_name,
"type": "csv",
"sample": df.head(3).to_dict('records')
})
elif file_name.endswith(('.pdf', '.docx', '.txt')):
# For text files, just note the file
result["chunks"].append(f"Document: {file_name}")
result["artifacts"].append({
"file": file_name,
"type": "document"
})
except Exception as e:
result["chunks"].append(f"Error processing {file_path}: {str(e)}")
return result
|