File size: 4,604 Bytes
023cf3a
44836be
5cd1b74
ef17f73
 
 
 
 
 
44836be
 
ef17f73
 
 
 
44836be
 
 
ef17f73
 
44836be
5cd1b74
ef17f73
44836be
 
ef17f73
 
 
 
 
 
 
 
44836be
ef17f73
 
 
 
 
44836be
ef17f73
 
 
 
44836be
ef17f73
 
 
 
 
 
5cd1b74
ef17f73
5cd1b74
ef17f73
 
5cd1b74
44836be
ef17f73
 
 
 
 
 
7ae997f
5cd1b74
ef17f73
 
 
 
5cd1b74
ef17f73
 
 
 
 
 
 
44836be
5cd1b74
ef17f73
 
 
5cd1b74
ef17f73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cd1b74
ef17f73
 
 
 
 
 
5cd1b74
ef17f73
44836be
ef17f73
5cd1b74
ef17f73
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# upload_ingest.py
import os
import json
import pandas as pd
from typing import Dict, List, Any, Optional
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
    """Enhanced file extraction with better healthcare data handling"""
    all_chunks = []
    artifacts = []
    data_summary = {}
    
    for file_path in file_paths:
        try:
            file_ext = os.path.splitext(file_path)[1].lower()
            filename = os.path.basename(file_path)
            
            if file_ext == '.csv':
                # Enhanced CSV processing
                df = pd.read_csv(file_path)
                
                # Basic data profiling
                data_summary[filename] = {
                    "shape": df.shape,
                    "columns": list(df.columns),
                    "data_types": df.dtypes.to_dict(),
                    "null_counts": df.isnull().sum().to_dict(),
                    "sample_data": df.head(3).to_dict()
                }
                
                # Extract text representation
                text_chunks = []
                text_chunks.append(f"File: {filename}")
                text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
                text_chunks.append("Columns: " + ", ".join(df.columns))
                
                # Add sample data
                for i, row in df.head(5).iterrows():
                    row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
                    text_chunks.append(f"Row {i+1}: {row_text}")
                
                all_chunks.extend(text_chunks)
                artifacts.append({
                    "type": "csv", 
                    "path": file_path, 
                    "summary": data_summary[filename],
                    "filename": filename
                })
                
            elif file_ext == '.json':
                # JSON processing
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                text_chunks = [json.dumps(data, indent=2)]
                all_chunks.extend(text_chunks)
                artifacts.append({
                    "type": "json", 
                    "path": file_path,
                    "filename": filename
                })
                
            elif file_ext in ['.txt', '.md']:
                # Text file processing
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                # Split into chunks
                chunks = [content[i:i+1000] for i in range(0, len(content), 1000)]
                all_chunks.extend(chunks)
                artifacts.append({
                    "type": "text", 
                    "path": file_path,
                    "filename": filename
                })
                
            elif file_ext in ['.xlsx', '.xls']:
                # Excel processing
                df = pd.read_excel(file_path)
                
                # Basic data profiling
                data_summary[filename] = {
                    "shape": df.shape,
                    "columns": list(df.columns),
                    "data_types": df.dtypes.to_dict(),
                    "null_counts": df.isnull().sum().to_dict(),
                    "sample_data": df.head(3).to_dict()
                }
                
                # Extract text representation
                text_chunks = []
                text_chunks.append(f"File: {filename}")
                text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
                text_chunks.append("Columns: " + ", ".join(df.columns))
                
                # Add sample data
                for i, row in df.head(5).iterrows():
                    row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
                    text_chunks.append(f"Row {i+1}: {row_text}")
                
                all_chunks.extend(text_chunks)
                artifacts.append({
                    "type": "excel", 
                    "path": file_path, 
                    "summary": data_summary[filename],
                    "filename": filename
                })
                
        except Exception as e:
            logger.error(f"Error processing {file_path}: {e}")
    
    return {
        "chunks": all_chunks,
        "artifacts": artifacts,
        "data_summary": data_summary
    }