Spaces:
Sleeping
Sleeping
File size: 4,604 Bytes
023cf3a 44836be 5cd1b74 ef17f73 44836be ef17f73 44836be ef17f73 44836be 5cd1b74 ef17f73 44836be ef17f73 44836be ef17f73 44836be ef17f73 44836be ef17f73 5cd1b74 ef17f73 5cd1b74 ef17f73 5cd1b74 44836be ef17f73 7ae997f 5cd1b74 ef17f73 5cd1b74 ef17f73 44836be 5cd1b74 ef17f73 5cd1b74 ef17f73 5cd1b74 ef17f73 5cd1b74 ef17f73 44836be ef17f73 5cd1b74 ef17f73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# upload_ingest.py
import os
import json
import pandas as pd
from typing import Dict, List, Any, Optional
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
"""Enhanced file extraction with better healthcare data handling"""
all_chunks = []
artifacts = []
data_summary = {}
for file_path in file_paths:
try:
file_ext = os.path.splitext(file_path)[1].lower()
filename = os.path.basename(file_path)
if file_ext == '.csv':
# Enhanced CSV processing
df = pd.read_csv(file_path)
# Basic data profiling
data_summary[filename] = {
"shape": df.shape,
"columns": list(df.columns),
"data_types": df.dtypes.to_dict(),
"null_counts": df.isnull().sum().to_dict(),
"sample_data": df.head(3).to_dict()
}
# Extract text representation
text_chunks = []
text_chunks.append(f"File: {filename}")
text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
text_chunks.append("Columns: " + ", ".join(df.columns))
# Add sample data
for i, row in df.head(5).iterrows():
row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
text_chunks.append(f"Row {i+1}: {row_text}")
all_chunks.extend(text_chunks)
artifacts.append({
"type": "csv",
"path": file_path,
"summary": data_summary[filename],
"filename": filename
})
elif file_ext == '.json':
# JSON processing
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
text_chunks = [json.dumps(data, indent=2)]
all_chunks.extend(text_chunks)
artifacts.append({
"type": "json",
"path": file_path,
"filename": filename
})
elif file_ext in ['.txt', '.md']:
# Text file processing
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Split into chunks
chunks = [content[i:i+1000] for i in range(0, len(content), 1000)]
all_chunks.extend(chunks)
artifacts.append({
"type": "text",
"path": file_path,
"filename": filename
})
elif file_ext in ['.xlsx', '.xls']:
# Excel processing
df = pd.read_excel(file_path)
# Basic data profiling
data_summary[filename] = {
"shape": df.shape,
"columns": list(df.columns),
"data_types": df.dtypes.to_dict(),
"null_counts": df.isnull().sum().to_dict(),
"sample_data": df.head(3).to_dict()
}
# Extract text representation
text_chunks = []
text_chunks.append(f"File: {filename}")
text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
text_chunks.append("Columns: " + ", ".join(df.columns))
# Add sample data
for i, row in df.head(5).iterrows():
row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
text_chunks.append(f"Row {i+1}: {row_text}")
all_chunks.extend(text_chunks)
artifacts.append({
"type": "excel",
"path": file_path,
"summary": data_summary[filename],
"filename": filename
})
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
return {
"chunks": all_chunks,
"artifacts": artifacts,
"data_summary": data_summary
}
|