Medica_DecisionSupportAI / upload_ingest.py
Rajan Sharma
Update upload_ingest.py
ef17f73 verified
raw
history blame
4.6 kB
# upload_ingest.py
import os
import json
import pandas as pd
from typing import Dict, List, Any, Optional
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
"""Enhanced file extraction with better healthcare data handling"""
all_chunks = []
artifacts = []
data_summary = {}
for file_path in file_paths:
try:
file_ext = os.path.splitext(file_path)[1].lower()
filename = os.path.basename(file_path)
if file_ext == '.csv':
# Enhanced CSV processing
df = pd.read_csv(file_path)
# Basic data profiling
data_summary[filename] = {
"shape": df.shape,
"columns": list(df.columns),
"data_types": df.dtypes.to_dict(),
"null_counts": df.isnull().sum().to_dict(),
"sample_data": df.head(3).to_dict()
}
# Extract text representation
text_chunks = []
text_chunks.append(f"File: {filename}")
text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
text_chunks.append("Columns: " + ", ".join(df.columns))
# Add sample data
for i, row in df.head(5).iterrows():
row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
text_chunks.append(f"Row {i+1}: {row_text}")
all_chunks.extend(text_chunks)
artifacts.append({
"type": "csv",
"path": file_path,
"summary": data_summary[filename],
"filename": filename
})
elif file_ext == '.json':
# JSON processing
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
text_chunks = [json.dumps(data, indent=2)]
all_chunks.extend(text_chunks)
artifacts.append({
"type": "json",
"path": file_path,
"filename": filename
})
elif file_ext in ['.txt', '.md']:
# Text file processing
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Split into chunks
chunks = [content[i:i+1000] for i in range(0, len(content), 1000)]
all_chunks.extend(chunks)
artifacts.append({
"type": "text",
"path": file_path,
"filename": filename
})
elif file_ext in ['.xlsx', '.xls']:
# Excel processing
df = pd.read_excel(file_path)
# Basic data profiling
data_summary[filename] = {
"shape": df.shape,
"columns": list(df.columns),
"data_types": df.dtypes.to_dict(),
"null_counts": df.isnull().sum().to_dict(),
"sample_data": df.head(3).to_dict()
}
# Extract text representation
text_chunks = []
text_chunks.append(f"File: {filename}")
text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
text_chunks.append("Columns: " + ", ".join(df.columns))
# Add sample data
for i, row in df.head(5).iterrows():
row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
text_chunks.append(f"Row {i+1}: {row_text}")
all_chunks.extend(text_chunks)
artifacts.append({
"type": "excel",
"path": file_path,
"summary": data_summary[filename],
"filename": filename
})
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
return {
"chunks": all_chunks,
"artifacts": artifacts,
"data_summary": data_summary
}