Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

Medica_DecisionSupportAI / upload_ingest.py

Rajan Sharma

Update upload_ingest.py

ef17f73 verified 3 months ago

4.6 kB

	# upload_ingest.py
	import os
	import json
	import pandas as pd
	from typing import Dict, List, Any, Optional
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
	"""Enhanced file extraction with better healthcare data handling"""
	all_chunks = []
	artifacts = []
	data_summary = {}

	for file_path in file_paths:
	try:
	file_ext = os.path.splitext(file_path)[1].lower()
	filename = os.path.basename(file_path)

	if file_ext == '.csv':
	# Enhanced CSV processing
	df = pd.read_csv(file_path)

	# Basic data profiling
	data_summary[filename] = {
	"shape": df.shape,
	"columns": list(df.columns),
	"data_types": df.dtypes.to_dict(),
	"null_counts": df.isnull().sum().to_dict(),
	"sample_data": df.head(3).to_dict()
	}

	# Extract text representation
	text_chunks = []
	text_chunks.append(f"File: {filename}")
	text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
	text_chunks.append("Columns: " + ", ".join(df.columns))

	# Add sample data
	for i, row in df.head(5).iterrows():
	row_text = " \| ".join(f"{col}: {val}" for col, val in row.items())
	text_chunks.append(f"Row {i+1}: {row_text}")

	all_chunks.extend(text_chunks)
	artifacts.append({
	"type": "csv",
	"path": file_path,
	"summary": data_summary[filename],
	"filename": filename
	})

	elif file_ext == '.json':
	# JSON processing
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	text_chunks = [json.dumps(data, indent=2)]
	all_chunks.extend(text_chunks)
	artifacts.append({
	"type": "json",
	"path": file_path,
	"filename": filename
	})

	elif file_ext in ['.txt', '.md']:
	# Text file processing
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Split into chunks
	chunks = [content[i:i+1000] for i in range(0, len(content), 1000)]
	all_chunks.extend(chunks)
	artifacts.append({
	"type": "text",
	"path": file_path,
	"filename": filename
	})

	elif file_ext in ['.xlsx', '.xls']:
	# Excel processing
	df = pd.read_excel(file_path)

	# Basic data profiling
	data_summary[filename] = {
	"shape": df.shape,
	"columns": list(df.columns),
	"data_types": df.dtypes.to_dict(),
	"null_counts": df.isnull().sum().to_dict(),
	"sample_data": df.head(3).to_dict()
	}

	# Extract text representation
	text_chunks = []
	text_chunks.append(f"File: {filename}")
	text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
	text_chunks.append("Columns: " + ", ".join(df.columns))

	# Add sample data
	for i, row in df.head(5).iterrows():
	row_text = " \| ".join(f"{col}: {val}" for col, val in row.items())
	text_chunks.append(f"Row {i+1}: {row_text}")

	all_chunks.extend(text_chunks)
	artifacts.append({
	"type": "excel",
	"path": file_path,
	"summary": data_summary[filename],
	"filename": filename
	})

	except Exception as e:
	logger.error(f"Error processing {file_path}: {e}")

	return {
	"chunks": all_chunks,
	"artifacts": artifacts,
	"data_summary": data_summary
	}