# upload_ingest.py import os import json import pandas as pd from typing import Dict, List, Any, Optional import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]: """Enhanced file extraction with better healthcare data handling""" all_chunks = [] artifacts = [] data_summary = {} for file_path in file_paths: try: file_ext = os.path.splitext(file_path)[1].lower() filename = os.path.basename(file_path) if file_ext == '.csv': # Enhanced CSV processing df = pd.read_csv(file_path) # Basic data profiling data_summary[filename] = { "shape": df.shape, "columns": list(df.columns), "data_types": df.dtypes.to_dict(), "null_counts": df.isnull().sum().to_dict(), "sample_data": df.head(3).to_dict() } # Extract text representation text_chunks = [] text_chunks.append(f"File: {filename}") text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns") text_chunks.append("Columns: " + ", ".join(df.columns)) # Add sample data for i, row in df.head(5).iterrows(): row_text = " | ".join(f"{col}: {val}" for col, val in row.items()) text_chunks.append(f"Row {i+1}: {row_text}") all_chunks.extend(text_chunks) artifacts.append({ "type": "csv", "path": file_path, "summary": data_summary[filename], "filename": filename }) elif file_ext == '.json': # JSON processing with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) text_chunks = [json.dumps(data, indent=2)] all_chunks.extend(text_chunks) artifacts.append({ "type": "json", "path": file_path, "filename": filename }) elif file_ext in ['.txt', '.md']: # Text file processing with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Split into chunks chunks = [content[i:i+1000] for i in range(0, len(content), 1000)] all_chunks.extend(chunks) artifacts.append({ "type": "text", "path": file_path, "filename": filename }) elif file_ext in ['.xlsx', '.xls']: # Excel processing df = pd.read_excel(file_path) # Basic data profiling data_summary[filename] = { "shape": df.shape, "columns": list(df.columns), "data_types": df.dtypes.to_dict(), "null_counts": df.isnull().sum().to_dict(), "sample_data": df.head(3).to_dict() } # Extract text representation text_chunks = [] text_chunks.append(f"File: {filename}") text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns") text_chunks.append("Columns: " + ", ".join(df.columns)) # Add sample data for i, row in df.head(5).iterrows(): row_text = " | ".join(f"{col}: {val}" for col, val in row.items()) text_chunks.append(f"Row {i+1}: {row_text}") all_chunks.extend(text_chunks) artifacts.append({ "type": "excel", "path": file_path, "summary": data_summary[filename], "filename": filename }) except Exception as e: logger.error(f"Error processing {file_path}: {e}") return { "chunks": all_chunks, "artifacts": artifacts, "data_summary": data_summary }