# upload_ingest.py
import os
import json
import pandas as pd
from typing import Dict, List, Any, Optional
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
    """Enhanced file extraction with better healthcare data handling"""
    all_chunks = []
    artifacts = []
    data_summary = {}
    
    for file_path in file_paths:
        try:
            file_ext = os.path.splitext(file_path)[1].lower()
            filename = os.path.basename(file_path)
            
            if file_ext == '.csv':
                # Enhanced CSV processing
                df = pd.read_csv(file_path)
                
                # Basic data profiling
                data_summary[filename] = {
                    "shape": df.shape,
                    "columns": list(df.columns),
                    "data_types": df.dtypes.to_dict(),
                    "null_counts": df.isnull().sum().to_dict(),
                    "sample_data": df.head(3).to_dict()
                }
                
                # Extract text representation
                text_chunks = []
                text_chunks.append(f"File: {filename}")
                text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
                text_chunks.append("Columns: " + ", ".join(df.columns))
                
                # Add sample data
                for i, row in df.head(5).iterrows():
                    row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
                    text_chunks.append(f"Row {i+1}: {row_text}")
                
                all_chunks.extend(text_chunks)
                artifacts.append({
                    "type": "csv", 
                    "path": file_path, 
                    "summary": data_summary[filename],
                    "filename": filename
                })
                
            elif file_ext == '.json':
                # JSON processing
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                text_chunks = [json.dumps(data, indent=2)]
                all_chunks.extend(text_chunks)
                artifacts.append({
                    "type": "json", 
                    "path": file_path,
                    "filename": filename
                })
                
            elif file_ext in ['.txt', '.md']:
                # Text file processing
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                # Split into chunks
                chunks = [content[i:i+1000] for i in range(0, len(content), 1000)]
                all_chunks.extend(chunks)
                artifacts.append({
                    "type": "text", 
                    "path": file_path,
                    "filename": filename
                })
                
            elif file_ext in ['.xlsx', '.xls']:
                # Excel processing
                df = pd.read_excel(file_path)
                
                # Basic data profiling
                data_summary[filename] = {
                    "shape": df.shape,
                    "columns": list(df.columns),
                    "data_types": df.dtypes.to_dict(),
                    "null_counts": df.isnull().sum().to_dict(),
                    "sample_data": df.head(3).to_dict()
                }
                
                # Extract text representation
                text_chunks = []
                text_chunks.append(f"File: {filename}")
                text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
                text_chunks.append("Columns: " + ", ".join(df.columns))
                
                # Add sample data
                for i, row in df.head(5).iterrows():
                    row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
                    text_chunks.append(f"Row {i+1}: {row_text}")
                
                all_chunks.extend(text_chunks)
                artifacts.append({
                    "type": "excel", 
                    "path": file_path, 
                    "summary": data_summary[filename],
                    "filename": filename
                })
                
        except Exception as e:
            logger.error(f"Error processing {file_path}: {e}")
    
    return {
        "chunks": all_chunks,
        "artifacts": artifacts,
        "data_summary": data_summary
    }