Spaces:
Sleeping
Sleeping
| # upload_ingest.py | |
| import os | |
| import json | |
| import pandas as pd | |
| from typing import Dict, List, Any, Optional | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]: | |
| """Enhanced file extraction with better healthcare data handling""" | |
| all_chunks = [] | |
| artifacts = [] | |
| data_summary = {} | |
| for file_path in file_paths: | |
| try: | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| filename = os.path.basename(file_path) | |
| if file_ext == '.csv': | |
| # Enhanced CSV processing | |
| df = pd.read_csv(file_path) | |
| # Basic data profiling | |
| data_summary[filename] = { | |
| "shape": df.shape, | |
| "columns": list(df.columns), | |
| "data_types": df.dtypes.to_dict(), | |
| "null_counts": df.isnull().sum().to_dict(), | |
| "sample_data": df.head(3).to_dict() | |
| } | |
| # Extract text representation | |
| text_chunks = [] | |
| text_chunks.append(f"File: {filename}") | |
| text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns") | |
| text_chunks.append("Columns: " + ", ".join(df.columns)) | |
| # Add sample data | |
| for i, row in df.head(5).iterrows(): | |
| row_text = " | ".join(f"{col}: {val}" for col, val in row.items()) | |
| text_chunks.append(f"Row {i+1}: {row_text}") | |
| all_chunks.extend(text_chunks) | |
| artifacts.append({ | |
| "type": "csv", | |
| "path": file_path, | |
| "summary": data_summary[filename], | |
| "filename": filename | |
| }) | |
| elif file_ext == '.json': | |
| # JSON processing | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| text_chunks = [json.dumps(data, indent=2)] | |
| all_chunks.extend(text_chunks) | |
| artifacts.append({ | |
| "type": "json", | |
| "path": file_path, | |
| "filename": filename | |
| }) | |
| elif file_ext in ['.txt', '.md']: | |
| # Text file processing | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Split into chunks | |
| chunks = [content[i:i+1000] for i in range(0, len(content), 1000)] | |
| all_chunks.extend(chunks) | |
| artifacts.append({ | |
| "type": "text", | |
| "path": file_path, | |
| "filename": filename | |
| }) | |
| elif file_ext in ['.xlsx', '.xls']: | |
| # Excel processing | |
| df = pd.read_excel(file_path) | |
| # Basic data profiling | |
| data_summary[filename] = { | |
| "shape": df.shape, | |
| "columns": list(df.columns), | |
| "data_types": df.dtypes.to_dict(), | |
| "null_counts": df.isnull().sum().to_dict(), | |
| "sample_data": df.head(3).to_dict() | |
| } | |
| # Extract text representation | |
| text_chunks = [] | |
| text_chunks.append(f"File: {filename}") | |
| text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns") | |
| text_chunks.append("Columns: " + ", ".join(df.columns)) | |
| # Add sample data | |
| for i, row in df.head(5).iterrows(): | |
| row_text = " | ".join(f"{col}: {val}" for col, val in row.items()) | |
| text_chunks.append(f"Row {i+1}: {row_text}") | |
| all_chunks.extend(text_chunks) | |
| artifacts.append({ | |
| "type": "excel", | |
| "path": file_path, | |
| "summary": data_summary[filename], | |
| "filename": filename | |
| }) | |
| except Exception as e: | |
| logger.error(f"Error processing {file_path}: {e}") | |
| return { | |
| "chunks": all_chunks, | |
| "artifacts": artifacts, | |
| "data_summary": data_summary | |
| } | |