Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 24

Commit

4d43747

verified ·

1 Parent(s): 548a084

Update upload_ingest.py

Browse files

Files changed (1) hide show

upload_ingest.py +7 -118

upload_ingest.py CHANGED Viewed

@@ -1,120 +1,9 @@
 # upload_ingest.py
-import os
-import json
-import pandas as pd
-from typing import Dict, List, Any, Optional
-import logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
-    """Enhanced file extraction with better healthcare data handling"""
-    all_chunks = []
-    artifacts = []
-    data_summary = {}
-    for file_path in file_paths:
-        try:
-            file_ext = os.path.splitext(file_path)[1].lower()
-            filename = os.path.basename(file_path)
-            if file_ext == '.csv':
-                # Enhanced CSV processing
-                df = pd.read_csv(file_path)
-                # Basic data profiling
-                data_summary[filename] = {
-                    "shape": df.shape,
-                    "columns": list(df.columns),
-                    "data_types": df.dtypes.to_dict(),
-                    "null_counts": df.isnull().sum().to_dict(),
-                    "sample_data": df.head(3).to_dict()
-                }
-                # Extract text representation
-                text_chunks = []
-                text_chunks.append(f"File: {filename}")
-                text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
-                text_chunks.append("Columns: " + ", ".join(df.columns))
-                # Add sample data
-                for i, row in df.head(5).iterrows():
-                    row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
-                    text_chunks.append(f"Row {i+1}: {row_text}")
-                all_chunks.extend(text_chunks)
-                artifacts.append({
-                    "type": "csv",
-                    "path": file_path,
-                    "summary": data_summary[filename],
-                    "filename": filename
-                })
-            elif file_ext == '.json':
-                # JSON processing
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    data = json.load(f)
-                text_chunks = [json.dumps(data, indent=2)]
-                all_chunks.extend(text_chunks)
-                artifacts.append({
-                    "type": "json",
-                    "path": file_path,
-                    "filename": filename
-                })
-            elif file_ext in ['.txt', '.md']:
-                # Text file processing
-                with open(file_path, 'r', encoding='utf-8') as f:
-                    content = f.read()
-                # Split into chunks
-                chunks = [content[i:i+1000] for i in range(0, len(content), 1000)]
-                all_chunks.extend(chunks)
-                artifacts.append({
-                    "type": "text",
-                    "path": file_path,
-                    "filename": filename
-                })
-            elif file_ext in ['.xlsx', '.xls']:
-                # Excel processing
-                df = pd.read_excel(file_path)
-                # Basic data profiling
-                data_summary[filename] = {
-                    "shape": df.shape,
-                    "columns": list(df.columns),
-                    "data_types": df.dtypes.to_dict(),
-                    "null_counts": df.isnull().sum().to_dict(),
-                    "sample_data": df.head(3).to_dict()
-                }
-                # Extract text representation
-                text_chunks = []
-                text_chunks.append(f"File: {filename}")
-                text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
-                text_chunks.append("Columns: " + ", ".join(df.columns))
-                # Add sample data
-                for i, row in df.head(5).iterrows():
-                    row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
-                    text_chunks.append(f"Row {i+1}: {row_text}")
-                all_chunks.extend(text_chunks)
-                artifacts.append({
-                    "type": "excel",
-                    "path": file_path,
-                    "summary": data_summary[filename],
-                    "filename": filename
-                })
-        except Exception as e:
-            logger.error(f"Error processing {file_path}: {e}")
-    return {
-        "chunks": all_chunks,
-        "artifacts": artifacts,
-        "data_summary": data_summary
-    }

 # upload_ingest.py
+def extract_text_from_files(files):
+    chunks=[]
+    for f in files:
+        if f.endswith(".txt"):
+            with open(f,"r") as fh:
+                chunks.append(fh.read())
+    return {"chunks":chunks}