Rajan Sharma commited on
Commit
4d43747
·
verified ·
1 Parent(s): 548a084

Update upload_ingest.py

Browse files
Files changed (1) hide show
  1. upload_ingest.py +7 -118
upload_ingest.py CHANGED
@@ -1,120 +1,9 @@
1
  # upload_ingest.py
2
- import os
3
- import json
4
- import pandas as pd
5
- from typing import Dict, List, Any, Optional
6
- import logging
 
 
7
 
8
- logging.basicConfig(level=logging.INFO)
9
- logger = logging.getLogger(__name__)
10
-
11
- def extract_text_from_files(file_paths: List[str]) -> Dict[str, Any]:
12
- """Enhanced file extraction with better healthcare data handling"""
13
- all_chunks = []
14
- artifacts = []
15
- data_summary = {}
16
-
17
- for file_path in file_paths:
18
- try:
19
- file_ext = os.path.splitext(file_path)[1].lower()
20
- filename = os.path.basename(file_path)
21
-
22
- if file_ext == '.csv':
23
- # Enhanced CSV processing
24
- df = pd.read_csv(file_path)
25
-
26
- # Basic data profiling
27
- data_summary[filename] = {
28
- "shape": df.shape,
29
- "columns": list(df.columns),
30
- "data_types": df.dtypes.to_dict(),
31
- "null_counts": df.isnull().sum().to_dict(),
32
- "sample_data": df.head(3).to_dict()
33
- }
34
-
35
- # Extract text representation
36
- text_chunks = []
37
- text_chunks.append(f"File: {filename}")
38
- text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
39
- text_chunks.append("Columns: " + ", ".join(df.columns))
40
-
41
- # Add sample data
42
- for i, row in df.head(5).iterrows():
43
- row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
44
- text_chunks.append(f"Row {i+1}: {row_text}")
45
-
46
- all_chunks.extend(text_chunks)
47
- artifacts.append({
48
- "type": "csv",
49
- "path": file_path,
50
- "summary": data_summary[filename],
51
- "filename": filename
52
- })
53
-
54
- elif file_ext == '.json':
55
- # JSON processing
56
- with open(file_path, 'r', encoding='utf-8') as f:
57
- data = json.load(f)
58
-
59
- text_chunks = [json.dumps(data, indent=2)]
60
- all_chunks.extend(text_chunks)
61
- artifacts.append({
62
- "type": "json",
63
- "path": file_path,
64
- "filename": filename
65
- })
66
-
67
- elif file_ext in ['.txt', '.md']:
68
- # Text file processing
69
- with open(file_path, 'r', encoding='utf-8') as f:
70
- content = f.read()
71
-
72
- # Split into chunks
73
- chunks = [content[i:i+1000] for i in range(0, len(content), 1000)]
74
- all_chunks.extend(chunks)
75
- artifacts.append({
76
- "type": "text",
77
- "path": file_path,
78
- "filename": filename
79
- })
80
-
81
- elif file_ext in ['.xlsx', '.xls']:
82
- # Excel processing
83
- df = pd.read_excel(file_path)
84
-
85
- # Basic data profiling
86
- data_summary[filename] = {
87
- "shape": df.shape,
88
- "columns": list(df.columns),
89
- "data_types": df.dtypes.to_dict(),
90
- "null_counts": df.isnull().sum().to_dict(),
91
- "sample_data": df.head(3).to_dict()
92
- }
93
-
94
- # Extract text representation
95
- text_chunks = []
96
- text_chunks.append(f"File: {filename}")
97
- text_chunks.append(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns")
98
- text_chunks.append("Columns: " + ", ".join(df.columns))
99
-
100
- # Add sample data
101
- for i, row in df.head(5).iterrows():
102
- row_text = " | ".join(f"{col}: {val}" for col, val in row.items())
103
- text_chunks.append(f"Row {i+1}: {row_text}")
104
-
105
- all_chunks.extend(text_chunks)
106
- artifacts.append({
107
- "type": "excel",
108
- "path": file_path,
109
- "summary": data_summary[filename],
110
- "filename": filename
111
- })
112
-
113
- except Exception as e:
114
- logger.error(f"Error processing {file_path}: {e}")
115
-
116
- return {
117
- "chunks": all_chunks,
118
- "artifacts": artifacts,
119
- "data_summary": data_summary
120
- }
 
1
  # upload_ingest.py
2
+ def extract_text_from_files(files):
3
+ chunks=[]
4
+ for f in files:
5
+ if f.endswith(".txt"):
6
+ with open(f,"r") as fh:
7
+ chunks.append(fh.read())
8
+ return {"chunks":chunks}
9