Rajan Sharma commited on
Commit
548a084
·
verified ·
1 Parent(s): 1134cbf

Update data_registry.py

Browse files
Files changed (1) hide show
  1. data_registry.py +12 -107
data_registry.py CHANGED
@@ -1,112 +1,17 @@
1
  # data_registry.py
2
  import pandas as pd
3
- import os
4
- from typing import Dict, List, Any, Optional, Union
5
- import logging
6
-
7
- logging.basicConfig(level=logging.INFO)
8
- logger = logging.getLogger(__name__)
9
 
10
  class DataRegistry:
11
  def __init__(self):
12
- self.data = {}
13
- self.metadata = {}
14
-
15
- def add_path(self, file_path: str) -> bool:
16
- """Add a file to the registry and return success status"""
17
- try:
18
- file_ext = os.path.splitext(file_path)[1].lower()
19
-
20
- if file_ext == '.csv':
21
- df = pd.read_csv(file_path)
22
- elif file_ext in ['.xlsx', '.xls']:
23
- df = pd.read_excel(file_path)
24
- elif file_ext == '.json':
25
- df = pd.read_json(file_path)
26
- else:
27
- logger.warning(f"Unsupported file type: {file_ext}")
28
- return False
29
-
30
- # Store with filename as key
31
- filename = os.path.basename(file_path)
32
- self.data[filename] = df
33
-
34
- # Store metadata
35
- self.metadata[filename] = {
36
- "path": file_path,
37
- "type": file_ext,
38
- "shape": df.shape,
39
- "columns": list(df.columns),
40
- "data_types": df.dtypes.to_dict(),
41
- "null_counts": df.isnull().sum().to_dict(),
42
- "sample_data": df.head(3).to_dict()
43
- }
44
-
45
- logger.info(f"Successfully loaded {filename} with shape {df.shape}")
46
- return True
47
-
48
- except Exception as e:
49
- logger.error(f"Error loading {file_path}: {str(e)}")
50
- return False
51
-
52
- def get(self, name: str) -> Optional[pd.DataFrame]:
53
- """Get a dataset by name"""
54
- return self.data.get(name)
55
-
56
- def names(self) -> List[str]:
57
- """Get all dataset names"""
58
- return list(self.data.keys())
59
-
60
- def get_data_by_type(self, data_type: str) -> List[str]:
61
- """Get datasets matching a type pattern"""
62
- matching = []
63
- for name, meta in self.metadata.items():
64
- if data_type.lower() in name.lower():
65
- matching.append(name)
66
- return matching
67
-
68
- def get_data_summary(self) -> Dict[str, Any]:
69
- """Generate a summary of all loaded datasets"""
70
- return self.metadata
71
-
72
- def find_related_datasets(self, keywords: List[str]) -> List[Dict[str, Any]]:
73
- """Find datasets containing specific keywords in columns or data"""
74
- related = []
75
- for name in self.names():
76
- df = self.get(name)
77
- if df is None:
78
- continue
79
-
80
- # Check column names
81
- col_matches = [col for col in df.columns if any(kw in col.lower() for kw in keywords)]
82
-
83
- # Check data content
84
- data_matches = False
85
- for col in df.select_dtypes(include=['object']).columns:
86
- try:
87
- # Create a boolean mask for rows containing any keyword
88
- # This is the generic approach that works for any keywords
89
- pattern = '|'.join(keywords)
90
- mask = df[col].str.contains(pattern, case=False, na=False)
91
-
92
- # Check if any match exists (this returns a single boolean)
93
- if mask.any():
94
- data_matches = True
95
- break
96
- except Exception as e:
97
- # If there's an error with this column, skip it
98
- logger.debug(f"Error checking column {col} for keywords: {str(e)}")
99
- continue
100
-
101
- if col_matches or data_matches:
102
- related.append({
103
- "name": name,
104
- "matching_columns": col_matches,
105
- "has_matching_data": data_matches
106
- })
107
- return related
108
-
109
- def clear(self):
110
- """Clear all data"""
111
- self.data.clear()
112
- self.metadata.clear()
 
1
  # data_registry.py
2
  import pandas as pd
 
 
 
 
 
 
3
 
4
  class DataRegistry:
5
  def __init__(self):
6
+ self._data={}
7
+
8
+ def add_path(self, path: str):
9
+ if path.endswith(".csv"):
10
+ self._data[path]=pd.read_csv(path)
11
+ # future: add PDF/TXT/MD parsing
12
+
13
+ def get(self, name: str):
14
+ return self._data.get(name)
15
+
16
+ def names(self):
17
+ return list(self._data.keys())