Rajan Sharma commited on
Commit
aff5a07
·
verified ·
1 Parent(s): 1ca7039

Create data_registry.py

Browse files
Files changed (1) hide show
  1. data_registry.py +89 -0
data_registry.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import os
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, Any, List, Optional
5
+ import pandas as pd
6
+
7
+ def _safe_read(path: str) -> Optional[pd.DataFrame]:
8
+ name = (path or "").lower()
9
+ try:
10
+ if name.endswith(".csv"):
11
+ return pd.read_csv(path, low_memory=False)
12
+ if name.endswith(".xlsx") or name.endswith(".xls"):
13
+ return pd.read_excel(path)
14
+ except Exception:
15
+ return None
16
+ return None
17
+
18
+ def _dtype_of_series(s: pd.Series) -> str:
19
+ if pd.api.types.is_integer_dtype(s): return "int"
20
+ if pd.api.types.is_float_dtype(s): return "float"
21
+ if pd.api.types.is_bool_dtype(s): return "bool"
22
+ if pd.api.types.is_datetime64_any_dtype(s): return "datetime"
23
+ return "string"
24
+
25
+ def _profile_df(df: pd.DataFrame, max_examples: int = 3) -> Dict[str, Any]:
26
+ cols = []
27
+ for c in df.columns:
28
+ s = df[c]
29
+ dtype = _dtype_of_series(s)
30
+ ex_vals = s.dropna().astype(str).head(max_examples).tolist() if len(s) else []
31
+ cols.append({
32
+ "name": str(c),
33
+ "dtype": dtype,
34
+ "n_non_null": int(s.notna().sum()),
35
+ "n_unique": int(s.nunique(dropna=True)),
36
+ "examples": ex_vals
37
+ })
38
+ return {"n_rows": int(len(df)), "n_cols": int(df.shape[1]), "columns": cols}
39
+
40
+ @dataclass
41
+ class TableEntry:
42
+ name: str
43
+ path: str
44
+ df: pd.DataFrame
45
+ profile: Dict[str, Any] = field(default_factory=dict)
46
+
47
+ class DataRegistry:
48
+ def __init__(self):
49
+ self._tables: Dict[str, TableEntry] = {}
50
+
51
+ def clear(self) -> None:
52
+ self._tables.clear()
53
+
54
+ def add_path(self, path: str) -> Optional[str]:
55
+ if not path or not os.path.exists(path):
56
+ return None
57
+ df = _safe_read(path)
58
+ if df is None:
59
+ return None
60
+ base = os.path.basename(path)
61
+ key = base
62
+ i = 2
63
+ while key in self._tables:
64
+ key = f"{base} ({i})"
65
+ i += 1
66
+ prof = _profile_df(df)
67
+ self._tables[key] = TableEntry(name=key, path=path, df=df, profile=prof)
68
+ return key
69
+
70
+ def names(self) -> List[str]:
71
+ return list(self._tables.keys())
72
+
73
+ def get(self, name: str) -> Optional[pd.DataFrame]:
74
+ return self._tables.get(name).df if name in self._tables else None
75
+
76
+ def get_profile(self, name: str) -> Dict[str, Any]:
77
+ return self._tables.get(name).profile if name in self._tables else {}
78
+
79
+ def iter_tables(self) -> List[TableEntry]:
80
+ return list(self._tables.values())
81
+
82
+ def summarize_for_prompt(self, col_cap: int = 600) -> str:
83
+ lines = []
84
+ for t in self.iter_tables():
85
+ cols = ", ".join([c["name"] for c in t.profile.get("columns", [])])
86
+ if len(cols) > col_cap:
87
+ cols = cols[:col_cap] + "…"
88
+ lines.append(f"- {t.name}: rows={t.profile.get('n_rows', 0)} cols=[{cols}]")
89
+ return "\n".join(lines) if lines else "- <none>"