RoyAalekh commited on
Commit
a8c9d4c
·
1 Parent(s): 51c0ba4

Convert to DuckDB for efficient data storage and access

Browse files

- Converted 389MB CSV files to 100MB DuckDB (74% compression)
- Added DuckDB dependency to pyproject.toml
- Modified EDA pipeline to load data directly from DuckDB
- Updated .gitignore to track DuckDB file
- Faster loading: DuckDB vs CSV parsing overhead
- Git-friendly: Single binary file under 100MB limit
- Better performance for RL parameter extraction

Files changed (4) hide show
  1. .gitignore +1 -0
  2. pyproject.toml +2 -1
  3. src/eda_config.py +1 -2
  4. src/eda_load_clean.py +18 -15
.gitignore CHANGED
@@ -29,3 +29,4 @@ Data/test_verification/
29
  # Keep essential data
30
  !Data/README.md
31
  !pyproject.toml
 
 
29
  # Keep essential data
30
  !Data/README.md
31
  !pyproject.toml
32
+ !Data/court_data.duckdb
pyproject.toml CHANGED
@@ -20,7 +20,8 @@ dependencies = [
20
  "scipy>=1.14",
21
  "scikit-learn>=1.5",
22
  "streamlit>=1.28",
23
- "altair>=5.0"
 
24
  ]
25
 
26
  [project.optional-dependencies]
 
20
  "scipy>=1.14",
21
  "scikit-learn>=1.5",
22
  "streamlit>=1.28",
23
+ "altair>=5.0",
24
+ "duckdb>=1.4.2",
25
  ]
26
 
27
  [project.optional-dependencies]
src/eda_config.py CHANGED
@@ -9,8 +9,7 @@ from pathlib import Path
9
  # Paths and versioning
10
  # -------------------------------------------------------------------
11
  DATA_DIR = Path("Data")
12
- CASES_FILE = DATA_DIR / "ISDMHack_Cases_WPfinal.csv"
13
- HEAR_FILE = DATA_DIR / "ISDMHack_Hear.csv"
14
 
15
  REPORTS_DIR = Path("reports")
16
  FIGURES_DIR = REPORTS_DIR / "figures"
 
9
  # Paths and versioning
10
  # -------------------------------------------------------------------
11
  DATA_DIR = Path("Data")
12
+ DUCKDB_FILE = DATA_DIR / "court_data.duckdb"
 
13
 
14
  REPORTS_DIR = Path("reports")
15
  FIGURES_DIR = REPORTS_DIR / "figures"
src/eda_load_clean.py CHANGED
@@ -11,10 +11,10 @@ Responsibilities:
11
  from datetime import timedelta
12
 
13
  import polars as pl
 
14
  from src.eda_config import (
15
  CASES_CLEAN_PARQUET,
16
- CASES_FILE,
17
- HEAR_FILE,
18
  HEARINGS_CLEAN_PARQUET,
19
  NULL_TOKENS,
20
  RUN_TS,
@@ -56,19 +56,22 @@ def _null_summary(df: pl.DataFrame, name: str) -> None:
56
  # Main logic
57
  # -------------------------------------------------------------------
58
  def load_raw() -> tuple[pl.DataFrame, pl.DataFrame]:
59
- print("Loading raw data with Polars...")
60
- cases = pl.read_csv(
61
- CASES_FILE,
62
- try_parse_dates=True,
63
- null_values=NULL_TOKENS,
64
- infer_schema_length=100_000,
65
- )
66
- hearings = pl.read_csv(
67
- HEAR_FILE,
68
- try_parse_dates=True,
69
- null_values=NULL_TOKENS,
70
- infer_schema_length=100_000,
71
- )
 
 
 
72
  print(f"Cases shape: {cases.shape}")
73
  print(f"Hearings shape: {hearings.shape}")
74
  return cases, hearings
 
11
  from datetime import timedelta
12
 
13
  import polars as pl
14
+ import duckdb
15
  from src.eda_config import (
16
  CASES_CLEAN_PARQUET,
17
+ DUCKDB_FILE,
 
18
  HEARINGS_CLEAN_PARQUET,
19
  NULL_TOKENS,
20
  RUN_TS,
 
56
  # Main logic
57
  # -------------------------------------------------------------------
58
  def load_raw() -> tuple[pl.DataFrame, pl.DataFrame]:
59
+ print(f"Loading raw data from DuckDB: {DUCKDB_FILE}")
60
+
61
+ if not DUCKDB_FILE.exists():
62
+ raise FileNotFoundError(f"DuckDB file not found: {DUCKDB_FILE}")
63
+
64
+ # Connect to DuckDB and load data
65
+ conn = duckdb.connect(str(DUCKDB_FILE))
66
+
67
+ # Load cases as Polars DataFrame
68
+ cases = pl.from_pandas(conn.execute("SELECT * FROM cases").df())
69
+
70
+ # Load hearings as Polars DataFrame
71
+ hearings = pl.from_pandas(conn.execute("SELECT * FROM hearings").df())
72
+
73
+ conn.close()
74
+
75
  print(f"Cases shape: {cases.shape}")
76
  print(f"Hearings shape: {hearings.shape}")
77
  return cases, hearings