Spaces:
Running on Zero
Running on Zero
| """ | |
| Load various data formats and convert to SQLite databases. | |
| Supports CSV, Excel, JSON file formats and converts them | |
| into in-memory SQLite databases for SQL querying. | |
| """ | |
| import sqlite3 | |
| import logging | |
| from typing import Dict, Any, Optional | |
| from pathlib import Path | |
| import json | |
| logger = logging.getLogger(__name__) | |
| class DataLoader: | |
| """Load data from various formats into SQLite database.""" | |
| def __init__(self) -> None: | |
| """Initialize data loader.""" | |
| pass | |
| def load_csv( | |
| self, | |
| file_path: str | Path, | |
| table_name: Optional[str] = None, | |
| in_memory: bool = True, | |
| ) -> str: | |
| """ | |
| Load CSV file into SQLite database. | |
| Args: | |
| file_path: Path to CSV file | |
| table_name: Name for the table (default: filename without extension) | |
| in_memory: Whether to use in-memory database | |
| Returns: | |
| Path to database file (or ':memory:' for in-memory) | |
| """ | |
| try: | |
| import pandas as pd | |
| file_path = Path(file_path) | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"CSV file not found: {file_path}") | |
| logger.info(f"Loading CSV: {file_path}") | |
| df = pd.read_csv(file_path) | |
| table_name = table_name or file_path.stem | |
| db_path = self._create_database(df, table_name, in_memory) | |
| logger.info(f"CSV loaded to {db_path}, table: {table_name}") | |
| return db_path | |
| except ImportError: | |
| logger.error("pandas not installed. Install with: pip install pandas") | |
| raise | |
| except Exception as e: | |
| logger.error(f"Error loading CSV: {e}") | |
| raise | |
| def load_excel( | |
| self, | |
| file_path: str | Path, | |
| sheet_name: str = 0, | |
| table_name: Optional[str] = None, | |
| in_memory: bool = True, | |
| ) -> str: | |
| """ | |
| Load Excel file into SQLite database. | |
| Args: | |
| file_path: Path to Excel file | |
| sheet_name: Sheet to load | |
| table_name: Name for the table | |
| in_memory: Whether to use in-memory database | |
| Returns: | |
| Path to database file | |
| """ | |
| try: | |
| import pandas as pd | |
| file_path = Path(file_path) | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"Excel file not found: {file_path}") | |
| logger.info(f"Loading Excel: {file_path}, sheet: {sheet_name}") | |
| df = pd.read_excel(file_path, sheet_name=sheet_name) | |
| table_name = table_name or f"table_{sheet_name}" | |
| db_path = self._create_database(df, table_name, in_memory) | |
| logger.info(f"Excel loaded to {db_path}, table: {table_name}") | |
| return db_path | |
| except ImportError: | |
| logger.error("openpyxl/pandas not installed") | |
| raise | |
| except Exception as e: | |
| logger.error(f"Error loading Excel: {e}") | |
| raise | |
| def load_json( | |
| self, | |
| file_path: str | Path, | |
| table_name: Optional[str] = None, | |
| in_memory: bool = True, | |
| ) -> str: | |
| """ | |
| Load JSON file into SQLite database. | |
| Args: | |
| file_path: Path to JSON file | |
| table_name: Name for the table | |
| in_memory: Whether to use in-memory database | |
| Returns: | |
| Path to database file | |
| """ | |
| try: | |
| import pandas as pd | |
| file_path = Path(file_path) | |
| if not file_path.exists(): | |
| raise FileNotFoundError(f"JSON file not found: {file_path}") | |
| logger.info(f"Loading JSON: {file_path}") | |
| with open(file_path) as f: | |
| data = json.load(f) | |
| # Handle both list of records and single record | |
| if isinstance(data, dict): | |
| data = [data] | |
| df = pd.DataFrame(data) | |
| table_name = table_name or file_path.stem | |
| db_path = self._create_database(df, table_name, in_memory) | |
| logger.info(f"JSON loaded to {db_path}, table: {table_name}") | |
| return db_path | |
| except ImportError: | |
| logger.error("pandas not installed") | |
| raise | |
| except Exception as e: | |
| logger.error(f"Error loading JSON: {e}") | |
| raise | |
| def _create_database( | |
| self, | |
| df, | |
| table_name: str, | |
| in_memory: bool = True, | |
| ) -> str: | |
| """ | |
| Create SQLite database from DataFrame. | |
| Args: | |
| df: Pandas DataFrame | |
| table_name: Name for the table | |
| in_memory: Whether to use in-memory database | |
| Returns: | |
| Path to database | |
| """ | |
| db_path = ":memory:" if in_memory else f"{table_name}.db" | |
| try: | |
| conn = sqlite3.connect(db_path) | |
| df.to_sql(table_name, conn, if_exists="replace", index=False) | |
| conn.commit() | |
| conn.close() | |
| logger.info(f"Created database with table: {table_name}") | |
| return db_path | |
| except Exception as e: | |
| logger.error(f"Error creating database: {e}") | |
| raise | |
| def load_dict_list( | |
| self, | |
| data: list[Dict[str, Any]], | |
| table_name: str = "data", | |
| in_memory: bool = True, | |
| ) -> str: | |
| """ | |
| Load list of dictionaries into SQLite database. | |
| Args: | |
| data: List of dictionaries | |
| table_name: Name for the table | |
| in_memory: Whether to use in-memory database | |
| Returns: | |
| Path to database | |
| """ | |
| try: | |
| import pandas as pd | |
| if not data: | |
| raise ValueError("Data list is empty") | |
| logger.info(f"Loading {len(data)} records into table: {table_name}") | |
| df = pd.DataFrame(data) | |
| db_path = self._create_database(df, table_name, in_memory) | |
| return db_path | |
| except Exception as e: | |
| logger.error(f"Error loading data: {e}") | |
| raise | |