Spaces:
Running
Running
File size: 1,619 Bytes
3255634 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | """Base class for data collectors."""
import logging
from abc import ABC, abstractmethod
from pathlib import Path
import pandas as pd
from .config import CollectionConfig
class BaseCollector(ABC):
"""Abstract base class for all data collectors."""
def __init__(self, config: CollectionConfig):
self.config = config
self.logger = logging.getLogger(self.__class__.__name__)
@property
@abstractmethod
def name(self) -> str:
"""Return the name of the data source."""
pass
@property
@abstractmethod
def output_dir(self) -> Path:
"""Return the output directory for this collector."""
pass
@abstractmethod
def collect(self) -> pd.DataFrame:
"""Run the collection pipeline and return metadata DataFrame."""
pass
def setup_logging(self, log_file: Path | None = None) -> None:
"""Configure logging for this collector."""
handlers = [logging.StreamHandler()]
if log_file:
log_file.parent.mkdir(parents=True, exist_ok=True)
handlers.append(logging.FileHandler(log_file))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=handlers,
)
def log_summary(self, df: pd.DataFrame) -> None:
"""Log a summary of collected data."""
self.logger.info(f"Collection complete for {self.name}")
self.logger.info(f"Total records: {len(df)}")
if not df.empty:
self.logger.info(f"Columns: {', '.join(df.columns[:10])}")
|