import pandas as pd import glob import os import logging import sys # Configure logging to show up in the notebook logging.basicConfig( level=logging.INFO, format='%(message)s', stream=sys.stdout, force=True ) logger = logging.getLogger("SchemaExplorer") def analyze_schemas(news_dir: str): """ Scans all CSV files in the given directory and groups them by their column structure. """ if not os.path.exists(news_dir): logger.error(f" Directory not found: {news_dir}") return csv_files = glob.glob(os.path.join(news_dir, "*.csv")) logger.info(f"Scanning {len(csv_files)} files in '{news_dir}'...\n") if not csv_files: logger.warning(" No CSV files found.") return # Dictionary to store unique schemas: { (col1, col2): [file1, file2] } schemas = {} for f in csv_files: try: # Read only the header (fast) df = pd.read_csv(f, nrows=0) # Sort columns to ensure order doesn't matter for grouping cols = tuple(sorted(df.columns.tolist())) if cols not in schemas: schemas[cols] = [] schemas[cols].append(os.path.basename(f)) except Exception as e: logger.error(f" Error reading {os.path.basename(f)}: {e}") # Report Findings logger.info("--- Schema Report ---") for i, (cols, files) in enumerate(schemas.items()): logger.info(f"\nTYPE {i+1}: Found in {len(files)} files") logger.info(f"Columns: {list(cols)}") if len(files) < 5: logger.info(f"Examples: {files}") else: logger.info(f"Examples: {files[:3]} ... (+{len(files)-3} others)") # Date Format Check (Random Sample from the first valid file) logger.info("\n--- Date Format Sample ---") try: sample_file = csv_files[0] sample = pd.read_csv(sample_file, nrows=5) # Look for a column containing 'date' or 'time' date_col = next((c for c in sample.columns if 'date' in c.lower() or 'time' in c.lower() or 'published' in c.lower()), None) if date_col: logger.info(f"Sample from column '{date_col}' in {os.path.basename(sample_file)}:") logger.info(sample[date_col].head().tolist()) else: logger.warning("No obvious 'date' column found in sample.") except Exception as e: logger.error(f"Could not read sample for date check: {e}")