mshauri-fedha / src /load /explore_news_schema.py
teofizzy's picture
Removed artifacts
c0adb25
import pandas as pd
import glob
import os
import logging
import sys
# Configure logging to show up in the notebook
logging.basicConfig(
level=logging.INFO,
format='%(message)s',
stream=sys.stdout,
force=True
)
logger = logging.getLogger("SchemaExplorer")
def analyze_schemas(news_dir: str):
"""
Scans all CSV files in the given directory and groups them by their column structure.
"""
if not os.path.exists(news_dir):
logger.error(f" Directory not found: {news_dir}")
return
csv_files = glob.glob(os.path.join(news_dir, "*.csv"))
logger.info(f"Scanning {len(csv_files)} files in '{news_dir}'...\n")
if not csv_files:
logger.warning(" No CSV files found.")
return
# Dictionary to store unique schemas: { (col1, col2): [file1, file2] }
schemas = {}
for f in csv_files:
try:
# Read only the header (fast)
df = pd.read_csv(f, nrows=0)
# Sort columns to ensure order doesn't matter for grouping
cols = tuple(sorted(df.columns.tolist()))
if cols not in schemas:
schemas[cols] = []
schemas[cols].append(os.path.basename(f))
except Exception as e:
logger.error(f" Error reading {os.path.basename(f)}: {e}")
# Report Findings
logger.info("--- Schema Report ---")
for i, (cols, files) in enumerate(schemas.items()):
logger.info(f"\nTYPE {i+1}: Found in {len(files)} files")
logger.info(f"Columns: {list(cols)}")
if len(files) < 5:
logger.info(f"Examples: {files}")
else:
logger.info(f"Examples: {files[:3]} ... (+{len(files)-3} others)")
# Date Format Check (Random Sample from the first valid file)
logger.info("\n--- Date Format Sample ---")
try:
sample_file = csv_files[0]
sample = pd.read_csv(sample_file, nrows=5)
# Look for a column containing 'date' or 'time'
date_col = next((c for c in sample.columns if 'date' in c.lower() or 'time' in c.lower() or 'published' in c.lower()), None)
if date_col:
logger.info(f"Sample from column '{date_col}' in {os.path.basename(sample_file)}:")
logger.info(sample[date_col].head().tolist())
else:
logger.warning("No obvious 'date' column found in sample.")
except Exception as e:
logger.error(f"Could not read sample for date check: {e}")