import pandas as pd import sys import os def clean_url(url): """Remove www. and ensure http(s):// prefix.""" url = str(url).strip() # Remove www. (handles http://www., https://www., and bare www.) if url.startswith("https://www."): url = "https://" + url[len("https://www."):] elif url.startswith("http://www."): url = "http://" + url[len("http://www."):] elif url.startswith("www."): url = url[len("www."):] # Add http:// if no scheme present if not url.startswith("http://") and not url.startswith("https://"): url = "http://" + url return url def main(): input_path = sys.argv[1] if len(sys.argv) > 1 else "data/raw/top-1m.csv" base, ext = os.path.splitext(input_path) output_path = sys.argv[2] if len(sys.argv) > 2 else f"{base}_cleaned{ext}" print(f"Reading {input_path}...") df = pd.read_csv(input_path) print(f"Loaded {len(df):,} rows") print("Cleaning URLs...") df["url"] = df["url"].apply(clean_url) # Drop duplicates that may appear after www. removal before = len(df) df.drop_duplicates(subset=["url"], keep="first", inplace=True) after = len(df) if before != after: print(f"Removed {before - after:,} duplicates after cleaning") df.to_csv(output_path, index=False) print(f"Saved {len(df):,} rows to {output_path}") if __name__ == "__main__": main()