Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import sys | |
| import os | |
| def clean_url(url): | |
| """Remove www. and ensure http(s):// prefix.""" | |
| url = str(url).strip() | |
| # Remove www. (handles http://www., https://www., and bare www.) | |
| if url.startswith("https://www."): | |
| url = "https://" + url[len("https://www."):] | |
| elif url.startswith("http://www."): | |
| url = "http://" + url[len("http://www."):] | |
| elif url.startswith("www."): | |
| url = url[len("www."):] | |
| # Add http:// if no scheme present | |
| if not url.startswith("http://") and not url.startswith("https://"): | |
| url = "http://" + url | |
| return url | |
| def main(): | |
| input_path = sys.argv[1] if len(sys.argv) > 1 else "data/raw/top-1m.csv" | |
| base, ext = os.path.splitext(input_path) | |
| output_path = sys.argv[2] if len(sys.argv) > 2 else f"{base}_cleaned{ext}" | |
| print(f"Reading {input_path}...") | |
| df = pd.read_csv(input_path) | |
| print(f"Loaded {len(df):,} rows") | |
| print("Cleaning URLs...") | |
| df["url"] = df["url"].apply(clean_url) | |
| # Drop duplicates that may appear after www. removal | |
| before = len(df) | |
| df.drop_duplicates(subset=["url"], keep="first", inplace=True) | |
| after = len(df) | |
| if before != after: | |
| print(f"Removed {before - after:,} duplicates after cleaning") | |
| df.to_csv(output_path, index=False) | |
| print(f"Saved {len(df):,} rows to {output_path}") | |
| if __name__ == "__main__": | |
| main() | |