File size: 1,479 Bytes
2cc7f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import sys
import os

def clean_url(url):
    """Remove www. and ensure http(s):// prefix."""
    url = str(url).strip()
    
    # Remove www. (handles http://www., https://www., and bare www.)
    if url.startswith("https://www."):
        url = "https://" + url[len("https://www."):]
    elif url.startswith("http://www."):
        url = "http://" + url[len("http://www."):]
    elif url.startswith("www."):
        url = url[len("www."):]
    
    # Add http:// if no scheme present
    if not url.startswith("http://") and not url.startswith("https://"):
        url = "http://" + url
    
    return url


def main():
    input_path = sys.argv[1] if len(sys.argv) > 1 else "data/raw/top-1m.csv"
    
    base, ext = os.path.splitext(input_path)
    output_path = sys.argv[2] if len(sys.argv) > 2 else f"{base}_cleaned{ext}"

    print(f"Reading {input_path}...")
    df = pd.read_csv(input_path)
    print(f"Loaded {len(df):,} rows")

    print("Cleaning URLs...")
    df["url"] = df["url"].apply(clean_url)

    # Drop duplicates that may appear after www. removal
    before = len(df)
    df.drop_duplicates(subset=["url"], keep="first", inplace=True)
    after = len(df)
    if before != after:
        print(f"Removed {before - after:,} duplicates after cleaning")

    df.to_csv(output_path, index=False)
    print(f"Saved {len(df):,} rows to {output_path}")


if __name__ == "__main__":
    main()