rb1337's picture
Upload 50 files
2cc7f91 verified
import pandas as pd
import sys
import os
def clean_url(url):
"""Remove www. and ensure http(s):// prefix."""
url = str(url).strip()
# Remove www. (handles http://www., https://www., and bare www.)
if url.startswith("https://www."):
url = "https://" + url[len("https://www."):]
elif url.startswith("http://www."):
url = "http://" + url[len("http://www."):]
elif url.startswith("www."):
url = url[len("www."):]
# Add http:// if no scheme present
if not url.startswith("http://") and not url.startswith("https://"):
url = "http://" + url
return url
def main():
input_path = sys.argv[1] if len(sys.argv) > 1 else "data/raw/top-1m.csv"
base, ext = os.path.splitext(input_path)
output_path = sys.argv[2] if len(sys.argv) > 2 else f"{base}_cleaned{ext}"
print(f"Reading {input_path}...")
df = pd.read_csv(input_path)
print(f"Loaded {len(df):,} rows")
print("Cleaning URLs...")
df["url"] = df["url"].apply(clean_url)
# Drop duplicates that may appear after www. removal
before = len(df)
df.drop_duplicates(subset=["url"], keep="first", inplace=True)
after = len(df)
if before != after:
print(f"Removed {before - after:,} duplicates after cleaning")
df.to_csv(output_path, index=False)
print(f"Saved {len(df):,} rows to {output_path}")
if __name__ == "__main__":
main()