Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

Upload 50 files

2cc7f91 verified 8 days ago

1.48 kB

	import pandas as pd
	import sys
	import os

	def clean_url(url):
	"""Remove www. and ensure http(s):// prefix."""
	url = str(url).strip()

	# Remove www. (handles http://www., https://www., and bare www.)
	if url.startswith("https://www."):
	url = "https://" + url[len("https://www."):]
	elif url.startswith("http://www."):
	url = "http://" + url[len("http://www."):]
	elif url.startswith("www."):
	url = url[len("www."):]

	# Add http:// if no scheme present
	if not url.startswith("http://") and not url.startswith("https://"):
	url = "http://" + url

	return url


	def main():
	input_path = sys.argv[1] if len(sys.argv) > 1 else "data/raw/top-1m.csv"

	base, ext = os.path.splitext(input_path)
	output_path = sys.argv[2] if len(sys.argv) > 2 else f"{base}_cleaned{ext}"

	print(f"Reading {input_path}...")
	df = pd.read_csv(input_path)
	print(f"Loaded {len(df):,} rows")

	print("Cleaning URLs...")
	df["url"] = df["url"].apply(clean_url)

	# Drop duplicates that may appear after www. removal
	before = len(df)
	df.drop_duplicates(subset=["url"], keep="first", inplace=True)
	after = len(df)
	if before != after:
	print(f"Removed {before - after:,} duplicates after cleaning")

	df.to_csv(output_path, index=False)
	print(f"Saved {len(df):,} rows to {output_path}")


	if __name__ == "__main__":
	main()