import pandas as pd import sys # Cesta k datasetu dataset_path = 'data/processed/url_dataset_cleaned.csv' try: # Načítanie datasetu df = pd.read_csv(dataset_path) # Analýza labelov print("=" * 50) print("ANALÝZA DATASETU") print("=" * 50) print(f"\nCelkový počet záznamov: {len(df)}") print(f"\nRozdělenie labelov:") print("-" * 50) label_counts = df['label'].value_counts().sort_index() for label, count in label_counts.items(): percentage = (count / len(df)) * 100 print(f"Label {label}: {count} záznamov ({percentage:.2f}%)") print("-" * 50) print(f"\nPomer label 0 / label 1: {label_counts.get(0, 0) / label_counts.get(1, 1):.2f}") # Kontrola chýbajúcich hodnôt missing = df['label'].isna().sum() if missing > 0: print(f"\nChýbajúce labely: {missing}") print("\n" + "=" * 50) except FileNotFoundError: print(f"Súbor '{dataset_path}' nebol nájdený") print(f"Aktuálny adresár: {sys.path[0]}") except KeyError: print("Stĺpec 'label' neexistuje v datasete") print(f"Dostupné stĺpce: {list(df.columns)}") # type: ignore except Exception as e: print(f"Chyba: {e}")