Spaces:

salihfurkaan
/

auto-data-analyst

Running

App Files Files Community

salihfurkaan commited on Feb 7

Commit

7501c0e

1 Parent(s): 2554712

Fix list/dict column crash and remove trust_remote_code

Browse files

Files changed (5) hide show

src/__pycache__/cleaning.cpython-313.pyc +0 -0
src/__pycache__/ingestion.cpython-313.pyc +0 -0
src/cleaning.py +23 -2
src/ingestion.py +2 -2
verify_pipeline_mock.py +25 -3

src/__pycache__/cleaning.cpython-313.pyc CHANGED Viewed

Binary files a/src/__pycache__/cleaning.cpython-313.pyc and b/src/__pycache__/cleaning.cpython-313.pyc differ

src/__pycache__/ingestion.cpython-313.pyc CHANGED Viewed

Binary files a/src/__pycache__/ingestion.cpython-313.pyc and b/src/__pycache__/ingestion.cpython-313.pyc differ

src/cleaning.py CHANGED Viewed

@@ -43,9 +43,30 @@ def clean_data(df):
             log.append(f"Dropped {dropped_rows} empty rows.")
     # 3. Handle Duplicate Rows
-    duplicates = df.duplicated().sum()
     if duplicates > 0:
-        df = df.drop_duplicates()
         log.append(f"Removed {duplicates} duplicate rows.")
     # 4. Fill Missing Values (Simple Strategy for Analysis)

             log.append(f"Dropped {dropped_rows} empty rows.")
     # 3. Handle Duplicate Rows
+    # Convert unhashable types (lists, dicts) to strings for duplicate checking
+    try:
+        duplicates = df.duplicated().sum()
+    except TypeError:
+        # Fallback: Convert object columns to string to handle unhashable types
+        temp_df = df.copy()
+        for col in temp_df.select_dtypes(include=['object']):
+            temp_df[col] = temp_df[col].astype(str)
+        duplicates = temp_df.duplicated().sum()
+        del temp_df
     if duplicates > 0:
+        # To actually drop them, we need a similar approach or just rely on the try/except
+        try:
+             df = df.drop_duplicates()
+        except TypeError:
+             # If dropping fails naturally, we must use the string conversion method to identifying indices
+             # This is a bit expensive but robust
+             temp_df = df.copy()
+             for col in temp_df.select_dtypes(include=['object']):
+                 temp_df[col] = temp_df[col].astype(str)
+             df = df.loc[~temp_df.duplicated()].copy()
+             del temp_df
         log.append(f"Removed {duplicates} duplicate rows.")
     # 4. Fill Missing Values (Simple Strategy for Analysis)

src/ingestion.py CHANGED Viewed

@@ -133,9 +133,9 @@ def load_hf_dataset(dataset_name, subset=None, split='train', api_token=None):
         # Load dataset
         # If subset is provided, pass it as the second argument
         if subset:
-            ds = load_dataset(dataset_name, subset, split=split, token=api_token, trust_remote_code=True)
         else:
-            ds = load_dataset(dataset_name, split=split, token=api_token, trust_remote_code=True)
         # Convert to pandas
         df = ds.to_pandas()

         # Load dataset
         # If subset is provided, pass it as the second argument
         if subset:
+            ds = load_dataset(dataset_name, subset, split=split, token=api_token)
         else:
+            ds = load_dataset(dataset_name, split=split, token=api_token)
         # Convert to pandas
         df = ds.to_pandas()

verify_pipeline_mock.py CHANGED Viewed

@@ -19,9 +19,31 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
         def __init__(self, path):
             self.name = path
-    print("Generating example dataset...")
-    example_path = load_example()
-    print(f"Example dataset created at: {example_path}")
     mock_file = MockFile(example_path)
     print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")

         def __init__(self, path):
             self.name = path
+    print("Generating example dataset with complex types...")
+    # Create a dummy CSV with list columns to test unhashable type handling
+    complex_data = {
+        "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+        "Age": [25, 30, 35, 40, 22],
+        "Tags": [["HR", "admin"], ["eng", "dev"], ["eng", "lead"], ["mgmt"], ["HR"]], # List type (unhashable)
+        "Details": [{"role": "staff"}, {"role": "dev"}, {"role": "lead"}, {"role": "manager"}, {"role": "staff"}] # Dict type (unhashable)
+    }
+    df_complex = pd.DataFrame(complex_data)
+    example_path = "complex_test_data.csv"
+    # yielding list/dict in csv might be tricky as pandas read_csv reads them as strings usually.
+    # But ingestion.py `load_file` reads CSV.
+    # If I want to test "unhashable type", I need `read_csv` to produce lists/dicts? No, read_csv produces strings representation of lists.
+    # The error "unhashable type: dict" usually comes from parquet or HF datasets where real lists/dicts are preserved.
+    # So I should mock a parquet file or just manually create the dataframe and bypass load_file?
+    # verify_pipeline_mock uses `MockFile` which is passed to `analyze_dataset`.
+    # `analyze_dataset` calls `load_file`.
+    # `load_file` reads the file.
+    # To reproduce "unhashable type: dict", I need to pass a file that loads as dicts/lists.
+    # Parquet or HF Dataset is the way.
+    # Let's create a parquet file with complex types.
+    df_complex.to_parquet(example_path)
+    print(f"Example complex dataset created at: {example_path}")
     mock_file = MockFile(example_path)
     print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")