salihfurkaan commited on
Commit
7501c0e
·
1 Parent(s): 2554712

Fix list/dict column crash and remove trust_remote_code

Browse files
src/__pycache__/cleaning.cpython-313.pyc CHANGED
Binary files a/src/__pycache__/cleaning.cpython-313.pyc and b/src/__pycache__/cleaning.cpython-313.pyc differ
 
src/__pycache__/ingestion.cpython-313.pyc CHANGED
Binary files a/src/__pycache__/ingestion.cpython-313.pyc and b/src/__pycache__/ingestion.cpython-313.pyc differ
 
src/cleaning.py CHANGED
@@ -43,9 +43,30 @@ def clean_data(df):
43
  log.append(f"Dropped {dropped_rows} empty rows.")
44
 
45
  # 3. Handle Duplicate Rows
46
- duplicates = df.duplicated().sum()
 
 
 
 
 
 
 
 
 
 
47
  if duplicates > 0:
48
- df = df.drop_duplicates()
 
 
 
 
 
 
 
 
 
 
 
49
  log.append(f"Removed {duplicates} duplicate rows.")
50
 
51
  # 4. Fill Missing Values (Simple Strategy for Analysis)
 
43
  log.append(f"Dropped {dropped_rows} empty rows.")
44
 
45
  # 3. Handle Duplicate Rows
46
+ # Convert unhashable types (lists, dicts) to strings for duplicate checking
47
+ try:
48
+ duplicates = df.duplicated().sum()
49
+ except TypeError:
50
+ # Fallback: Convert object columns to string to handle unhashable types
51
+ temp_df = df.copy()
52
+ for col in temp_df.select_dtypes(include=['object']):
53
+ temp_df[col] = temp_df[col].astype(str)
54
+ duplicates = temp_df.duplicated().sum()
55
+ del temp_df
56
+
57
  if duplicates > 0:
58
+ # To actually drop them, we need a similar approach or just rely on the try/except
59
+ try:
60
+ df = df.drop_duplicates()
61
+ except TypeError:
62
+ # If dropping fails naturally, we must use the string conversion method to identifying indices
63
+ # This is a bit expensive but robust
64
+ temp_df = df.copy()
65
+ for col in temp_df.select_dtypes(include=['object']):
66
+ temp_df[col] = temp_df[col].astype(str)
67
+ df = df.loc[~temp_df.duplicated()].copy()
68
+ del temp_df
69
+
70
  log.append(f"Removed {duplicates} duplicate rows.")
71
 
72
  # 4. Fill Missing Values (Simple Strategy for Analysis)
src/ingestion.py CHANGED
@@ -133,9 +133,9 @@ def load_hf_dataset(dataset_name, subset=None, split='train', api_token=None):
133
  # Load dataset
134
  # If subset is provided, pass it as the second argument
135
  if subset:
136
- ds = load_dataset(dataset_name, subset, split=split, token=api_token, trust_remote_code=True)
137
  else:
138
- ds = load_dataset(dataset_name, split=split, token=api_token, trust_remote_code=True)
139
 
140
  # Convert to pandas
141
  df = ds.to_pandas()
 
133
  # Load dataset
134
  # If subset is provided, pass it as the second argument
135
  if subset:
136
+ ds = load_dataset(dataset_name, subset, split=split, token=api_token)
137
  else:
138
+ ds = load_dataset(dataset_name, split=split, token=api_token)
139
 
140
  # Convert to pandas
141
  df = ds.to_pandas()
verify_pipeline_mock.py CHANGED
@@ -19,9 +19,31 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
19
  def __init__(self, path):
20
  self.name = path
21
 
22
- print("Generating example dataset...")
23
- example_path = load_example()
24
- print(f"Example dataset created at: {example_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  mock_file = MockFile(example_path)
27
  print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
 
19
  def __init__(self, path):
20
  self.name = path
21
 
22
+ print("Generating example dataset with complex types...")
23
+ # Create a dummy CSV with list columns to test unhashable type handling
24
+ complex_data = {
25
+ "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
26
+ "Age": [25, 30, 35, 40, 22],
27
+ "Tags": [["HR", "admin"], ["eng", "dev"], ["eng", "lead"], ["mgmt"], ["HR"]], # List type (unhashable)
28
+ "Details": [{"role": "staff"}, {"role": "dev"}, {"role": "lead"}, {"role": "manager"}, {"role": "staff"}] # Dict type (unhashable)
29
+ }
30
+ df_complex = pd.DataFrame(complex_data)
31
+ example_path = "complex_test_data.csv"
32
+ # yielding list/dict in csv might be tricky as pandas read_csv reads them as strings usually.
33
+ # But ingestion.py `load_file` reads CSV.
34
+ # If I want to test "unhashable type", I need `read_csv` to produce lists/dicts? No, read_csv produces strings representation of lists.
35
+ # The error "unhashable type: dict" usually comes from parquet or HF datasets where real lists/dicts are preserved.
36
+ # So I should mock a parquet file or just manually create the dataframe and bypass load_file?
37
+ # verify_pipeline_mock uses `MockFile` which is passed to `analyze_dataset`.
38
+ # `analyze_dataset` calls `load_file`.
39
+ # `load_file` reads the file.
40
+
41
+ # To reproduce "unhashable type: dict", I need to pass a file that loads as dicts/lists.
42
+ # Parquet or HF Dataset is the way.
43
+ # Let's create a parquet file with complex types.
44
+ df_complex.to_parquet(example_path)
45
+
46
+ print(f"Example complex dataset created at: {example_path}")
47
 
48
  mock_file = MockFile(example_path)
49
  print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")