translators-will commited on
Commit
edc5088
·
verified ·
1 Parent(s): d2e9523

Update data_clean_final.py

Browse files
Files changed (1) hide show
  1. data_clean_final.py +125 -130
data_clean_final.py CHANGED
@@ -1,130 +1,125 @@
1
- # data_clean_final.py
2
-
3
- import pandas as pd
4
- import os
5
- import streamlit as st
6
- from dotenv import load_dotenv
7
- import tempfile
8
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
9
- import torch
10
-
11
- # Load environment variables from .env file
12
- load_dotenv()
13
-
14
- # Load local TinyLlama model
15
- model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
16
- tokenizer = AutoTokenizer.from_pretrained(model_name)
17
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
18
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
19
-
20
-
21
- # Function to get data cleaning suggestions from LLM
22
- def suggest_llm_fixes_and_fill(column_name, examples):
23
-
24
- examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
25
-
26
- prompt = (
27
- f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
28
-
29
- Examine these sample values.
30
-
31
- {examples_text}
32
-
33
- Return ONLY a valid Python list of tuples, like:
34
- [("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...]
35
- No explanation or extra text — just the list.
36
- """
37
- )
38
-
39
- try:
40
- response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
41
- return response[0]['generated_text'].split(prompt)[-1].strip()
42
-
43
- except Exception as e:
44
- error_message = f"LLM for error column {column_name}: {str(e)}"
45
- st.error(error_message)
46
- return error_message
47
-
48
- def clean_data(file_path):
49
- # Support CSV and TSV files
50
- # Load data and drop duplicates
51
- if file_path.endswith('.tsv'):
52
- df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
53
- else:
54
- df = pd.read_csv(file_path).drop_duplicates().copy()
55
-
56
- suggestions_log = []
57
-
58
- # Convert column types
59
- for col in df.columns:
60
- if df[col].dtype == 'object':
61
- df[col] = df[col].str.strip().str.lower() # Normalize text
62
-
63
- # Escape newline characters
64
- df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
65
-
66
- if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
67
- df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
68
- # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
69
- df[col] = df[col].replace(
70
- r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
71
- )
72
- # Remove currency symbols and commas
73
- if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
74
- df[col] = df[col].str.replace(r'[$,]', '', regex=True)
75
-
76
- elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
77
- df[col] = pd.to_numeric(df[col], errors='coerce')
78
-
79
- # LLM assistance or missing or weird values
80
- null_count = df[col].isnull().sum()
81
- empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
82
- pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
83
- na=False, case=False, regex=True).sum()
84
-
85
- if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
86
- # Get examples for LLM analysis (both good and bad examples)
87
- # Get non-null, non-empty examples
88
- good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
89
-
90
- # Get bad examples
91
- bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
92
- na=False, case=False, regex=True)].sample(
93
- min(5, df[col].isna().sum()), random_state=1)
94
- # Combine good and bad examples
95
- examples = good_examples + bad_examples
96
-
97
- if examples is not None:
98
- llm_suggestion = suggest_llm_fixes_and_fill(col, examples)
99
- suggestions_log.append({
100
- 'col': col,
101
- 'suggestion': llm_suggestion
102
- })
103
-
104
- # Automatically apply replacements from LLM if in expected format
105
- # if suggestions_log:
106
- # try:
107
- # parsed = ast.literal_eval(llm_suggestion)
108
- # if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
109
- # for original, replacement, _ in parsed:
110
- # df[col] = df[col].replace(original, replacement)
111
- # else:
112
- # raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
113
- # except Exception as e:
114
- # print(f"Failed to apply replacements for column {col}: {e}")
115
- # st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
116
- # st.code(llm_suggestion, language="python")
117
-
118
- df = df.reset_index(drop=True)
119
-
120
- return df, suggestions_log
121
-
122
- def display_llm_report(suggestions_log):
123
- if suggestions_log:
124
- st.subheader("🤖 LLM Cleaning Suggestions")
125
- for col, suggestion in suggestions_log:
126
- st.markdown(f"**Column:** `{col}`")
127
- if suggestion:
128
- st.code(suggestion, language="python")
129
- else:
130
- st.write("No suggestions or LLM response error.")
 
1
+ # data_clean_final.py
2
+
3
+ import pandas as pd
4
+ import streamlit as st
5
+ import tempfile
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
7
+ import torch
8
+
9
+ # Load local TinyLlama model
10
+ model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
13
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
14
+
15
+
16
+ # Function to get data cleaning suggestions from LLM
17
+ def suggest_llm_fixes_and_fill(column_name, examples):
18
+
19
+ examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
20
+
21
+ prompt = (
22
+ f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
23
+
24
+ Examine these sample values.
25
+
26
+ {examples_text}
27
+
28
+ Return ONLY a valid Python list of tuples, like:
29
+ [("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...]
30
+ No explanation or extra text — just the list.
31
+ """
32
+ )
33
+
34
+ try:
35
+ response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
36
+ return response[0]['generated_text'].split(prompt)[-1].strip()
37
+
38
+ except Exception as e:
39
+ error_message = f"LLM for error column {column_name}: {str(e)}"
40
+ st.error(error_message)
41
+ return error_message
42
+
43
+ def clean_data(file_path):
44
+ # Support CSV and TSV files
45
+ # Load data and drop duplicates
46
+ if file_path.endswith('.tsv'):
47
+ df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
48
+ else:
49
+ df = pd.read_csv(file_path).drop_duplicates().copy()
50
+
51
+ suggestions_log = []
52
+
53
+ # Convert column types
54
+ for col in df.columns:
55
+ if df[col].dtype == 'object':
56
+ df[col] = df[col].str.strip().str.lower() # Normalize text
57
+
58
+ # Escape newline characters
59
+ df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
60
+
61
+ if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
62
+ df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
63
+ # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
64
+ df[col] = df[col].replace(
65
+ r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
66
+ )
67
+ # Remove currency symbols and commas
68
+ if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
69
+ df[col] = df[col].str.replace(r'[$,]', '', regex=True)
70
+
71
+ elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
72
+ df[col] = pd.to_numeric(df[col], errors='coerce')
73
+
74
+ # LLM assistance or missing or weird values
75
+ null_count = df[col].isnull().sum()
76
+ empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
77
+ pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
78
+ na=False, case=False, regex=True).sum()
79
+
80
+ if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
81
+ # Get examples for LLM analysis (both good and bad examples)
82
+ # Get non-null, non-empty examples
83
+ good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
84
+
85
+ # Get bad examples
86
+ bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
87
+ na=False, case=False, regex=True)].sample(
88
+ min(5, df[col].isna().sum()), random_state=1)
89
+ # Combine good and bad examples
90
+ examples = good_examples + bad_examples
91
+
92
+ if examples is not None:
93
+ llm_suggestion = suggest_llm_fixes_and_fill(col, examples)
94
+ suggestions_log.append({
95
+ 'col': col,
96
+ 'suggestion': llm_suggestion
97
+ })
98
+
99
+ # Automatically apply replacements from LLM if in expected format
100
+ # if suggestions_log:
101
+ # try:
102
+ # parsed = ast.literal_eval(llm_suggestion)
103
+ # if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
104
+ # for original, replacement, _ in parsed:
105
+ # df[col] = df[col].replace(original, replacement)
106
+ # else:
107
+ # raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
108
+ # except Exception as e:
109
+ # print(f"Failed to apply replacements for column {col}: {e}")
110
+ # st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
111
+ # st.code(llm_suggestion, language="python")
112
+
113
+ df = df.reset_index(drop=True)
114
+
115
+ return df, suggestions_log
116
+
117
+ def display_llm_report(suggestions_log):
118
+ if suggestions_log:
119
+ st.subheader("🤖 LLM Cleaning Suggestions")
120
+ for col, suggestion in suggestions_log:
121
+ st.markdown(f"**Column:** `{col}`")
122
+ if suggestion:
123
+ st.code(suggestion, language="python")
124
+ else:
125
+ st.write("No suggestions or LLM response error.")