translators-will commited on
Commit
d2e9523
·
verified ·
1 Parent(s): 04e510d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +109 -0
  2. data_clean_final.py +130 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ from io import StringIO, BytesIO
6
+ import os
7
+ from data_clean_final import clean_data, display_llm_report
8
+ import tempfile
9
+
10
+ # Set page config
11
+ st.set_page_config(page_title="LLM Data Cleaner", page_icon=":bar_chart:", layout="wide")
12
+
13
+ st.title("LLM-Powered Data Cleaner")
14
+ st.markdown(
15
+ """
16
+ Upload a CSV, TSV or Excel file, and we'll clean it up for you using AI-augmented techniques.
17
+ The system will automatically:
18
+ - Fix formatting issues
19
+ - Handle missing values
20
+ - Standardize data entries
21
+ - Provide AI-powered suggestions for data improvements
22
+
23
+ Then, you can download the cleaned data for your analysis.
24
+ """
25
+ )
26
+
27
+ # File uploader
28
+ uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])
29
+
30
+ if uploaded_file:
31
+ file_bytes = uploaded_file.read() # Convert to BytesIO for pandas
32
+ file_name = uploaded_file.name
33
+ file_type = os.path.splitext(file_name)[1].lower()
34
+
35
+ # Clean data and get LLM suggestions
36
+ with st.spinner("Cleaning your data..."):
37
+
38
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
39
+ temp_file.write(file_bytes)
40
+ temp_file_path = temp_file.name
41
+
42
+ try:
43
+ # Temporary file to clean_data function
44
+ cleaned_df, suggestions = clean_data(temp_file_path)
45
+
46
+ st.success("Data cleaned successfully!")
47
+
48
+ finally:
49
+ # Clean up the temporary file
50
+ if os.path.exists(temp_file_path):
51
+ os.unlink(temp_file_path)
52
+
53
+ # Show original data
54
+ st.subheader("Original Data")
55
+ try:
56
+ if file_type == ".tsv":
57
+ original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
58
+ elif file_type == ".xlsx":
59
+ original_df = pd.read_excel(BytesIO(file_bytes))
60
+ else:
61
+ original_df = pd.read_csv(BytesIO(file_bytes))
62
+
63
+ st.dataframe(original_df.head(10), use_container_width=True)
64
+
65
+ except Exception as e:
66
+ st.error(f"Error loading original data: {str(e)}")
67
+
68
+ # Show cleaned data
69
+ st.subheader("Cleaned Data Preview")
70
+ st.dataframe(cleaned_df.head(10), use_container_width=True)
71
+
72
+ # Data statistics
73
+ col1, col2, col3 = st.columns(3)
74
+ with col1:
75
+ st.metric("Total Rows", len(cleaned_df))
76
+ with col2:
77
+ st.metric("Total Columns", len(cleaned_df.columns))
78
+ with col3:
79
+ null_percentage = round((cleaned_df.isnull().sum().sum() / (cleaned_df.shape[0] * cleaned_df.shape[1])) * 100, 2)
80
+ st.metric("Null Values (%)", f"{null_percentage}%")
81
+
82
+ # Show suggestions from LLM
83
+ display_llm_report(suggestions)
84
+
85
+
86
+ # Convert to downloadable format
87
+ if file_type == ".csv":
88
+ cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
89
+ download_name = "cleaned_data.csv"
90
+ mime_type = "text/csv"
91
+ elif file_type == ".tsv":
92
+ cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
93
+ download_name = "cleaned_data.tsv"
94
+ mime_type = "text/tsv"
95
+ elif file_type == ".xlsx":
96
+ output = BytesIO()
97
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
98
+ cleaned_df.to_excel(writer, index=False)
99
+ cleaned_file = output.getvalue()
100
+ download_name = "cleaned_data.xlsx"
101
+ mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
102
+
103
+ # Download button
104
+ st.download_button(
105
+ label="📁 Download Cleaned Data",
106
+ data=cleaned_file,
107
+ file_name=download_name,
108
+ mime=mime_type
109
+ )
data_clean_final.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data_clean_final.py
2
+
3
+ import pandas as pd
4
+ import os
5
+ import streamlit as st
6
+ from dotenv import load_dotenv
7
+ import tempfile
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
9
+ import torch
10
+
11
+ # Load environment variables from .env file
12
+ load_dotenv()
13
+
14
+ # Load local TinyLlama model
15
+ model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
16
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
18
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
19
+
20
+
21
+ # Function to get data cleaning suggestions from LLM
22
+ def suggest_llm_fixes_and_fill(column_name, examples):
23
+
24
+ examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
25
+
26
+ prompt = (
27
+ f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
28
+
29
+ Examine these sample values.
30
+
31
+ {examples_text}
32
+
33
+ Return ONLY a valid Python list of tuples, like:
34
+ [("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...]
35
+ No explanation or extra text — just the list.
36
+ """
37
+ )
38
+
39
+ try:
40
+ response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
41
+ return response[0]['generated_text'].split(prompt)[-1].strip()
42
+
43
+ except Exception as e:
44
+ error_message = f"LLM for error column {column_name}: {str(e)}"
45
+ st.error(error_message)
46
+ return error_message
47
+
48
+ def clean_data(file_path):
49
+ # Support CSV and TSV files
50
+ # Load data and drop duplicates
51
+ if file_path.endswith('.tsv'):
52
+ df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
53
+ else:
54
+ df = pd.read_csv(file_path).drop_duplicates().copy()
55
+
56
+ suggestions_log = []
57
+
58
+ # Convert column types
59
+ for col in df.columns:
60
+ if df[col].dtype == 'object':
61
+ df[col] = df[col].str.strip().str.lower() # Normalize text
62
+
63
+ # Escape newline characters
64
+ df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
65
+
66
+ if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
67
+ df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
68
+ # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
69
+ df[col] = df[col].replace(
70
+ r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
71
+ )
72
+ # Remove currency symbols and commas
73
+ if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
74
+ df[col] = df[col].str.replace(r'[$,]', '', regex=True)
75
+
76
+ elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
77
+ df[col] = pd.to_numeric(df[col], errors='coerce')
78
+
79
+ # LLM assistance or missing or weird values
80
+ null_count = df[col].isnull().sum()
81
+ empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
82
+ pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
83
+ na=False, case=False, regex=True).sum()
84
+
85
+ if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
86
+ # Get examples for LLM analysis (both good and bad examples)
87
+ # Get non-null, non-empty examples
88
+ good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
89
+
90
+ # Get bad examples
91
+ bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
92
+ na=False, case=False, regex=True)].sample(
93
+ min(5, df[col].isna().sum()), random_state=1)
94
+ # Combine good and bad examples
95
+ examples = good_examples + bad_examples
96
+
97
+ if examples is not None:
98
+ llm_suggestion = suggest_llm_fixes_and_fill(col, examples)
99
+ suggestions_log.append({
100
+ 'col': col,
101
+ 'suggestion': llm_suggestion
102
+ })
103
+
104
+ # Automatically apply replacements from LLM if in expected format
105
+ # if suggestions_log:
106
+ # try:
107
+ # parsed = ast.literal_eval(llm_suggestion)
108
+ # if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
109
+ # for original, replacement, _ in parsed:
110
+ # df[col] = df[col].replace(original, replacement)
111
+ # else:
112
+ # raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
113
+ # except Exception as e:
114
+ # print(f"Failed to apply replacements for column {col}: {e}")
115
+ # st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
116
+ # st.code(llm_suggestion, language="python")
117
+
118
+ df = df.reset_index(drop=True)
119
+
120
+ return df, suggestions_log
121
+
122
+ def display_llm_report(suggestions_log):
123
+ if suggestions_log:
124
+ st.subheader("🤖 LLM Cleaning Suggestions")
125
+ for col, suggestion in suggestions_log:
126
+ st.markdown(f"**Column:** `{col}`")
127
+ if suggestion:
128
+ st.code(suggestion, language="python")
129
+ else:
130
+ st.write("No suggestions or LLM response error.")