LLM-Data-Cleaner / data_clean_final.py
translators-will's picture
Update data_clean_final.py
94f2bf4 verified
raw
history blame
5.89 kB
# data_clean_final.py
import pandas as pd
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
# Load local TinyLlama model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map='auto',
load_in_4bit=True,
torch_dtype='auto'
)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
# Function to get data cleaning suggestions from LLM
def suggest_llm_fixes_and_fill(column_name, examples):
examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
prompt = (
f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
Examine these sample values.
{examples_text}
Return ONLY a valid Python list of tuples, like:
[("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...]
No explanation or extra text β€” just the list.
"""
)
try:
response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
return response[0]['generated_text'].split(prompt)[-1].strip()
except Exception as e:
error_message = f"LLM for error column {column_name}: {str(e)}"
st.error(error_message)
return error_message
def clean_data(file_path):
# Support CSV and TSV files
# Load data and drop duplicates
if file_path.endswith('.tsv'):
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
else:
df = pd.read_csv(file_path).drop_duplicates().copy()
suggestions_log = []
# Convert column types
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].str.strip().str.lower() # Normalize text
# Escape newline characters
df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
df[col] = df[col].replace(
r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
)
# Remove currency symbols and commas
if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
df[col] = df[col].str.replace(r'[$,]', '', regex=True)
elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
df[col] = pd.to_numeric(df[col], errors='coerce')
# LLM assistance or missing or weird values
null_count = df[col].isnull().sum()
empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True).sum()
if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
# Get examples for LLM analysis (both good and bad examples)
# Get non-null, non-empty examples
good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
# Get bad examples
bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True)].sample(
min(5, df[col].isna().sum()), random_state=1)
# Combine good and bad examples
examples = good_examples + bad_examples
if examples is not None:
llm_suggestion = suggest_llm_fixes_and_fill(col, examples)
suggestions_log.append({
'col': col,
'suggestion': llm_suggestion
})
# Automatically apply replacements from LLM if in expected format
# if suggestions_log:
# try:
# parsed = ast.literal_eval(llm_suggestion)
# if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
# for original, replacement, _ in parsed:
# df[col] = df[col].replace(original, replacement)
# else:
# raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
# except Exception as e:
# print(f"Failed to apply replacements for column {col}: {e}")
# st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
# st.code(llm_suggestion, language="python")
df = df.reset_index(drop=True)
return df, suggestions_log
def display_llm_report(suggestions_log):
if suggestions_log:
st.subheader("πŸ€– LLM Cleaning Suggestions")
for col, suggestion in suggestions_log:
st.markdown(f"**Column:** `{col}`")
if suggestion:
st.code(suggestion, language="python")
else:
st.write("No suggestions or LLM response error.")