LLM-Data-Cleaner / data_clean_final.py
translators-will's picture
Update data_clean_final.py
0e4b101 verified
# data_clean_final.py
import pandas as pd
import streamlit as st
from ctransformers import AutoModelForCausalLM
# Load local TinyLlama model
llm = AutoModelForCausalLM.from_pretrained(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", # Update this to match your GGUF file name
model_type="llama",
gpu_layers=0 # Adjust for GPU support if available
)
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
# Function to get data cleaning suggestions from LLM
def suggest_fill_strategies(column_name, examples):
examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
prompt = (
f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
Examine these sample values.
{examples_text}
Generate a short, bulleted list of suggestions for how a data scientist might fill or impute missing values in this column.
Only return the bulleted list. Do not include any other text.
"""
)
try:
response = llm(prompt)
return response.strip()
except Exception as e:
error_message = f"LLM for error column {column_name}: {str(e)}"
st.error(error_message)
return error_message
def clean_data(file_path):
# Support CSV and TSV files
# Load data and drop duplicates
if file_path.endswith('.tsv'):
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
else:
df = pd.read_csv(file_path).drop_duplicates().copy()
suggestions_log = []
# Convert column types
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].str.strip().str.lower() # Normalize text
# Escape newline characters
df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
df[col] = df[col].replace(
r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
)
# Remove currency symbols and commas
if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
df[col] = df[col].str.replace(r'[$,]', '', regex=True)
elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
df[col] = pd.to_numeric(df[col], errors='coerce')
# LLM assistance or missing or weird values
null_count = df[col].isnull().sum()
empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True).sum()
if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
# Get examples for LLM analysis (both good and bad examples)
# Get non-null, non-empty examples
good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
# Get bad examples
bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True)].sample(
min(5, df[col].isna().sum()), random_state=1)
# Combine good and bad examples
examples = good_examples + bad_examples
if examples is not None:
llm_suggestion = suggest_fill_strategies(col, examples)
suggestions_log.append((col, llm_suggestion))
# Automatically apply replacements from LLM if in expected format
# if suggestions_log:
# try:
# parsed = ast.literal_eval(llm_suggestion)
# if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
# for original, replacement, _ in parsed:
# df[col] = df[col].replace(original, replacement)
# else:
# raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
# except Exception as e:
# print(f"Failed to apply replacements for column {col}: {e}")
# st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
# st.code(llm_suggestion, language="python")
df = df.reset_index(drop=True)
return df, suggestions_log
def display_llm_report(suggestions_log):
if suggestions_log:
st.subheader("🤖 LLM Cleaning Suggestions")
for col, suggestion in suggestions_log:
st.markdown(f"**Column:** `{col}`")
if suggestion:
st.code(suggestion, language="python")
else:
st.write("No suggestions or LLM response error.")