Spaces:
Sleeping
Sleeping
File size: 5,683 Bytes
edc5088 1f4f519 edc5088 ac4f597 edc5088 0e4b101 edc5088 0e4b101 edc5088 ac4f597 edc5088 0e4b101 edc5088 0e4b101 edc5088 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# data_clean_final.py
import pandas as pd
import streamlit as st
from ctransformers import AutoModelForCausalLM
# Load local TinyLlama model
llm = AutoModelForCausalLM.from_pretrained(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", # Update this to match your GGUF file name
model_type="llama",
gpu_layers=0 # Adjust for GPU support if available
)
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
# Function to get data cleaning suggestions from LLM
def suggest_fill_strategies(column_name, examples):
examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
prompt = (
f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
Examine these sample values.
{examples_text}
Generate a short, bulleted list of suggestions for how a data scientist might fill or impute missing values in this column.
Only return the bulleted list. Do not include any other text.
"""
)
try:
response = llm(prompt)
return response.strip()
except Exception as e:
error_message = f"LLM for error column {column_name}: {str(e)}"
st.error(error_message)
return error_message
def clean_data(file_path):
# Support CSV and TSV files
# Load data and drop duplicates
if file_path.endswith('.tsv'):
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
else:
df = pd.read_csv(file_path).drop_duplicates().copy()
suggestions_log = []
# Convert column types
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].str.strip().str.lower() # Normalize text
# Escape newline characters
df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
df[col] = df[col].replace(
r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
)
# Remove currency symbols and commas
if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
df[col] = df[col].str.replace(r'[$,]', '', regex=True)
elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
df[col] = pd.to_numeric(df[col], errors='coerce')
# LLM assistance or missing or weird values
null_count = df[col].isnull().sum()
empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True).sum()
if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
# Get examples for LLM analysis (both good and bad examples)
# Get non-null, non-empty examples
good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
# Get bad examples
bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True)].sample(
min(5, df[col].isna().sum()), random_state=1)
# Combine good and bad examples
examples = good_examples + bad_examples
if examples is not None:
llm_suggestion = suggest_fill_strategies(col, examples)
suggestions_log.append((col, llm_suggestion))
# Automatically apply replacements from LLM if in expected format
# if suggestions_log:
# try:
# parsed = ast.literal_eval(llm_suggestion)
# if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
# for original, replacement, _ in parsed:
# df[col] = df[col].replace(original, replacement)
# else:
# raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
# except Exception as e:
# print(f"Failed to apply replacements for column {col}: {e}")
# st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
# st.code(llm_suggestion, language="python")
df = df.reset_index(drop=True)
return df, suggestions_log
def display_llm_report(suggestions_log):
if suggestions_log:
st.subheader("🤖 LLM Cleaning Suggestions")
for col, suggestion in suggestions_log:
st.markdown(f"**Column:** `{col}`")
if suggestion:
st.code(suggestion, language="python")
else:
st.write("No suggestions or LLM response error.")
|