Spaces:

translators-will
/

LLM-Data-Cleaner

Sleeping

App Files Files Community

LLM-Data-Cleaner / data_clean_final.py

translators-will

Update data_clean_final.py

0e4b101 verified 10 months ago

raw

history blame contribute delete

5.68 kB

	# data_clean_final.py

	import pandas as pd
	import streamlit as st
	from ctransformers import AutoModelForCausalLM

	# Load local TinyLlama model
	llm = AutoModelForCausalLM.from_pretrained(
	"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
	model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", # Update this to match your GGUF file name
	model_type="llama",
	gpu_layers=0 # Adjust for GPU support if available
	)
	# generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)


	# Function to get data cleaning suggestions from LLM
	def suggest_fill_strategies(column_name, examples):

	examples_text = "\n".join([f"- {ex}" for ex in examples if ex])

	prompt = (
	f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n

	Examine these sample values.

	{examples_text}

	Generate a short, bulleted list of suggestions for how a data scientist might fill or impute missing values in this column.
	Only return the bulleted list. Do not include any other text.
	"""
	)

	try:
	response = llm(prompt)
	return response.strip()

	except Exception as e:
	error_message = f"LLM for error column {column_name}: {str(e)}"
	st.error(error_message)
	return error_message

	def clean_data(file_path):
	# Support CSV and TSV files
	# Load data and drop duplicates
	if file_path.endswith('.tsv'):
	df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
	else:
	df = pd.read_csv(file_path).drop_duplicates().copy()

	suggestions_log = []

	# Convert column types
	for col in df.columns:
	if df[col].dtype == 'object':
	df[col] = df[col].str.strip().str.lower() # Normalize text

	# Escape newline characters
	df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)

	if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
	df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
	# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
	df[col] = df[col].replace(
	r'(?<!\d)(\d{4})\s[-–—./]?\s(\d{4})(?!\d)', r'\1-\2', regex=True
	)
	# Remove currency symbols and commas
	if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
	df[col] = df[col].str.replace(r'[$,]', '', regex=True)

	elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
	df[col] = pd.to_numeric(df[col], errors='coerce')

	# LLM assistance or missing or weird values
	null_count = df[col].isnull().sum()
	empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
	pattern_matches = df[col].astype(str).str.contains(r'none\|null\|n/a\|na\|\?+missing\|unknown',
	na=False, case=False, regex=True).sum()

	if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
	# Get examples for LLM analysis (both good and bad examples)
	# Get non-null, non-empty examples
	good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)

	# Get bad examples
	bad_examples = df[col][df[col].isna() \| (df[col] == '') \| df[col].astype(str).str.contains(r'none\|null\|n/a\|na\|\?+missing\|unknown',
	na=False, case=False, regex=True)].sample(
	min(5, df[col].isna().sum()), random_state=1)
	# Combine good and bad examples
	examples = good_examples + bad_examples

	if examples is not None:
	llm_suggestion = suggest_fill_strategies(col, examples)
	suggestions_log.append((col, llm_suggestion))

	# Automatically apply replacements from LLM if in expected format
	# if suggestions_log:
	# try:
	# parsed = ast.literal_eval(llm_suggestion)
	# if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
	# for original, replacement, _ in parsed:
	# df[col] = df[col].replace(original, replacement)
	# else:
	# raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
	# except Exception as e:
	# print(f"Failed to apply replacements for column {col}: {e}")
	# st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
	# st.code(llm_suggestion, language="python")

	df = df.reset_index(drop=True)

	return df, suggestions_log

	def display_llm_report(suggestions_log):
	if suggestions_log:
	st.subheader("🤖 LLM Cleaning Suggestions")
	for col, suggestion in suggestions_log:
	st.markdown(f"Column: `{col}`")
	if suggestion:
	st.code(suggestion, language="python")
	else:
	st.write("No suggestions or LLM response error.")