Spaces:

translators-will
/

LLM-Data-Cleaner

Sleeping

App Files Files Community

LLM-Data-Cleaner / data_clean_final.py

translators-will

Update data_clean_final.py

94f2bf4 verified about 1 year ago

raw

history blame

5.89 kB

	# data_clean_final.py

	import pandas as pd
	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import torch

	# Load local TinyLlama model
	model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map='auto',
	load_in_4bit=True,
	torch_dtype='auto'
	)
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)


	# Function to get data cleaning suggestions from LLM
	def suggest_llm_fixes_and_fill(column_name, examples):

	examples_text = "\n".join([f"- {ex}" for ex in examples if ex])

	prompt = (
	f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n

	Examine these sample values.

	{examples_text}

	Return ONLY a valid Python list of tuples, like:
	[("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...]
	No explanation or extra text — just the list.
	"""
	)

	try:
	response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
	return response[0]['generated_text'].split(prompt)[-1].strip()

	except Exception as e:
	error_message = f"LLM for error column {column_name}: {str(e)}"
	st.error(error_message)
	return error_message

	def clean_data(file_path):
	# Support CSV and TSV files
	# Load data and drop duplicates
	if file_path.endswith('.tsv'):
	df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
	else:
	df = pd.read_csv(file_path).drop_duplicates().copy()

	suggestions_log = []

	# Convert column types
	for col in df.columns:
	if df[col].dtype == 'object':
	df[col] = df[col].str.strip().str.lower() # Normalize text

	# Escape newline characters
	df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)

	if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
	df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
	# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
	df[col] = df[col].replace(
	r'(?<!\d)(\d{4})\s[-–—./]?\s(\d{4})(?!\d)', r'\1-\2', regex=True
	)
	# Remove currency symbols and commas
	if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
	df[col] = df[col].str.replace(r'[$,]', '', regex=True)

	elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type
	df[col] = pd.to_numeric(df[col], errors='coerce')

	# LLM assistance or missing or weird values
	null_count = df[col].isnull().sum()
	empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
	pattern_matches = df[col].astype(str).str.contains(r'none\|null\|n/a\|na\|\?+missing\|unknown',
	na=False, case=False, regex=True).sum()

	if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
	# Get examples for LLM analysis (both good and bad examples)
	# Get non-null, non-empty examples
	good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)

	# Get bad examples
	bad_examples = df[col][df[col].isna() \| (df[col] == '') \| df[col].astype(str).str.contains(r'none\|null\|n/a\|na\|\?+missing\|unknown',
	na=False, case=False, regex=True)].sample(
	min(5, df[col].isna().sum()), random_state=1)
	# Combine good and bad examples
	examples = good_examples + bad_examples

	if examples is not None:
	llm_suggestion = suggest_llm_fixes_and_fill(col, examples)
	suggestions_log.append({
	'col': col,
	'suggestion': llm_suggestion
	})

	# Automatically apply replacements from LLM if in expected format
	# if suggestions_log:
	# try:
	# parsed = ast.literal_eval(llm_suggestion)
	# if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
	# for original, replacement, _ in parsed:
	# df[col] = df[col].replace(original, replacement)
	# else:
	# raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
	# except Exception as e:
	# print(f"Failed to apply replacements for column {col}: {e}")
	# st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
	# st.code(llm_suggestion, language="python")

	df = df.reset_index(drop=True)

	return df, suggestions_log

	def display_llm_report(suggestions_log):
	if suggestions_log:
	st.subheader("🤖 LLM Cleaning Suggestions")
	for col, suggestion in suggestions_log:
	st.markdown(f"Column: `{col}`")
	if suggestion:
	st.code(suggestion, language="python")
	else:
	st.write("No suggestions or LLM response error.")