Spaces:
Sleeping
Sleeping
| # data_clean_final.py | |
| import pandas as pd | |
| import streamlit as st | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import torch | |
| # Load local TinyLlama model | |
| model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| device_map='auto', | |
| load_in_4bit=True, | |
| torch_dtype='auto' | |
| ) | |
| generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
| # Function to get data cleaning suggestions from LLM | |
| def suggest_llm_fixes_and_fill(column_name, examples): | |
| examples_text = "\n".join([f"- {ex}" for ex in examples if ex]) | |
| prompt = ( | |
| f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n | |
| Examine these sample values. | |
| {examples_text} | |
| Return ONLY a valid Python list of tuples, like: | |
| [("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...] | |
| No explanation or extra text β just the list. | |
| """ | |
| ) | |
| try: | |
| response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7) | |
| return response[0]['generated_text'].split(prompt)[-1].strip() | |
| except Exception as e: | |
| error_message = f"LLM for error column {column_name}: {str(e)}" | |
| st.error(error_message) | |
| return error_message | |
| def clean_data(file_path): | |
| # Support CSV and TSV files | |
| # Load data and drop duplicates | |
| if file_path.endswith('.tsv'): | |
| df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy() | |
| else: | |
| df = pd.read_csv(file_path).drop_duplicates().copy() | |
| suggestions_log = [] | |
| # Convert column types | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| df[col] = df[col].str.strip().str.lower() # Normalize text | |
| # Escape newline characters | |
| df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False) | |
| if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']): | |
| df[col] = df[col].str.replace(r'[^\d]', '', regex=True) | |
| # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001) | |
| df[col] = df[col].replace( | |
| r'(?<!\d)(\d{4})\s*[-ββ./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True | |
| ) | |
| # Remove currency symbols and commas | |
| if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any(): | |
| df[col] = df[col].str.replace(r'[$,]', '', regex=True) | |
| elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| # LLM assistance or missing or weird values | |
| null_count = df[col].isnull().sum() | |
| empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0 | |
| pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown', | |
| na=False, case=False, regex=True).sum() | |
| if null_count > 0 or empty_str_count > 0 or pattern_matches > 0: | |
| # Get examples for LLM analysis (both good and bad examples) | |
| # Get non-null, non-empty examples | |
| good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1) | |
| # Get bad examples | |
| bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown', | |
| na=False, case=False, regex=True)].sample( | |
| min(5, df[col].isna().sum()), random_state=1) | |
| # Combine good and bad examples | |
| examples = good_examples + bad_examples | |
| if examples is not None: | |
| llm_suggestion = suggest_llm_fixes_and_fill(col, examples) | |
| suggestions_log.append({ | |
| 'col': col, | |
| 'suggestion': llm_suggestion | |
| }) | |
| # Automatically apply replacements from LLM if in expected format | |
| # if suggestions_log: | |
| # try: | |
| # parsed = ast.literal_eval(llm_suggestion) | |
| # if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed): | |
| # for original, replacement, _ in parsed: | |
| # df[col] = df[col].replace(original, replacement) | |
| # else: | |
| # raise ValueError("Parsed suggestion is not a list of 3-item tuples.") | |
| # except Exception as e: | |
| # print(f"Failed to apply replacements for column {col}: {e}") | |
| # st.warning(f"β Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.") | |
| # st.code(llm_suggestion, language="python") | |
| df = df.reset_index(drop=True) | |
| return df, suggestions_log | |
| def display_llm_report(suggestions_log): | |
| if suggestions_log: | |
| st.subheader("π€ LLM Cleaning Suggestions") | |
| for col, suggestion in suggestions_log: | |
| st.markdown(f"**Column:** `{col}`") | |
| if suggestion: | |
| st.code(suggestion, language="python") | |
| else: | |
| st.write("No suggestions or LLM response error.") | |