# data_clean_final.py import pandas as pd import streamlit as st from ctransformers import AutoModelForCausalLM # Load local TinyLlama model llm = AutoModelForCausalLM.from_pretrained( "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", # Update this to match your GGUF file name model_type="llama", gpu_layers=0 # Adjust for GPU support if available ) # generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) # Function to get data cleaning suggestions from LLM def suggest_fill_strategies(column_name, examples): examples_text = "\n".join([f"- {ex}" for ex in examples if ex]) prompt = ( f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n Examine these sample values. {examples_text} Generate a short, bulleted list of suggestions for how a data scientist might fill or impute missing values in this column. Only return the bulleted list. Do not include any other text. """ ) try: response = llm(prompt) return response.strip() except Exception as e: error_message = f"LLM for error column {column_name}: {str(e)}" st.error(error_message) return error_message def clean_data(file_path): # Support CSV and TSV files # Load data and drop duplicates if file_path.endswith('.tsv'): df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy() else: df = pd.read_csv(file_path).drop_duplicates().copy() suggestions_log = [] # Convert column types for col in df.columns: if df[col].dtype == 'object': df[col] = df[col].str.strip().str.lower() # Normalize text # Escape newline characters df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False) if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']): df[col] = df[col].str.replace(r'[^\d]', '', regex=True) # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001) df[col] = df[col].replace( r'(? 0 or empty_str_count > 0 or pattern_matches > 0: # Get examples for LLM analysis (both good and bad examples) # Get non-null, non-empty examples good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1) # Get bad examples bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown', na=False, case=False, regex=True)].sample( min(5, df[col].isna().sum()), random_state=1) # Combine good and bad examples examples = good_examples + bad_examples if examples is not None: llm_suggestion = suggest_fill_strategies(col, examples) suggestions_log.append((col, llm_suggestion)) # Automatically apply replacements from LLM if in expected format # if suggestions_log: # try: # parsed = ast.literal_eval(llm_suggestion) # if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed): # for original, replacement, _ in parsed: # df[col] = df[col].replace(original, replacement) # else: # raise ValueError("Parsed suggestion is not a list of 3-item tuples.") # except Exception as e: # print(f"Failed to apply replacements for column {col}: {e}") # st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.") # st.code(llm_suggestion, language="python") df = df.reset_index(drop=True) return df, suggestions_log def display_llm_report(suggestions_log): if suggestions_log: st.subheader("🤖 LLM Cleaning Suggestions") for col, suggestion in suggestions_log: st.markdown(f"**Column:** `{col}`") if suggestion: st.code(suggestion, language="python") else: st.write("No suggestions or LLM response error.")