Spaces:
Sleeping
Sleeping
| # data_clean_final.py | |
| import pandas as pd | |
| import streamlit as st | |
| from ctransformers import AutoModelForCausalLM | |
| # Load local TinyLlama model | |
| llm = AutoModelForCausalLM.from_pretrained( | |
| "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", | |
| model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", # Update this to match your GGUF file name | |
| model_type="llama", | |
| gpu_layers=0 # Adjust for GPU support if available | |
| ) | |
| # generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
| # Function to get data cleaning suggestions from LLM | |
| def suggest_fill_strategies(column_name, examples): | |
| examples_text = "\n".join([f"- {ex}" for ex in examples if ex]) | |
| prompt = ( | |
| f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n | |
| Examine these sample values. | |
| {examples_text} | |
| Generate a short, bulleted list of suggestions for how a data scientist might fill or impute missing values in this column. | |
| Only return the bulleted list. Do not include any other text. | |
| """ | |
| ) | |
| try: | |
| response = llm(prompt) | |
| return response.strip() | |
| except Exception as e: | |
| error_message = f"LLM for error column {column_name}: {str(e)}" | |
| st.error(error_message) | |
| return error_message | |
| def clean_data(file_path): | |
| # Support CSV and TSV files | |
| # Load data and drop duplicates | |
| if file_path.endswith('.tsv'): | |
| df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy() | |
| else: | |
| df = pd.read_csv(file_path).drop_duplicates().copy() | |
| suggestions_log = [] | |
| # Convert column types | |
| for col in df.columns: | |
| if df[col].dtype == 'object': | |
| df[col] = df[col].str.strip().str.lower() # Normalize text | |
| # Escape newline characters | |
| df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False) | |
| if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']): | |
| df[col] = df[col].str.replace(r'[^\d]', '', regex=True) | |
| # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001) | |
| df[col] = df[col].replace( | |
| r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True | |
| ) | |
| # Remove currency symbols and commas | |
| if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any(): | |
| df[col] = df[col].str.replace(r'[$,]', '', regex=True) | |
| elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]): # Convert numeric columns to proper type | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| # LLM assistance or missing or weird values | |
| null_count = df[col].isnull().sum() | |
| empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0 | |
| pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown', | |
| na=False, case=False, regex=True).sum() | |
| if null_count > 0 or empty_str_count > 0 or pattern_matches > 0: | |
| # Get examples for LLM analysis (both good and bad examples) | |
| # Get non-null, non-empty examples | |
| good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1) | |
| # Get bad examples | |
| bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown', | |
| na=False, case=False, regex=True)].sample( | |
| min(5, df[col].isna().sum()), random_state=1) | |
| # Combine good and bad examples | |
| examples = good_examples + bad_examples | |
| if examples is not None: | |
| llm_suggestion = suggest_fill_strategies(col, examples) | |
| suggestions_log.append((col, llm_suggestion)) | |
| # Automatically apply replacements from LLM if in expected format | |
| # if suggestions_log: | |
| # try: | |
| # parsed = ast.literal_eval(llm_suggestion) | |
| # if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed): | |
| # for original, replacement, _ in parsed: | |
| # df[col] = df[col].replace(original, replacement) | |
| # else: | |
| # raise ValueError("Parsed suggestion is not a list of 3-item tuples.") | |
| # except Exception as e: | |
| # print(f"Failed to apply replacements for column {col}: {e}") | |
| # st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.") | |
| # st.code(llm_suggestion, language="python") | |
| df = df.reset_index(drop=True) | |
| return df, suggestions_log | |
| def display_llm_report(suggestions_log): | |
| if suggestions_log: | |
| st.subheader("🤖 LLM Cleaning Suggestions") | |
| for col, suggestion in suggestions_log: | |
| st.markdown(f"**Column:** `{col}`") | |
| if suggestion: | |
| st.code(suggestion, language="python") | |
| else: | |
| st.write("No suggestions or LLM response error.") | |