Spaces:

translators-will
/

LLM-Data-Cleaner

Sleeping

App Files Files Community

translators-will commited on Apr 9, 2025

Commit

edc5088

verified ·

1 Parent(s): d2e9523

Update data_clean_final.py

Browse files

Files changed (1) hide show

data_clean_final.py +125 -130

data_clean_final.py CHANGED Viewed

@@ -1,130 +1,125 @@
-# data_clean_final.py
-import pandas as pd
-import os
-import streamlit as st
-from dotenv import load_dotenv
-import tempfile
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-import torch
-# Load environment variables from .env file
-load_dotenv()
-# Load local TinyLlama model
-model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
-# Function to get data cleaning suggestions from LLM
-def suggest_llm_fixes_and_fill(column_name, examples):
-    examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
-    prompt = (
-        f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
-        Examine these sample values.
-        {examples_text}
-        Return ONLY a valid Python list of tuples, like:
-        [("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...]
-        No explanation or extra text — just the list.
-        """
-    )
-    try:
-        response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
-        return response[0]['generated_text'].split(prompt)[-1].strip()
-    except Exception as e:
-        error_message = f"LLM for error column {column_name}: {str(e)}"
-        st.error(error_message)
-        return error_message
-def clean_data(file_path):
-    # Support CSV and TSV files
-    # Load data and drop duplicates
-    if file_path.endswith('.tsv'):
-        df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
-    else:
-        df = pd.read_csv(file_path).drop_duplicates().copy()
-    suggestions_log = []
-    # Convert column types
-    for col in df.columns:
-        if df[col].dtype == 'object':
-            df[col] = df[col].str.strip().str.lower()  # Normalize text
-            # Escape newline characters
-            df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
-            if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
-                df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
-                # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
-                df[col] = df[col].replace(
-                    r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
-                )
-            # Remove currency symbols and commas
-            if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
-                df[col] = df[col].str.replace(r'[$,]', '', regex=True)
-        elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]):  # Convert numeric columns to proper type
-            df[col] = pd.to_numeric(df[col], errors='coerce')
-        # LLM assistance or missing or weird values
-        null_count = df[col].isnull().sum()
-        empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
-        pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
-                                                           na=False, case=False, regex=True).sum()
-        if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
-            # Get examples for LLM analysis (both good and bad examples)
-            # Get non-null, non-empty examples
-            good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
-            # Get bad examples
-            bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
-                                                                                                       na=False, case=False, regex=True)].sample(
-                                                                                                           min(5, df[col].isna().sum()), random_state=1)
-            # Combine good and bad examples
-            examples = good_examples + bad_examples
-            if examples is not None:
-                llm_suggestion = suggest_llm_fixes_and_fill(col, examples)
-                suggestions_log.append({
-                    'col': col,
-                    'suggestion': llm_suggestion
-                })
-                 # Automatically apply replacements from LLM if in expected format
-                # if suggestions_log:
-                #     try:
-                #         parsed = ast.literal_eval(llm_suggestion)
-                #         if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
-                #             for original, replacement, _ in parsed:
-                #                 df[col] = df[col].replace(original, replacement)
-                #         else:
-                #             raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
-                #     except Exception as e:
-                #         print(f"Failed to apply replacements for column {col}: {e}")
-                #         st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
-                #         st.code(llm_suggestion, language="python")
-    df = df.reset_index(drop=True)
-    return df, suggestions_log
-def display_llm_report(suggestions_log):
-    if suggestions_log:
-        st.subheader("🤖 LLM Cleaning Suggestions")
-        for col, suggestion in suggestions_log:
-            st.markdown(f"**Column:** `{col}`")
-            if suggestion:
-                    st.code(suggestion, language="python")
-            else:
-                st.write("No suggestions or LLM response error.")

+# data_clean_final.py
+import pandas as pd
+import streamlit as st
+import tempfile
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
+# Load local TinyLlama model
+model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
+# Function to get data cleaning suggestions from LLM
+def suggest_llm_fixes_and_fill(column_name, examples):
+    examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
+    prompt = (
+        f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n
+        Examine these sample values.
+        {examples_text}
+        Return ONLY a valid Python list of tuples, like:
+        [("original_value1", "replacement1", "reason"), ("original_value2", "replacement2", "reason"), ...]
+        No explanation or extra text — just the list.
+        """
+    )
+    try:
+        response = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
+        return response[0]['generated_text'].split(prompt)[-1].strip()
+    except Exception as e:
+        error_message = f"LLM for error column {column_name}: {str(e)}"
+        st.error(error_message)
+        return error_message
+def clean_data(file_path):
+    # Support CSV and TSV files
+    # Load data and drop duplicates
+    if file_path.endswith('.tsv'):
+        df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
+    else:
+        df = pd.read_csv(file_path).drop_duplicates().copy()
+    suggestions_log = []
+    # Convert column types
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            df[col] = df[col].str.strip().str.lower()  # Normalize text
+            # Escape newline characters
+            df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
+            if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
+                df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
+                # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
+                df[col] = df[col].replace(
+                    r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
+                )
+            # Remove currency symbols and commas
+            if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
+                df[col] = df[col].str.replace(r'[$,]', '', regex=True)
+        elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]):  # Convert numeric columns to proper type
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+        # LLM assistance or missing or weird values
+        null_count = df[col].isnull().sum()
+        empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
+        pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
+                                                           na=False, case=False, regex=True).sum()
+        if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
+            # Get examples for LLM analysis (both good and bad examples)
+            # Get non-null, non-empty examples
+            good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)
+            # Get bad examples
+            bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
+                                                                                                       na=False, case=False, regex=True)].sample(
+                                                                                                           min(5, df[col].isna().sum()), random_state=1)
+            # Combine good and bad examples
+            examples = good_examples + bad_examples
+            if examples is not None:
+                llm_suggestion = suggest_llm_fixes_and_fill(col, examples)
+                suggestions_log.append({
+                    'col': col,
+                    'suggestion': llm_suggestion
+                })
+                 # Automatically apply replacements from LLM if in expected format
+                # if suggestions_log:
+                #     try:
+                #         parsed = ast.literal_eval(llm_suggestion)
+                #         if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
+                #             for original, replacement, _ in parsed:
+                #                 df[col] = df[col].replace(original, replacement)
+                #         else:
+                #             raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
+                #     except Exception as e:
+                #         print(f"Failed to apply replacements for column {col}: {e}")
+                #         st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
+                #         st.code(llm_suggestion, language="python")
+    df = df.reset_index(drop=True)
+    return df, suggestions_log
+def display_llm_report(suggestions_log):
+    if suggestions_log:
+        st.subheader("🤖 LLM Cleaning Suggestions")
+        for col, suggestion in suggestions_log:
+            st.markdown(f"**Column:** `{col}`")
+            if suggestion:
+                    st.code(suggestion, language="python")
+            else:
+                st.write("No suggestions or LLM response error.")