File size: 5,683 Bytes
edc5088
 
 
 
1f4f519
edc5088
 
ac4f597
 
 
 
 
 
 
edc5088
 
 
0e4b101
edc5088
 
 
 
 
 
 
 
 
 
0e4b101
 
edc5088
 
 
 
ac4f597
 
edc5088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e4b101
 
edc5088
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e4b101
edc5088
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# data_clean_final.py

import pandas as pd
import streamlit as st
from ctransformers import AutoModelForCausalLM

# Load local TinyLlama model
llm = AutoModelForCausalLM.from_pretrained(
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
    model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",  # Update this to match your GGUF file name
    model_type="llama",
    gpu_layers=0  # Adjust for GPU support if available
)
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)


# Function to get data cleaning suggestions from LLM
def suggest_fill_strategies(column_name, examples):

    examples_text = "\n".join([f"- {ex}" for ex in examples if ex])
    
    prompt = (
        f"""You are a data cleaning assistant. Some entries in the '{column_name}' column are missing or inconsistent.\n

        Examine these sample values.

        {examples_text}

        Generate a short, bulleted list of suggestions for how a data scientist might fill or impute missing values in this column.
        Only return the bulleted list. Do not include any other text.
        """
    )

    try:
        response = llm(prompt)
        return response.strip()
    
    except Exception as e:
        error_message = f"LLM for error column {column_name}: {str(e)}"
        st.error(error_message)
        return error_message

def clean_data(file_path):
    # Support CSV and TSV files
    # Load data and drop duplicates
    if file_path.endswith('.tsv'):
        df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
    else:
        df = pd.read_csv(file_path).drop_duplicates().copy()

    suggestions_log = []

    # Convert column types
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.lower()  # Normalize text

            # Escape newline characters
            df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
            
            if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
                df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
                # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
                df[col] = df[col].replace(
                    r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
                )
            # Remove currency symbols and commas
            if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
                df[col] = df[col].str.replace(r'[$,]', '', regex=True)

        elif df[col].dtype in ['int64', 'float64'] or pd.api.types.is_numeric_dtype(df[col]):  # Convert numeric columns to proper type
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # LLM assistance or missing or weird values
        null_count = df[col].isnull().sum()
        empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
        pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
                                                           na=False, case=False, regex=True).sum()
        
        if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
            # Get examples for LLM analysis (both good and bad examples)
            # Get non-null, non-empty examples
            good_examples = df[col][df[col].notnull() & (df[col] != '')].drop_duplicates().sample(n=min(5, len(df)), random_state=1)

            # Get bad examples
            bad_examples = df[col][df[col].isna() | (df[col] == '') | df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown', 
                                                                                                       na=False, case=False, regex=True)].sample(
                                                                                                           min(5, df[col].isna().sum()), random_state=1)
            # Combine good and bad examples
            examples = good_examples + bad_examples

            if examples is not None:
                llm_suggestion = suggest_fill_strategies(col, examples)
                suggestions_log.append((col, llm_suggestion))

                 # Automatically apply replacements from LLM if in expected format
                # if suggestions_log:
                #     try:
                #         parsed = ast.literal_eval(llm_suggestion)
                #         if isinstance(parsed, list) and all(isinstance(t, tuple) and len(t) == 3 for t in parsed):
                #             for original, replacement, _ in parsed:
                #                 df[col] = df[col].replace(original, replacement)
                #         else:
                #             raise ValueError("Parsed suggestion is not a list of 3-item tuples.")
                #     except Exception as e:
                #         print(f"Failed to apply replacements for column {col}: {e}")
                #         st.warning(f"❌ Could not parse suggestion for column `{col}`. Make sure the LLM returned a valid Python list of tuples.")
                #         st.code(llm_suggestion, language="python")

    df = df.reset_index(drop=True)

    return df, suggestions_log

def display_llm_report(suggestions_log):
    if suggestions_log:
        st.subheader("🤖 LLM Cleaning Suggestions")
        for col, suggestion in suggestions_log:
            st.markdown(f"**Column:** `{col}`")
            if suggestion:
                st.code(suggestion, language="python")
            else:
                st.write("No suggestions or LLM response error.")