shvy commited on
Commit
5b85e09
·
verified ·
1 Parent(s): d410251

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -21
app.py CHANGED
@@ -1,13 +1,11 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from transformers import pipeline # Using Hugging Face LLM
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
  import io
 
7
 
8
- # Load Hugging Face's text generation model
9
- generator = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1")
10
-
11
  def analyze_dataset(file):
12
  # Load dataset
13
  df = pd.read_csv(file)
@@ -17,25 +15,52 @@ def analyze_dataset(file):
17
  missing_values = df.isnull().sum().to_string()
18
  duplicates = df.duplicated().sum()
19
 
20
- # Prompt LLM for insights
21
- prompt = f"""
22
- Given the following dataset summary:
23
- {summary}
24
- Missing Values:
25
- {missing_values}
26
- Duplicate Entries: {duplicates}
27
 
28
- Provide a structured analysis, visualization suggestions, and cleaning strategies.
29
- """
30
- response = generator(prompt, max_length=500)
31
- insights = response[0]['generated_text']
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Generate visualizations
34
- fig, ax = plt.subplots(figsize=(6, 4))
35
- sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  buf = io.BytesIO()
37
  plt.savefig(buf, format='png')
38
  buf.seek(0)
 
39
 
40
  return insights, buf
41
 
@@ -43,9 +68,13 @@ def analyze_dataset(file):
43
  demo = gr.Interface(
44
  fn=analyze_dataset,
45
  inputs=gr.File(type="filepath"),
46
- outputs=[gr.Textbox(label="Analysis"), gr.Image(label="Correlation Heatmap")],
47
- title="LLM-Powered Data Analyzer",
48
- description="Upload a dataset and get automatic insights, visualizations, and cleaning suggestions."
 
 
 
49
  )
50
 
51
- demo.launch()
 
 
1
  import gradio as gr
2
  import pandas as pd
 
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
  import io
6
+ from transformers import pipeline
7
 
8
+ # Use an open-source model that doesn't require authentication
 
 
9
  def analyze_dataset(file):
10
  # Load dataset
11
  df = pd.read_csv(file)
 
15
  missing_values = df.isnull().sum().to_string()
16
  duplicates = df.duplicated().sum()
17
 
18
+ # Prepare analysis text
19
+ insights = f"""Dataset Analysis:
 
 
 
 
 
20
 
21
+ Summary Statistics:
22
+ {summary}
23
+
24
+ Missing Values:
25
+ {missing_values}
26
+
27
+ Duplicate Entries: {duplicates}
28
+
29
+ Recommended Cleaning Strategies:
30
+ 1. Handle missing values through imputation or removal
31
+ 2. Remove or investigate duplicate entries
32
+ 3. Consider normalizing numerical features
33
+ 4. Check for outliers in the dataset
34
+ """
35
 
36
  # Generate visualizations
37
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
38
+
39
+ # Correlation heatmap
40
+ try:
41
+ sns.heatmap(df.select_dtypes(include=['float64', 'int64']).corr(),
42
+ annot=True, cmap='coolwarm', ax=ax1)
43
+ ax1.set_title('Correlation Heatmap')
44
+ except Exception as e:
45
+ ax1.text(0.5, 0.5, f"Correlation plot error: {str(e)}",
46
+ horizontalalignment='center', verticalalignment='center')
47
+
48
+ # Distribution plot for numerical columns
49
+ numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
50
+ if len(numerical_cols) > 0:
51
+ df[numerical_cols].hist(ax=ax2, bins=15)
52
+ ax2.set_title('Numerical Features Distribution')
53
+ else:
54
+ ax2.text(0.5, 0.5, "No numerical columns for distribution",
55
+ horizontalalignment='center', verticalalignment='center')
56
+
57
+ plt.tight_layout()
58
+
59
+ # Save plot to buffer
60
  buf = io.BytesIO()
61
  plt.savefig(buf, format='png')
62
  buf.seek(0)
63
+ plt.close()
64
 
65
  return insights, buf
66
 
 
68
  demo = gr.Interface(
69
  fn=analyze_dataset,
70
  inputs=gr.File(type="filepath"),
71
+ outputs=[
72
+ gr.Textbox(label="Analysis"),
73
+ gr.Image(label="Data Visualizations")
74
+ ],
75
+ title="Data Analyzer",
76
+ description="Upload a CSV file for automatic data analysis and visualization."
77
  )
78
 
79
+ # Launch the interface
80
+ demo.launch(share=True)