rairo commited on
Commit
3a17237
·
verified ·
1 Parent(s): 60414d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -29
app.py CHANGED
@@ -3,15 +3,13 @@ import pandas as pd
3
  import os
4
  from io import BytesIO
5
  import google.generativeai as genai
6
- from google.generativeai import types
7
- import pathlib
8
  from typing import List, Dict
9
  import json
10
  import tempfile
11
 
12
  # Initialize Google Gemini AI client
13
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
14
- client = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
15
 
16
  def convert_excel_to_csv(excel_file):
17
  """Convert Excel file to CSV and return the DataFrame"""
@@ -27,32 +25,28 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
27
  # Convert sample of DataFrame to CSV string
28
  sample_csv = df.head(5).to_csv(index=False)
29
 
30
- analysis_prompt = """
31
- Analyze this CSV data and provide the following in JSON format:
32
- 1. Identify the main subject/entity of this dataset
33
- 2. List all columns and their likely content type (text, number, date, etc.)
34
- 3. Identify potential key columns that could be used for merging with other datasets
35
- 4. Flag any inconsistencies or data quality issues
36
- 5. Suggest any column renamings for clarity
37
 
38
- Response format:
39
- {
40
- "subject": "string",
41
- "columns": [{"name": "string", "type": "string", "description": "string"}],
42
- "key_columns": ["string"],
43
- "issues": ["string"],
44
- "suggested_renames": {"old_name": "new_name"}
45
- }
 
 
 
 
 
 
 
46
  """
47
 
48
  try:
49
- response = client.generate_content(
50
- contents=[
51
- types.Part.from_text(f"Filename: {filename}\n\nCSV Data:\n{sample_csv}"),
52
- analysis_prompt
53
- ]
54
- )
55
-
56
  # Parse JSON response
57
  analysis = json.loads(response.text)
58
  return analysis
@@ -126,8 +120,13 @@ def main():
126
  df = pd.read_csv(uploaded_file)
127
 
128
  if df is not None:
 
 
 
 
129
  # Analyze columns using Gemini
130
- analysis = analyze_columns(df, uploaded_file.name)
 
131
 
132
  if analysis:
133
  st.write("Column Analysis:")
@@ -140,10 +139,12 @@ def main():
140
  'analysis': analysis
141
  })
142
 
143
- # Apply suggested column renames
144
- if 'suggested_renames' in analysis:
145
  df.rename(columns=analysis['suggested_renames'], inplace=True)
146
  st.write("Applied suggested column renames.")
 
 
147
 
148
  if len(processed_files) > 1:
149
  st.write("### Merging DataFrames")
@@ -163,7 +164,8 @@ def main():
163
 
164
  if selected_columns:
165
  # Merge DataFrames
166
- merged_df = merge_dataframes(processed_files, selected_columns)
 
167
 
168
  if merged_df is not None:
169
  st.write("### Preview of Merged Data")
@@ -188,6 +190,10 @@ def main():
188
  missing_values = merged_df.isnull().sum()
189
  st.write("Missing values per column:")
190
  st.dataframe(missing_values)
 
 
 
 
191
  else:
192
  st.warning("No common columns found across datasets.")
193
  else:
 
3
  import os
4
  from io import BytesIO
5
  import google.generativeai as genai
 
 
6
  from typing import List, Dict
7
  import json
8
  import tempfile
9
 
10
  # Initialize Google Gemini AI client
11
  genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
12
+ model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
13
 
14
  def convert_excel_to_csv(excel_file):
15
  """Convert Excel file to CSV and return the DataFrame"""
 
25
  # Convert sample of DataFrame to CSV string
26
  sample_csv = df.head(5).to_csv(index=False)
27
 
28
+ analysis_prompt = f"""
29
+ Analyze this CSV data from file '{filename}' and provide the following in JSON format:
 
 
 
 
 
30
 
31
+ CSV Data:
32
+ {sample_csv}
33
+
34
+ Provide analysis in this exact JSON format:
35
+ {{
36
+ "subject": "string describing main subject of dataset",
37
+ "columns": [
38
+ {{"name": "column_name", "type": "data_type", "description": "column description"}}
39
+ ],
40
+ "key_columns": ["potential columns for merging"],
41
+ "issues": ["list of data quality issues found"],
42
+ "suggested_renames": {{"old_name": "new_name"}}
43
+ }}
44
+
45
+ Only respond with the JSON object, no additional text.
46
  """
47
 
48
  try:
49
+ response = model.generate_content(analysis_prompt)
 
 
 
 
 
 
50
  # Parse JSON response
51
  analysis = json.loads(response.text)
52
  return analysis
 
120
  df = pd.read_csv(uploaded_file)
121
 
122
  if df is not None:
123
+ # Show initial data preview
124
+ st.write("Initial Preview:")
125
+ st.dataframe(df.head())
126
+
127
  # Analyze columns using Gemini
128
+ with st.spinner("Analyzing columns with AI..."):
129
+ analysis = analyze_columns(df, uploaded_file.name)
130
 
131
  if analysis:
132
  st.write("Column Analysis:")
 
139
  'analysis': analysis
140
  })
141
 
142
+ # Apply suggested column renames if any
143
+ if 'suggested_renames' in analysis and analysis['suggested_renames']:
144
  df.rename(columns=analysis['suggested_renames'], inplace=True)
145
  st.write("Applied suggested column renames.")
146
+ st.write("Updated Preview:")
147
+ st.dataframe(df.head())
148
 
149
  if len(processed_files) > 1:
150
  st.write("### Merging DataFrames")
 
164
 
165
  if selected_columns:
166
  # Merge DataFrames
167
+ with st.spinner("Merging datasets..."):
168
+ merged_df = merge_dataframes(processed_files, selected_columns)
169
 
170
  if merged_df is not None:
171
  st.write("### Preview of Merged Data")
 
190
  missing_values = merged_df.isnull().sum()
191
  st.write("Missing values per column:")
192
  st.dataframe(missing_values)
193
+
194
+ # Show duplicate check
195
+ duplicates = merged_df.duplicated().sum()
196
+ st.write(f"Number of duplicate rows: {duplicates}")
197
  else:
198
  st.warning("No common columns found across datasets.")
199
  else: