rairo commited on
Commit
943ef38
·
verified ·
1 Parent(s): d78ab3e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -0
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ from io import BytesIO
5
+ from google import genai
6
+ from google.genai import types
7
+ import pathlib
8
+ from typing import List, Dict
9
+ import json
10
+ import tempfile
11
+
12
+ # Initialize Google Gemini AI client
13
+ genai.configure(api_key=st.secrets["GOOGLE_API_KEY"])
14
+ client = genai.Client()
15
+
16
+ def convert_excel_to_csv(excel_file):
17
+ """Convert Excel file to CSV and return the DataFrame"""
18
+ try:
19
+ df = pd.read_excel(excel_file)
20
+ return df
21
+ except Exception as e:
22
+ st.error(f"Error converting Excel file: {str(e)}")
23
+ return None
24
+
25
+ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
26
+ """Analyze DataFrame columns using Gemini AI"""
27
+ # Convert sample of DataFrame to CSV string
28
+ sample_csv = df.head(5).to_csv(index=False)
29
+
30
+ analysis_prompt = """
31
+ Analyze this CSV data and provide the following in JSON format:
32
+ 1. Identify the main subject/entity of this dataset
33
+ 2. List all columns and their likely content type (text, number, date, etc.)
34
+ 3. Identify potential key columns that could be used for merging with other datasets
35
+ 4. Flag any inconsistencies or data quality issues
36
+ 5. Suggest any column renamings for clarity
37
+
38
+ Response format:
39
+ {
40
+ "subject": "string",
41
+ "columns": [{"name": "string", "type": "string", "description": "string"}],
42
+ "key_columns": ["string"],
43
+ "issues": ["string"],
44
+ "suggested_renames": {"old_name": "new_name"}
45
+ }
46
+ """
47
+
48
+ try:
49
+ response = client.models.generate_content(
50
+ model="gemini-2.0-flash-thinking-exp",
51
+ contents=[
52
+ types.Part.from_text(f"Filename: {filename}\n\nCSV Data:\n{sample_csv}"),
53
+ analysis_prompt
54
+ ]
55
+ )
56
+
57
+ # Parse JSON response
58
+ analysis = json.loads(response.text)
59
+ return analysis
60
+ except Exception as e:
61
+ st.error(f"Error analyzing columns: {str(e)}")
62
+ return None
63
+
64
+ def find_common_columns(dataframes: List[Dict]) -> List[str]:
65
+ """Find potential common columns across all DataFrames based on Gemini analysis"""
66
+ all_key_columns = []
67
+ for df_info in dataframes:
68
+ if df_info['analysis'] and 'key_columns' in df_info['analysis']:
69
+ all_key_columns.extend(df_info['analysis']['key_columns'])
70
+
71
+ # Count frequency of each column
72
+ from collections import Counter
73
+ column_freq = Counter(all_key_columns)
74
+
75
+ # Return columns that appear in multiple datasets
76
+ common_columns = [col for col, freq in column_freq.items() if freq > 1]
77
+ return common_columns
78
+
79
+ def merge_dataframes(dataframes: List[Dict], common_columns: List[str]) -> pd.DataFrame:
80
+ """Merge all DataFrames using specified common columns"""
81
+ if not dataframes:
82
+ return None
83
+
84
+ # Start with the first DataFrame
85
+ merged_df = dataframes[0]['df'].copy()
86
+
87
+ # Merge with remaining DataFrames
88
+ for df_info in dataframes[1:]:
89
+ try:
90
+ merged_df = pd.merge(
91
+ merged_df,
92
+ df_info['df'],
93
+ on=common_columns,
94
+ how='outer',
95
+ suffixes=(None, f'_{df_info["filename"]}')
96
+ )
97
+ except Exception as e:
98
+ st.error(f"Error merging {df_info['filename']}: {str(e)}")
99
+ continue
100
+
101
+ return merged_df
102
+
103
+ def main():
104
+ st.title("Smart CSV Processor")
105
+ st.write("Upload CSV or Excel files for intelligent analysis and merging.")
106
+
107
+ # File uploader
108
+ uploaded_files = st.file_uploader(
109
+ "Choose files",
110
+ accept_multiple_files=True,
111
+ type=['csv', 'xlsx', 'xls']
112
+ )
113
+
114
+ if uploaded_files:
115
+ st.write("### Processing Files")
116
+
117
+ # Process each file and store DataFrames with their analysis
118
+ processed_files = []
119
+
120
+ for uploaded_file in uploaded_files:
121
+ st.write(f"#### Analyzing: {uploaded_file.name}")
122
+
123
+ # Read file into DataFrame
124
+ if uploaded_file.name.endswith(('.xlsx', '.xls')):
125
+ df = convert_excel_to_csv(uploaded_file)
126
+ else:
127
+ df = pd.read_csv(uploaded_file)
128
+
129
+ if df is not None:
130
+ # Analyze columns using Gemini
131
+ analysis = analyze_columns(df, uploaded_file.name)
132
+
133
+ if analysis:
134
+ st.write("Column Analysis:")
135
+ st.json(analysis)
136
+
137
+ # Store DataFrame and its analysis
138
+ processed_files.append({
139
+ 'filename': uploaded_file.name,
140
+ 'df': df,
141
+ 'analysis': analysis
142
+ })
143
+
144
+ # Apply suggested column renames
145
+ if 'suggested_renames' in analysis:
146
+ df.rename(columns=analysis['suggested_renames'], inplace=True)
147
+ st.write("Applied suggested column renames.")
148
+
149
+ if len(processed_files) > 1:
150
+ st.write("### Merging DataFrames")
151
+
152
+ # Find common columns
153
+ common_columns = find_common_columns(processed_files)
154
+
155
+ if common_columns:
156
+ st.write("Detected common columns:", common_columns)
157
+
158
+ # Let user select columns to use for merging
159
+ selected_columns = st.multiselect(
160
+ "Select columns to use for merging",
161
+ options=common_columns,
162
+ default=common_columns
163
+ )
164
+
165
+ if selected_columns:
166
+ # Merge DataFrames
167
+ merged_df = merge_dataframes(processed_files, selected_columns)
168
+
169
+ if merged_df is not None:
170
+ st.write("### Preview of Merged Data")
171
+ st.dataframe(merged_df.head())
172
+
173
+ # Download button for merged CSV
174
+ csv = merged_df.to_csv(index=False)
175
+ st.download_button(
176
+ label="Download Merged CSV",
177
+ data=csv,
178
+ file_name="merged_data.csv",
179
+ mime="text/csv"
180
+ )
181
+
182
+ # Show statistics
183
+ st.write("### Dataset Statistics")
184
+ st.write(f"Total rows: {len(merged_df)}")
185
+ st.write(f"Total columns: {len(merged_df.columns)}")
186
+
187
+ # Show data quality metrics
188
+ st.write("### Data Quality Metrics")
189
+ missing_values = merged_df.isnull().sum()
190
+ st.write("Missing values per column:")
191
+ st.dataframe(missing_values)
192
+ else:
193
+ st.warning("No common columns found across datasets.")
194
+ else:
195
+ st.warning("Please upload at least 2 files to merge.")
196
+
197
+ if __name__ == "__main__":
198
+ main()