Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,15 +3,13 @@ import pandas as pd
|
|
| 3 |
import os
|
| 4 |
from io import BytesIO
|
| 5 |
import google.generativeai as genai
|
| 6 |
-
from google.generativeai import types
|
| 7 |
-
import pathlib
|
| 8 |
from typing import List, Dict
|
| 9 |
import json
|
| 10 |
import tempfile
|
| 11 |
|
| 12 |
# Initialize Google Gemini AI client
|
| 13 |
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
|
| 14 |
-
|
| 15 |
|
| 16 |
def convert_excel_to_csv(excel_file):
|
| 17 |
"""Convert Excel file to CSV and return the DataFrame"""
|
|
@@ -27,32 +25,28 @@ def analyze_columns(df: pd.DataFrame, filename: str) -> dict:
|
|
| 27 |
# Convert sample of DataFrame to CSV string
|
| 28 |
sample_csv = df.head(5).to_csv(index=False)
|
| 29 |
|
| 30 |
-
analysis_prompt = """
|
| 31 |
-
Analyze this CSV data and provide the following in JSON format:
|
| 32 |
-
1. Identify the main subject/entity of this dataset
|
| 33 |
-
2. List all columns and their likely content type (text, number, date, etc.)
|
| 34 |
-
3. Identify potential key columns that could be used for merging with other datasets
|
| 35 |
-
4. Flag any inconsistencies or data quality issues
|
| 36 |
-
5. Suggest any column renamings for clarity
|
| 37 |
|
| 38 |
-
|
| 39 |
-
{
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"""
|
| 47 |
|
| 48 |
try:
|
| 49 |
-
response =
|
| 50 |
-
contents=[
|
| 51 |
-
types.Part.from_text(f"Filename: {filename}\n\nCSV Data:\n{sample_csv}"),
|
| 52 |
-
analysis_prompt
|
| 53 |
-
]
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
# Parse JSON response
|
| 57 |
analysis = json.loads(response.text)
|
| 58 |
return analysis
|
|
@@ -126,8 +120,13 @@ def main():
|
|
| 126 |
df = pd.read_csv(uploaded_file)
|
| 127 |
|
| 128 |
if df is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
# Analyze columns using Gemini
|
| 130 |
-
|
|
|
|
| 131 |
|
| 132 |
if analysis:
|
| 133 |
st.write("Column Analysis:")
|
|
@@ -140,10 +139,12 @@ def main():
|
|
| 140 |
'analysis': analysis
|
| 141 |
})
|
| 142 |
|
| 143 |
-
# Apply suggested column renames
|
| 144 |
-
if 'suggested_renames' in analysis:
|
| 145 |
df.rename(columns=analysis['suggested_renames'], inplace=True)
|
| 146 |
st.write("Applied suggested column renames.")
|
|
|
|
|
|
|
| 147 |
|
| 148 |
if len(processed_files) > 1:
|
| 149 |
st.write("### Merging DataFrames")
|
|
@@ -163,7 +164,8 @@ def main():
|
|
| 163 |
|
| 164 |
if selected_columns:
|
| 165 |
# Merge DataFrames
|
| 166 |
-
|
|
|
|
| 167 |
|
| 168 |
if merged_df is not None:
|
| 169 |
st.write("### Preview of Merged Data")
|
|
@@ -188,6 +190,10 @@ def main():
|
|
| 188 |
missing_values = merged_df.isnull().sum()
|
| 189 |
st.write("Missing values per column:")
|
| 190 |
st.dataframe(missing_values)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
else:
|
| 192 |
st.warning("No common columns found across datasets.")
|
| 193 |
else:
|
|
|
|
| 3 |
import os
|
| 4 |
from io import BytesIO
|
| 5 |
import google.generativeai as genai
|
|
|
|
|
|
|
| 6 |
from typing import List, Dict
|
| 7 |
import json
|
| 8 |
import tempfile
|
| 9 |
|
| 10 |
# Initialize Google Gemini AI client
|
| 11 |
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
|
| 12 |
+
model = genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
|
| 13 |
|
| 14 |
def convert_excel_to_csv(excel_file):
|
| 15 |
"""Convert Excel file to CSV and return the DataFrame"""
|
|
|
|
| 25 |
# Convert sample of DataFrame to CSV string
|
| 26 |
sample_csv = df.head(5).to_csv(index=False)
|
| 27 |
|
| 28 |
+
analysis_prompt = f"""
|
| 29 |
+
Analyze this CSV data from file '{filename}' and provide the following in JSON format:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
CSV Data:
|
| 32 |
+
{sample_csv}
|
| 33 |
+
|
| 34 |
+
Provide analysis in this exact JSON format:
|
| 35 |
+
{{
|
| 36 |
+
"subject": "string describing main subject of dataset",
|
| 37 |
+
"columns": [
|
| 38 |
+
{{"name": "column_name", "type": "data_type", "description": "column description"}}
|
| 39 |
+
],
|
| 40 |
+
"key_columns": ["potential columns for merging"],
|
| 41 |
+
"issues": ["list of data quality issues found"],
|
| 42 |
+
"suggested_renames": {{"old_name": "new_name"}}
|
| 43 |
+
}}
|
| 44 |
+
|
| 45 |
+
Only respond with the JSON object, no additional text.
|
| 46 |
"""
|
| 47 |
|
| 48 |
try:
|
| 49 |
+
response = model.generate_content(analysis_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
# Parse JSON response
|
| 51 |
analysis = json.loads(response.text)
|
| 52 |
return analysis
|
|
|
|
| 120 |
df = pd.read_csv(uploaded_file)
|
| 121 |
|
| 122 |
if df is not None:
|
| 123 |
+
# Show initial data preview
|
| 124 |
+
st.write("Initial Preview:")
|
| 125 |
+
st.dataframe(df.head())
|
| 126 |
+
|
| 127 |
# Analyze columns using Gemini
|
| 128 |
+
with st.spinner("Analyzing columns with AI..."):
|
| 129 |
+
analysis = analyze_columns(df, uploaded_file.name)
|
| 130 |
|
| 131 |
if analysis:
|
| 132 |
st.write("Column Analysis:")
|
|
|
|
| 139 |
'analysis': analysis
|
| 140 |
})
|
| 141 |
|
| 142 |
+
# Apply suggested column renames if any
|
| 143 |
+
if 'suggested_renames' in analysis and analysis['suggested_renames']:
|
| 144 |
df.rename(columns=analysis['suggested_renames'], inplace=True)
|
| 145 |
st.write("Applied suggested column renames.")
|
| 146 |
+
st.write("Updated Preview:")
|
| 147 |
+
st.dataframe(df.head())
|
| 148 |
|
| 149 |
if len(processed_files) > 1:
|
| 150 |
st.write("### Merging DataFrames")
|
|
|
|
| 164 |
|
| 165 |
if selected_columns:
|
| 166 |
# Merge DataFrames
|
| 167 |
+
with st.spinner("Merging datasets..."):
|
| 168 |
+
merged_df = merge_dataframes(processed_files, selected_columns)
|
| 169 |
|
| 170 |
if merged_df is not None:
|
| 171 |
st.write("### Preview of Merged Data")
|
|
|
|
| 190 |
missing_values = merged_df.isnull().sum()
|
| 191 |
st.write("Missing values per column:")
|
| 192 |
st.dataframe(missing_values)
|
| 193 |
+
|
| 194 |
+
# Show duplicate check
|
| 195 |
+
duplicates = merged_df.duplicated().sum()
|
| 196 |
+
st.write(f"Number of duplicate rows: {duplicates}")
|
| 197 |
else:
|
| 198 |
st.warning("No common columns found across datasets.")
|
| 199 |
else:
|