Spaces:

insly
/

policyimport

Sleeping

File size: 4,346 Bytes

import streamlit as st
import zipfile
import json
from io import BytesIO
import chardet
import pandas as pd
import openpyxl

# Set page configuration to wide mode by default
st.set_page_config(layout="wide")

# Function to extract and combine JSON files from a ZIP file
def extract_and_combine_zip(zip_file):
    combined_data = {}
    with zipfile.ZipFile(zip_file) as z:
        # Extract all JSON files, ignoring macOS-specific hidden files
        json_files = [name for name in z.namelist() if name.endswith('.json') and not name.startswith('__MACOSX')]
        for json_file in json_files:
            with z.open(json_file) as f:
                content = f.read()
                encoding = chardet.detect(content)['encoding']
                try:
                    decoded_content = content.decode(encoding)
                    data = json.loads(decoded_content)
                    combined_data = flatten_json(data, combined_data)
                except (UnicodeDecodeError, json.JSONDecodeError) as e:
                    st.warning(f"Warning: Could not decode {json_file} in {zip_file.name}. Error: {str(e)}")
    return combined_data

# Improved function to flatten and merge JSON data
def flatten_json(data, flattened=None, prefix=''):
    if flattened is None:
        flattened = {}
    
    if isinstance(data, dict):
        for key, value in data.items():
            new_key = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                flatten_json(value, flattened, new_key)
            elif value is not None and value != "":
                flattened[new_key] = value
    elif isinstance(data, list):
        for i, item in enumerate(data):
            new_key = f"{prefix}[{i}]" if prefix else str(i)
            flatten_json(item, flattened, new_key)
    elif data is not None and data != "":
        flattened[prefix] = value
    
    return flattened

# Function to convert DataFrame to Excel
def to_excel(df):
    output = BytesIO()
    with pd.ExcelWriter(output, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Sheet1')
    processed_data = output.getvalue()
    return processed_data

# Streamlit app setup
st.title("Multi-ZIP JSON Extractor & Flattener")

# File uploader widget for multiple files
uploaded_zips = st.file_uploader("Upload ZIP files containing JSON files:", type="zip", accept_multiple_files=True)

if uploaded_zips:
    # Process all uploaded ZIP files
    all_flattened_json = []
    
    for zip_file in uploaded_zips:
        flattened_json = extract_and_combine_zip(zip_file)
        flattened_json['zip_file_name'] = zip_file.name  # Add ZIP file name to the flattened JSON
        all_flattened_json.append(flattened_json)

    # Create a DataFrame from the flattened JSON data
    df = pd.DataFrame(all_flattened_json)

    # Convert all object columns to string to avoid Arrow conversion issues
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype(str)

    # Create a downloadable JSON
    flattened_json_str = json.dumps(all_flattened_json, indent=4)
    json_bytes = flattened_json_str.encode()

    # Display processing summary
    st.write(f"Processed {len(uploaded_zips)} ZIP file(s)")
    st.write(f"Total rows in the output: {len(all_flattened_json)}")

    # Create columns for download buttons
    col1, col2 = st.columns(2)

    # Button to download the flattened JSON data
    with col1:
        st.download_button(
            label="Download Flattened JSON",
            data=BytesIO(json_bytes),
            file_name='flattened_json.json',
            mime='application/json'
        )

    # Button to download the Excel file
    with col2:
        excel_data = to_excel(df)
        st.download_button(
            label="Download Excel File",
            data=excel_data,
            file_name='flattened_data.xlsx',
            mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        )

    # Add a success message
    st.success(f"All ZIP files have been successfully processed. Each ZIP file is represented as a separate row in the output. You can now download the flattened JSON file or the Excel file.")

    # Display a preview of the DataFrame
    st.write("Preview of the processed data:")
    st.dataframe(df.head())