Spaces:

Khamad
/

Paramify-test

Sleeping

App Files Files Community

bluestpanda commited on Oct 29, 2025

Commit

a9f051b

1 Parent(s): 9714df8

3rd

Browse files

Files changed (6) hide show

Dockerfile +2 -1
README.md +49 -9
requirements.txt +0 -5
src/streamlit_app.py +351 -35
src/structure_analysis.py +99 -0
structure_analysis.py +98 -0

Dockerfile CHANGED Viewed

@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
 COPY requirements.txt ./
 COPY src/ ./src/
 RUN pip3 install -r requirements.txt
@@ -17,4 +18,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

 COPY requirements.txt ./
 COPY src/ ./src/
+COPY structure_analysis.py ./src/
 RUN pip3 install -r requirements.txt
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,19 +1,59 @@
 ---
-title: Paramify Test
-emoji: 🚀
-colorFrom: red
-colorTo: red
 sdk: docker
 app_port: 8501
 tags:
 - streamlit
 pinned: false
-short_description: Streamlit template space
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: Field Correlation Analyzer
+emoji: 🤖
+colorFrom: blue
+colorTo: purple
 sdk: docker
 app_port: 8501
 tags:
 - streamlit
+- json
+- analysis
+- field-correlation
 pinned: false
+short_description: Analyze JSON files and detect important fields with regex pattern generation
 ---
+# Field Correlation Analyzer
+Upload a JSON file to analyze important fields and generate regex patterns for field extraction.
+## Features
+- 🔍 **Automatic Field Detection**: Detects summary/aggregate fields automatically
+- 📊 **Hierarchy Analysis**: Classifies data structure by hierarchy levels
+- 🎯 **Smart Recommendations**: Recommends important fields for validation
+- 📝 **Regex Generation**: Generates regex patterns for field extraction
+- 📥 **Export Results**: Download analysis results as JSON
+## How to Use
+1. Upload a JSON file with structured data
+2. Set the target field you want to analyze (e.g., `rotation_enabled`)
+3. Click "Analyze" to process the data
+4. Review the structure analysis and field recommendations
+5. Select fields and generate regex patterns
+6. Download the results as JSON
+## Example JSON Structure
+```json
+{
+  "results": {
+    "summary": {
+      "total_keys": 13,
+      "rotated_keys": 6,
+      "rotation_percentage": 46
+    },
+    "kms_keys": {
+      "object": [
+        {
+          "key_id": "12345",
+          "rotation_enabled": true,
+          "key_state": "Enabled"
+        }
+      ]
+    }
+  }
+}
+```

requirements.txt CHANGED Viewed

@@ -1,6 +1 @@
-requests>=2.31.0
 streamlit>=1.28.0
-pandas>=2.0.0
-openai>=1.0.0
-anthropic>=0.7.0



1	streamlit>=1.28.0

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,356 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+#!/usr/bin/env python3
+"""
+Hugging Face Streamlit App for LLM Field Analyzer
+Upload a JSON file and analyze important fields with pattern generation.
+"""
 import streamlit as st
+import json
+from pathlib import Path
+from typing import Dict, Any
+import io
+# Page configuration
+st.set_page_config(
+    page_title="Field Correlation Analyzer",
+    page_icon="🤖",
+    layout="wide"
+)
+# Import our modules
+try:
+    from structure_analysis import (
+        detect_summary_fields,
+        classify_data_structure,
+        get_hierarchy_summary
+    )
+except ImportError:
+    st.error("⚠️ structure_analysis.py not found. Make sure all files are uploaded.")
+    st.stop()
+# Session state
+if 'analysis_result' not in st.session_state:
+    st.session_state.analysis_result = None
+def analyze_with_llm(data: Dict[str, Any], target_field: str = "rotation_enabled") -> Dict[str, Any]:
+    """
+    Analyze data and generate a prompt for LLM analysis.
+    Returns structured analysis without requiring Ollama.
+    """
+    # Detect summary fields
+    summary_fields = detect_summary_fields(data)
+    classification = classify_data_structure(data)
+    hierarchy_summary = get_hierarchy_summary(data)
+    # Extract samples
+    sample_object = {}
+    if 'results' in data:
+        for section in data['results'].values():
+            if isinstance(section, list) and len(section) > 0:
+                sample_object = section[0]
+                break
+            elif isinstance(section, dict):
+                for key, value in section.items():
+                    if isinstance(value, list) and len(value) > 0:
+                        sample_object = value[0] if isinstance(value[0], dict) else {}
+                        break
+    summary_sample = data.get('results', {}).get('summary', {}) or data.get('summary', {})
+    # Count objects with target field
+    def count_objects_with_field(obj, field_name):
+        count = 0
+        if isinstance(obj, dict):
+            if field_name in obj:
+                count += 1
+            for v in obj.values():
+                count += count_objects_with_field(v, field_name)
+        elif isinstance(obj, list):
+            for item in obj:
+                count += count_objects_with_field(item, field_name)
+        return count
+    total_objects = count_objects_with_field(data, target_field)
+    # Generate analysis
+    analysis = {
+        "summary_fields_detected": summary_fields[:10],
+        "classification": classification,
+        "hierarchy_summary": hierarchy_summary,
+        "total_objects": total_objects,
+        "sample_object": sample_object,
+        "summary_sample": summary_sample,
+        "recommended_fields": []
+    }
+    # Recommend fields based on priority
+    if summary_fields:
+        analysis["recommended_fields"].extend(summary_fields[:3])
+    if classification.get('config_fields'):
+        analysis["recommended_fields"].extend(classification['config_fields'][:2])
+    if sample_object:
+        analysis["recommended_fields"].extend([k for k in sample_object.keys() if target_field in k.lower()])
+    return analysis
+def generate_regex_patterns(field_names: list, data_sample: dict, summary_sample: dict) -> list:
+    """Generate regex patterns for given fields."""
+    patterns = []
+    for field in field_names:
+        # Try to find the field value type
+        field_lower = field.lower()
+        # Check in summary first
+        if 'summary' in str(field):
+            field_name = field.split('.')[-1]
+            # Boolean pattern
+            if field_name in summary_sample and isinstance(summary_sample.get(field_name), bool):
+                patterns.append(f'"summary.{field_name}"\\s*:\\s*(true|false)')
+            # Number pattern
+            elif isinstance(summary_sample.get(field_name), (int, float)):
+                patterns.append(f'"summary.{field_name}"\\s*:\\s*(\\d+)')
+        # Check in object
+        elif field in data_sample:
+            value = data_sample[field]
+            if isinstance(value, bool):
+                patterns.append(f'"{field}"\\s*:\\s*(true|false)')
+            elif isinstance(value, (int, float)):
+                patterns.append(f'"{field}"\\s*:\\s*(\\d+)')
+            elif isinstance(value, str):
+                patterns.append(f'"{field}"\\s*:\\s*"([^"]*)"')
+        else:
+            # Generic pattern based on field name
+            if 'percentage' in field_lower or 'count' in field_lower or 'total' in field_lower:
+                patterns.append(f'"{field}"\\s*:\\s*(\\d+)')
+            elif 'enabled' in field_lower or 'enforced' in field_lower:
+                patterns.append(f'"{field}"\\s*:\\s*(true|false)')
+            else:
+                patterns.append(f'"{field}"\\s*:\\s*"([^"]*)"')
+    return patterns
+def main():
+    """Main application."""
+    st.title("🤖 Field Correlation Analyzer")
+    st.markdown("Upload a JSON file to analyze important fields and generate regex patterns")
+    # File upload
+    uploaded_file = st.file_uploader(
+        "Choose a JSON file",
+        type=['json'],
+        help="Upload a JSON file with structured data"
+    )
+    if uploaded_file is not None:
+        # Read and parse JSON
+        try:
+            content = uploaded_file.read()
+            data = json.loads(content)
+            st.success("✅ File loaded successfully!")
+            # Sidebar for settings
+            with st.sidebar:
+                st.header("⚙️ Settings")
+                # Target field input
+                target_field = st.text_input(
+                    "Target Field",
+                    value="rotation_enabled",
+                    help="The field you want to analyze"
+                )
+                # Analyze button
+                if st.button("🔍 Analyze", type="primary"):
+                    with st.spinner("Analyzing data structure..."):
+                        analysis_result = analyze_with_llm(data, target_field)
+                        st.session_state.analysis_result = analysis_result
+                        st.session_state.data = data
+            # Display results if available
+            if st.session_state.analysis_result:
+                analysis = st.session_state.analysis_result
+                # Summary metrics
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.metric("Summary Fields", len(analysis['summary_fields_detected']))
+                with col2:
+                    st.metric("Total Objects", analysis['total_objects'])
+                with col3:
+                    st.metric("Has Summary", "Yes" if analysis['hierarchy_summary']['has_summary'] else "No")
+                with col4:
+                    st.metric("Config Fields", len(analysis['classification'].get('config_fields', [])))
+                st.markdown("---")
+                # Create tabs
+                tab1, tab2, tab3, tab4 = st.tabs([
+                    "📊 Structure Analysis",
+                    "🎯 Field Recommendations",
+                    "📝 Generated Patterns",
+                    "📄 Raw Data"
+                ])
+                with tab1:
+                    st.subheader("Data Hierarchy")
+                    # Summary fields
+                    if analysis['summary_fields_detected']:
+                        st.markdown("#### Level 1: Summary/Aggregate Fields (Highest Priority)")
+                        for field in analysis['summary_fields_detected'][:10]:
+                            st.write(f"✓ `{field}`")
+                    # Config fields
+                    config_fields = analysis['classification'].get('config_fields', [])
+                    if config_fields:
+                        st.markdown("#### Level 2: Configuration/Compliance Fields")
+                        for field in config_fields[:10]:
+                            st.write(f"✓ `{field}`")
+                    # Object arrays
+                    object_arrays = analysis['classification'].get('object_arrays', [])
+                    if object_arrays:
+                        st.markdown("#### Level 3: Object Arrays")
+                        for field in object_arrays[:5]:
+                            st.write(f"✓ `{field}`")
+                    # Show sample data
+                    with st.expander("📋 View Summary Data Sample"):
+                        st.json(analysis['summary_sample'])
+                    with st.expander("📋 View Object Data Sample"):
+                        st.json(analysis['sample_object'])
+                with tab2:
+                    st.subheader("Recommended Fields for Analysis")
+                    if analysis['recommended_fields']:
+                        st.info("These fields are recommended based on the data hierarchy and target field.")
+                        # Let user select fields
+                        selected_fields = st.multiselect(
+                            "Select fields to generate patterns for:",
+                            analysis['recommended_fields'],
+                            default=analysis['recommended_fields'][:3]
+                        )
+                        if selected_fields and st.button("Generate Patterns"):
+                            patterns = generate_regex_patterns(
+                                selected_fields,
+                                analysis['sample_object'],
+                                analysis['summary_sample']
+                            )
+                            st.session_state.generated_patterns = {
+                                'fields': selected_fields,
+                                'patterns': patterns
+                            }
+                    else:
+                        st.warning("No recommended fields found.")
+                with tab3:
+                    if 'generated_patterns' in st.session_state:
+                        patterns_data = st.session_state.generated_patterns
+                        st.subheader("Generated Regex Patterns")
+                        # Show patterns
+                        for i, (field, pattern) in enumerate(zip(patterns_data['fields'], patterns_data['patterns']), 1):
+                            st.markdown(f"**Pattern {i}: {field}**")
+                            st.code(pattern, language="regex", line_numbers=False)
+                            st.markdown("---")
+                        # Copy to clipboard
+                        all_patterns = "\n".join(patterns_data['patterns'])
+                        st.text_area(
+                            "All Patterns (copy this):",
+                            all_patterns,
+                            height=100
+                        )
+                        # JSON export
+                        export_data = {
+                            "test_name": "Field Analysis",
+                            "important_fields": patterns_data['fields'],
+                            "reasoning": "Fields identified using hierarchical analysis prioritizing summary/aggregate fields",
+                            "generated_regex": patterns_data['patterns']
+                        }
+                        st.download_button(
+                            label="📥 Download as JSON",
+                            data=json.dumps(export_data, indent=2),
+                            file_name="analysis_result.json",
+                            mime="application/json"
+                        )
+                    else:
+                        st.info("👆 Go to 'Field Recommendations' tab to select fields and generate patterns.")
+                with tab4:
+                    st.subheader("Raw Data Structure")
+                    # Full data viewer
+                    st.json(data)
+                    # Download raw data
+                    st.download_button(
+                        label="📥 Download Raw Data",
+                        data=json.dumps(data, indent=2),
+                        file_name="raw_data.json",
+                        mime="application/json"
+                    )
+        except json.JSONDecodeError as e:
+            st.error(f"❌ Invalid JSON file: {e}")
+        except Exception as e:
+            st.error(f"❌ Error processing file: {e}")
+    else:
+        # Show example when no file uploaded
+        st.info("👆 Please upload a JSON file to begin analysis")
+        with st.expander("📖 How to use"):
+            st.markdown("""
+            **Steps:**
+            1. Upload a JSON file with structured data
+            2. Set the target field you want to analyze (e.g., `rotation_enabled`)
+            3. Click "Analyze" to process the data
+            4. Review the structure analysis and field recommendations
+            5. Select fields and generate regex patterns
+            6. Download the results as JSON
+            **What this tool does:**
+            - Detects summary/aggregate fields automatically
+            - Classifies data structure by hierarchy levels
+            - Recommends important fields for validation
+            - Generates regex patterns for field extraction
+            """)
+        with st.expander("📋 Example JSON Structure"):
+            example = {
+                "results": {
+                    "summary": {
+                        "total_keys": 13,
+                        "rotated_keys": 6,
+                        "rotation_percentage": 46
+                    },
+                    "kms_keys": {
+                        "object": [
+                            {
+                                "key_id": "12345",
+                                "rotation_enabled": True,
+                                "key_state": "Enabled"
+                            }
+                        ]
+                    }
+                }
+            }
+            st.json(example)
+if __name__ == "__main__":
+    main()

src/structure_analysis.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+Structure analysis utilities for detecting fields in JSON data.
+"""
+from typing import Dict, Any, List
+def detect_summary_fields(data: Dict[str, Any]) -> List[str]:
+    """
+    Detect summary/aggregate fields in the data structure.
+    Looks for fields in 'summary' sections or aggregate fields.
+    """
+    summary_fields = []
+    # Check for 'summary' in results
+    if 'results' in data and isinstance(data['results'], dict):
+        if 'summary' in data['results']:
+            summary_data = data['results']['summary']
+            if isinstance(summary_data, dict):
+                summary_fields.extend([f"summary.{key}" for key in summary_data.keys()])
+    # Check for top-level 'summary'
+    if 'summary' in data and isinstance(data['summary'], dict):
+        summary_fields.extend([f"summary.{key}" for key in data['summary'].keys()])
+    # Look for aggregate patterns in field names
+    def find_aggregate_fields(obj, path=""):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                current_path = f"{path}.{key}" if path else key
+                # Check for aggregate patterns
+                if any(pattern in key.lower() for pattern in ['total', 'count', 'sum', 'average', 'avg', 'percent', 'percentage']):
+                    if isinstance(value, (int, float)):
+                        summary_fields.append(current_path)
+                # Recurse
+                find_aggregate_fields(value, current_path)
+        elif isinstance(obj, list) and len(obj) > 0:
+            find_aggregate_fields(obj[0], path)
+    find_aggregate_fields(data)
+    # Remove duplicates and return
+    return list(set(summary_fields))
+def classify_data_structure(data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Classify the data structure and return categorization.
+    """
+    config_fields = []
+    object_arrays = []
+    def classify_recursive(obj, path=""):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                current_path = f"{path}.{key}" if path else key
+                # Check for config/compliance fields
+                if any(pattern in key.lower() for pattern in ['config', 'compliance', 'enabled', 'enforced', 'policy']):
+                    config_fields.append(current_path)
+                # Check for object arrays
+                if isinstance(value, list) and len(value) > 0 and isinstance(value[0], dict):
+                    object_arrays.append(current_path)
+                # Recurse
+                classify_recursive(value, current_path)
+        elif isinstance(obj, list) and len(obj) > 0:
+            classify_recursive(obj[0], path)
+    classify_recursive(data)
+    return {
+        'config_fields': config_fields,
+        'object_arrays': object_arrays
+    }
+def get_hierarchy_summary(data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Get a summary of the data hierarchy.
+    """
+    has_summary = False
+    # Check for summary sections
+    if 'results' in data and isinstance(data['results'], dict):
+        if 'summary' in data['results']:
+            has_summary = True
+    if 'summary' in data:
+        has_summary = True
+    return {
+        'has_summary': has_summary,
+        'levels': 2 if has_summary else 1
+    }

structure_analysis.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Structure analysis utilities for detecting fields in JSON data.
+"""
+from typing import Dict, Any, List
+def detect_summary_fields(data: Dict[str, Any]) -> List[str]:
+    """
+    Detect summary/aggregate fields in the data structure.
+    Looks for fields in 'summary' sections or aggregate fields.
+    """
+    summary_fields = []
+    # Check for 'summary' in results
+    if 'results' in data and isinstance(data['results'], dict):
+        if 'summary' in data['results']:
+            summary_data = data['results']['summary']
+            if isinstance(summary_data, dict):
+                summary_fields.extend([f"summary.{key}" for key in summary_data.keys()])
+    # Check for top-level 'summary'
+    if 'summary' in data and isinstance(data['summary'], dict):
+        summary_fields.extend([f"summary.{key}" for key in data['summary'].keys()])
+    # Look for aggregate patterns in field names
+    def find_aggregate_fields(obj, path=""):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                current_path = f"{path}.{key}" if path else key
+                # Check for aggregate patterns
+                if any(pattern in key.lower() for pattern in ['total', 'count', 'sum', 'average', 'avg', 'percent', 'percentage']):
+                    if isinstance(value, (int, float)):
+                        summary_fields.append(current_path)
+                # Recurse
+                find_aggregate_fields(value, current_path)
+        elif isinstance(obj, list) and len(obj) > 0:
+            find_aggregate_fields(obj[0], path)
+    find_aggregate_fields(data)
+    # Remove duplicates and return
+    return list(set(summary_fields))
+def classify_data_structure(data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Classify the data structure and return categorization.
+    """
+    config_fields = []
+    object_arrays = []
+    def classify_recursive(obj, path=""):
+        if isinstance(obj, dict):
+            for key, value in obj.items():
+                current_path = f"{path}.{key}" if path else key
+                # Check for config/compliance fields
+                if any(pattern in key.lower() for pattern in ['config', 'compliance', 'enabled', 'enforced', 'policy']):
+                    config_fields.append(current_path)
+                # Check for object arrays
+                if isinstance(value, list) and len(value) > 0 and isinstance(value[0], dict):
+                    object_arrays.append(current_path)
+                # Recurse
+                classify_recursive(value, current_path)
+        elif isinstance(obj, list) and len(obj) > 0:
+            classify_recursive(obj[0], path)
+    classify_recursive(data)
+    return {
+        'config_fields': config_fields,
+        'object_arrays': object_arrays
+    }
+def get_hierarchy_summary(data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Get a summary of the data hierarchy.
+    """
+    has_summary = False
+    # Check for summary sections
+    if 'results' in data and isinstance(data['results'], dict):
+        if 'summary' in data['results']:
+            has_summary = True
+    if 'summary' in data:
+        has_summary = True
+    return {
+        'has_summary': has_summary,
+        'levels': 2 if has_summary else 1
+    }