Spaces:

Khamad
/

json-field-analyzer

Building

App Files Files Community

bluestpanda commited on Oct 28, 2025

Commit

80e30b4

1 Parent(s): 98662cd

Add application file

Browse files

Files changed (3) hide show

Dockerfile +21 -0
app.py +577 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM huggingface/space-ollama:streamlit
+# Copy your app files
+COPY app.py /app/app.py
+COPY requirements.txt /app/requirements.txt
+# Install Python dependencies
+RUN pip install -r requirements.txt
+# Copy structure_analysis if it exists
+COPY structure_analysis.py /app/structure_analysis.py 2>/dev/null || true
+# Download Ollama model (this takes a few minutes)
+RUN ollama pull llama3.2:3b
+# Expose Streamlit port
+EXPOSE 7860
+# Run Streamlit
+CMD ["streamlit", "run", "/app/app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,577 @@

+#!/usr/bin/env python3
+"""
+File Upload Analyzer - Streamlit Frontend
+This is a copy of file_upload_app.py for Hugging Face Spaces deployment.
+"""
+import streamlit as st
+import json
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any
+import io
+try:
+    import requests
+except ImportError:
+    st.error("Error: requests module not found. Please install it with: pip install requests")
+    st.stop()
+# Try to import structure_analysis, fallback to inline if not available
+try:
+    from structure_analysis import (
+        detect_summary_fields,
+        classify_data_structure,
+        get_hierarchy_summary
+    )
+except ImportError:
+    # Inline fallback implementations
+    def detect_summary_fields(data: Any, path: str = "") -> list:
+        """Detect summary fields."""
+        fields = []
+        summary_indicators = ['total', 'count', 'percentage', 'summary', 'aggregate', 'statistics', 'percent']
+        def traverse(obj, current_path=""):
+            if isinstance(obj, dict):
+                for key, value in obj.items():
+                    field_path = f"{current_path}.{key}" if current_path else key
+                    if any(ind in key.lower() for ind in summary_indicators):
+                        fields.append(field_path)
+                    if isinstance(value, (dict, list)):
+                        traverse(value, field_path)
+            elif isinstance(obj, list) and len(obj) > 0:
+                traverse(obj[0], current_path)
+        traverse(data, path)
+        return fields
+    def classify_data_structure(data: Any) -> dict:
+        """Classify data structure."""
+        return {
+            'summary_fields': [],
+            'config_fields': [],
+            'object_arrays': [],
+            'object_fields': []
+        }
+    def get_hierarchy_summary(data: Any) -> dict:
+        """Get hierarchy summary."""
+        return {
+            'has_summary': False,
+            'has_config': False,
+            'summary_fields': [],
+            'config_fields': [],
+            'levels_present': []
+        }
+# Detect if running on Streamlit Cloud or Hugging Face
+IS_STREAMLIT_CLOUD = os.getenv("STREAMLIT_SHARING_BASE_URL") is not None
+IS_HUGGINGFACE = os.getenv("SPACE_ID") is not None
+IS_ONLINE = IS_STREAMLIT_CLOUD or IS_HUGGINGFACE
+# Page config
+st.set_page_config(
+    page_title="JSON Field Analyzer",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main > div {
+        padding-top: 1rem;
+    }
+    .stButton>button {
+        width: 100%;
+    }
+    h1 {
+        font-size: 2rem;
+    }
+    h2 {
+        font-size: 1.3rem;
+        border-bottom: 2px solid #0e1117;
+        padding-bottom: 0.3rem;
+    }
+    .highlight {
+        background-color: #f0f2f6;
+        color: #262730;
+        padding: 1rem;
+        border-radius: 5px;
+        border-left: 4px solid #1f77b4;
+        margin: 1rem 0;
+    }
+    .highlight p {
+        color: #262730;
+        margin: 0;
+    }
+    .result-box {
+        background-color: #f0f2f6;
+        padding: 1.5rem;
+        border-radius: 10px;
+        margin: 1rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+class FileAnalyzer:
+    """Analyzer for uploaded JSON files."""
+    OLLAMA_API_URL = "http://localhost:11434/api/generate"
+    MODEL_NAME = "llama3.2:3b"
+    def __init__(self, data: Dict[str, Any], llm_provider="ollama", api_key=None):
+        self.data = data
+        self.metadata = None
+        self.llm_provider = llm_provider
+        self.api_key = api_key
+    def extract_metadata(self, target_field: str) -> Dict[str, Any]:
+        """Extract key metadata from the JSON data for LLM analysis."""
+        # Enhanced: Detect summary fields and classify structure
+        summary_fields = detect_summary_fields(self.data)
+        classification = classify_data_structure(self.data)
+        hierarchy_summary = get_hierarchy_summary(self.data)
+        # Try to find objects in the data structure
+        objects_with_target = self._find_objects_with_target(target_field)
+        total = len(objects_with_target)
+        target_true = sum(1 for obj in objects_with_target if obj.get(target_field) is True)
+        percentage = (target_true / total * 100) if total > 0 else 0
+        metadata = {
+            "total_objects": total,
+            "target_count": target_true,
+            "percentage": round(percentage, 2),
+            "summary_fields_detected": summary_fields[:10],
+            "classification": classification,
+            "hierarchy_summary": hierarchy_summary,
+            "has_summary_level": hierarchy_summary['has_summary'],
+            "has_config_level": hierarchy_summary['has_config']
+        }
+        self.metadata = metadata
+        return metadata
+    def _find_objects_with_target(self, target_field: str) -> list:
+        """Find all objects in the data structure that contain the target field."""
+        found = []
+        def find_fields(obj):
+            if isinstance(obj, dict):
+                if target_field in obj:
+                    found.append(obj)
+                for value in obj.values():
+                    find_fields(value)
+            elif isinstance(obj, list):
+                for item in obj:
+                    find_fields(item)
+        find_fields(self.data)
+        return found
+    def generate_prompt(self, target_field: str) -> str:
+        """Generate a hierarchy-aware prompt for the LLM."""
+        if not self.metadata:
+            self.extract_metadata(target_field)
+        hierarchy = self.metadata.get('hierarchy_summary', {})
+        summary_fields = self.metadata.get('summary_fields_detected', [])
+        classification = self.metadata.get('classification', {})
+        # Get sample object
+        sample = {}
+        def find_sample(obj):
+            if isinstance(obj, dict):
+                if target_field in obj:
+                    return obj
+                for v in obj.values():
+                    result = find_sample(v)
+                    if result:
+                        return result
+            elif isinstance(obj, list) and len(obj) > 0:
+                return find_sample(obj[0])
+            return {}
+        sample = find_sample(self.data)
+        # Get summary sample
+        summary_sample = self.data.get('results', {}).get('summary', {}) or self.data.get('summary', {})
+        # Create samples
+        sample_object = json.dumps({k: sample[k] for k in list(sample.keys())[:5]}, indent=2) if sample else "{}"
+        sample_summary = json.dumps(summary_sample, indent=2) if summary_sample else "{}"
+        # Build hierarchy instruction
+        hierarchy_text = f"""
+DATA HIERARCHY (analyze in this priority order):
+LEVEL 1 - Summary/Aggregate Fields (HIGHEST PRIORITY):
+"""
+        if summary_fields:
+            for field in summary_fields[:5]:
+                hierarchy_text += f"  ✓ {field}\n"
+            if len(summary_fields) > 5:
+                hierarchy_text += f"  ... and {len(summary_fields) - 5} more\n"
+        else:
+            hierarchy_text += "  No summary fields detected\n"
+        hierarchy_text += f"""
+LEVEL 2 - Configuration/Compliance Fields:
+"""
+        config_fields = classification.get('config_fields', [])
+        if config_fields:
+            for field in config_fields[:3]:
+                hierarchy_text += f"  ✓ {field}\n"
+        else:
+            hierarchy_text += "  No config fields detected\n"
+        hierarchy_text += f"""
+LEVEL 3 - Individual Objects:
+  ✓ Sample object fields shown below
+CRITICAL INSTRUCTION: Check summary fields FIRST! They are the most important for validation.
+"""
+        prompt = f"""You are analyzing JSON data to identify important fields related to "{target_field}".
+{hierarchy_text}
+CONTEXT:
+- Total objects: {self.metadata.get('total_objects', 0)}
+- Objects with "{target_field}" = true: {self.metadata.get('target_count', 0)}
+- Percentage: {self.metadata.get('percentage', 0)}%
+- Has summary level data: {self.metadata.get('has_summary_level', False)}
+SAMPLE SUMMARY DATA (check this first):
+{sample_summary}
+SAMPLE OBJECT DATA:
+{sample_object}
+TASK:
+Identify 3-4 important fields related to "{target_field}" in this priority order:
+1. FIRST: Summary/aggregate fields (totals, percentages, counts)
+2. SECOND: Configuration/compliance fields
+3. THIRD: Individual object fields (if needed)
+Generate regex patterns that match JSON format (with quotes).
+VALIDATION PATTERN EXAMPLES:
+- Compare two aggregate values: "field1"\\s*:\\s*(\\d+)[\\s\\S]*?"field2"\\s*:\\s*(\\d+)
+- Extract percentage: "field_percentage"\\s*:\\s*(\\d+)
+- Extract boolean: "field_name"\\s*:\\s*(true|false)
+- Extract status: "compliance"\\s*:\\s*"([^"]*)"
+Output ONLY valid JSON:
+{{
+  "test_name": "Field Analysis: {target_field}",
+  "important_fields": ["field1", "field2", "field3"],
+  "reasoning": "Explain prioritization and why these fields matter",
+  "generated_regex": ["regex1", "regex2", "regex3"]
+}}
+"""
+        return prompt
+    def call_llm(self, prompt: str) -> str:
+        """Call the appropriate LLM based on provider."""
+        if self.llm_provider == "ollama":
+            return self._call_ollama(prompt)
+        elif self.llm_provider == "openai":
+            return self._call_openai(prompt)
+        elif self.llm_provider == "anthropic":
+            return self._call_anthropic(prompt)
+        else:
+            raise ValueError(f"Unknown LLM provider: {self.llm_provider}")
+    def _call_ollama(self, prompt: str) -> str:
+        """Call the Ollama API to generate a response."""
+        try:
+            payload = {
+                "model": self.MODEL_NAME,
+                "prompt": prompt,
+                "stream": False,
+                "format": "json"
+            }
+            response = requests.post(self.OLLAMA_API_URL, json=payload, timeout=120)
+            response.raise_for_status()
+            result = response.json()
+            return result.get('response', '')
+        except requests.exceptions.ConnectionError:
+            raise ConnectionError("Cannot connect to Ollama. Make sure Ollama is running.")
+        except requests.exceptions.Timeout:
+            raise TimeoutError("Ollama request timed out.")
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"Failed to call Ollama API - {e}")
+    def parse_llm_output(self, output: str) -> Dict[str, Any]:
+        """Parse and validate the LLM JSON output."""
+        try:
+            output = output.strip()
+            if output.startswith("```json"):
+                output = output[7:]
+            if output.startswith("```"):
+                output = output[3:]
+            if output.endswith("```"):
+                output = output[:-3]
+            output = output.strip()
+            result = json.loads(output)
+            return result
+        except json.JSONDecodeError as e:
+            raise ValueError(f"LLM output is not valid JSON - {e}")
+    def analyze(self, target_field: str = "rotation_enabled") -> Dict[str, Any]:
+        """Main analysis function."""
+        self.extract_metadata(target_field)
+        prompt = self.generate_prompt(target_field)
+        llm_output = self.call_llm(prompt)
+        result = self.parse_llm_output(llm_output)
+        return result
+def main():
+    """Main Streamlit application."""
+    st.title("📊 JSON Field Analyzer")
+    if IS_HUGGINGFACE:
+        st.info("🆓 Running on Hugging Face - Ollama available!")
+    st.markdown("**Upload a JSON file and analyze important fields using LLM**")
+    # Sidebar for configuration
+    with st.sidebar:
+        st.header("⚙️ Configuration")
+        # Show environment info
+        if IS_ONLINE and not IS_HUGGINGFACE:
+            st.info("🌐 Running online - Cloud LLM required")
+        # LLM Provider Selection
+        # Default to Anthropic if on Streamlit Cloud, Ollama on HF/local
+        if IS_STREAMLIT_CLOUD:
+            default_index = 2  # Anthropic Claude
+        else:
+            default_index = 0  # Ollama
+        llm_provider = st.selectbox(
+            "🤖 LLM Provider",
+            ["Ollama (Local)", "OpenAI (Cloud)", "Anthropic Claude (Cloud)"],
+            index=default_index,
+            help="Choose your LLM provider"
+        )
+        # Extract provider name and model
+        if llm_provider == "Ollama (Local)":
+            provider_name = "ollama"
+            api_key = None
+            if IS_STREAMLIT_CLOUD:
+                st.error("❌ Ollama not available on Streamlit Cloud")
+                st.markdown("**Please select a cloud LLM provider:**")
+                st.markdown("- OpenAI (Cloud) - GPT-4o Mini")
+                st.markdown("- Anthropic Claude (Cloud) - Recommended")
+            else:
+                st.info("📝 Using local Ollama")
+        elif llm_provider == "OpenAI (Cloud)":
+            provider_name = "openai"
+            api_key = os.getenv("OPENAI_API_KEY") or st.text_input(
+                "OpenAI API Key",
+                type="password",
+                help="Enter your OpenAI API key (or set OPENAI_API_KEY env var)"
+            )
+            if not api_key:
+                st.warning("⚠️ Please enter your OpenAI API key")
+                st.info("💡 Get key: https://platform.openai.com/api-keys")
+        else:  # Anthropic
+            provider_name = "anthropic"
+            api_key = os.getenv("ANTHROPIC_API_KEY") or st.text_input(
+                "Anthropic API Key",
+                type="password",
+                help="Enter your Anthropic API key (or set ANTHROPIC_API_KEY env var)"
+            )
+            if not api_key:
+                st.warning("⚠️ Please enter your Anthropic API key")
+                st.info("💡 Get key: https://console.anthropic.com")
+        st.markdown("---")
+        target_field = st.text_input(
+            "Target Field",
+            value="rotation_enabled",
+            help="The field you want to analyze (e.g., rotation_enabled, ssl_enforced)"
+        )
+        st.markdown("---")
+        st.markdown("### 📋 Setup Guides")
+        with st.expander("🔧 Local Ollama Setup"):
+            st.code("""
+brew install ollama
+ollama serve
+ollama pull llama3.2:3b
+            """, language="bash")
+        with st.expander("☁️ Cloud API Setup"):
+            st.markdown("""
+            **OpenAI:**
+            - Get key: https://platform.openai.com/api-keys
+            - Model: GPT-4o Mini
+            **Anthropic:**
+            - Get key: https://console.anthropic.com
+            - Model: Claude 3.5 Sonnet
+            """)
+    # File upload section
+    st.markdown("---")
+    st.header("📤 Upload JSON File")
+    uploaded_file = st.file_uploader(
+        "Choose a JSON file",
+        type=['json'],
+        help="Upload a JSON file to analyze"
+    )
+    # Display file info if uploaded
+    if uploaded_file is not None:
+        try:
+            # Read file contents
+            content = uploaded_file.read()
+            data = json.loads(content)
+            st.success("✅ File uploaded successfully!")
+            # Show file info
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("File Size", f"{len(content) / 1024:.2f} KB")
+            with col2:
+                st.metric("JSON Structure", "Valid" if isinstance(data, (dict, list)) else "Invalid")
+            # Analyze button
+            st.markdown("---")
+            col1, col2, col3 = st.columns([1, 2, 1])
+            with col2:
+                analyze_button = st.button("🔍 Analyze with LLM", type="primary", use_container_width=True)
+            # Run analysis
+            if analyze_button:
+                # Prevent Ollama usage on Streamlit Cloud
+                if provider_name == "ollama" and IS_STREAMLIT_CLOUD:
+                    st.error("❌ Ollama is not available on Streamlit Cloud")
+                    st.info("💡 Please select 'Anthropic Claude (Cloud)' or 'OpenAI (Cloud)' from the sidebar")
+                # Validate API key for cloud providers
+                elif provider_name in ["openai", "anthropic"] and not api_key:
+                    st.error("❌ Please enter an API key for the selected cloud provider")
+                else:
+                    try:
+                        with st.spinner(f"Analyzing with {llm_provider}... This may take a moment."):
+                            analyzer = FileAnalyzer(data, llm_provider=provider_name, api_key=api_key)
+                            result = analyzer.analyze(target_field=target_field)
+                        # Display results
+                        st.markdown("---")
+                        st.header("📊 Analysis Results")
+                        # Main results in columns
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.subheader("🤖 Important Fields")
+                            for i, field in enumerate(result.get('important_fields', []), 1):
+                                st.markdown(f"**{i}. {field}**")
+                        with col2:
+                            st.subheader("💡 Reasoning")
+                            st.markdown(f'<div class="highlight">{result.get("reasoning", "N/A")}</div>',
+                                      unsafe_allow_html=True)
+                        # Regex patterns
+                        st.markdown("---")
+                        st.subheader("🔧 Generated Regex Patterns")
+                        regex_patterns = result.get('generated_regex', [])
+                        for i, pattern in enumerate(regex_patterns, 1):
+                            st.markdown(f"**Pattern {i}:**")
+                            st.code(pattern, language="regex")
+                        # Raw JSON output
+                        with st.expander("📄 View Raw JSON Output"):
+                            st.json(result)
+                        # Download results
+                        st.markdown("---")
+                        result_json = json.dumps(result, indent=2)
+                        st.download_button(
+                            label="⬇️ Download Results",
+                            data=result_json,
+                            file_name=f"analysis_{target_field}.json",
+                            mime="application/json"
+                        )
+                    except ConnectionError as e:
+                        st.error(f"❌ {e}")
+                        if provider_name == "ollama":
+                            st.info("💡 Start Ollama with: `ollama serve`")
+                        else:
+                            st.info("💡 Check your internet connection and API key")
+                    except TimeoutError as e:
+                        st.error(f"❌ {e}")
+                        st.info("💡 The analysis took too long. Try again or use a larger timeout.")
+                    except Exception as e:
+                        st.error(f"❌ Error during analysis: {e}")
+                        st.exception(e)
+        except json.JSONDecodeError:
+            st.error("❌ Invalid JSON file. Please upload a valid JSON file.")
+        except Exception as e:
+            st.error(f"❌ Error reading file: {e}")
+            st.exception(e)
+    else:
+        # Show example when no file is uploaded
+        st.info("👆 Please upload a JSON file to get started")
+        with st.expander("📖 How it works"):
+            st.markdown("""
+            ### Workflow:
+            1. **Upload**: Upload your JSON file using the file uploader above
+            2. **Configure**: Set the target field name in the sidebar (default: `rotation_enabled`)
+            3. **Analyze**: Click the "Analyze with LLM" button
+            4. **Review**: View the important fields, reasoning, and regex patterns
+            5. **Download**: Save the results as JSON
+            ### What it does:
+            - Analyzes your JSON structure to detect summary fields, configurations, and objects
+            - Uses LLM to identify important fields related to your target
+            - Generates regex patterns for data extraction and validation
+            - Provides reasoning for why each field is important
+            ### Use cases:
+            - AWS compliance validation (KMS rotation, SSL enforcement, etc.)
+            - Data quality checks
+            - Automated validation pattern generation
+            - Field correlation analysis
+            """)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+requests>=2.31.0
+streamlit>=1.28.0
+pandas>=2.0.0
+openai>=1.0.0
+anthropic>=0.7.0