Spaces:

Khamad
/

Paramify-test

Sleeping

App Files Files Community

bluestpanda commited on Oct 29, 2025

Commit

4b7b107

1 Parent(s): 5b2c0c6

asd

Browse files

Files changed (2) hide show

app.py +680 -0
requirements.txt +6 -3

app.py ADDED Viewed

	@@ -0,0 +1,680 @@

+#!/usr/bin/env python3
+"""
+File Upload Analyzer - Streamlit Frontend
+This is a copy of file_upload_app.py for Hugging Face Spaces deployment.
+"""
+import streamlit as st
+import json
+import sys
+import os
+from pathlib import Path
+from typing import Dict, Any
+import io
+import requests
+# Try to import structure_analysis, fallback to inline if not available
+try:
+    from structure_analysis import (
+        detect_summary_fields,
+        classify_data_structure,
+        get_hierarchy_summary
+    )
+except ImportError:
+    # Inline fallback implementations
+    def detect_summary_fields(data: Any, path: str = "") -> list:
+        """Detect summary fields."""
+        fields = []
+        summary_indicators = ['total', 'count', 'percentage', 'summary', 'aggregate', 'statistics', 'percent']
+        def traverse(obj, current_path=""):
+            if isinstance(obj, dict):
+                for key, value in obj.items():
+                    field_path = f"{current_path}.{key}" if current_path else key
+                    if any(ind in key.lower() for ind in summary_indicators):
+                        fields.append(field_path)
+                    if isinstance(value, (dict, list)):
+                        traverse(value, field_path)
+            elif isinstance(obj, list) and len(obj) > 0:
+                traverse(obj[0], current_path)
+        traverse(data, path)
+        return fields
+    def classify_data_structure(data: Any) -> dict:
+        """Classify data structure."""
+        return {
+            'summary_fields': [],
+            'config_fields': [],
+            'object_arrays': [],
+            'object_fields': []
+        }
+    def get_hierarchy_summary(data: Any) -> dict:
+        """Get hierarchy summary."""
+        return {
+            'has_summary': False,
+            'has_config': False,
+            'summary_fields': [],
+            'config_fields': [],
+            'levels_present': []
+        }
+# Detect if running on Streamlit Cloud or Hugging Face
+IS_STREAMLIT_CLOUD = os.getenv("STREAMLIT_SHARING_BASE_URL") is not None
+IS_HUGGINGFACE = os.getenv("SPACE_ID") is not None
+IS_ONLINE = IS_STREAMLIT_CLOUD or IS_HUGGINGFACE
+# Page config - must be first
+st.set_page_config(
+    page_title="JSON Field Analyzer",
+    page_icon="📊",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main > div {
+        padding-top: 1rem;
+    }
+    .stButton>button {
+        width: 100%;
+    }
+    h1 {
+        font-size: 2rem;
+    }
+    h2 {
+        font-size: 1.3rem;
+        border-bottom: 2px solid #0e1117;
+        padding-bottom: 0.3rem;
+    }
+    .highlight {
+        background-color: #f0f2f6;
+        color: #262730;
+        padding: 1rem;
+        border-radius: 5px;
+        border-left: 4px solid #1f77b4;
+        margin: 1rem 0;
+    }
+    .highlight p {
+        color: #262730;
+        margin: 0;
+    }
+    .result-box {
+        background-color: #f0f2f6;
+        padding: 1.5rem;
+        border-radius: 10px;
+        margin: 1rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+class FileAnalyzer:
+    """Analyzer for uploaded JSON files."""
+    OLLAMA_API_URL = "http://localhost:11434/api/generate"
+    MODEL_NAME = "llama3.2:3b"
+    def __init__(self, data: Dict[str, Any], llm_provider="ollama", api_key=None):
+        self.data = data
+        self.metadata = None
+        self.llm_provider = llm_provider
+        self.api_key = api_key
+    def extract_metadata(self, target_field: str) -> Dict[str, Any]:
+        """Extract key metadata from the JSON data for LLM analysis."""
+        # Enhanced: Detect summary fields and classify structure
+        summary_fields = detect_summary_fields(self.data)
+        classification = classify_data_structure(self.data)
+        hierarchy_summary = get_hierarchy_summary(self.data)
+        # Try to find objects in the data structure
+        objects_with_target = self._find_objects_with_target(target_field)
+        total = len(objects_with_target)
+        target_true = sum(1 for obj in objects_with_target if obj.get(target_field) is True)
+        percentage = (target_true / total * 100) if total > 0 else 0
+        metadata = {
+            "total_objects": total,
+            "target_count": target_true,
+            "percentage": round(percentage, 2),
+            "summary_fields_detected": summary_fields[:10],
+            "classification": classification,
+            "hierarchy_summary": hierarchy_summary,
+            "has_summary_level": hierarchy_summary['has_summary'],
+            "has_config_level": hierarchy_summary['has_config']
+        }
+        self.metadata = metadata
+        return metadata
+    def _find_objects_with_target(self, target_field: str) -> list:
+        """Find all objects in the data structure that contain the target field."""
+        found = []
+        def find_fields(obj):
+            if isinstance(obj, dict):
+                if target_field in obj:
+                    found.append(obj)
+                for value in obj.values():
+                    find_fields(value)
+            elif isinstance(obj, list):
+                for item in obj:
+                    find_fields(item)
+        find_fields(self.data)
+        return found
+    def generate_prompt(self, target_field: str) -> str:
+        """Generate a hierarchy-aware prompt for the LLM."""
+        if not self.metadata:
+            self.extract_metadata(target_field)
+        hierarchy = self.metadata.get('hierarchy_summary', {})
+        summary_fields = self.metadata.get('summary_fields_detected', [])
+        classification = self.metadata.get('classification', {})
+        # Get sample object
+        sample = {}
+        def find_sample(obj):
+            if isinstance(obj, dict):
+                if target_field in obj:
+                    return obj
+                for v in obj.values():
+                    result = find_sample(v)
+                    if result:
+                        return result
+            elif isinstance(obj, list) and len(obj) > 0:
+                return find_sample(obj[0])
+            return {}
+        sample = find_sample(self.data)
+        # Get summary sample
+        summary_sample = self.data.get('results', {}).get('summary', {}) or self.data.get('summary', {})
+        # Create samples
+        sample_object = json.dumps({k: sample[k] for k in list(sample.keys())[:5]}, indent=2) if sample else "{}"
+        sample_summary = json.dumps(summary_sample, indent=2) if summary_sample else "{}"
+        # Build hierarchy instruction
+        hierarchy_text = f"""
+DATA HIERARCHY (analyze in this priority order):
+LEVEL 1 - Summary/Aggregate Fields (HIGHEST PRIORITY):
+"""
+        if summary_fields:
+            for field in summary_fields[:5]:
+                hierarchy_text += f"  ✓ {field}\n"
+            if len(summary_fields) > 5:
+                hierarchy_text += f"  ... and {len(summary_fields) - 5} more\n"
+        else:
+            hierarchy_text += "  No summary fields detected\n"
+        hierarchy_text += f"""
+LEVEL 2 - Configuration/Compliance Fields:
+"""
+        config_fields = classification.get('config_fields', [])
+        if config_fields:
+            for field in config_fields[:3]:
+                hierarchy_text += f"  ✓ {field}\n"
+        else:
+            hierarchy_text += "  No config fields detected\n"
+        hierarchy_text += f"""
+LEVEL 3 - Individual Objects:
+  ✓ Sample object fields shown below
+CRITICAL INSTRUCTION: Check summary fields FIRST! They are the most important for validation.
+"""
+        prompt = f"""You are analyzing JSON data to identify important fields related to "{target_field}".
+{hierarchy_text}
+CONTEXT:
+- Total objects: {self.metadata.get('total_objects', 0)}
+- Objects with "{target_field}" = true: {self.metadata.get('target_count', 0)}
+- Percentage: {self.metadata.get('percentage', 0)}%
+- Has summary level data: {self.metadata.get('has_summary_level', False)}
+SAMPLE SUMMARY DATA (check this first):
+{sample_summary}
+SAMPLE OBJECT DATA:
+{sample_object}
+TASK:
+Identify 3-4 important fields related to "{target_field}" in this priority order:
+1. FIRST: Summary/aggregate fields (totals, percentages, counts)
+2. SECOND: Configuration/compliance fields
+3. THIRD: Individual object fields (if needed)
+Generate regex patterns that match JSON format (with quotes).
+VALIDATION PATTERN EXAMPLES:
+- Compare two aggregate values: "field1"\\s*:\\s*(\\d+)[\\s\\S]*?"field2"\\s*:\\s*(\\d+)
+- Extract percentage: "field_percentage"\\s*:\\s*(\\d+)
+- Extract boolean: "field_name"\\s*:\\s*(true|false)
+- Extract status: "compliance"\\s*:\\s*"([^"]*)"
+Output ONLY valid JSON:
+{{
+  "test_name": "Field Analysis: {target_field}",
+  "important_fields": ["field1", "field2", "field3"],
+  "reasoning": "Explain prioritization and why these fields matter",
+  "generated_regex": ["regex1", "regex2", "regex3"]
+}}
+"""
+        return prompt
+    def call_llm(self, prompt: str) -> str:
+        """Call the appropriate LLM based on provider."""
+        if self.llm_provider == "ollama":
+            return self._call_ollama(prompt)
+        elif self.llm_provider == "openai":
+            return self._call_openai(prompt)
+        elif self.llm_provider == "anthropic":
+            return self._call_anthropic(prompt)
+        elif self.llm_provider == "huggingface":
+            return self._call_huggingface(prompt)
+        else:
+            raise ValueError(f"Unknown LLM provider: {self.llm_provider}")
+    def _call_ollama(self, prompt: str) -> str:
+        """Call the Ollama API to generate a response."""
+        try:
+            payload = {
+                "model": self.MODEL_NAME,
+                "prompt": prompt,
+                "stream": False,
+                "format": "json"
+            }
+            response = requests.post(self.OLLAMA_API_URL, json=payload, timeout=120)
+            response.raise_for_status()
+            result = response.json()
+            return result.get('response', '')
+        except requests.exceptions.ConnectionError:
+            raise ConnectionError("Cannot connect to Ollama. Make sure Ollama is running.")
+        except requests.exceptions.Timeout:
+            raise TimeoutError("Ollama request timed out.")
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"Failed to call Ollama API - {e}")
+    def _call_openai(self, prompt: str) -> str:
+        """Call the OpenAI API to generate a response."""
+        try:
+            from openai import OpenAI
+            client = OpenAI(api_key=self.api_key)
+            response = client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "You are a JSON data analysis assistant. Always respond with valid JSON."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.3,
+                max_tokens=2000
+            )
+            return response.choices[0].message.content
+        except ImportError:
+            raise ImportError("OpenAI library not installed. Install with: pip install openai")
+        except Exception as e:
+            raise Exception(f"Failed to call OpenAI API - {e}")
+    def _call_anthropic(self, prompt: str) -> str:
+        """Call the Anthropic API to generate a response."""
+        try:
+            from anthropic import Anthropic
+            client = Anthropic(api_key=self.api_key)
+            response = client.messages.create(
+                model="claude-3-5-sonnet-20241022",
+                max_tokens=2000,
+                temperature=0.3,
+                system="You are a JSON data analysis assistant. Always respond with valid JSON.",
+                messages=[
+                    {"role": "user", "content": prompt}
+                ]
+            )
+            return response.content[0].text
+        except ImportError:
+            raise ImportError("Anthropic library not installed. Install with: pip install anthropic")
+        except Exception as e:
+            raise Exception(f"Failed to call Anthropic API - {e}")
+    def _call_huggingface(self, prompt: str) -> str:
+        """Call the Hugging Face Inference API (FREE) to generate a response."""
+        try:
+            # Use a good free model for text generation
+            model_name = self.api_key or "mistralai/Mistral-7B-Instruct-v0.3"  # Default free model
+            headers = {
+                "Authorization": f"Bearer {self.api_key}" if self.api_key else None,
+                "Content-Type": "application/json"
+            }
+            # Remove None values
+            headers = {k: v for k, v in headers.items() if v is not None}
+            # Create a properly formatted prompt
+            full_prompt = f"""<s>[INST]You are a JSON data analysis assistant. Always respond with valid JSON only, no explanations.
+{prompt}[/INST]"""
+            payload = {
+                "inputs": full_prompt,
+                "parameters": {
+                    "max_new_tokens": 1000,
+                    "temperature": 0.3,
+                    "return_full_text": False
+                }
+            }
+            api_url = f"https://api-inference.huggingface.co/models/{model_name}"
+            response = requests.post(api_url, json=payload, headers=headers, timeout=60)
+            if response.status_code == 503:
+                raise Exception("Model is loading. Please wait a moment and try again.")
+            response.raise_for_status()
+            result = response.json()
+            # Handle different response formats
+            if isinstance(result, list) and len(result) > 0:
+                return result[0].get('generated_text', '')
+            elif isinstance(result, dict):
+                return result.get('generated_text', '')
+            else:
+                return str(result)
+        except Exception as e:
+            raise Exception(f"Failed to call Hugging Face API - {e}")
+    def parse_llm_output(self, output: str) -> Dict[str, Any]:
+        """Parse and validate the LLM JSON output."""
+        try:
+            output = output.strip()
+            if output.startswith("```json"):
+                output = output[7:]
+            if output.startswith("```"):
+                output = output[3:]
+            if output.endswith("```"):
+                output = output[:-3]
+            output = output.strip()
+            result = json.loads(output)
+            return result
+        except json.JSONDecodeError as e:
+            raise ValueError(f"LLM output is not valid JSON - {e}")
+    def analyze(self, target_field: str = "rotation_enabled") -> Dict[str, Any]:
+        """Main analysis function."""
+        self.extract_metadata(target_field)
+        prompt = self.generate_prompt(target_field)
+        llm_output = self.call_llm(prompt)
+        result = self.parse_llm_output(llm_output)
+        return result
+def main():
+    """Main Streamlit application."""
+    st.title("📊 JSON Field Analyzer")
+    if IS_HUGGINGFACE:
+        st.info("🆓 Running on Hugging Face - FREE Hugging Face AI model available! No API key needed.")
+    st.markdown("**Upload a JSON file and analyze important fields using LLM**")
+    # Sidebar for configuration
+    with st.sidebar:
+        st.header("⚙️ Configuration")
+        # Show environment info
+        if IS_ONLINE and not IS_HUGGINGFACE:
+            st.info("🌐 Running online - Cloud LLM required")
+        # LLM Provider Selection
+        # Default to Hugging Face (free) if online, Ollama on local
+        if IS_ONLINE:
+            default_index = 3  # Hugging Face (Free)
+        else:
+            default_index = 0  # Ollama
+        llm_provider = st.selectbox(
+            "🤖 LLM Provider",
+            ["Ollama (Local)", "OpenAI (Cloud)", "Anthropic Claude (Cloud)", "Hugging Face (Free 🌟)"],
+            index=default_index,
+            help="Choose your LLM provider - Hugging Face is FREE and no API key needed!"
+        )
+        # Extract provider name and model
+        if llm_provider == "Ollama (Local)":
+            provider_name = "ollama"
+            api_key = None
+            if IS_ONLINE:
+                st.error("❌ Ollama not available on this platform")
+                st.markdown("**Please select a cloud LLM provider:**")
+                st.markdown("- OpenAI (Cloud) - GPT-4o Mini")
+                st.markdown("- Anthropic Claude (Cloud) - Recommended")
+            else:
+                st.info("📝 Using local Ollama")
+        elif llm_provider == "OpenAI (Cloud)":
+            provider_name = "openai"
+            api_key = os.getenv("OPENAI_API_KEY") or st.text_input(
+                "OpenAI API Key",
+                type="password",
+                help="Enter your OpenAI API key (or set OPENAI_API_KEY env var)"
+            )
+            if not api_key:
+                st.warning("⚠️ Please enter your OpenAI API key")
+                st.info("💡 Get key: https://platform.openai.com/api-keys")
+        elif llm_provider == "Anthropic Claude (Cloud)":
+            provider_name = "anthropic"
+            api_key = os.getenv("ANTHROPIC_API_KEY") or st.text_input(
+                "Anthropic API Key",
+                type="password",
+                help="Enter your Anthropic API key (or set ANTHROPIC_API_KEY env var)"
+            )
+            if not api_key:
+                st.warning("⚠️ Please enter your Anthropic API key")
+                st.info("💡 Get key: https://console.anthropic.com")
+        else:  # Hugging Face (Free)
+            provider_name = "huggingface"
+            api_key = os.getenv("HUGGINGFACE_API_KEY") or st.text_input(
+                "Hugging Face API Key (Optional)",
+                type="password",
+                help="Optional: Enter your HF token for faster inference (or set HUGGINGFACE_API_KEY env var)"
+            )
+            if not api_key:
+                st.info("✨ Using free Hugging Face Inference API - no key needed!")
+                st.info("💡 Optional: Add your token in Settings > Secrets for better performance")
+        st.markdown("---")
+        target_field = st.text_input(
+            "Target Field",
+            value="rotation_enabled",
+            help="The field you want to analyze (e.g., rotation_enabled, ssl_enforced)"
+        )
+        st.markdown("---")
+        st.markdown("### 📋 Setup Guides")
+        with st.expander("🔧 Local Ollama Setup"):
+            st.code("""
+brew install ollama
+ollama serve
+ollama pull llama3.2:3b
+            """, language="bash")
+        with st.expander("☁️ Cloud API Setup"):
+            st.markdown("""
+            **OpenAI:**
+            - Get key: https://platform.openai.com/api-keys
+            - Model: GPT-4o Mini
+            **Anthropic:**
+            - Get key: https://console.anthropic.com
+            - Model: Claude 3.5 Sonnet
+            """)
+    # File upload section
+    st.markdown("---")
+    st.header("📤 Upload JSON File")
+    uploaded_file = st.file_uploader(
+        "Choose a JSON file",
+        type=['json'],
+        help="Upload a JSON file to analyze"
+    )
+    # Display file info if uploaded
+    if uploaded_file is not None:
+        try:
+            # Read file contents
+            content = uploaded_file.read()
+            data = json.loads(content)
+            st.success("✅ File uploaded successfully!")
+            # Show file info
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("File Size", f"{len(content) / 1024:.2f} KB")
+            with col2:
+                st.metric("JSON Structure", "Valid" if isinstance(data, (dict, list)) else "Invalid")
+            # Analyze button
+            st.markdown("---")
+            col1, col2, col3 = st.columns([1, 2, 1])
+            with col2:
+                analyze_button = st.button("🔍 Analyze with LLM", type="primary", use_container_width=True)
+            # Run analysis
+            if analyze_button:
+                # Prevent Ollama usage on online platforms
+                if provider_name == "ollama" and IS_ONLINE:
+                    st.error("❌ Ollama is not available on this platform")
+                    st.info("💡 Please select 'Anthropic Claude (Cloud)' or 'OpenAI (Cloud)' from the sidebar")
+                # Validate API key for cloud providers (except Hugging Face which is optional)
+                elif provider_name in ["openai", "anthropic"] and not api_key:
+                    st.error("❌ Please enter an API key for the selected cloud provider")
+                else:
+                    try:
+                        with st.spinner(f"Analyzing with {llm_provider}... This may take a moment."):
+                            analyzer = FileAnalyzer(data, llm_provider=provider_name, api_key=api_key)
+                            result = analyzer.analyze(target_field=target_field)
+                        # Display results
+                        st.markdown("---")
+                        st.header("📊 Analysis Results")
+                        # Main results in columns
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.subheader("🤖 Important Fields")
+                            for i, field in enumerate(result.get('important_fields', []), 1):
+                                st.markdown(f"**{i}. {field}**")
+                        with col2:
+                            st.subheader("💡 Reasoning")
+                            st.markdown(f'<div class="highlight">{result.get("reasoning", "N/A")}</div>',
+                                      unsafe_allow_html=True)
+                        # Regex patterns
+                        st.markdown("---")
+                        st.subheader("🔧 Generated Regex Patterns")
+                        regex_patterns = result.get('generated_regex', [])
+                        for i, pattern in enumerate(regex_patterns, 1):
+                            st.markdown(f"**Pattern {i}:**")
+                            st.code(pattern, language="regex")
+                        # Raw JSON output
+                        with st.expander("📄 View Raw JSON Output"):
+                            st.json(result)
+                        # Download results
+                        st.markdown("---")
+                        result_json = json.dumps(result, indent=2)
+                        st.download_button(
+                            label="⬇️ Download Results",
+                            data=result_json,
+                            file_name=f"analysis_{target_field}.json",
+                            mime="application/json"
+                        )
+                    except ConnectionError as e:
+                        st.error(f"❌ {e}")
+                        if provider_name == "ollama":
+                            st.info("💡 Start Ollama with: `ollama serve`")
+                        else:
+                            st.info("💡 Check your internet connection and API key")
+                    except TimeoutError as e:
+                        st.error(f"❌ {e}")
+                        st.info("💡 The analysis took too long. Try again or use a larger timeout.")
+                    except Exception as e:
+                        st.error(f"❌ Error during analysis: {e}")
+                        st.exception(e)
+        except json.JSONDecodeError:
+            st.error("❌ Invalid JSON file. Please upload a valid JSON file.")
+        except Exception as e:
+            st.error(f"❌ Error reading file: {e}")
+            st.exception(e)
+    else:
+        # Show example when no file is uploaded
+        st.info("👆 Please upload a JSON file to get started")
+        with st.expander("📖 How it works"):
+            st.markdown("""
+            ### Workflow:
+            1. **Upload**: Upload your JSON file using the file uploader above
+            2. **Configure**: Set the target field name in the sidebar (default: `rotation_enabled`)
+            3. **Analyze**: Click the "Analyze with LLM" button
+            4. **Review**: View the important fields, reasoning, and regex patterns
+            5. **Download**: Save the results as JSON
+            ### What it does:
+            - Analyzes your JSON structure to detect summary fields, configurations, and objects
+            - Uses LLM to identify important fields related to your target
+            - Generates regex patterns for data extraction and validation
+            - Provides reasoning for why each field is important
+            ### Use cases:
+            - AWS compliance validation (KMS rotation, SSL enforcement, etc.)
+            - Data quality checks
+            - Automated validation pattern generation
+            - Field correlation analysis
+            """)
+# Call main function - Streamlit will handle errors
+main()

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
-pandas
-streamlit

+requests>=2.31.0
+streamlit>=1.28.0
+pandas>=2.0.0
+openai>=1.0.0
+anthropic>=0.7.0