bluestpanda commited on
Commit
9714df8
Β·
1 Parent(s): 4b7b107
Files changed (1) hide show
  1. app.py +289 -613
app.py CHANGED
@@ -1,20 +1,23 @@
1
  #!/usr/bin/env python3
2
  """
3
- File Upload Analyzer - Streamlit Frontend
4
- This is a copy of file_upload_app.py for Hugging Face Spaces deployment.
5
  """
6
 
7
  import streamlit as st
8
  import json
9
- import sys
10
- import os
11
  from pathlib import Path
12
  from typing import Dict, Any
13
  import io
14
 
15
- import requests
 
 
 
 
 
16
 
17
- # Try to import structure_analysis, fallback to inline if not available
18
  try:
19
  from structure_analysis import (
20
  detect_summary_fields,
@@ -22,659 +25,332 @@ try:
22
  get_hierarchy_summary
23
  )
24
  except ImportError:
25
- # Inline fallback implementations
26
- def detect_summary_fields(data: Any, path: str = "") -> list:
27
- """Detect summary fields."""
28
- fields = []
29
- summary_indicators = ['total', 'count', 'percentage', 'summary', 'aggregate', 'statistics', 'percent']
30
-
31
- def traverse(obj, current_path=""):
32
- if isinstance(obj, dict):
33
- for key, value in obj.items():
34
- field_path = f"{current_path}.{key}" if current_path else key
35
- if any(ind in key.lower() for ind in summary_indicators):
36
- fields.append(field_path)
37
- if isinstance(value, (dict, list)):
38
- traverse(value, field_path)
39
- elif isinstance(obj, list) and len(obj) > 0:
40
- traverse(obj[0], current_path)
41
-
42
- traverse(data, path)
43
- return fields
44
-
45
- def classify_data_structure(data: Any) -> dict:
46
- """Classify data structure."""
47
- return {
48
- 'summary_fields': [],
49
- 'config_fields': [],
50
- 'object_arrays': [],
51
- 'object_fields': []
52
- }
53
-
54
- def get_hierarchy_summary(data: Any) -> dict:
55
- """Get hierarchy summary."""
56
- return {
57
- 'has_summary': False,
58
- 'has_config': False,
59
- 'summary_fields': [],
60
- 'config_fields': [],
61
- 'levels_present': []
62
- }
63
 
64
- # Detect if running on Streamlit Cloud or Hugging Face
65
- IS_STREAMLIT_CLOUD = os.getenv("STREAMLIT_SHARING_BASE_URL") is not None
66
- IS_HUGGINGFACE = os.getenv("SPACE_ID") is not None
67
- IS_ONLINE = IS_STREAMLIT_CLOUD or IS_HUGGINGFACE
68
 
69
 
70
- # Page config - must be first
71
- st.set_page_config(
72
- page_title="JSON Field Analyzer",
73
- page_icon="πŸ“Š",
74
- layout="wide",
75
- initial_sidebar_state="expanded"
76
- )
77
-
78
- # Custom CSS
79
- st.markdown("""
80
- <style>
81
- .main > div {
82
- padding-top: 1rem;
83
- }
84
- .stButton>button {
85
- width: 100%;
86
- }
87
- h1 {
88
- font-size: 2rem;
89
- }
90
- h2 {
91
- font-size: 1.3rem;
92
- border-bottom: 2px solid #0e1117;
93
- padding-bottom: 0.3rem;
94
- }
95
- .highlight {
96
- background-color: #f0f2f6;
97
- color: #262730;
98
- padding: 1rem;
99
- border-radius: 5px;
100
- border-left: 4px solid #1f77b4;
101
- margin: 1rem 0;
102
- }
103
- .highlight p {
104
- color: #262730;
105
- margin: 0;
106
- }
107
- .result-box {
108
- background-color: #f0f2f6;
109
- padding: 1.5rem;
110
- border-radius: 10px;
111
- margin: 1rem 0;
112
- }
113
- </style>
114
- """, unsafe_allow_html=True)
115
-
116
-
117
- class FileAnalyzer:
118
- """Analyzer for uploaded JSON files."""
119
 
120
- OLLAMA_API_URL = "http://localhost:11434/api/generate"
121
- MODEL_NAME = "llama3.2:3b"
 
 
 
 
 
 
 
 
 
 
122
 
123
- def __init__(self, data: Dict[str, Any], llm_provider="ollama", api_key=None):
124
- self.data = data
125
- self.metadata = None
126
- self.llm_provider = llm_provider
127
- self.api_key = api_key
128
-
129
- def extract_metadata(self, target_field: str) -> Dict[str, Any]:
130
- """Extract key metadata from the JSON data for LLM analysis."""
131
- # Enhanced: Detect summary fields and classify structure
132
- summary_fields = detect_summary_fields(self.data)
133
- classification = classify_data_structure(self.data)
134
- hierarchy_summary = get_hierarchy_summary(self.data)
135
-
136
- # Try to find objects in the data structure
137
- objects_with_target = self._find_objects_with_target(target_field)
138
- total = len(objects_with_target)
139
- target_true = sum(1 for obj in objects_with_target if obj.get(target_field) is True)
140
- percentage = (target_true / total * 100) if total > 0 else 0
141
-
142
- metadata = {
143
- "total_objects": total,
144
- "target_count": target_true,
145
- "percentage": round(percentage, 2),
146
- "summary_fields_detected": summary_fields[:10],
147
- "classification": classification,
148
- "hierarchy_summary": hierarchy_summary,
149
- "has_summary_level": hierarchy_summary['has_summary'],
150
- "has_config_level": hierarchy_summary['has_config']
151
- }
152
-
153
- self.metadata = metadata
154
- return metadata
155
 
156
- def _find_objects_with_target(self, target_field: str) -> list:
157
- """Find all objects in the data structure that contain the target field."""
158
- found = []
159
-
160
- def find_fields(obj):
161
- if isinstance(obj, dict):
162
- if target_field in obj:
163
- found.append(obj)
164
- for value in obj.values():
165
- find_fields(value)
166
- elif isinstance(obj, list):
167
- for item in obj:
168
- find_fields(item)
169
-
170
- find_fields(self.data)
171
- return found
172
 
173
- def generate_prompt(self, target_field: str) -> str:
174
- """Generate a hierarchy-aware prompt for the LLM."""
175
- if not self.metadata:
176
- self.extract_metadata(target_field)
177
-
178
- hierarchy = self.metadata.get('hierarchy_summary', {})
179
- summary_fields = self.metadata.get('summary_fields_detected', [])
180
- classification = self.metadata.get('classification', {})
181
-
182
- # Get sample object
183
- sample = {}
184
- def find_sample(obj):
185
- if isinstance(obj, dict):
186
- if target_field in obj:
187
- return obj
188
- for v in obj.values():
189
- result = find_sample(v)
190
- if result:
191
- return result
192
- elif isinstance(obj, list) and len(obj) > 0:
193
- return find_sample(obj[0])
194
- return {}
195
-
196
- sample = find_sample(self.data)
197
-
198
- # Get summary sample
199
- summary_sample = self.data.get('results', {}).get('summary', {}) or self.data.get('summary', {})
200
-
201
- # Create samples
202
- sample_object = json.dumps({k: sample[k] for k in list(sample.keys())[:5]}, indent=2) if sample else "{}"
203
- sample_summary = json.dumps(summary_sample, indent=2) if summary_sample else "{}"
204
-
205
- # Build hierarchy instruction
206
- hierarchy_text = f"""
207
- DATA HIERARCHY (analyze in this priority order):
208
-
209
- LEVEL 1 - Summary/Aggregate Fields (HIGHEST PRIORITY):
210
- """
211
- if summary_fields:
212
- for field in summary_fields[:5]:
213
- hierarchy_text += f" βœ“ {field}\n"
214
- if len(summary_fields) > 5:
215
- hierarchy_text += f" ... and {len(summary_fields) - 5} more\n"
216
- else:
217
- hierarchy_text += " No summary fields detected\n"
218
-
219
- hierarchy_text += f"""
220
- LEVEL 2 - Configuration/Compliance Fields:
221
- """
222
- config_fields = classification.get('config_fields', [])
223
- if config_fields:
224
- for field in config_fields[:3]:
225
- hierarchy_text += f" βœ“ {field}\n"
226
- else:
227
- hierarchy_text += " No config fields detected\n"
228
-
229
- hierarchy_text += f"""
230
- LEVEL 3 - Individual Objects:
231
- βœ“ Sample object fields shown below
232
-
233
- CRITICAL INSTRUCTION: Check summary fields FIRST! They are the most important for validation.
234
- """
235
-
236
- prompt = f"""You are analyzing JSON data to identify important fields related to "{target_field}".
237
-
238
- {hierarchy_text}
239
-
240
- CONTEXT:
241
- - Total objects: {self.metadata.get('total_objects', 0)}
242
- - Objects with "{target_field}" = true: {self.metadata.get('target_count', 0)}
243
- - Percentage: {self.metadata.get('percentage', 0)}%
244
- - Has summary level data: {self.metadata.get('has_summary_level', False)}
245
-
246
- SAMPLE SUMMARY DATA (check this first):
247
- {sample_summary}
248
-
249
- SAMPLE OBJECT DATA:
250
- {sample_object}
251
-
252
- TASK:
253
- Identify 3-4 important fields related to "{target_field}" in this priority order:
254
- 1. FIRST: Summary/aggregate fields (totals, percentages, counts)
255
- 2. SECOND: Configuration/compliance fields
256
- 3. THIRD: Individual object fields (if needed)
257
-
258
- Generate regex patterns that match JSON format (with quotes).
259
-
260
- VALIDATION PATTERN EXAMPLES:
261
- - Compare two aggregate values: "field1"\\s*:\\s*(\\d+)[\\s\\S]*?"field2"\\s*:\\s*(\\d+)
262
- - Extract percentage: "field_percentage"\\s*:\\s*(\\d+)
263
- - Extract boolean: "field_name"\\s*:\\s*(true|false)
264
- - Extract status: "compliance"\\s*:\\s*"([^"]*)"
265
-
266
- Output ONLY valid JSON:
267
- {{
268
- "test_name": "Field Analysis: {target_field}",
269
- "important_fields": ["field1", "field2", "field3"],
270
- "reasoning": "Explain prioritization and why these fields matter",
271
- "generated_regex": ["regex1", "regex2", "regex3"]
272
- }}
273
- """
274
-
275
- return prompt
276
-
277
- def call_llm(self, prompt: str) -> str:
278
- """Call the appropriate LLM based on provider."""
279
- if self.llm_provider == "ollama":
280
- return self._call_ollama(prompt)
281
- elif self.llm_provider == "openai":
282
- return self._call_openai(prompt)
283
- elif self.llm_provider == "anthropic":
284
- return self._call_anthropic(prompt)
285
- elif self.llm_provider == "huggingface":
286
- return self._call_huggingface(prompt)
287
- else:
288
- raise ValueError(f"Unknown LLM provider: {self.llm_provider}")
289
-
290
- def _call_ollama(self, prompt: str) -> str:
291
- """Call the Ollama API to generate a response."""
292
- try:
293
- payload = {
294
- "model": self.MODEL_NAME,
295
- "prompt": prompt,
296
- "stream": False,
297
- "format": "json"
298
- }
299
-
300
- response = requests.post(self.OLLAMA_API_URL, json=payload, timeout=120)
301
- response.raise_for_status()
302
-
303
- result = response.json()
304
- return result.get('response', '')
305
-
306
- except requests.exceptions.ConnectionError:
307
- raise ConnectionError("Cannot connect to Ollama. Make sure Ollama is running.")
308
- except requests.exceptions.Timeout:
309
- raise TimeoutError("Ollama request timed out.")
310
- except requests.exceptions.RequestException as e:
311
- raise Exception(f"Failed to call Ollama API - {e}")
312
-
313
- def _call_openai(self, prompt: str) -> str:
314
- """Call the OpenAI API to generate a response."""
315
- try:
316
- from openai import OpenAI
317
-
318
- client = OpenAI(api_key=self.api_key)
319
-
320
- response = client.chat.completions.create(
321
- model="gpt-4o-mini",
322
- messages=[
323
- {"role": "system", "content": "You are a JSON data analysis assistant. Always respond with valid JSON."},
324
- {"role": "user", "content": prompt}
325
- ],
326
- temperature=0.3,
327
- max_tokens=2000
328
- )
329
-
330
- return response.choices[0].message.content
331
-
332
- except ImportError:
333
- raise ImportError("OpenAI library not installed. Install with: pip install openai")
334
- except Exception as e:
335
- raise Exception(f"Failed to call OpenAI API - {e}")
336
 
337
- def _call_anthropic(self, prompt: str) -> str:
338
- """Call the Anthropic API to generate a response."""
339
- try:
340
- from anthropic import Anthropic
341
-
342
- client = Anthropic(api_key=self.api_key)
343
-
344
- response = client.messages.create(
345
- model="claude-3-5-sonnet-20241022",
346
- max_tokens=2000,
347
- temperature=0.3,
348
- system="You are a JSON data analysis assistant. Always respond with valid JSON.",
349
- messages=[
350
- {"role": "user", "content": prompt}
351
- ]
352
- )
353
-
354
- return response.content[0].text
355
-
356
- except ImportError:
357
- raise ImportError("Anthropic library not installed. Install with: pip install anthropic")
358
- except Exception as e:
359
- raise Exception(f"Failed to call Anthropic API - {e}")
360
-
361
- def _call_huggingface(self, prompt: str) -> str:
362
- """Call the Hugging Face Inference API (FREE) to generate a response."""
363
- try:
364
- # Use a good free model for text generation
365
- model_name = self.api_key or "mistralai/Mistral-7B-Instruct-v0.3" # Default free model
366
-
367
- headers = {
368
- "Authorization": f"Bearer {self.api_key}" if self.api_key else None,
369
- "Content-Type": "application/json"
370
- }
371
- # Remove None values
372
- headers = {k: v for k, v in headers.items() if v is not None}
373
-
374
- # Create a properly formatted prompt
375
- full_prompt = f"""<s>[INST]You are a JSON data analysis assistant. Always respond with valid JSON only, no explanations.
376
-
377
- {prompt}[/INST]"""
378
-
379
- payload = {
380
- "inputs": full_prompt,
381
- "parameters": {
382
- "max_new_tokens": 1000,
383
- "temperature": 0.3,
384
- "return_full_text": False
385
- }
386
- }
387
-
388
- api_url = f"https://api-inference.huggingface.co/models/{model_name}"
389
- response = requests.post(api_url, json=payload, headers=headers, timeout=60)
390
-
391
- if response.status_code == 503:
392
- raise Exception("Model is loading. Please wait a moment and try again.")
393
-
394
- response.raise_for_status()
395
- result = response.json()
396
-
397
- # Handle different response formats
398
- if isinstance(result, list) and len(result) > 0:
399
- return result[0].get('generated_text', '')
400
- elif isinstance(result, dict):
401
- return result.get('generated_text', '')
402
- else:
403
- return str(result)
404
-
405
- except Exception as e:
406
- raise Exception(f"Failed to call Hugging Face API - {e}")
407
 
408
- def parse_llm_output(self, output: str) -> Dict[str, Any]:
409
- """Parse and validate the LLM JSON output."""
410
- try:
411
- output = output.strip()
412
- if output.startswith("```json"):
413
- output = output[7:]
414
- if output.startswith("```"):
415
- output = output[3:]
416
- if output.endswith("```"):
417
- output = output[:-3]
418
- output = output.strip()
419
-
420
- result = json.loads(output)
421
- return result
422
-
423
- except json.JSONDecodeError as e:
424
- raise ValueError(f"LLM output is not valid JSON - {e}")
425
 
426
- def analyze(self, target_field: str = "rotation_enabled") -> Dict[str, Any]:
427
- """Main analysis function."""
428
- self.extract_metadata(target_field)
429
- prompt = self.generate_prompt(target_field)
430
- llm_output = self.call_llm(prompt)
431
- result = self.parse_llm_output(llm_output)
432
- return result
433
 
434
 
435
- def main():
436
- """Main Streamlit application."""
437
- st.title("πŸ“Š JSON Field Analyzer")
438
-
439
- if IS_HUGGINGFACE:
440
- st.info("πŸ†“ Running on Hugging Face - FREE Hugging Face AI model available! No API key needed.")
441
 
442
- st.markdown("**Upload a JSON file and analyze important fields using LLM**")
443
-
444
- # Sidebar for configuration
445
- with st.sidebar:
446
- st.header("βš™οΈ Configuration")
447
-
448
- # Show environment info
449
- if IS_ONLINE and not IS_HUGGINGFACE:
450
- st.info("🌐 Running online - Cloud LLM required")
451
 
452
- # LLM Provider Selection
453
- # Default to Hugging Face (free) if online, Ollama on local
454
- if IS_ONLINE:
455
- default_index = 3 # Hugging Face (Free)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  else:
457
- default_index = 0 # Ollama
458
-
459
- llm_provider = st.selectbox(
460
- "πŸ€– LLM Provider",
461
- ["Ollama (Local)", "OpenAI (Cloud)", "Anthropic Claude (Cloud)", "Hugging Face (Free 🌟)"],
462
- index=default_index,
463
- help="Choose your LLM provider - Hugging Face is FREE and no API key needed!"
464
- )
465
-
466
- # Extract provider name and model
467
- if llm_provider == "Ollama (Local)":
468
- provider_name = "ollama"
469
- api_key = None
470
- if IS_ONLINE:
471
- st.error("❌ Ollama not available on this platform")
472
- st.markdown("**Please select a cloud LLM provider:**")
473
- st.markdown("- OpenAI (Cloud) - GPT-4o Mini")
474
- st.markdown("- Anthropic Claude (Cloud) - Recommended")
475
  else:
476
- st.info("πŸ“ Using local Ollama")
477
- elif llm_provider == "OpenAI (Cloud)":
478
- provider_name = "openai"
479
- api_key = os.getenv("OPENAI_API_KEY") or st.text_input(
480
- "OpenAI API Key",
481
- type="password",
482
- help="Enter your OpenAI API key (or set OPENAI_API_KEY env var)"
483
- )
484
- if not api_key:
485
- st.warning("⚠️ Please enter your OpenAI API key")
486
- st.info("πŸ’‘ Get key: https://platform.openai.com/api-keys")
487
- elif llm_provider == "Anthropic Claude (Cloud)":
488
- provider_name = "anthropic"
489
- api_key = os.getenv("ANTHROPIC_API_KEY") or st.text_input(
490
- "Anthropic API Key",
491
- type="password",
492
- help="Enter your Anthropic API key (or set ANTHROPIC_API_KEY env var)"
493
- )
494
- if not api_key:
495
- st.warning("⚠️ Please enter your Anthropic API key")
496
- st.info("πŸ’‘ Get key: https://console.anthropic.com")
497
- else: # Hugging Face (Free)
498
- provider_name = "huggingface"
499
- api_key = os.getenv("HUGGINGFACE_API_KEY") or st.text_input(
500
- "Hugging Face API Key (Optional)",
501
- type="password",
502
- help="Optional: Enter your HF token for faster inference (or set HUGGINGFACE_API_KEY env var)"
503
- )
504
- if not api_key:
505
- st.info("✨ Using free Hugging Face Inference API - no key needed!")
506
- st.info("πŸ’‘ Optional: Add your token in Settings > Secrets for better performance")
507
-
508
- st.markdown("---")
509
-
510
- target_field = st.text_input(
511
- "Target Field",
512
- value="rotation_enabled",
513
- help="The field you want to analyze (e.g., rotation_enabled, ssl_enforced)"
514
- )
515
-
516
- st.markdown("---")
517
- st.markdown("### πŸ“‹ Setup Guides")
518
-
519
- with st.expander("πŸ”§ Local Ollama Setup"):
520
- st.code("""
521
- brew install ollama
522
- ollama serve
523
- ollama pull llama3.2:3b
524
- """, language="bash")
525
-
526
- with st.expander("☁️ Cloud API Setup"):
527
- st.markdown("""
528
- **OpenAI:**
529
- - Get key: https://platform.openai.com/api-keys
530
- - Model: GPT-4o Mini
531
-
532
- **Anthropic:**
533
- - Get key: https://console.anthropic.com
534
- - Model: Claude 3.5 Sonnet
535
- """)
536
 
537
- # File upload section
538
- st.markdown("---")
539
- st.header("πŸ“€ Upload JSON File")
 
 
 
 
540
 
 
541
  uploaded_file = st.file_uploader(
542
  "Choose a JSON file",
543
  type=['json'],
544
- help="Upload a JSON file to analyze"
545
  )
546
 
547
- # Display file info if uploaded
548
  if uploaded_file is not None:
 
549
  try:
550
- # Read file contents
551
  content = uploaded_file.read()
552
  data = json.loads(content)
553
 
554
- st.success("βœ… File uploaded successfully!")
555
 
556
- # Show file info
557
- col1, col2 = st.columns(2)
558
- with col1:
559
- st.metric("File Size", f"{len(content) / 1024:.2f} KB")
560
- with col2:
561
- st.metric("JSON Structure", "Valid" if isinstance(data, (dict, list)) else "Invalid")
562
-
563
- # Analyze button
564
- st.markdown("---")
565
-
566
- col1, col2, col3 = st.columns([1, 2, 1])
567
- with col2:
568
- analyze_button = st.button("πŸ” Analyze with LLM", type="primary", use_container_width=True)
569
-
570
- # Run analysis
571
- if analyze_button:
572
- # Prevent Ollama usage on online platforms
573
- if provider_name == "ollama" and IS_ONLINE:
574
- st.error("❌ Ollama is not available on this platform")
575
- st.info("πŸ’‘ Please select 'Anthropic Claude (Cloud)' or 'OpenAI (Cloud)' from the sidebar")
576
 
577
- # Validate API key for cloud providers (except Hugging Face which is optional)
578
- elif provider_name in ["openai", "anthropic"] and not api_key:
579
- st.error("❌ Please enter an API key for the selected cloud provider")
580
- else:
581
- try:
582
- with st.spinner(f"Analyzing with {llm_provider}... This may take a moment."):
583
- analyzer = FileAnalyzer(data, llm_provider=provider_name, api_key=api_key)
584
- result = analyzer.analyze(target_field=target_field)
585
-
586
- # Display results
587
- st.markdown("---")
588
- st.header("πŸ“Š Analysis Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
 
590
- # Main results in columns
591
- col1, col2 = st.columns(2)
 
 
 
 
592
 
593
- with col1:
594
- st.subheader("πŸ€– Important Fields")
595
- for i, field in enumerate(result.get('important_fields', []), 1):
596
- st.markdown(f"**{i}. {field}**")
 
 
 
 
 
 
 
 
 
 
 
 
 
597
 
598
- with col2:
599
- st.subheader("πŸ’‘ Reasoning")
600
- st.markdown(f'<div class="highlight">{result.get("reasoning", "N/A")}</div>',
601
- unsafe_allow_html=True)
602
 
603
- # Regex patterns
604
- st.markdown("---")
605
- st.subheader("πŸ”§ Generated Regex Patterns")
 
 
606
 
607
- regex_patterns = result.get('generated_regex', [])
608
- for i, pattern in enumerate(regex_patterns, 1):
609
- st.markdown(f"**Pattern {i}:**")
610
- st.code(pattern, language="regex")
 
 
 
611
 
612
- # Raw JSON output
613
- with st.expander("πŸ“„ View Raw JSON Output"):
614
- st.json(result)
 
 
 
 
615
 
616
- # Download results
617
- st.markdown("---")
618
- result_json = json.dumps(result, indent=2)
619
  st.download_button(
620
- label="⬇️ Download Results",
621
- data=result_json,
622
- file_name=f"analysis_{target_field}.json",
623
  mime="application/json"
624
  )
625
-
626
- except ConnectionError as e:
627
- st.error(f"❌ {e}")
628
- if provider_name == "ollama":
629
- st.info("πŸ’‘ Start Ollama with: `ollama serve`")
630
- else:
631
- st.info("πŸ’‘ Check your internet connection and API key")
632
-
633
- except TimeoutError as e:
634
- st.error(f"❌ {e}")
635
- st.info("πŸ’‘ The analysis took too long. Try again or use a larger timeout.")
636
-
637
- except Exception as e:
638
- st.error(f"❌ Error during analysis: {e}")
639
- st.exception(e)
640
 
641
- except json.JSONDecodeError:
642
- st.error("❌ Invalid JSON file. Please upload a valid JSON file.")
643
-
 
 
 
 
 
 
 
 
 
 
644
  except Exception as e:
645
- st.error(f"❌ Error reading file: {e}")
646
- st.exception(e)
647
 
648
  else:
649
- # Show example when no file is uploaded
650
- st.info("πŸ‘† Please upload a JSON file to get started")
651
 
652
- with st.expander("πŸ“– How it works"):
653
  st.markdown("""
654
- ### Workflow:
655
-
656
- 1. **Upload**: Upload your JSON file using the file uploader above
657
- 2. **Configure**: Set the target field name in the sidebar (default: `rotation_enabled`)
658
- 3. **Analyze**: Click the "Analyze with LLM" button
659
- 4. **Review**: View the important fields, reasoning, and regex patterns
660
- 5. **Download**: Save the results as JSON
661
-
662
- ### What it does:
663
-
664
- - Analyzes your JSON structure to detect summary fields, configurations, and objects
665
- - Uses LLM to identify important fields related to your target
666
- - Generates regex patterns for data extraction and validation
667
- - Provides reasoning for why each field is important
668
-
669
- ### Use cases:
670
-
671
- - AWS compliance validation (KMS rotation, SSL enforcement, etc.)
672
- - Data quality checks
673
- - Automated validation pattern generation
674
- - Field correlation analysis
675
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
 
678
- # Call main function - Streamlit will handle errors
679
- main()
680
-
 
1
  #!/usr/bin/env python3
2
  """
3
+ Hugging Face Streamlit App for LLM Field Analyzer
4
+ Upload a JSON file and analyze important fields with pattern generation.
5
  """
6
 
7
  import streamlit as st
8
  import json
 
 
9
  from pathlib import Path
10
  from typing import Dict, Any
11
  import io
12
 
13
+ # Page configuration
14
+ st.set_page_config(
15
+ page_title="Field Correlation Analyzer",
16
+ page_icon="πŸ€–",
17
+ layout="wide"
18
+ )
19
 
20
+ # Import our modules
21
  try:
22
  from structure_analysis import (
23
  detect_summary_fields,
 
25
  get_hierarchy_summary
26
  )
27
  except ImportError:
28
+ st.error("⚠️ structure_analysis.py not found. Make sure all files are uploaded.")
29
+ st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Session state
32
+ if 'analysis_result' not in st.session_state:
33
+ st.session_state.analysis_result = None
 
34
 
35
 
36
+ def analyze_with_llm(data: Dict[str, Any], target_field: str = "rotation_enabled") -> Dict[str, Any]:
37
+ """
38
+ Analyze data and generate a prompt for LLM analysis.
39
+ Returns structured analysis without requiring Ollama.
40
+ """
41
+ # Detect summary fields
42
+ summary_fields = detect_summary_fields(data)
43
+ classification = classify_data_structure(data)
44
+ hierarchy_summary = get_hierarchy_summary(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Extract samples
47
+ sample_object = {}
48
+ if 'results' in data:
49
+ for section in data['results'].values():
50
+ if isinstance(section, list) and len(section) > 0:
51
+ sample_object = section[0]
52
+ break
53
+ elif isinstance(section, dict):
54
+ for key, value in section.items():
55
+ if isinstance(value, list) and len(value) > 0:
56
+ sample_object = value[0] if isinstance(value[0], dict) else {}
57
+ break
58
 
59
+ summary_sample = data.get('results', {}).get('summary', {}) or data.get('summary', {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ # Count objects with target field
62
+ def count_objects_with_field(obj, field_name):
63
+ count = 0
64
+ if isinstance(obj, dict):
65
+ if field_name in obj:
66
+ count += 1
67
+ for v in obj.values():
68
+ count += count_objects_with_field(v, field_name)
69
+ elif isinstance(obj, list):
70
+ for item in obj:
71
+ count += count_objects_with_field(item, field_name)
72
+ return count
 
 
 
 
73
 
74
+ total_objects = count_objects_with_field(data, target_field)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Generate analysis
77
+ analysis = {
78
+ "summary_fields_detected": summary_fields[:10],
79
+ "classification": classification,
80
+ "hierarchy_summary": hierarchy_summary,
81
+ "total_objects": total_objects,
82
+ "sample_object": sample_object,
83
+ "summary_sample": summary_sample,
84
+ "recommended_fields": []
85
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Recommend fields based on priority
88
+ if summary_fields:
89
+ analysis["recommended_fields"].extend(summary_fields[:3])
90
+ if classification.get('config_fields'):
91
+ analysis["recommended_fields"].extend(classification['config_fields'][:2])
92
+ if sample_object:
93
+ analysis["recommended_fields"].extend([k for k in sample_object.keys() if target_field in k.lower()])
 
 
 
 
 
 
 
 
 
 
94
 
95
+ return analysis
 
 
 
 
 
 
96
 
97
 
98
+ def generate_regex_patterns(field_names: list, data_sample: dict, summary_sample: dict) -> list:
99
+ """Generate regex patterns for given fields."""
100
+ patterns = []
 
 
 
101
 
102
+ for field in field_names:
103
+ # Try to find the field value type
104
+ field_lower = field.lower()
 
 
 
 
 
 
105
 
106
+ # Check in summary first
107
+ if 'summary' in str(field):
108
+ field_name = field.split('.')[-1]
109
+ # Boolean pattern
110
+ if field_name in summary_sample and isinstance(summary_sample.get(field_name), bool):
111
+ patterns.append(f'"summary.{field_name}"\\s*:\\s*(true|false)')
112
+ # Number pattern
113
+ elif isinstance(summary_sample.get(field_name), (int, float)):
114
+ patterns.append(f'"summary.{field_name}"\\s*:\\s*(\\d+)')
115
+ # Check in object
116
+ elif field in data_sample:
117
+ value = data_sample[field]
118
+ if isinstance(value, bool):
119
+ patterns.append(f'"{field}"\\s*:\\s*(true|false)')
120
+ elif isinstance(value, (int, float)):
121
+ patterns.append(f'"{field}"\\s*:\\s*(\\d+)')
122
+ elif isinstance(value, str):
123
+ patterns.append(f'"{field}"\\s*:\\s*"([^"]*)"')
124
  else:
125
+ # Generic pattern based on field name
126
+ if 'percentage' in field_lower or 'count' in field_lower or 'total' in field_lower:
127
+ patterns.append(f'"{field}"\\s*:\\s*(\\d+)')
128
+ elif 'enabled' in field_lower or 'enforced' in field_lower:
129
+ patterns.append(f'"{field}"\\s*:\\s*(true|false)')
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  else:
131
+ patterns.append(f'"{field}"\\s*:\\s*"([^"]*)"')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ return patterns
134
+
135
+
136
+ def main():
137
+ """Main application."""
138
+ st.title("πŸ€– Field Correlation Analyzer")
139
+ st.markdown("Upload a JSON file to analyze important fields and generate regex patterns")
140
 
141
+ # File upload
142
  uploaded_file = st.file_uploader(
143
  "Choose a JSON file",
144
  type=['json'],
145
+ help="Upload a JSON file with structured data"
146
  )
147
 
 
148
  if uploaded_file is not None:
149
+ # Read and parse JSON
150
  try:
 
151
  content = uploaded_file.read()
152
  data = json.loads(content)
153
 
154
+ st.success("βœ… File loaded successfully!")
155
 
156
+ # Sidebar for settings
157
+ with st.sidebar:
158
+ st.header("βš™οΈ Settings")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ # Target field input
161
+ target_field = st.text_input(
162
+ "Target Field",
163
+ value="rotation_enabled",
164
+ help="The field you want to analyze"
165
+ )
166
+
167
+ # Analyze button
168
+ if st.button("πŸ” Analyze", type="primary"):
169
+ with st.spinner("Analyzing data structure..."):
170
+ analysis_result = analyze_with_llm(data, target_field)
171
+ st.session_state.analysis_result = analysis_result
172
+ st.session_state.data = data
173
+
174
+ # Display results if available
175
+ if st.session_state.analysis_result:
176
+ analysis = st.session_state.analysis_result
177
+
178
+ # Summary metrics
179
+ col1, col2, col3, col4 = st.columns(4)
180
+ with col1:
181
+ st.metric("Summary Fields", len(analysis['summary_fields_detected']))
182
+ with col2:
183
+ st.metric("Total Objects", analysis['total_objects'])
184
+ with col3:
185
+ st.metric("Has Summary", "Yes" if analysis['hierarchy_summary']['has_summary'] else "No")
186
+ with col4:
187
+ st.metric("Config Fields", len(analysis['classification'].get('config_fields', [])))
188
+
189
+ st.markdown("---")
190
+
191
+ # Create tabs
192
+ tab1, tab2, tab3, tab4 = st.tabs([
193
+ "πŸ“Š Structure Analysis",
194
+ "🎯 Field Recommendations",
195
+ "πŸ“ Generated Patterns",
196
+ "πŸ“„ Raw Data"
197
+ ])
198
+
199
+ with tab1:
200
+ st.subheader("Data Hierarchy")
201
+
202
+ # Summary fields
203
+ if analysis['summary_fields_detected']:
204
+ st.markdown("#### Level 1: Summary/Aggregate Fields (Highest Priority)")
205
+ for field in analysis['summary_fields_detected'][:10]:
206
+ st.write(f"βœ“ `{field}`")
207
+
208
+ # Config fields
209
+ config_fields = analysis['classification'].get('config_fields', [])
210
+ if config_fields:
211
+ st.markdown("#### Level 2: Configuration/Compliance Fields")
212
+ for field in config_fields[:10]:
213
+ st.write(f"βœ“ `{field}`")
214
+
215
+ # Object arrays
216
+ object_arrays = analysis['classification'].get('object_arrays', [])
217
+ if object_arrays:
218
+ st.markdown("#### Level 3: Object Arrays")
219
+ for field in object_arrays[:5]:
220
+ st.write(f"βœ“ `{field}`")
221
+
222
+ # Show sample data
223
+ with st.expander("πŸ“‹ View Summary Data Sample"):
224
+ st.json(analysis['summary_sample'])
225
+
226
+ with st.expander("πŸ“‹ View Object Data Sample"):
227
+ st.json(analysis['sample_object'])
228
+
229
+ with tab2:
230
+ st.subheader("Recommended Fields for Analysis")
231
+
232
+ if analysis['recommended_fields']:
233
+ st.info("These fields are recommended based on the data hierarchy and target field.")
234
 
235
+ # Let user select fields
236
+ selected_fields = st.multiselect(
237
+ "Select fields to generate patterns for:",
238
+ analysis['recommended_fields'],
239
+ default=analysis['recommended_fields'][:3]
240
+ )
241
 
242
+ if selected_fields and st.button("Generate Patterns"):
243
+ patterns = generate_regex_patterns(
244
+ selected_fields,
245
+ analysis['sample_object'],
246
+ analysis['summary_sample']
247
+ )
248
+
249
+ st.session_state.generated_patterns = {
250
+ 'fields': selected_fields,
251
+ 'patterns': patterns
252
+ }
253
+ else:
254
+ st.warning("No recommended fields found.")
255
+
256
+ with tab3:
257
+ if 'generated_patterns' in st.session_state:
258
+ patterns_data = st.session_state.generated_patterns
259
 
260
+ st.subheader("Generated Regex Patterns")
 
 
 
261
 
262
+ # Show patterns
263
+ for i, (field, pattern) in enumerate(zip(patterns_data['fields'], patterns_data['patterns']), 1):
264
+ st.markdown(f"**Pattern {i}: {field}**")
265
+ st.code(pattern, language="regex", line_numbers=False)
266
+ st.markdown("---")
267
 
268
+ # Copy to clipboard
269
+ all_patterns = "\n".join(patterns_data['patterns'])
270
+ st.text_area(
271
+ "All Patterns (copy this):",
272
+ all_patterns,
273
+ height=100
274
+ )
275
 
276
+ # JSON export
277
+ export_data = {
278
+ "test_name": "Field Analysis",
279
+ "important_fields": patterns_data['fields'],
280
+ "reasoning": "Fields identified using hierarchical analysis prioritizing summary/aggregate fields",
281
+ "generated_regex": patterns_data['patterns']
282
+ }
283
 
 
 
 
284
  st.download_button(
285
+ label="πŸ“₯ Download as JSON",
286
+ data=json.dumps(export_data, indent=2),
287
+ file_name="analysis_result.json",
288
  mime="application/json"
289
  )
290
+ else:
291
+ st.info("πŸ‘† Go to 'Field Recommendations' tab to select fields and generate patterns.")
292
+
293
+ with tab4:
294
+ st.subheader("Raw Data Structure")
 
 
 
 
 
 
 
 
 
 
295
 
296
+ # Full data viewer
297
+ st.json(data)
298
+
299
+ # Download raw data
300
+ st.download_button(
301
+ label="πŸ“₯ Download Raw Data",
302
+ data=json.dumps(data, indent=2),
303
+ file_name="raw_data.json",
304
+ mime="application/json"
305
+ )
306
+
307
+ except json.JSONDecodeError as e:
308
+ st.error(f"❌ Invalid JSON file: {e}")
309
  except Exception as e:
310
+ st.error(f"❌ Error processing file: {e}")
 
311
 
312
  else:
313
+ # Show example when no file uploaded
314
+ st.info("πŸ‘† Please upload a JSON file to begin analysis")
315
 
316
+ with st.expander("πŸ“– How to use"):
317
  st.markdown("""
318
+ **Steps:**
319
+ 1. Upload a JSON file with structured data
320
+ 2. Set the target field you want to analyze (e.g., `rotation_enabled`)
321
+ 3. Click "Analyze" to process the data
322
+ 4. Review the structure analysis and field recommendations
323
+ 5. Select fields and generate regex patterns
324
+ 6. Download the results as JSON
325
+
326
+ **What this tool does:**
327
+ - Detects summary/aggregate fields automatically
328
+ - Classifies data structure by hierarchy levels
329
+ - Recommends important fields for validation
330
+ - Generates regex patterns for field extraction
 
 
 
 
 
 
 
 
331
  """)
332
+
333
+ with st.expander("πŸ“‹ Example JSON Structure"):
334
+ example = {
335
+ "results": {
336
+ "summary": {
337
+ "total_keys": 13,
338
+ "rotated_keys": 6,
339
+ "rotation_percentage": 46
340
+ },
341
+ "kms_keys": {
342
+ "object": [
343
+ {
344
+ "key_id": "12345",
345
+ "rotation_enabled": True,
346
+ "key_state": "Enabled"
347
+ }
348
+ ]
349
+ }
350
+ }
351
+ }
352
+ st.json(example)
353
 
354
 
355
+ if __name__ == "__main__":
356
+ main()