Spaces:

Princess3
/

l

Build error

App Files Files Community

l / app.py

Princess3

Upload app.py

b2fbd32 verified 7 months ago

raw

history blame contribute delete

28.7 kB

	#!/usr/bin/env python3
	"""
	NZ Legislation Loophole Analysis - Hugging Face Spaces App

	Root-level app.py for Hugging Face Spaces deployment.
	Adapted for Spaces memory constraints and session-based caching.
	"""

	import streamlit as st
	import sys
	import os
	import warnings
	from pathlib import Path

	# Add current directory to Python path for imports
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	# Suppress warnings for cleaner output
	warnings.filterwarnings('ignore')

	# Import core modules with error handling for Spaces
	try:
	from streamlit_app.core.cache_manager import CacheManager, get_cache_manager
	from streamlit_app.core.text_processor import TextProcessor
	from streamlit_app.core.llm_analyzer import LLMAnalyzer
	from streamlit_app.core.dataset_builder import DatasetBuilder
	from streamlit_app.utils.config import ConfigManager
	from streamlit_app.utils.performance import PerformanceMonitor
	from streamlit_app.utils.ui_helpers import UIHelpers
	except ImportError as e:
	st.error(f"❌ Import Error: {e}")
	st.error("Please ensure all required packages are installed.")
	st.stop()

	# Configure page settings for Spaces
	st.set_page_config(
	page_title="NZ Legislation Loophole Analyzer",
	page_icon="⚖️",
	layout="wide",
	initial_sidebar_state="expanded",
	menu_items={
	'Get Help': 'https://huggingface.co/spaces/your-space',
	'Report a bug': 'https://github.com/your-repo/issues',
	'About': '''
	## NZ Legislation Loophole Analyzer
	AI-powered analysis of New Zealand legislation to identify
	potential loopholes, ambiguities, and unintended consequences.

	Version: 1.0.0 (Spaces Edition)
	Platform: Hugging Face Spaces
	Built with: Streamlit & Llama.cpp
	'''
	}
	)

	# Spaces-specific configuration
	SPACES_CONFIG = {
	'max_memory_mb': 512, # Conservative memory limit for Spaces
	'cache_enabled': True,
	'persistent_cache': False, # Session-based only
	'model_path': 'qwen3.gguf', # Default model
	'context_length': 4096, # Smaller context for memory
	'max_tokens': 2048, # Smaller responses
	}

	def initialize_spaces_session():
	"""Initialize session state optimized for Spaces"""
	if 'cache_manager' not in st.session_state:
	# Initialize with Spaces-optimized settings
	st.session_state.cache_manager = CacheManager(
	max_memory_mb=SPACES_CONFIG['max_memory_mb'],
	persistent=False, # No persistent storage in Spaces
	ttl_hours=1 # Shorter TTL for memory efficiency
	)

	if 'config_manager' not in st.session_state:
	st.session_state.config_manager = ConfigManager()

	# Override with Spaces-optimized defaults
	spaces_defaults = {
	'model': {
	'path': SPACES_CONFIG['model_path'],
	'context_length': SPACES_CONFIG['context_length'],
	'max_tokens': SPACES_CONFIG['max_tokens'],
	'temperature': 0.3,
	'top_p': 0.85,
	},
	'cache': {
	'enabled': SPACES_CONFIG['cache_enabled'],
	'max_size_mb': SPACES_CONFIG['max_memory_mb'],
	'persistent': False,
	'ttl_hours': 1,
	},
	'processing': {
	'chunk_size': 2048, # Smaller chunks for memory
	'chunk_overlap': 128,
	'batch_size': 4, # Smaller batch size
	'clean_text': True,
	}
	}

	# Update configuration with Spaces defaults
	st.session_state.config_manager.update_config(spaces_defaults)

	if 'performance_monitor' not in st.session_state:
	st.session_state.performance_monitor = PerformanceMonitor(max_history=100)

	if 'text_processor' not in st.session_state:
	st.session_state.text_processor = TextProcessor()

	if 'current_analysis' not in st.session_state:
	st.session_state.current_analysis = None

	if 'analysis_results' not in st.session_state:
	st.session_state.analysis_results = []

	if 'processing_status' not in st.session_state:
	st.session_state.processing_status = {
	'is_running': False,
	'progress': 0,
	'current_task': '',
	'total_chunks': 0,
	'processed_chunks': 0
	}

	if 'model_loaded' not in st.session_state:
	st.session_state.model_loaded = False

	if 'llm_analyzer' not in st.session_state:
	st.session_state.llm_analyzer = None

	def show_spaces_optimized_home_page():
	"""Home page optimized for Spaces"""
	st.title("🏠 NZ Legislation Loophole Analyzer")
	st.markdown("### AI-Powered Legal Analysis (Spaces Edition)")

	# Spaces-specific warnings and info
	with st.expander("⚠️ Spaces Environment Notes", expanded=False):
	st.info("""
	Running on Hugging Face Spaces
	- Memory optimized for cloud deployment
	- Session-based caching (resets between visits)
	- Use smaller models for best performance
	- Analysis results persist during your session
	""")

	col1, col2 = st.columns([2, 1])

	with col1:
	st.markdown("""
	This AI-powered tool analyzes New Zealand legislation to identify:

	🔍 Potential Loopholes - Legal ambiguities that could be exploited
	📋 Unintended Consequences - Hidden implications in legislative language
	⚖️ Ambiguities - Vague or unclear legal provisions
	🎯 Circumvention Strategies - Ways legislation might be bypassed

	Key Features:
	- Smart Caching: Avoid re-processing identical content during your session
	- Memory Optimized: Designed for Spaces memory constraints
	- Real-time Progress: Live processing status and performance metrics
	- Export Options: Download results in multiple formats
	""")

	st.markdown("### Quick Start")
	st.markdown("""
	1. Upload your NZ legislation files (JSON lines or raw text)
	2. Configure analysis parameters (use smaller models for Spaces)
	3. Process the legislation with AI-powered analysis
	4. Review results with interactive visualizations
	5. Export findings before your session ends
	""")

	with col2:
	st.markdown("### Current Configuration")

	config = st.session_state.config_manager.get_config()

	# Model settings
	st.subheader("🤖 Model Settings")
	st.info(f"Model: {config['model']['path']}")
	st.info(f"Context Length: {config['model']['context_length']}")
	st.info(f"Max Tokens: {config['model']['max_tokens']}")

	# Processing settings
	st.subheader("⚙️ Processing")
	st.info(f"Chunk Size: {config['processing']['chunk_size']}")
	st.info(f"Overlap: {config['processing']['chunk_overlap']}")
	st.info(f"Batch Size: {config['processing']['batch_size']}")

	# Cache settings
	st.subheader("🧠 Cache")
	cache_stats = st.session_state.cache_manager.get_stats()
	st.info(f"Status: {'Active' if cache_stats['enabled'] else 'Disabled'}")
	st.info(f"Max Memory: {SPACES_CONFIG['max_memory_mb']}MB")
	st.info(f"Hit Rate: {cache_stats['hit_rate']:.1f}%")

	# Memory warning
	perf_stats = st.session_state.performance_monitor.get_stats()
	memory_usage = perf_stats['memory_usage_mb']
	if memory_usage > SPACES_CONFIG['max_memory_mb'] * 0.8:
	st.warning(f"⚠️ High Memory Usage: {memory_usage:.1f}MB")
	else:
	st.success(f"✅ Memory Usage: {memory_usage:.1f}MB")

	if st.button("🚀 Start Analysis", type="primary", use_container_width=True):
	st.switch_page("pages/1_upload.py")

	def show_spaces_optimized_upload_page():
	"""Upload page optimized for Spaces"""
	st.title("📤 Upload & Process Legislation")

	# Memory warning for Spaces
	with st.expander("💡 Spaces Optimization Tips", expanded=False):
	st.info("""
	For Best Performance on Spaces:
	- Use smaller models (0.8B-1.5B parameters)
	- Process files individually for large documents
	- Keep chunk sizes under 2048 characters
	- Monitor memory usage in the sidebar
	""")

	# File upload section
	st.subheader("📁 Upload Legislation Files")

	col1, col2 = st.columns([1, 1])

	with col1:
	uploaded_files = st.file_uploader(
	"Select NZ legislation files",
	accept_multiple_files=True,
	type=['json', 'txt', 'jsonl'],
	help="Upload JSON lines format (.jsonl), JSON arrays (.json), or raw text (.txt) files",
	key="spaces_file_uploader"
	)

	if uploaded_files:
	st.success(f"📄 {len(uploaded_files)} file(s) selected")

	# Show file details with size warnings
	for file in uploaded_files:
	with st.expander(f"📋 {file.name}"):
	size_mb = file.size / (1024 * 1024)
	if size_mb > 10: # Warning for large files
	st.warning(".1f")
	else:
	st.info(".1f")

	st.write(f"Type: {file.type}")

	# Preview content
	if file.type in ['text/plain', 'application/json']:
	content = file.read().decode('utf-8')
	preview_length = min(300, len(content))
	st.text_area("Preview", content[:preview_length] + "..." if len(content) > preview_length else content,
	height=100, disabled=True)
	file.seek(0) # Reset file pointer

	with col2:
	# Processing configuration optimized for Spaces
	st.subheader("⚙️ Processing Configuration")

	config = st.session_state.config_manager.get_config()

	# Model settings with Spaces warnings
	with st.expander("🤖 Model Configuration", expanded=True):
	st.info("💡 Use smaller models (0.8B-1.5B) for best Spaces performance")

	model_path = st.text_input(
	"Model Path",
	value=config['model']['path'],
	help="Path to your GGUF model file (use small models for Spaces)"
	)

	context_length = st.slider(
	"Context Length",
	min_value=1024,
	max_value=8192, # Reduced max for Spaces
	value=min(config['model']['context_length'], 4096),
	step=512,
	help="Maximum context length for the model"
	)

	max_tokens = st.slider(
	"Max Response Tokens",
	min_value=256,
	max_value=4096,
	value=min(config['model']['max_tokens'], 2048),
	step=128,
	help="Maximum tokens in model response"
	)

	# Text processing settings
	with st.expander("📝 Text Processing", expanded=True):
	chunk_size = st.slider(
	"Chunk Size",
	min_value=512,
	max_value=4096, # Reduced for Spaces memory
	value=min(config['processing']['chunk_size'], 2048),
	step=256,
	help="Size of text chunks for processing"
	)

	chunk_overlap = st.slider(
	"Chunk Overlap",
	min_value=32,
	max_value=512,
	value=config['processing']['chunk_overlap'],
	step=32,
	help="Overlap between chunks for context preservation"
	)

	# Analysis settings
	with st.expander("🔍 Analysis Settings", expanded=True):
	analysis_depth = st.select_slider(
	"Analysis Depth",
	options=["Basic", "Standard", "Detailed"], # Removed comprehensive for memory
	value=config['analysis']['depth'],
	help="Level of detail in legal analysis (use Standard for Spaces)"
	)

	include_recommendations = st.checkbox(
	"Include Recommendations",
	value=config['analysis']['include_recommendations'],
	help="Generate specific recommendations for addressing identified issues"
	)

	# Process button and status
	col1, col2, col3 = st.columns([1, 1, 1])

	with col1:
	if st.button("🔄 Start Processing", type="primary", use_container_width=True):
	if not uploaded_files:
	st.error("Please upload at least one legislation file")
	else:
	start_spaces_processing(uploaded_files, {
	'model': {
	'path': model_path,
	'context_length': context_length,
	'max_tokens': max_tokens
	},
	'processing': {
	'chunk_size': chunk_size,
	'chunk_overlap': chunk_overlap
	},
	'analysis': {
	'depth': analysis_depth,
	'include_recommendations': include_recommendations
	}
	})

	with col2:
	if st.button("⏹️ Stop Processing", use_container_width=True):
	stop_processing()

	with col3:
	if st.button("📊 View Results", use_container_width=True):
	st.switch_page("pages/2_analysis.py")

	def start_spaces_processing(files, config):
	"""Start processing optimized for Spaces"""
	# Check memory before starting
	perf_stats = st.session_state.performance_monitor.get_stats()
	if perf_stats['memory_usage_mb'] > SPACES_CONFIG['max_memory_mb'] * 0.9:
	st.warning("⚠️ High memory usage detected. Consider clearing cache first.")
	if st.button("Clear Cache and Continue"):
	st.session_state.cache_manager.clear_cache()
	st.rerun()
	return

	st.session_state.processing_status = {
	'is_running': True,
	'progress': 0,
	'current_task': 'Initializing...',
	'total_chunks': 0,
	'processed_chunks': 0
	}

	# Update configuration
	st.session_state.config_manager.update_config(config)

	# Add memory warning
	st.info("💡 Processing on Spaces - this may take longer than local execution")

	st.rerun()

	def stop_processing():
	"""Stop the current processing"""
	st.session_state.processing_status['is_running'] = False
	st.session_state.processing_status['current_task'] = 'Stopped by user'

	def show_spaces_optimized_results_page():
	"""Results page optimized for Spaces"""
	st.title("📊 Analysis Results")

	# Session warning for Spaces
	with st.expander("💾 Session-Based Storage", expanded=False):
	st.warning("""
	Important: Results are stored in your session only.
	- Download results before closing your browser
	- Cache resets between visits
	- Consider using smaller models for faster processing
	""")

	if not st.session_state.analysis_results:
	st.info("No analysis results available. Please upload and process legislation files first.")
	return

	# Results overview
	st.subheader("📈 Results Overview")

	col1, col2, col3, col4 = st.columns(4)

	total_results = len(st.session_state.analysis_results)
	total_loopholes = sum(len(result.get('loopholes', [])) for result in st.session_state.analysis_results)
	avg_confidence = sum(result.get('confidence', 0) for result in st.session_state.analysis_results) / max(total_results, 1)

	with col1:
	st.metric("Total Analyses", total_results)

	with col2:
	st.metric("Loopholes Found", total_loopholes)

	with col3:
	st.metric("Avg Confidence", ".2f")

	with col4:
	cache_stats = st.session_state.cache_manager.get_stats()
	st.metric("Cache Hit Rate", ".1f")

	# Results display
	st.subheader("🔍 Detailed Results")

	for i, result in enumerate(st.session_state.analysis_results):
	with st.expander(f"📋 Analysis {i+1}: {result.get('title', 'Unknown Title')}", expanded=i==0):
	col1, col2 = st.columns([2, 1])

	with col1:
	st.markdown("Summary:")
	st.write(result.get('summary', 'No summary available'))

	st.markdown("Key Findings:")
	for finding in result.get('loopholes', []):
	st.markdown(f"- {finding}")

	with col2:
	st.metric("Confidence", ".2f")
	st.metric("Processing Time", ".2f")
	st.metric("Chunks Processed", result.get('chunks_processed', 0))

	# Export options with Spaces warning
	st.subheader("💾 Export Results")

	col1, col2, col3 = st.columns(3)

	with col1:
	if st.button("📄 Export as JSON", use_container_width=True):
	export_results('json')

	with col2:
	if st.button("📊 Export as CSV", use_container_width=True):
	export_results('csv')

	with col3:
	if st.button("📋 Export as Excel", use_container_width=True):
	export_results('excel')

	def export_results(format_type):
	"""Export analysis results in specified format"""
	# TODO: Implement export functionality
	st.success(f"Results exported as {format_type.upper()} - download will be available in the next version")

	def show_spaces_optimized_settings_page():
	"""Settings page optimized for Spaces"""
	st.title("⚙️ Settings & Configuration")

	# Spaces-specific info
	with st.expander("🌐 Spaces Environment", expanded=False):
	st.info("""
	Spaces-Specific Settings:
	- Memory limit: 512MB cache (conservative)
	- Session-based storage only
	- No persistent data between visits
	- Optimized for cloud performance
	""")

	tabs = st.tabs(["🤖 Model Settings", "📝 Processing", "🧠 Cache", "📊 Performance"])

	with tabs[0]:
	st.subheader("🤖 Model Configuration")

	config = st.session_state.config_manager.get_config()

	st.info("💡 For Spaces: Use smaller models (0.8B-1.5B parameters) for best performance")

	model_path = st.text_input(
	"Model Path",
	value=config['model']['path'],
	help="Path to your GGUF model file (smaller models recommended)"
	)

	context_length = st.slider(
	"Context Length",
	min_value=1024,
	max_value=8192,
	value=config['model']['context_length'],
	step=512,
	help="Maximum context length (smaller = faster processing)"
	)

	max_tokens = st.slider(
	"Max Response Tokens",
	min_value=256,
	max_value=4096,
	value=config['model']['max_tokens'],
	step=128,
	help="Maximum tokens in response (smaller = faster)"
	)

	temperature = st.slider(
	"Temperature",
	min_value=0.0,
	max_value=1.0,
	value=config['model']['temperature'],
	step=0.1,
	help="Controls randomness (lower = more consistent)"
	)

	with tabs[1]:
	st.subheader("📝 Text Processing")

	chunk_size = st.slider(
	"Chunk Size",
	min_value=512,
	max_value=4096,
	value=config['processing']['chunk_size'],
	step=256,
	help="Text chunk size (smaller = more memory efficient)"
	)

	chunk_overlap = st.slider(
	"Chunk Overlap",
	min_value=32,
	max_value=512,
	value=config['processing']['chunk_overlap'],
	step=32,
	help="Overlap between chunks for context"
	)

	batch_size = st.slider(
	"Batch Size",
	min_value=1,
	max_value=8, # Reduced for Spaces
	value=config['processing']['batch_size'],
	step=1,
	help="Number of chunks to process at once (lower = less memory)"
	)

	with tabs[2]:
	st.subheader("🧠 Cache Configuration")

	enable_cache = st.checkbox(
	"Enable Caching",
	value=config['cache']['enabled'],
	help="Use cache to avoid re-processing (recommended)"
	)

	st.info(f"💡 Max cache size: {SPACES_CONFIG['max_memory_mb']}MB (fixed for Spaces)")

	cache_ttl = st.slider(
	"Cache TTL (hours)",
	min_value=0.5,
	max_value=2.0,
	value=config['cache']['ttl_hours'],
	step=0.5,
	help="How long to keep cached results (shorter = less memory)"
	)

	with tabs[3]:
	st.subheader("📊 Performance Monitoring")

	perf_stats = st.session_state.performance_monitor.get_stats()

	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("Memory Usage", ".1f", "MB")

	with col2:
	st.metric("Cache Hit Rate", ".1f", "%")

	with col3:
	st.metric("Active Threads", perf_stats.get('active_threads', 0))

	# Performance recommendations
	recommendations = st.session_state.performance_monitor.get_recommendations()
	if recommendations:
	st.subheader("💡 Recommendations")
	for rec in recommendations:
	if "High" in rec or "Low" in rec:
	st.warning(rec)
	else:
	st.info(rec)

	# Save settings
	col1, col2 = st.columns([1, 1])

	with col1:
	if st.button("💾 Save Settings", type="primary", use_container_width=True):
	new_config = {
	'model': {
	'path': model_path,
	'context_length': context_length,
	'max_tokens': max_tokens,
	'temperature': temperature
	},
	'processing': {
	'chunk_size': chunk_size,
	'chunk_overlap': chunk_overlap,
	'batch_size': batch_size
	},
	'cache': {
	'enabled': enable_cache,
	'ttl_hours': cache_ttl
	}
	}

	st.session_state.config_manager.update_config(new_config)
	st.success("Settings saved successfully!")

	with col2:
	if st.button("🔄 Reset to Defaults", use_container_width=True):
	st.session_state.config_manager.reset_to_defaults()
	st.success("Settings reset to defaults!")
	st.rerun()

	def show_spaces_optimized_performance_page():
	"""Performance page optimized for Spaces"""
	st.title("📈 Performance Dashboard")

	# Spaces-specific info
	with st.expander("🌐 Spaces Performance Notes", expanded=False):
	st.info("""
	Spaces Environment:
	- Memory limit: ~2-8GB shared
	- Cache: Session-based only
	- Performance: Optimized for cloud
	- Monitoring: Real-time metrics
	""")

	# Real-time metrics
	st.subheader("📊 Real-time Metrics")

	col1, col2, col3, col4 = st.columns(4)

	perf_stats = st.session_state.performance_monitor.get_stats()

	with col1:
	st.metric("Memory Usage", ".1f", "MB")

	with col2:
	st.metric("Memory %", ".1f", "%")

	with col3:
	st.metric("CPU Usage", ".1f", "%")

	with col4:
	cache_stats = st.session_state.cache_manager.get_stats()
	st.metric("Cache Hit Rate", ".1f", "%")

	# Memory warning for Spaces
	memory_percent = perf_stats.get('memory_percent', 0)
	if memory_percent > 80:
	st.error("⚠️ High memory usage - consider clearing cache")
	elif memory_percent > 60:
	st.warning("⚠️ Moderate memory usage")
	else:
	st.success("✅ Memory usage within limits")

	# Cache performance
	st.subheader("🧠 Cache Performance")

	cache_stats = st.session_state.cache_manager.get_stats()

	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Total Requests", cache_stats['hits'] + cache_stats['misses'])

	with col2:
	st.metric("Cache Hits", cache_stats['hits'])

	with col3:
	st.metric("Cache Misses", cache_stats['misses'])

	with col4:
	st.metric("Hit Rate", ".1f")

	# Performance recommendations
	st.subheader("💡 Performance Recommendations")

	recommendations = st.session_state.performance_monitor.get_recommendations()

	if recommendations:
	for rec in recommendations:
	if "High" in rec or "Low" in rec:
	st.error(rec)
	elif "Moderate" in rec or "Consider" in rec:
	st.warning(rec)
	else:
	st.info(rec)
	else:
	st.success("✅ Performance is optimal!")

	# Cache management
	st.subheader("🧠 Cache Management")

	col1, col2 = st.columns(2)

	with col1:
	if st.button("🔄 Clear Cache", type="secondary", use_container_width=True):
	st.session_state.cache_manager.clear_cache()
	st.success("Cache cleared successfully!")
	st.rerun()

	with col2:
	if st.button("📊 Reset Statistics", use_container_width=True):
	st.session_state.performance_monitor.reset_stats()
	st.success("Statistics reset!")
	st.rerun()

	def main():
	"""Main application function for Spaces"""
	# Initialize session state
	initialize_spaces_session()

	# Create sidebar with navigation and status
	with st.sidebar:
	st.title("⚖️ NZ Legislation Analyzer")
	st.markdown("---")
	st.markdown("Spaces Edition")
	st.markdown("---")

	# Navigation
	pages = {
	"🏠 Home": "home",
	"📤 Upload & Process": "upload",
	"📊 Analysis Results": "results",
	"⚙️ Settings": "settings",
	"📈 Performance": "performance"
	}

	selected_page = st.selectbox(
	"Navigate to:",
	list(pages.keys()),
	key="nav_select"
	)

	st.markdown("---")

	# Cache status
	with st.expander("🧠 Cache Status", expanded=True):
	cache_stats = st.session_state.cache_manager.get_stats()
	st.metric("Cache Hits", cache_stats['hits'])
	st.metric("Cache Misses", cache_stats['misses'])
	st.metric("Hit Rate", ".1f")
	st.metric("Cached Chunks", cache_stats['entries'])

	if st.button("Clear Cache", type="secondary"):
	st.session_state.cache_manager.clear_cache()
	st.rerun()

	# Performance metrics
	with st.expander("📊 Performance", expanded=True):
	perf_stats = st.session_state.performance_monitor.get_stats()
	st.metric("Memory Usage", ".1f")
	st.metric("CPU Usage", ".1f")

	# Processing status
	if st.session_state.processing_status['is_running']:
	with st.expander("🔄 Processing Status", expanded=True):
	st.progress(st.session_state.processing_status['progress'])
	st.text(st.session_state.processing_status['current_task'])
	st.text(f"Chunk {st.session_state.processing_status['processed_chunks']}/"
	f"{st.session_state.processing_status['total_chunks']}")

	# Main content area
	page = pages[selected_page]

	if page == "home":
	show_spaces_optimized_home_page()
	elif page == "upload":
	show_spaces_optimized_upload_page()
	elif page == "results":
	show_spaces_optimized_results_page()
	elif page == "settings":
	show_spaces_optimized_settings_page()
	elif page == "performance":
	show_spaces_optimized_performance_page()

	# Footer with Spaces branding
	st.markdown("---")
	st.markdown(
	"""
	<div style='text-align: center; color: #666; font-size: 12px;'>
	NZ Legislation Loophole Analyzer v1.0.0 (Spaces Edition) \|
	Built with Streamlit & Llama.cpp \| Hosted on 🤗 Hugging Face Spaces
	</div>
	""",
	unsafe_allow_html=True
	)

	if __name__ == "__main__":
	main()