Spaces:
Building
Building
added memory file handler
Browse files- MEMORY_HANDLER_MIGRATION.md +146 -0
- apply_memory_handler_fix.py +87 -0
- diagnose_upload_error.py +162 -0
- test_fix_403.py +90 -0
- test_memory_upload.py +90 -0
- update_to_memory_handler.py +113 -0
- web_app/components/comparison_functions.py +5 -22
- web_app/components/ui_components.py +5 -22
- web_app/handlers/analysis_handlers.py +544 -367
- web_app/handlers/analysis_handlers.py.backup_20250726_162020 +429 -0
- web_app/handlers/analysis_handlers_updated.py +606 -0
- web_app/handlers/frequency_handlers.py +429 -468
- web_app/handlers/frequency_handlers.py.backup_20250726_162020 +733 -0
- web_app/handlers/frequency_handlers_memory.py +317 -0
- web_app/handlers/frequency_handlers_updated.py +694 -0
- web_app/utils/__init__.py +2 -1
- web_app/utils/memory_file_handler.py +170 -0
MEMORY_HANDLER_MIGRATION.md
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Memory Handler Migration Guide
|
| 2 |
+
|
| 3 |
+
## Why Memory-Based File Handling?
|
| 4 |
+
|
| 5 |
+
The original `FileUploadHandler` saves files to `/tmp` directory, which can cause 403 Forbidden errors on restricted environments like:
|
| 6 |
+
- Hugging Face Spaces
|
| 7 |
+
- Some cloud platforms with read-only filesystems
|
| 8 |
+
- Containers with security restrictions
|
| 9 |
+
|
| 10 |
+
The `MemoryFileHandler` processes files entirely in memory, avoiding filesystem access.
|
| 11 |
+
|
| 12 |
+
## Caveats and Limitations
|
| 13 |
+
|
| 14 |
+
### 1. **Memory Usage**
|
| 15 |
+
- **Issue**: All file content is loaded into RAM
|
| 16 |
+
- **Impact**: Large files (near 300MB limit) could cause memory issues
|
| 17 |
+
- **Mitigation**: The 300MB file size limit helps prevent OOM errors
|
| 18 |
+
|
| 19 |
+
### 2. **ZIP File Handling**
|
| 20 |
+
- **Issue**: ZIP files need special handling as they require file-like objects
|
| 21 |
+
- **Current approach**: Load entire ZIP into memory using BytesIO
|
| 22 |
+
- **Limitation**: Extracting large ZIP files could spike memory usage
|
| 23 |
+
|
| 24 |
+
### 3. **Session State Persistence**
|
| 25 |
+
- **Issue**: Streamlit reloads can clear memory
|
| 26 |
+
- **Solution**: Store processed content in `st.session_state`
|
| 27 |
+
- **Limitation**: Session state also uses memory
|
| 28 |
+
|
| 29 |
+
### 4. **Multiple File Processing**
|
| 30 |
+
- **Issue**: Batch processing multiple files multiplies memory usage
|
| 31 |
+
- **Example**: 10 files × 30MB each = 300MB in memory
|
| 32 |
+
- **Mitigation**: Process files sequentially, not in parallel
|
| 33 |
+
|
| 34 |
+
### 5. **Binary vs Text Files**
|
| 35 |
+
- **Issue**: Binary files (images, etc.) need different handling
|
| 36 |
+
- **Solution**: `as_text` parameter in `process_uploaded_file()`
|
| 37 |
+
|
| 38 |
+
## Implementation Status
|
| 39 |
+
|
| 40 |
+
### ✅ Completed:
|
| 41 |
+
- `ui_components.py` - Text input file uploads
|
| 42 |
+
- `comparison_functions.py` - Comparison file uploads
|
| 43 |
+
- `frequency_handlers.py` - Created `frequency_handlers_updated.py`
|
| 44 |
+
- `utils/__init__.py` - Exports both handlers
|
| 45 |
+
|
| 46 |
+
### ⚠️ Need Updates:
|
| 47 |
+
- `analysis_handlers.py` - Complex due to ZIP file handling
|
| 48 |
+
- `pos_handlers.py` - Batch file processing
|
| 49 |
+
- `reference_manager.py` - Custom reference uploads
|
| 50 |
+
- `config_manager.py` - YAML config uploads
|
| 51 |
+
|
| 52 |
+
## Migration Examples
|
| 53 |
+
|
| 54 |
+
### Simple File Upload
|
| 55 |
+
```python
|
| 56 |
+
# OLD - FileUploadHandler
|
| 57 |
+
temp_path = FileUploadHandler.save_to_temp(uploaded_file, prefix="text")
|
| 58 |
+
if temp_path:
|
| 59 |
+
content = FileUploadHandler.read_from_temp(temp_path)
|
| 60 |
+
# ... process content
|
| 61 |
+
FileUploadHandler.cleanup_temp_file(temp_path)
|
| 62 |
+
|
| 63 |
+
# NEW - MemoryFileHandler
|
| 64 |
+
content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
|
| 65 |
+
if content:
|
| 66 |
+
# ... process content
|
| 67 |
+
# No cleanup needed!
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
### ZIP File Handling
|
| 71 |
+
```python
|
| 72 |
+
# OLD - FileUploadHandler
|
| 73 |
+
zip_file = FileUploadHandler.handle_zip_file(uploaded_file)
|
| 74 |
+
with zip_file as zip_ref:
|
| 75 |
+
for file_info in zip_ref.infolist():
|
| 76 |
+
content = zip_ref.read(file_info.filename)
|
| 77 |
+
|
| 78 |
+
# NEW - MemoryFileHandler
|
| 79 |
+
file_contents = MemoryFileHandler.handle_zip_file(uploaded_file)
|
| 80 |
+
if file_contents:
|
| 81 |
+
for filename, content in file_contents.items():
|
| 82 |
+
# Process each file
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### DataFrame Processing
|
| 86 |
+
```python
|
| 87 |
+
# OLD - Manual CSV parsing
|
| 88 |
+
content = FileUploadHandler.read_from_temp(temp_path)
|
| 89 |
+
df = pd.read_csv(StringIO(content.decode('utf-8')))
|
| 90 |
+
|
| 91 |
+
# NEW - Direct DataFrame creation
|
| 92 |
+
df = MemoryFileHandler.process_csv_tsv_file(uploaded_file)
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
## When to Use Which Handler
|
| 96 |
+
|
| 97 |
+
### Use MemoryFileHandler when:
|
| 98 |
+
- Deploying to restricted environments (Hugging Face Spaces)
|
| 99 |
+
- Files are reasonably sized (<100MB preferred)
|
| 100 |
+
- You need maximum compatibility
|
| 101 |
+
|
| 102 |
+
### Consider FileUploadHandler when:
|
| 103 |
+
- Processing very large files (>200MB)
|
| 104 |
+
- Running locally with full filesystem access
|
| 105 |
+
- Need to preserve files across sessions
|
| 106 |
+
|
| 107 |
+
## Complete Migration Steps
|
| 108 |
+
|
| 109 |
+
1. **Update imports**:
|
| 110 |
+
```python
|
| 111 |
+
from web_app.utils import MemoryFileHandler
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
2. **Replace file operations**:
|
| 115 |
+
- Remove `save_to_temp()` calls
|
| 116 |
+
- Remove `cleanup_temp_file()` calls
|
| 117 |
+
- Use `process_uploaded_file()` directly
|
| 118 |
+
|
| 119 |
+
3. **Update error handling**:
|
| 120 |
+
- Remove 403-specific error messages
|
| 121 |
+
- Add memory-related error handling
|
| 122 |
+
|
| 123 |
+
4. **Test thoroughly**:
|
| 124 |
+
- Test with small files first
|
| 125 |
+
- Test with maximum size files
|
| 126 |
+
- Test with multiple files
|
| 127 |
+
|
| 128 |
+
## Performance Considerations
|
| 129 |
+
|
| 130 |
+
### Memory Usage Formula:
|
| 131 |
+
```
|
| 132 |
+
Total Memory = File Size + Processing Overhead + Session State Storage
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
### Example for 50MB file:
|
| 136 |
+
- File content: 50MB
|
| 137 |
+
- String conversion: ~50MB (if text)
|
| 138 |
+
- DataFrame creation: ~100-200MB (depends on data)
|
| 139 |
+
- Total: ~200-300MB peak usage
|
| 140 |
+
|
| 141 |
+
## Recommendations
|
| 142 |
+
|
| 143 |
+
1. **For Hugging Face Spaces**: Use MemoryFileHandler exclusively
|
| 144 |
+
2. **For local deployment**: Either handler works, choose based on file sizes
|
| 145 |
+
3. **For production**: Consider implementing both with automatic fallback
|
| 146 |
+
4. **Monitor memory**: Add memory usage tracking for large deployments
|
apply_memory_handler_fix.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Apply the memory handler fix to all components
|
| 4 |
+
This script backs up original files and replaces them with memory-based versions
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import shutil
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
def backup_and_replace(original_file, new_file):
|
| 12 |
+
"""Backup original file and replace with new version."""
|
| 13 |
+
if not os.path.exists(original_file):
|
| 14 |
+
print(f"❌ Original file not found: {original_file}")
|
| 15 |
+
return False
|
| 16 |
+
|
| 17 |
+
if not os.path.exists(new_file):
|
| 18 |
+
print(f"❌ New file not found: {new_file}")
|
| 19 |
+
return False
|
| 20 |
+
|
| 21 |
+
# Create backup
|
| 22 |
+
backup_file = f"{original_file}.backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 23 |
+
shutil.copy2(original_file, backup_file)
|
| 24 |
+
print(f"📋 Backed up: {original_file} → {backup_file}")
|
| 25 |
+
|
| 26 |
+
# Replace with new version
|
| 27 |
+
shutil.copy2(new_file, original_file)
|
| 28 |
+
print(f"✅ Replaced: {original_file}")
|
| 29 |
+
|
| 30 |
+
return True
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
"""Apply memory handler updates to all components."""
|
| 34 |
+
|
| 35 |
+
print("🔄 Applying Memory Handler Fix")
|
| 36 |
+
print("=" * 60)
|
| 37 |
+
print("This will backup original files and replace with memory-based versions")
|
| 38 |
+
print("=" * 60)
|
| 39 |
+
|
| 40 |
+
# Files to update
|
| 41 |
+
updates = [
|
| 42 |
+
("web_app/handlers/frequency_handlers.py", "web_app/handlers/frequency_handlers_updated.py"),
|
| 43 |
+
("web_app/handlers/analysis_handlers.py", "web_app/handlers/analysis_handlers_updated.py"),
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
# Confirm with user
|
| 47 |
+
response = input("\nProceed with updates? (y/n): ").lower()
|
| 48 |
+
if response != 'y':
|
| 49 |
+
print("❌ Cancelled")
|
| 50 |
+
return
|
| 51 |
+
|
| 52 |
+
print("\n🚀 Starting updates...")
|
| 53 |
+
|
| 54 |
+
success_count = 0
|
| 55 |
+
for original, updated in updates:
|
| 56 |
+
if backup_and_replace(original, updated):
|
| 57 |
+
success_count += 1
|
| 58 |
+
|
| 59 |
+
print("\n" + "=" * 60)
|
| 60 |
+
print(f"✅ Successfully updated {success_count}/{len(updates)} files")
|
| 61 |
+
|
| 62 |
+
# Additional components that need manual updates
|
| 63 |
+
print("\n⚠️ The following files still need manual updates:")
|
| 64 |
+
manual_updates = [
|
| 65 |
+
"web_app/handlers/pos_handlers.py",
|
| 66 |
+
"web_app/reference_manager.py",
|
| 67 |
+
"web_app/config_manager.py",
|
| 68 |
+
"web_app/debug_utils.py"
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
for file in manual_updates:
|
| 72 |
+
print(f" - {file}")
|
| 73 |
+
|
| 74 |
+
print("\n💡 To complete the migration:")
|
| 75 |
+
print("1. Update the remaining files manually")
|
| 76 |
+
print("2. Test the application thoroughly")
|
| 77 |
+
print("3. Remove the *_updated.py files after verification")
|
| 78 |
+
|
| 79 |
+
print("\n📝 Key changes to make in remaining files:")
|
| 80 |
+
print("- Replace: from web_app.utils import FileUploadHandler")
|
| 81 |
+
print(" With: from web_app.utils import MemoryFileHandler")
|
| 82 |
+
print("- Replace: FileUploadHandler.save_to_temp() + read_from_temp()")
|
| 83 |
+
print(" With: MemoryFileHandler.process_uploaded_file()")
|
| 84 |
+
print("- Remove: All cleanup_temp_file() calls")
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
main()
|
diagnose_upload_error.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Diagnostic script for file upload 403 errors
|
| 3 |
+
"""
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import traceback
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
st.set_page_config(page_title="Upload Diagnostic", layout="wide")
|
| 11 |
+
|
| 12 |
+
st.title("File Upload Diagnostic Tool")
|
| 13 |
+
st.write("This tool helps diagnose file upload issues and 403 errors")
|
| 14 |
+
|
| 15 |
+
# Check environment
|
| 16 |
+
st.subheader("1. Environment Check")
|
| 17 |
+
col1, col2 = st.columns(2)
|
| 18 |
+
|
| 19 |
+
with col1:
|
| 20 |
+
st.write("**System Info:**")
|
| 21 |
+
st.write(f"- Platform: {os.name}")
|
| 22 |
+
st.write(f"- Python: {os.sys.version.split()[0]}")
|
| 23 |
+
st.write(f"- Streamlit: {st.__version__}")
|
| 24 |
+
st.write(f"- Working Dir: {os.getcwd()}")
|
| 25 |
+
|
| 26 |
+
with col2:
|
| 27 |
+
st.write("**Temp Directory:**")
|
| 28 |
+
st.write(f"- Temp Dir: {tempfile.gettempdir()}")
|
| 29 |
+
st.write(f"- Writable: {os.access(tempfile.gettempdir(), os.W_OK)}")
|
| 30 |
+
|
| 31 |
+
# Check disk space
|
| 32 |
+
try:
|
| 33 |
+
stat = os.statvfs(tempfile.gettempdir())
|
| 34 |
+
free_mb = (stat.f_frsize * stat.f_bavail) / (1024 * 1024)
|
| 35 |
+
st.write(f"- Free Space: {free_mb:.1f} MB")
|
| 36 |
+
except:
|
| 37 |
+
st.write("- Free Space: Unknown")
|
| 38 |
+
|
| 39 |
+
# File upload test
|
| 40 |
+
st.subheader("2. File Upload Test")
|
| 41 |
+
|
| 42 |
+
uploaded_file = st.file_uploader(
|
| 43 |
+
"Upload a test file",
|
| 44 |
+
type=['txt', 'csv', 'tsv'],
|
| 45 |
+
help="Upload any file to test"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
if uploaded_file:
|
| 49 |
+
st.write("**File received by Streamlit:**")
|
| 50 |
+
st.write(f"- Name: {uploaded_file.name}")
|
| 51 |
+
st.write(f"- Type: {uploaded_file.type}")
|
| 52 |
+
st.write(f"- Size: {uploaded_file.size} bytes")
|
| 53 |
+
|
| 54 |
+
# Test different read methods
|
| 55 |
+
st.subheader("3. Testing Read Methods")
|
| 56 |
+
|
| 57 |
+
# Method 1: Direct read
|
| 58 |
+
with st.expander("Method 1: Direct read()"):
|
| 59 |
+
try:
|
| 60 |
+
uploaded_file.seek(0)
|
| 61 |
+
content = uploaded_file.read()
|
| 62 |
+
st.success(f"✅ Success! Read {len(content)} bytes")
|
| 63 |
+
st.code(content[:200].decode('utf-8', errors='ignore') + "...")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
st.error(f"❌ Failed: {type(e).__name__}: {str(e)}")
|
| 66 |
+
st.code(traceback.format_exc())
|
| 67 |
+
|
| 68 |
+
# Method 2: getvalue()
|
| 69 |
+
with st.expander("Method 2: getvalue()"):
|
| 70 |
+
try:
|
| 71 |
+
uploaded_file.seek(0)
|
| 72 |
+
content = uploaded_file.getvalue()
|
| 73 |
+
st.success(f"✅ Success! Read {len(content)} bytes")
|
| 74 |
+
st.code(content[:200].decode('utf-8', errors='ignore') + "...")
|
| 75 |
+
except Exception as e:
|
| 76 |
+
st.error(f"❌ Failed: {type(e).__name__}: {str(e)}")
|
| 77 |
+
st.code(traceback.format_exc())
|
| 78 |
+
|
| 79 |
+
# Method 3: getbuffer()
|
| 80 |
+
with st.expander("Method 3: getbuffer()"):
|
| 81 |
+
try:
|
| 82 |
+
uploaded_file.seek(0)
|
| 83 |
+
content = uploaded_file.getbuffer()
|
| 84 |
+
st.success(f"✅ Success! Buffer size: {len(content)} bytes")
|
| 85 |
+
st.code(str(content[:200]))
|
| 86 |
+
except Exception as e:
|
| 87 |
+
st.error(f"❌ Failed: {type(e).__name__}: {str(e)}")
|
| 88 |
+
st.code(traceback.format_exc())
|
| 89 |
+
|
| 90 |
+
# Method 4: Save to temp
|
| 91 |
+
with st.expander("Method 4: Save to temp file"):
|
| 92 |
+
try:
|
| 93 |
+
# Try different temp locations
|
| 94 |
+
temp_locations = [
|
| 95 |
+
("/tmp", "System /tmp"),
|
| 96 |
+
(tempfile.gettempdir(), "Python tempdir"),
|
| 97 |
+
(".", "Current directory"),
|
| 98 |
+
(str(Path.home() / ".streamlit" / "temp"), "Streamlit temp")
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
for temp_dir, desc in temp_locations:
|
| 102 |
+
st.write(f"\n**Trying {desc}: {temp_dir}**")
|
| 103 |
+
|
| 104 |
+
if not os.path.exists(temp_dir):
|
| 105 |
+
try:
|
| 106 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 107 |
+
st.write(f"✅ Created directory")
|
| 108 |
+
except:
|
| 109 |
+
st.write(f"❌ Cannot create directory")
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
if not os.access(temp_dir, os.W_OK):
|
| 113 |
+
st.write(f"❌ Not writable")
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
temp_path = os.path.join(temp_dir, f"test_{uploaded_file.name}")
|
| 118 |
+
with open(temp_path, 'wb') as f:
|
| 119 |
+
uploaded_file.seek(0)
|
| 120 |
+
f.write(uploaded_file.getbuffer())
|
| 121 |
+
|
| 122 |
+
if os.path.exists(temp_path):
|
| 123 |
+
size = os.path.getsize(temp_path)
|
| 124 |
+
st.success(f"✅ Saved successfully! Size: {size} bytes")
|
| 125 |
+
st.code(f"Path: {temp_path}")
|
| 126 |
+
|
| 127 |
+
# Try to read back
|
| 128 |
+
with open(temp_path, 'rb') as f:
|
| 129 |
+
content = f.read()
|
| 130 |
+
st.write(f"✅ Read back: {len(content)} bytes")
|
| 131 |
+
|
| 132 |
+
# Cleanup
|
| 133 |
+
os.remove(temp_path)
|
| 134 |
+
st.write("✅ Cleaned up")
|
| 135 |
+
break
|
| 136 |
+
else:
|
| 137 |
+
st.error("❌ File not found after saving")
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
st.error(f"❌ Failed: {type(e).__name__}: {str(e)}")
|
| 141 |
+
if "403" in str(e):
|
| 142 |
+
st.error("**403 ERROR DETECTED!**")
|
| 143 |
+
st.code(traceback.format_exc())
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
st.error(f"❌ General failure: {str(e)}")
|
| 147 |
+
st.code(traceback.format_exc())
|
| 148 |
+
|
| 149 |
+
# Network test
|
| 150 |
+
st.subheader("4. Network Configuration")
|
| 151 |
+
st.write("**Streamlit Server Config:**")
|
| 152 |
+
st.write(f"- Server Port: {os.environ.get('STREAMLIT_SERVER_PORT', '8501')}")
|
| 153 |
+
st.write(f"- Server Address: {os.environ.get('STREAMLIT_SERVER_ADDRESS', 'Not set')}")
|
| 154 |
+
st.write(f"- Browser Port: {os.environ.get('STREAMLIT_BROWSER_SERVER_PORT', 'Not set')}")
|
| 155 |
+
|
| 156 |
+
# Check for proxy
|
| 157 |
+
proxy_vars = ['HTTP_PROXY', 'HTTPS_PROXY', 'http_proxy', 'https_proxy']
|
| 158 |
+
for var in proxy_vars:
|
| 159 |
+
if var in os.environ:
|
| 160 |
+
st.write(f"- {var}: {os.environ[var]}")
|
| 161 |
+
|
| 162 |
+
st.info("If you see a 403 error above, please share the full error message and traceback.")
|
test_fix_403.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test if the memory-based file handler fixes the 403 error
|
| 3 |
+
"""
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
sys.path.append(os.path.dirname(__file__))
|
| 9 |
+
|
| 10 |
+
# Test both handlers
|
| 11 |
+
from web_app.utils import FileUploadHandler, MemoryFileHandler
|
| 12 |
+
|
| 13 |
+
st.set_page_config(page_title="403 Error Fix Test", layout="wide")
|
| 14 |
+
|
| 15 |
+
st.title("Test 403 Error Fix")
|
| 16 |
+
st.write("This test compares the old FileUploadHandler with the new MemoryFileHandler")
|
| 17 |
+
|
| 18 |
+
# File upload
|
| 19 |
+
uploaded_file = st.file_uploader(
|
| 20 |
+
"Upload a test file to check for 403 errors",
|
| 21 |
+
type=['txt', 'csv', 'tsv'],
|
| 22 |
+
help="We'll test both handlers with this file"
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
if uploaded_file:
|
| 26 |
+
col1, col2 = st.columns(2)
|
| 27 |
+
|
| 28 |
+
with col1:
|
| 29 |
+
st.subheader("❌ Old Method (FileUploadHandler)")
|
| 30 |
+
st.write("This may cause 403 errors on restricted environments")
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
# Try the old method
|
| 34 |
+
temp_path = FileUploadHandler.save_to_temp(uploaded_file, prefix="test")
|
| 35 |
+
if temp_path:
|
| 36 |
+
st.success(f"✅ Saved to: {temp_path}")
|
| 37 |
+
content = FileUploadHandler.read_from_temp(temp_path)
|
| 38 |
+
if content:
|
| 39 |
+
st.success(f"✅ Read {len(content)} bytes")
|
| 40 |
+
FileUploadHandler.cleanup_temp_file(temp_path)
|
| 41 |
+
else:
|
| 42 |
+
st.error("❌ Failed to save to temp")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
st.error(f"❌ Error: {str(e)}")
|
| 45 |
+
if "403" in str(e):
|
| 46 |
+
st.error("**403 ERROR DETECTED!**")
|
| 47 |
+
|
| 48 |
+
with col2:
|
| 49 |
+
st.subheader("✅ New Method (MemoryFileHandler)")
|
| 50 |
+
st.write("This keeps files in memory, avoiding filesystem")
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
# Reset file pointer
|
| 54 |
+
uploaded_file.seek(0)
|
| 55 |
+
|
| 56 |
+
# Try the new method
|
| 57 |
+
content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=False)
|
| 58 |
+
if content:
|
| 59 |
+
st.success(f"✅ Successfully read {len(content)} bytes")
|
| 60 |
+
st.write("No filesystem access needed!")
|
| 61 |
+
|
| 62 |
+
# Also test text mode
|
| 63 |
+
uploaded_file.seek(0)
|
| 64 |
+
text_content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
|
| 65 |
+
if text_content:
|
| 66 |
+
st.success(f"✅ Text mode: {len(text_content)} characters")
|
| 67 |
+
else:
|
| 68 |
+
st.error("❌ Failed to read file")
|
| 69 |
+
except Exception as e:
|
| 70 |
+
st.error(f"❌ Error: {str(e)}")
|
| 71 |
+
|
| 72 |
+
st.info("""
|
| 73 |
+
**Summary:**
|
| 74 |
+
- The old FileUploadHandler saves files to /tmp which can trigger 403 errors
|
| 75 |
+
- The new MemoryFileHandler processes files entirely in memory
|
| 76 |
+
- To fix your app, replace all FileUploadHandler usage with MemoryFileHandler
|
| 77 |
+
""")
|
| 78 |
+
|
| 79 |
+
# Quick implementation guide
|
| 80 |
+
with st.expander("📝 How to implement the fix in your app"):
|
| 81 |
+
st.code("""
|
| 82 |
+
# Replace this:
|
| 83 |
+
from web_app.utils import FileUploadHandler
|
| 84 |
+
temp_path = FileUploadHandler.save_to_temp(uploaded_file)
|
| 85 |
+
content = FileUploadHandler.read_from_temp(temp_path)
|
| 86 |
+
|
| 87 |
+
# With this:
|
| 88 |
+
from web_app.utils import MemoryFileHandler
|
| 89 |
+
content = MemoryFileHandler.process_uploaded_file(uploaded_file)
|
| 90 |
+
""", language="python")
|
test_memory_upload.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test memory-based file upload approach
|
| 3 |
+
"""
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
sys.path.append(os.path.dirname(__file__))
|
| 9 |
+
|
| 10 |
+
from web_app.utils.memory_file_handler import MemoryFileHandler
|
| 11 |
+
|
| 12 |
+
st.set_page_config(page_title="Memory Upload Test", layout="wide")
|
| 13 |
+
|
| 14 |
+
st.title("Memory-Based File Upload Test")
|
| 15 |
+
st.write("This approach keeps files in memory to avoid filesystem 403 errors")
|
| 16 |
+
|
| 17 |
+
# File upload
|
| 18 |
+
uploaded_file = st.file_uploader(
|
| 19 |
+
"Upload a test file",
|
| 20 |
+
type=['txt', 'csv', 'tsv'],
|
| 21 |
+
help="Files are processed entirely in memory"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
if uploaded_file:
|
| 25 |
+
st.write("### File Information")
|
| 26 |
+
col1, col2 = st.columns(2)
|
| 27 |
+
|
| 28 |
+
with col1:
|
| 29 |
+
st.write("**File Details:**")
|
| 30 |
+
st.write(f"- Name: {uploaded_file.name}")
|
| 31 |
+
st.write(f"- Size: {uploaded_file.size:,} bytes")
|
| 32 |
+
st.write(f"- Type: {uploaded_file.type}")
|
| 33 |
+
|
| 34 |
+
with col2:
|
| 35 |
+
st.write("**Processing Status:**")
|
| 36 |
+
|
| 37 |
+
# Test text processing
|
| 38 |
+
with st.expander("Test 1: Text Processing"):
|
| 39 |
+
try:
|
| 40 |
+
content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
|
| 41 |
+
if content:
|
| 42 |
+
st.success(f"✅ Successfully read {len(content):,} characters")
|
| 43 |
+
st.text_area("Content Preview", content[:500] + "...", height=200)
|
| 44 |
+
else:
|
| 45 |
+
st.error("Failed to read file")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
st.error(f"Error: {str(e)}")
|
| 48 |
+
|
| 49 |
+
# Test binary processing
|
| 50 |
+
with st.expander("Test 2: Binary Processing"):
|
| 51 |
+
try:
|
| 52 |
+
content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=False)
|
| 53 |
+
if content:
|
| 54 |
+
st.success(f"✅ Successfully read {len(content):,} bytes")
|
| 55 |
+
st.write(f"First 100 bytes: {content[:100]}")
|
| 56 |
+
else:
|
| 57 |
+
st.error("Failed to read file")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
st.error(f"Error: {str(e)}")
|
| 60 |
+
|
| 61 |
+
# Test DataFrame processing
|
| 62 |
+
if uploaded_file.name.endswith(('.csv', '.tsv', '.txt')):
|
| 63 |
+
with st.expander("Test 3: DataFrame Processing"):
|
| 64 |
+
try:
|
| 65 |
+
df = MemoryFileHandler.process_csv_tsv_file(uploaded_file)
|
| 66 |
+
if df is not None:
|
| 67 |
+
st.success(f"✅ Successfully parsed {len(df):,} rows")
|
| 68 |
+
st.dataframe(df.head())
|
| 69 |
+
else:
|
| 70 |
+
st.error("Failed to parse as DataFrame")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
st.error(f"Error: {str(e)}")
|
| 73 |
+
|
| 74 |
+
# Test session storage
|
| 75 |
+
with st.expander("Test 4: Session Storage"):
|
| 76 |
+
try:
|
| 77 |
+
# Store in session
|
| 78 |
+
MemoryFileHandler.store_in_session(f"test_file_{uploaded_file.name}", uploaded_file.read())
|
| 79 |
+
st.success("✅ Stored in session")
|
| 80 |
+
|
| 81 |
+
# Retrieve from session
|
| 82 |
+
retrieved = MemoryFileHandler.retrieve_from_session(f"test_file_{uploaded_file.name}")
|
| 83 |
+
if retrieved:
|
| 84 |
+
st.success(f"✅ Retrieved {len(retrieved):,} bytes from session")
|
| 85 |
+
else:
|
| 86 |
+
st.error("Failed to retrieve from session")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
st.error(f"Error: {str(e)}")
|
| 89 |
+
|
| 90 |
+
st.info("💡 This approach processes files entirely in memory without touching the filesystem, avoiding 403 errors.")
|
update_to_memory_handler.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to update all file upload handlers to use MemoryFileHandler
|
| 4 |
+
This will prevent 403 errors on restricted environments like Hugging Face Spaces
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
def update_file(filepath):
|
| 12 |
+
"""Update a single file to use MemoryFileHandler."""
|
| 13 |
+
|
| 14 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 15 |
+
content = f.read()
|
| 16 |
+
|
| 17 |
+
original_content = content
|
| 18 |
+
|
| 19 |
+
# Update import statements
|
| 20 |
+
content = re.sub(
|
| 21 |
+
r'from web_app\.utils import FileUploadHandler',
|
| 22 |
+
'from web_app.utils import MemoryFileHandler',
|
| 23 |
+
content
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Update FileUploadHandler calls to MemoryFileHandler
|
| 27 |
+
content = re.sub(r'FileUploadHandler\.', 'MemoryFileHandler.', content)
|
| 28 |
+
|
| 29 |
+
# Update save_to_temp pattern
|
| 30 |
+
# Old pattern: temp_path = FileUploadHandler.save_to_temp(uploaded_file, prefix="...")
|
| 31 |
+
# New pattern: content = MemoryFileHandler.process_uploaded_file(uploaded_file)
|
| 32 |
+
|
| 33 |
+
# Replace save_to_temp and read_from_temp patterns
|
| 34 |
+
content = re.sub(
|
| 35 |
+
r'temp_path = MemoryFileHandler\.save_to_temp\(([^,]+), prefix="[^"]+"\)\s*'
|
| 36 |
+
r'if temp_path:\s*'
|
| 37 |
+
r'.*?= MemoryFileHandler\.read_from_temp\(temp_path\)',
|
| 38 |
+
r'content = MemoryFileHandler.process_uploaded_file(\1)',
|
| 39 |
+
content,
|
| 40 |
+
flags=re.DOTALL
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Replace validate_file_size (MemoryFileHandler doesn't need this as it checks inline)
|
| 44 |
+
content = re.sub(
|
| 45 |
+
r'if not MemoryFileHandler\.validate_file_size\([^)]+\):\s*return',
|
| 46 |
+
'if uploaded_file.size > 300 * 1024 * 1024:\n'
|
| 47 |
+
' st.error(f"File too large ({uploaded_file.size / 1024 / 1024:.1f} MB). Maximum allowed: 300MB")\n'
|
| 48 |
+
' return',
|
| 49 |
+
content
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Remove cleanup_temp_file calls
|
| 53 |
+
content = re.sub(
|
| 54 |
+
r'MemoryFileHandler\.cleanup_temp_file\([^)]+\)\s*',
|
| 55 |
+
'',
|
| 56 |
+
content
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Remove cleanup_old_temp_files calls
|
| 60 |
+
content = re.sub(
|
| 61 |
+
r'MemoryFileHandler\.cleanup_old_temp_files\([^)]+\)\s*',
|
| 62 |
+
'',
|
| 63 |
+
content
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
if content != original_content:
|
| 67 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 68 |
+
f.write(content)
|
| 69 |
+
print(f"✅ Updated: {filepath}")
|
| 70 |
+
return True
|
| 71 |
+
else:
|
| 72 |
+
print(f"⏭️ No changes needed: {filepath}")
|
| 73 |
+
return False
|
| 74 |
+
|
| 75 |
+
def main():
|
| 76 |
+
"""Main function to update all relevant files."""
|
| 77 |
+
|
| 78 |
+
# Files to update
|
| 79 |
+
files_to_update = [
|
| 80 |
+
'web_app/handlers/analysis_handlers.py',
|
| 81 |
+
'web_app/handlers/pos_handlers.py',
|
| 82 |
+
'web_app/handlers/frequency_handlers.py',
|
| 83 |
+
'web_app/reference_manager.py',
|
| 84 |
+
'web_app/config_manager.py',
|
| 85 |
+
'web_app/debug_utils.py'
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
updated_count = 0
|
| 89 |
+
|
| 90 |
+
print("🔄 Updating file handlers to use MemoryFileHandler...")
|
| 91 |
+
print("=" * 60)
|
| 92 |
+
|
| 93 |
+
for file_path in files_to_update:
|
| 94 |
+
if os.path.exists(file_path):
|
| 95 |
+
if update_file(file_path):
|
| 96 |
+
updated_count += 1
|
| 97 |
+
else:
|
| 98 |
+
print(f"❌ File not found: {file_path}")
|
| 99 |
+
|
| 100 |
+
print("=" * 60)
|
| 101 |
+
print(f"✅ Updated {updated_count} files")
|
| 102 |
+
|
| 103 |
+
# Create a backup of the old FileUploadHandler
|
| 104 |
+
old_handler_path = 'web_app/utils/file_upload_handler.py'
|
| 105 |
+
backup_path = 'web_app/utils/file_upload_handler.py.backup'
|
| 106 |
+
|
| 107 |
+
if os.path.exists(old_handler_path) and not os.path.exists(backup_path):
|
| 108 |
+
import shutil
|
| 109 |
+
shutil.copy2(old_handler_path, backup_path)
|
| 110 |
+
print(f"📋 Created backup: {backup_path}")
|
| 111 |
+
|
| 112 |
+
if __name__ == "__main__":
|
| 113 |
+
main()
|
web_app/components/comparison_functions.py
CHANGED
|
@@ -8,7 +8,7 @@ import pandas as pd
|
|
| 8 |
import numpy as np
|
| 9 |
import plotly.graph_objects as go
|
| 10 |
from scipy import stats
|
| 11 |
-
from web_app.utils import
|
| 12 |
|
| 13 |
|
| 14 |
def get_text_input(label, key_suffix):
|
|
@@ -30,29 +30,12 @@ def get_text_input(label, key_suffix):
|
|
| 30 |
)
|
| 31 |
if uploaded_file:
|
| 32 |
try:
|
| 33 |
-
# Use
|
| 34 |
-
|
| 35 |
-
if not
|
| 36 |
-
st.error("Failed to
|
| 37 |
return ""
|
| 38 |
|
| 39 |
-
# Read content from temp file
|
| 40 |
-
content = FileUploadHandler.read_from_temp(temp_path)
|
| 41 |
-
if isinstance(content, bytes):
|
| 42 |
-
try:
|
| 43 |
-
text_content = content.decode('utf-8')
|
| 44 |
-
except UnicodeDecodeError:
|
| 45 |
-
try:
|
| 46 |
-
text_content = content.decode('utf-16')
|
| 47 |
-
except UnicodeDecodeError:
|
| 48 |
-
st.error("Unable to decode file. Please ensure it's a valid UTF-8 or UTF-16 text file.")
|
| 49 |
-
return ""
|
| 50 |
-
else:
|
| 51 |
-
text_content = content
|
| 52 |
-
|
| 53 |
-
# Cleanup temp file
|
| 54 |
-
FileUploadHandler.cleanup_temp_file(temp_path)
|
| 55 |
-
|
| 56 |
except Exception as e:
|
| 57 |
st.error(f"Error reading uploaded file: {str(e)}")
|
| 58 |
return ""
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
import plotly.graph_objects as go
|
| 10 |
from scipy import stats
|
| 11 |
+
from web_app.utils import MemoryFileHandler
|
| 12 |
|
| 13 |
|
| 14 |
def get_text_input(label, key_suffix):
|
|
|
|
| 30 |
)
|
| 31 |
if uploaded_file:
|
| 32 |
try:
|
| 33 |
+
# Use memory-based approach to avoid filesystem restrictions
|
| 34 |
+
text_content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
|
| 35 |
+
if not text_content:
|
| 36 |
+
st.error("Failed to read uploaded file. Please try again.")
|
| 37 |
return ""
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
except Exception as e:
|
| 40 |
st.error(f"Error reading uploaded file: {str(e)}")
|
| 41 |
return ""
|
web_app/components/ui_components.py
CHANGED
|
@@ -7,7 +7,7 @@ import streamlit as st
|
|
| 7 |
import pandas as pd
|
| 8 |
from typing import Dict, List, Any, Optional, Tuple
|
| 9 |
from pathlib import Path
|
| 10 |
-
from web_app.utils import
|
| 11 |
|
| 12 |
from web_app.config_manager import ConfigManager
|
| 13 |
from web_app.session_manager import SessionManager
|
|
@@ -152,29 +152,12 @@ class UIComponents:
|
|
| 152 |
)
|
| 153 |
if uploaded_file:
|
| 154 |
try:
|
| 155 |
-
# Use
|
| 156 |
-
|
| 157 |
-
if not
|
| 158 |
-
st.error("Failed to
|
| 159 |
return ""
|
| 160 |
|
| 161 |
-
# Read content from temp file
|
| 162 |
-
content = FileUploadHandler.read_from_temp(temp_path)
|
| 163 |
-
if isinstance(content, bytes):
|
| 164 |
-
try:
|
| 165 |
-
text_content = content.decode('utf-8')
|
| 166 |
-
except UnicodeDecodeError:
|
| 167 |
-
try:
|
| 168 |
-
text_content = content.decode('utf-16')
|
| 169 |
-
except UnicodeDecodeError:
|
| 170 |
-
st.error("Unable to decode file. Please ensure it's a valid UTF-8 or UTF-16 text file.")
|
| 171 |
-
return ""
|
| 172 |
-
else:
|
| 173 |
-
text_content = content
|
| 174 |
-
|
| 175 |
-
# Cleanup temp file
|
| 176 |
-
FileUploadHandler.cleanup_temp_file(temp_path)
|
| 177 |
-
|
| 178 |
except Exception as e:
|
| 179 |
st.error(f"Error reading uploaded file: {str(e)}")
|
| 180 |
return ""
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
from typing import Dict, List, Any, Optional, Tuple
|
| 9 |
from pathlib import Path
|
| 10 |
+
from web_app.utils import MemoryFileHandler
|
| 11 |
|
| 12 |
from web_app.config_manager import ConfigManager
|
| 13 |
from web_app.session_manager import SessionManager
|
|
|
|
| 152 |
)
|
| 153 |
if uploaded_file:
|
| 154 |
try:
|
| 155 |
+
# Use memory-based approach to avoid filesystem restrictions
|
| 156 |
+
text_content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
|
| 157 |
+
if not text_content:
|
| 158 |
+
st.error("Failed to read uploaded file. Please try again.")
|
| 159 |
return ""
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
except Exception as e:
|
| 162 |
st.error(f"Error reading uploaded file: {str(e)}")
|
| 163 |
return ""
|
web_app/handlers/analysis_handlers.py
CHANGED
|
@@ -1,112 +1,73 @@
|
|
| 1 |
"""
|
| 2 |
-
Analysis
|
| 3 |
-
Handles single text, batch, and comparison analysis workflows.
|
| 4 |
"""
|
| 5 |
|
| 6 |
import streamlit as st
|
| 7 |
import pandas as pd
|
| 8 |
-
import
|
| 9 |
-
import plotly.graph_objects as go
|
| 10 |
-
from scipy import stats
|
| 11 |
-
import tempfile
|
| 12 |
-
import os
|
| 13 |
-
from typing import Dict, List, Any, Optional, Tuple
|
| 14 |
-
from pathlib import Path
|
| 15 |
-
import zipfile
|
| 16 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
from web_app.session_manager import SessionManager
|
| 19 |
from web_app.components.ui_components import UIComponents
|
| 20 |
-
from web_app.
|
| 21 |
-
|
| 22 |
-
from web_app.utils import FileUploadHandler
|
| 23 |
|
| 24 |
class AnalysisHandlers:
|
| 25 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
@staticmethod
|
| 28 |
-
def
|
| 29 |
-
"""
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
if (st.session_state.pos_parser is None or
|
| 48 |
-
st.session_state.pos_parser.language != st.session_state.language or
|
| 49 |
-
st.session_state.pos_parser.model_size != st.session_state.model_size):
|
| 50 |
-
try:
|
| 51 |
-
from text_analyzer.pos_parser import POSParser
|
| 52 |
-
st.session_state.pos_parser = POSParser(
|
| 53 |
-
language=st.session_state.language,
|
| 54 |
-
model_size=st.session_state.model_size
|
| 55 |
)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
return st.session_state.pos_parser
|
| 60 |
|
| 61 |
@staticmethod
|
| 62 |
-
def
|
| 63 |
-
"""Handle
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
# Text input
|
| 67 |
-
text_content = UIComponents.render_text_input("text to analyze", "single")
|
| 68 |
-
|
| 69 |
-
if not text_content:
|
| 70 |
-
st.info("Please provide text to analyze.")
|
| 71 |
-
return
|
| 72 |
-
|
| 73 |
-
# Reference list configuration
|
| 74 |
-
st.subheader("Reference Lists")
|
| 75 |
-
ReferenceManager.configure_reference_lists(analyzer)
|
| 76 |
-
ReferenceManager.render_custom_upload_section()
|
| 77 |
-
|
| 78 |
-
# Analysis options
|
| 79 |
-
apply_log, word_type_filter = UIComponents.render_analysis_options()
|
| 80 |
-
|
| 81 |
-
# Analysis button
|
| 82 |
-
if st.button("Analyze Text", type="primary"):
|
| 83 |
-
reference_lists = SessionManager.get_reference_lists()
|
| 84 |
-
if not reference_lists:
|
| 85 |
-
st.warning("Please select or upload reference lists first.")
|
| 86 |
-
return
|
| 87 |
-
|
| 88 |
-
with st.spinner("Analyzing text..."):
|
| 89 |
-
try:
|
| 90 |
-
# Load reference lists
|
| 91 |
-
analyzer.load_reference_lists(reference_lists)
|
| 92 |
-
|
| 93 |
-
# Perform analysis
|
| 94 |
-
results = analyzer.analyze_text(
|
| 95 |
-
text_content,
|
| 96 |
-
list(reference_lists.keys()),
|
| 97 |
-
apply_log,
|
| 98 |
-
word_type_filter
|
| 99 |
-
)
|
| 100 |
-
|
| 101 |
-
# Display results
|
| 102 |
-
AnalysisHandlers.display_single_text_results(results)
|
| 103 |
-
|
| 104 |
-
except Exception as e:
|
| 105 |
-
st.error(f"Error during analysis: {e}")
|
| 106 |
|
| 107 |
@staticmethod
|
| 108 |
-
def handle_batch_analysis(analyzer):
|
| 109 |
-
"""Handle batch analysis workflow."""
|
| 110 |
st.subheader("Batch Analysis")
|
| 111 |
|
| 112 |
# File upload
|
|
@@ -118,312 +79,528 @@ class AnalysisHandlers:
|
|
| 118 |
)
|
| 119 |
|
| 120 |
if not uploaded_files:
|
| 121 |
-
st.info("Please upload text files
|
| 122 |
return
|
| 123 |
|
| 124 |
-
#
|
| 125 |
-
st.
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
reference_lists = SessionManager.get_reference_lists()
|
| 135 |
-
if not reference_lists:
|
| 136 |
-
st.warning("Please select or upload reference lists first.")
|
| 137 |
return
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
# Extract files
|
| 142 |
-
file_contents = AnalysisHandlers.extract_uploaded_files(uploaded_files)
|
| 143 |
-
|
| 144 |
-
if not file_contents:
|
| 145 |
-
st.error("No valid .txt files found in uploaded files.")
|
| 146 |
-
return
|
| 147 |
-
|
| 148 |
-
st.info(f"Found {len(file_contents)} files to process.")
|
| 149 |
-
|
| 150 |
-
# Load reference lists
|
| 151 |
-
analyzer.load_reference_lists(reference_lists)
|
| 152 |
-
|
| 153 |
-
# Create progress tracking
|
| 154 |
-
progress_bar = st.progress(0)
|
| 155 |
-
status_text = st.empty()
|
| 156 |
-
|
| 157 |
-
# Process files in memory
|
| 158 |
-
batch_results = []
|
| 159 |
-
selected_indices = list(reference_lists.keys())
|
| 160 |
-
|
| 161 |
-
for i, (filename, text_content) in enumerate(file_contents):
|
| 162 |
-
# Update progress
|
| 163 |
-
progress = (i + 1) / len(file_contents)
|
| 164 |
-
progress_bar.progress(progress)
|
| 165 |
-
status_text.text(f"Processing file {i + 1}/{len(file_contents)}: {filename}")
|
| 166 |
-
|
| 167 |
-
try:
|
| 168 |
-
# Analyze for both content and function words
|
| 169 |
-
result_row = {'filename': filename}
|
| 170 |
-
|
| 171 |
-
for word_type in ['CW', 'FW']:
|
| 172 |
-
analysis = analyzer.analyze_text(text_content, selected_indices, apply_log, word_type)
|
| 173 |
-
|
| 174 |
-
# Extract summary scores
|
| 175 |
-
if analysis and 'summary' in analysis:
|
| 176 |
-
for index, stats in analysis['summary'].items():
|
| 177 |
-
col_name = f"{index}_{word_type}"
|
| 178 |
-
result_row[col_name] = stats['mean']
|
| 179 |
-
|
| 180 |
-
batch_results.append(result_row)
|
| 181 |
-
except Exception as e:
|
| 182 |
-
st.warning(f"Error analyzing {filename}: {e}")
|
| 183 |
-
continue
|
| 184 |
-
|
| 185 |
-
# Convert to DataFrame
|
| 186 |
-
results_df = pd.DataFrame(batch_results)
|
| 187 |
-
|
| 188 |
-
# Display results
|
| 189 |
-
st.success(f"Analysis complete! Processed {len(results_df)} files.")
|
| 190 |
-
st.subheader("Results")
|
| 191 |
-
st.dataframe(results_df, use_container_width=True)
|
| 192 |
-
|
| 193 |
-
# Download link
|
| 194 |
-
csv_data = results_df.to_csv(index=False)
|
| 195 |
-
st.download_button(
|
| 196 |
-
label="Download Results (CSV)",
|
| 197 |
-
data=csv_data,
|
| 198 |
-
file_name="lexical_sophistication_results.csv",
|
| 199 |
-
mime="text/csv"
|
| 200 |
-
)
|
| 201 |
-
|
| 202 |
-
except Exception as e:
|
| 203 |
-
st.error(f"Error during batch analysis: {e}")
|
| 204 |
|
| 205 |
@staticmethod
|
| 206 |
-
def
|
| 207 |
-
"""
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
# Create two columns for text input
|
| 211 |
-
col_a, col_b = st.columns(2)
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
| 216 |
|
| 217 |
-
with
|
| 218 |
-
st.
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
ReferenceManager.render_custom_upload_section()
|
| 229 |
|
| 230 |
-
# Analysis options
|
| 231 |
-
col1, col2 = st.columns(2)
|
| 232 |
with col1:
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
with col2:
|
| 235 |
-
|
| 236 |
-
"
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
key="comparison_word_type"
|
| 240 |
)
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
@staticmethod
|
| 267 |
-
def
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
for file_info in zip_ref.infolist():
|
| 282 |
-
if file_info.filename.endswith('.txt'):
|
| 283 |
-
try:
|
| 284 |
-
content = zip_ref.read(file_info.filename)
|
| 285 |
-
# Decode content
|
| 286 |
-
try:
|
| 287 |
-
text_content = content.decode('utf-8')
|
| 288 |
-
except UnicodeDecodeError:
|
| 289 |
-
try:
|
| 290 |
-
text_content = content.decode('utf-16')
|
| 291 |
-
except UnicodeDecodeError:
|
| 292 |
-
st.error(f"Unable to decode file {file_info.filename}. Skipping.")
|
| 293 |
-
continue
|
| 294 |
-
file_contents.append((file_info.filename, text_content))
|
| 295 |
-
except Exception as e:
|
| 296 |
-
st.error(f"Cannot read {file_info.filename}: {e}")
|
| 297 |
-
continue
|
| 298 |
-
elif uploaded_file.name.endswith('.txt'):
|
| 299 |
-
# Handle individual text files using temp file approach
|
| 300 |
-
try:
|
| 301 |
-
# Save to temp and read content
|
| 302 |
-
temp_path = FileUploadHandler.save_to_temp(uploaded_file, prefix="analysis")
|
| 303 |
-
if not temp_path:
|
| 304 |
-
st.error(f"Failed to save file {uploaded_file.name}")
|
| 305 |
-
continue
|
| 306 |
-
|
| 307 |
-
temp_paths.append(temp_path)
|
| 308 |
-
|
| 309 |
-
# Read content with encoding handling
|
| 310 |
-
content = FileUploadHandler.read_from_temp(temp_path)
|
| 311 |
-
if isinstance(content, bytes):
|
| 312 |
-
try:
|
| 313 |
-
text_content = content.decode('utf-8')
|
| 314 |
-
except UnicodeDecodeError:
|
| 315 |
-
try:
|
| 316 |
-
text_content = content.decode('utf-16')
|
| 317 |
-
except UnicodeDecodeError:
|
| 318 |
-
st.error(f"Unable to decode file {uploaded_file.name}. Skipping.")
|
| 319 |
-
continue
|
| 320 |
-
else:
|
| 321 |
-
text_content = content
|
| 322 |
-
|
| 323 |
-
file_contents.append((uploaded_file.name, text_content))
|
| 324 |
-
except Exception as e:
|
| 325 |
-
st.error(f"Cannot read file {uploaded_file.name}: {e}")
|
| 326 |
-
continue
|
| 327 |
-
else:
|
| 328 |
-
st.warning(f"Skipping {uploaded_file.name}: Not a .txt or .zip file")
|
| 329 |
|
| 330 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
@staticmethod
|
| 338 |
-
def
|
| 339 |
-
"""Display
|
| 340 |
-
st.
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
-
#
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
st.download_button(
|
| 368 |
-
label="Download
|
| 369 |
-
data=
|
| 370 |
-
file_name="
|
| 371 |
mime="text/csv"
|
| 372 |
)
|
| 373 |
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
if
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
@staticmethod
|
| 387 |
-
def
|
| 388 |
-
"""
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
return
|
| 391 |
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
))
|
| 418 |
|
| 419 |
-
#
|
| 420 |
-
fig.
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
showlegend=True,
|
| 426 |
-
bargap=0.05
|
| 427 |
)
|
| 428 |
|
| 429 |
-
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Analysis Handlers for Streamlit Interface - Updated with MemoryFileHandler
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import pandas as pd
|
| 7 |
+
from typing import List, Tuple, Dict, Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import time
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import zipfile
|
| 11 |
+
from io import BytesIO, StringIO
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
|
| 15 |
+
# Add parent directory to path for imports
|
| 16 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 17 |
+
|
| 18 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 19 |
from web_app.session_manager import SessionManager
|
| 20 |
from web_app.components.ui_components import UIComponents
|
| 21 |
+
from web_app.utils import MemoryFileHandler
|
| 22 |
+
|
|
|
|
| 23 |
|
| 24 |
class AnalysisHandlers:
|
| 25 |
+
"""
|
| 26 |
+
Handles analysis-related UI components and workflows.
|
| 27 |
+
Updated to use MemoryFileHandler for better compatibility.
|
| 28 |
+
"""
|
| 29 |
|
| 30 |
@staticmethod
|
| 31 |
+
def handle_single_text_analysis(analyzer: LexicalSophisticationAnalyzer):
|
| 32 |
+
"""Handle single text analysis workflow."""
|
| 33 |
+
# Get text input
|
| 34 |
+
text_content = UIComponents.render_text_input_section("Text for Analysis", "single_text")
|
| 35 |
+
|
| 36 |
+
if text_content and st.button("Analyze Text", type="primary", key="analyze_single"):
|
| 37 |
+
with st.spinner("Analyzing text..."):
|
| 38 |
+
start_time = time.time()
|
| 39 |
+
|
| 40 |
+
# Get analysis parameters
|
| 41 |
+
params = SessionManager.get_analysis_params()
|
| 42 |
+
|
| 43 |
+
# Run analysis
|
| 44 |
+
metrics = analyzer.analyze_text(
|
| 45 |
+
text_content,
|
| 46 |
+
include_pos_info=params['include_pos'],
|
| 47 |
+
min_word_length=params['min_word_length']
|
| 48 |
)
|
| 49 |
+
|
| 50 |
+
analysis_time = time.time() - start_time
|
| 51 |
+
|
| 52 |
+
# Store results
|
| 53 |
+
SessionManager.store_analysis_results(
|
| 54 |
+
'single_text',
|
| 55 |
+
metrics,
|
| 56 |
+
{'analysis_time': analysis_time}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
)
|
| 58 |
+
|
| 59 |
+
# Display results
|
| 60 |
+
AnalysisHandlers._display_analysis_results(metrics, analysis_time)
|
|
|
|
| 61 |
|
| 62 |
@staticmethod
|
| 63 |
+
def handle_comparison_analysis(analyzer: LexicalSophisticationAnalyzer):
|
| 64 |
+
"""Handle text comparison workflow."""
|
| 65 |
+
from web_app.components.comparison_functions import render_comparison_interface
|
| 66 |
+
render_comparison_interface(analyzer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
@staticmethod
|
| 69 |
+
def handle_batch_analysis(analyzer: LexicalSophisticationAnalyzer):
|
| 70 |
+
"""Handle batch analysis workflow with memory-based file handling."""
|
| 71 |
st.subheader("Batch Analysis")
|
| 72 |
|
| 73 |
# File upload
|
|
|
|
| 79 |
)
|
| 80 |
|
| 81 |
if not uploaded_files:
|
| 82 |
+
st.info("Please upload text files or a ZIP archive to begin batch analysis.")
|
| 83 |
return
|
| 84 |
|
| 85 |
+
# Analysis parameters
|
| 86 |
+
with st.expander("Analysis Parameters", expanded=True):
|
| 87 |
+
params = AnalysisHandlers._render_batch_analysis_params()
|
| 88 |
+
|
| 89 |
+
if st.button("Start Batch Analysis", type="primary"):
|
| 90 |
+
# Process files
|
| 91 |
+
file_contents = AnalysisHandlers._process_batch_files_memory(uploaded_files)
|
| 92 |
+
|
| 93 |
+
if not file_contents:
|
| 94 |
+
st.error("No valid text files found.")
|
|
|
|
|
|
|
|
|
|
| 95 |
return
|
| 96 |
|
| 97 |
+
# Run batch analysis
|
| 98 |
+
AnalysisHandlers._run_batch_analysis(analyzer, file_contents, params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
@staticmethod
|
| 101 |
+
def _process_batch_files_memory(uploaded_files) -> List[Tuple[str, str]]:
|
| 102 |
+
"""
|
| 103 |
+
Process uploaded files for batch analysis using memory-based approach.
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
Returns:
|
| 106 |
+
List of tuples (filename, content)
|
| 107 |
+
"""
|
| 108 |
+
file_contents = []
|
| 109 |
|
| 110 |
+
with st.spinner("Processing uploaded files..."):
|
| 111 |
+
progress_bar = st.progress(0)
|
| 112 |
+
total_files = len(uploaded_files)
|
| 113 |
+
|
| 114 |
+
for idx, uploaded_file in enumerate(uploaded_files):
|
| 115 |
+
try:
|
| 116 |
+
if uploaded_file.name.endswith('.zip'):
|
| 117 |
+
# Handle ZIP files
|
| 118 |
+
zip_contents = MemoryFileHandler.handle_zip_file(uploaded_file)
|
| 119 |
+
if zip_contents:
|
| 120 |
+
for filename, content in zip_contents.items():
|
| 121 |
+
if filename.endswith('.txt'):
|
| 122 |
+
try:
|
| 123 |
+
# Decode bytes to text
|
| 124 |
+
if isinstance(content, bytes):
|
| 125 |
+
text_content = content.decode('utf-8')
|
| 126 |
+
else:
|
| 127 |
+
text_content = content
|
| 128 |
+
file_contents.append((filename, text_content))
|
| 129 |
+
except UnicodeDecodeError:
|
| 130 |
+
st.warning(f"Skipping {filename}: Unable to decode as UTF-8")
|
| 131 |
+
|
| 132 |
+
elif uploaded_file.name.endswith('.txt'):
|
| 133 |
+
# Handle individual text files
|
| 134 |
+
text_content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
|
| 135 |
+
if text_content:
|
| 136 |
+
file_contents.append((uploaded_file.name, text_content))
|
| 137 |
+
else:
|
| 138 |
+
st.warning(f"Could not read {uploaded_file.name}")
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
| 142 |
+
|
| 143 |
+
# Update progress
|
| 144 |
+
progress_bar.progress((idx + 1) / total_files)
|
| 145 |
+
|
| 146 |
+
progress_bar.empty()
|
| 147 |
|
| 148 |
+
st.success(f"Processed {len(file_contents)} text files")
|
| 149 |
+
return file_contents
|
| 150 |
+
|
| 151 |
+
@staticmethod
|
| 152 |
+
def _render_batch_analysis_params() -> dict:
|
| 153 |
+
"""Render batch analysis parameters."""
|
| 154 |
+
col1, col2, col3 = st.columns(3)
|
|
|
|
| 155 |
|
|
|
|
|
|
|
| 156 |
with col1:
|
| 157 |
+
include_pos = st.checkbox(
|
| 158 |
+
"Include POS Analysis",
|
| 159 |
+
value=True,
|
| 160 |
+
help="Include part-of-speech tagging (slower but more detailed)"
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
min_word_length = st.number_input(
|
| 164 |
+
"Minimum Word Length",
|
| 165 |
+
min_value=1,
|
| 166 |
+
max_value=10,
|
| 167 |
+
value=1,
|
| 168 |
+
help="Exclude words shorter than this length"
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
with col2:
|
| 172 |
+
analyze_readability = st.checkbox(
|
| 173 |
+
"Analyze Readability",
|
| 174 |
+
value=True,
|
| 175 |
+
help="Include readability metrics"
|
|
|
|
| 176 |
)
|
| 177 |
+
|
| 178 |
+
analyze_diversity = st.checkbox(
|
| 179 |
+
"Analyze Diversity",
|
| 180 |
+
value=True,
|
| 181 |
+
help="Include lexical diversity metrics"
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
with col3:
|
| 185 |
+
export_format = st.selectbox(
|
| 186 |
+
"Export Format",
|
| 187 |
+
["CSV", "Excel", "JSON"],
|
| 188 |
+
help="Format for results export"
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
include_raw_data = st.checkbox(
|
| 192 |
+
"Include Raw Data",
|
| 193 |
+
value=False,
|
| 194 |
+
help="Include word lists in export"
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
'include_pos': include_pos,
|
| 199 |
+
'min_word_length': min_word_length,
|
| 200 |
+
'analyze_readability': analyze_readability,
|
| 201 |
+
'analyze_diversity': analyze_diversity,
|
| 202 |
+
'export_format': export_format,
|
| 203 |
+
'include_raw_data': include_raw_data
|
| 204 |
+
}
|
| 205 |
|
| 206 |
@staticmethod
|
| 207 |
+
def _run_batch_analysis(analyzer: LexicalSophisticationAnalyzer,
|
| 208 |
+
file_contents: List[Tuple[str, str]],
|
| 209 |
+
params: dict):
|
| 210 |
+
"""Run batch analysis on multiple files."""
|
| 211 |
+
results = []
|
| 212 |
+
|
| 213 |
+
# Progress tracking
|
| 214 |
+
progress_bar = st.progress(0)
|
| 215 |
+
status_text = st.empty()
|
| 216 |
+
|
| 217 |
+
start_time = time.time()
|
| 218 |
+
|
| 219 |
+
for idx, (filename, content) in enumerate(file_contents):
|
| 220 |
+
status_text.text(f"Analyzing {filename}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
+
try:
|
| 223 |
+
# Analyze text
|
| 224 |
+
metrics = analyzer.analyze_text(
|
| 225 |
+
content,
|
| 226 |
+
include_pos_info=params['include_pos'],
|
| 227 |
+
min_word_length=params['min_word_length']
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Add filename to results
|
| 231 |
+
metrics['filename'] = filename
|
| 232 |
+
results.append(metrics)
|
| 233 |
+
|
| 234 |
+
except Exception as e:
|
| 235 |
+
st.error(f"Error analyzing {filename}: {str(e)}")
|
| 236 |
+
continue
|
| 237 |
|
| 238 |
+
# Update progress
|
| 239 |
+
progress_bar.progress((idx + 1) / len(file_contents))
|
| 240 |
+
|
| 241 |
+
# Clear progress indicators
|
| 242 |
+
progress_bar.empty()
|
| 243 |
+
status_text.empty()
|
| 244 |
+
|
| 245 |
+
total_time = time.time() - start_time
|
| 246 |
+
|
| 247 |
+
# Display results
|
| 248 |
+
AnalysisHandlers._display_batch_results(results, params, total_time)
|
| 249 |
|
| 250 |
@staticmethod
|
| 251 |
+
def _display_batch_results(results: List[dict], params: dict, total_time: float):
|
| 252 |
+
"""Display batch analysis results."""
|
| 253 |
+
st.success(f"✅ Analyzed {len(results)} files in {total_time:.1f} seconds")
|
| 254 |
+
|
| 255 |
+
if not results:
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
# Create results DataFrame
|
| 259 |
+
df_results = pd.DataFrame(results)
|
| 260 |
+
|
| 261 |
+
# Reorder columns for better display
|
| 262 |
+
priority_cols = ['filename', 'total_words', 'unique_words', 'avg_word_length',
|
| 263 |
+
'lexical_diversity', 'avg_word_frequency']
|
| 264 |
+
other_cols = [col for col in df_results.columns if col not in priority_cols]
|
| 265 |
+
ordered_cols = [col for col in priority_cols if col in df_results.columns] + other_cols
|
| 266 |
+
df_results = df_results[ordered_cols]
|
| 267 |
+
|
| 268 |
+
# Display options
|
| 269 |
+
col1, col2 = st.columns([3, 1])
|
| 270 |
+
with col1:
|
| 271 |
+
st.subheader("Analysis Results")
|
| 272 |
+
with col2:
|
| 273 |
+
display_mode = st.radio("Display", ["Table", "Charts"], horizontal=True)
|
| 274 |
+
|
| 275 |
+
if display_mode == "Table":
|
| 276 |
+
# Display as table
|
| 277 |
+
st.dataframe(
|
| 278 |
+
df_results,
|
| 279 |
+
use_container_width=True,
|
| 280 |
+
hide_index=True,
|
| 281 |
+
height=400
|
| 282 |
+
)
|
| 283 |
|
| 284 |
+
# Summary statistics
|
| 285 |
+
with st.expander("Summary Statistics"):
|
| 286 |
+
st.write(df_results.describe())
|
| 287 |
+
|
| 288 |
+
else:
|
| 289 |
+
# Display as charts
|
| 290 |
+
AnalysisHandlers._render_batch_charts(df_results)
|
| 291 |
+
|
| 292 |
+
# Export results
|
| 293 |
+
st.subheader("Export Results")
|
| 294 |
+
col1, col2, col3 = st.columns(3)
|
| 295 |
+
|
| 296 |
+
with col1:
|
| 297 |
+
# CSV export
|
| 298 |
+
csv = df_results.to_csv(index=False)
|
| 299 |
st.download_button(
|
| 300 |
+
label="📥 Download as CSV",
|
| 301 |
+
data=csv,
|
| 302 |
+
file_name=f"batch_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 303 |
mime="text/csv"
|
| 304 |
)
|
| 305 |
|
| 306 |
+
with col2:
|
| 307 |
+
# Excel export (using in-memory buffer)
|
| 308 |
+
if params['export_format'] == "Excel":
|
| 309 |
+
excel_buffer = BytesIO()
|
| 310 |
+
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
|
| 311 |
+
df_results.to_excel(writer, sheet_name='Results', index=False)
|
| 312 |
+
|
| 313 |
+
# Add summary sheet
|
| 314 |
+
df_summary = df_results.describe()
|
| 315 |
+
df_summary.to_excel(writer, sheet_name='Summary')
|
| 316 |
+
|
| 317 |
+
excel_data = excel_buffer.getvalue()
|
| 318 |
+
|
| 319 |
+
st.download_button(
|
| 320 |
+
label="📥 Download as Excel",
|
| 321 |
+
data=excel_data,
|
| 322 |
+
file_name=f"batch_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
|
| 323 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
with col3:
|
| 327 |
+
# JSON export
|
| 328 |
+
if params['export_format'] == "JSON":
|
| 329 |
+
json_str = df_results.to_json(orient='records', indent=2)
|
| 330 |
+
st.download_button(
|
| 331 |
+
label="📥 Download as JSON",
|
| 332 |
+
data=json_str,
|
| 333 |
+
file_name=f"batch_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
| 334 |
+
mime="application/json"
|
| 335 |
+
)
|
| 336 |
|
| 337 |
@staticmethod
|
| 338 |
+
def _render_batch_charts(df_results: pd.DataFrame):
|
| 339 |
+
"""Render charts for batch analysis results."""
|
| 340 |
+
import plotly.express as px
|
| 341 |
+
|
| 342 |
+
# Select metrics for visualization
|
| 343 |
+
numeric_cols = df_results.select_dtypes(include=['float64', 'int64']).columns
|
| 344 |
+
numeric_cols = [col for col in numeric_cols if col not in ['index']]
|
| 345 |
+
|
| 346 |
+
if len(numeric_cols) == 0:
|
| 347 |
+
st.warning("No numeric data available for visualization")
|
| 348 |
return
|
| 349 |
|
| 350 |
+
# Metric selection
|
| 351 |
+
col1, col2 = st.columns(2)
|
| 352 |
+
with col1:
|
| 353 |
+
x_metric = st.selectbox("X-axis metric", numeric_cols, index=0)
|
| 354 |
+
with col2:
|
| 355 |
+
y_metric = st.selectbox("Y-axis metric", numeric_cols,
|
| 356 |
+
index=min(1, len(numeric_cols)-1))
|
| 357 |
+
|
| 358 |
+
# Create scatter plot
|
| 359 |
+
fig = px.scatter(
|
| 360 |
+
df_results,
|
| 361 |
+
x=x_metric,
|
| 362 |
+
y=y_metric,
|
| 363 |
+
hover_data=['filename'],
|
| 364 |
+
title=f"{y_metric} vs {x_metric}",
|
| 365 |
+
labels={x_metric: x_metric.replace('_', ' ').title(),
|
| 366 |
+
y_metric: y_metric.replace('_', ' ').title()}
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
fig.update_traces(marker=dict(size=10))
|
| 370 |
+
fig.update_layout(height=500)
|
| 371 |
+
|
| 372 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 373 |
+
|
| 374 |
+
# Distribution plots
|
| 375 |
+
st.subheader("Metric Distributions")
|
| 376 |
+
|
| 377 |
+
selected_metric = st.selectbox(
|
| 378 |
+
"Select metric for distribution",
|
| 379 |
+
numeric_cols,
|
| 380 |
+
key="dist_metric"
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
col1, col2 = st.columns(2)
|
| 384 |
+
|
| 385 |
+
with col1:
|
| 386 |
+
# Histogram
|
| 387 |
+
fig_hist = px.histogram(
|
| 388 |
+
df_results,
|
| 389 |
+
x=selected_metric,
|
| 390 |
+
nbins=20,
|
| 391 |
+
title=f"Distribution of {selected_metric.replace('_', ' ').title()}"
|
| 392 |
+
)
|
| 393 |
+
fig_hist.update_layout(height=400)
|
| 394 |
+
st.plotly_chart(fig_hist, use_container_width=True)
|
| 395 |
+
|
| 396 |
+
with col2:
|
| 397 |
+
# Box plot
|
| 398 |
+
fig_box = px.box(
|
| 399 |
+
df_results,
|
| 400 |
+
y=selected_metric,
|
| 401 |
+
title=f"Box Plot of {selected_metric.replace('_', ' ').title()}",
|
| 402 |
+
points="all"
|
| 403 |
+
)
|
| 404 |
+
fig_box.update_layout(height=400)
|
| 405 |
+
st.plotly_chart(fig_box, use_container_width=True)
|
| 406 |
+
|
| 407 |
+
@staticmethod
|
| 408 |
+
def _display_analysis_results(metrics: dict, analysis_time: float):
|
| 409 |
+
"""Display single text analysis results."""
|
| 410 |
+
st.success(f"✅ Analysis completed in {analysis_time:.2f} seconds")
|
| 411 |
+
|
| 412 |
+
# Render results in tabs
|
| 413 |
+
tab1, tab2, tab3, tab4 = st.tabs([
|
| 414 |
+
"📊 Overview",
|
| 415 |
+
"📈 Frequency Analysis",
|
| 416 |
+
"🎯 Advanced Metrics",
|
| 417 |
+
"📋 Raw Data"
|
| 418 |
+
])
|
| 419 |
+
|
| 420 |
+
with tab1:
|
| 421 |
+
AnalysisHandlers._render_overview_metrics(metrics)
|
| 422 |
+
|
| 423 |
+
with tab2:
|
| 424 |
+
AnalysisHandlers._render_frequency_analysis(metrics)
|
| 425 |
+
|
| 426 |
+
with tab3:
|
| 427 |
+
AnalysisHandlers._render_advanced_metrics(metrics)
|
| 428 |
+
|
| 429 |
+
with tab4:
|
| 430 |
+
AnalysisHandlers._render_raw_data(metrics)
|
| 431 |
+
|
| 432 |
+
@staticmethod
|
| 433 |
+
def _render_overview_metrics(metrics: dict):
|
| 434 |
+
"""Render overview metrics."""
|
| 435 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 436 |
+
|
| 437 |
+
with col1:
|
| 438 |
+
st.metric("Total Words", f"{metrics.get('total_words', 0):,}")
|
| 439 |
+
st.metric("Sentences", f"{metrics.get('sentence_count', 0):,}")
|
| 440 |
+
|
| 441 |
+
with col2:
|
| 442 |
+
st.metric("Unique Words", f"{metrics.get('unique_words', 0):,}")
|
| 443 |
+
st.metric("Avg Sentence Length", f"{metrics.get('avg_sentence_length', 0):.1f}")
|
| 444 |
+
|
| 445 |
+
with col3:
|
| 446 |
+
st.metric("Lexical Diversity", f"{metrics.get('lexical_diversity', 0):.3f}")
|
| 447 |
+
st.metric("Avg Word Length", f"{metrics.get('avg_word_length', 0):.2f}")
|
| 448 |
+
|
| 449 |
+
with col4:
|
| 450 |
+
st.metric("Readability (Flesch)", f"{metrics.get('flesch_reading_ease', 0):.1f}")
|
| 451 |
+
st.metric("Grade Level", f"{metrics.get('flesch_kincaid_grade', 0):.1f}")
|
| 452 |
+
|
| 453 |
+
@staticmethod
|
| 454 |
+
def _render_frequency_analysis(metrics: dict):
|
| 455 |
+
"""Render frequency analysis section."""
|
| 456 |
+
import plotly.graph_objects as go
|
| 457 |
+
|
| 458 |
+
if 'frequency_distribution' not in metrics:
|
| 459 |
+
st.info("Frequency distribution data not available")
|
| 460 |
+
return
|
| 461 |
+
|
| 462 |
+
freq_dist = metrics['frequency_distribution']
|
| 463 |
+
|
| 464 |
+
# Prepare data for visualization
|
| 465 |
+
words = list(freq_dist.keys())[:30] # Top 30 words
|
| 466 |
+
frequencies = [freq_dist[word] for word in words]
|
| 467 |
+
|
| 468 |
+
# Create bar chart
|
| 469 |
+
fig = go.Figure(data=[
|
| 470 |
+
go.Bar(x=words, y=frequencies)
|
| 471 |
+
])
|
| 472 |
+
|
| 473 |
+
fig.update_layout(
|
| 474 |
+
title="Top 30 Most Frequent Words",
|
| 475 |
+
xaxis_title="Words",
|
| 476 |
+
yaxis_title="Frequency",
|
| 477 |
+
height=500
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 481 |
+
|
| 482 |
+
# Word frequency statistics
|
| 483 |
+
col1, col2 = st.columns(2)
|
| 484 |
+
|
| 485 |
+
with col1:
|
| 486 |
+
st.write("**Frequency Statistics:**")
|
| 487 |
+
st.write(f"• Most common word: {words[0]} ({frequencies[0]} times)")
|
| 488 |
+
st.write(f"• Hapax legomena: {sum(1 for f in freq_dist.values() if f == 1)} words")
|
| 489 |
+
st.write(f"• Words appearing 2+ times: {sum(1 for f in freq_dist.values() if f >= 2)}")
|
| 490 |
+
|
| 491 |
+
with col2:
|
| 492 |
+
st.write("**Coverage Analysis:**")
|
| 493 |
+
total_words = sum(freq_dist.values())
|
| 494 |
+
top10_coverage = sum(frequencies[:10]) / total_words * 100
|
| 495 |
+
top30_coverage = sum(frequencies[:30]) / total_words * 100
|
| 496 |
+
st.write(f"• Top 10 words: {top10_coverage:.1f}% of text")
|
| 497 |
+
st.write(f"• Top 30 words: {top30_coverage:.1f}% of text")
|
| 498 |
+
|
| 499 |
+
@staticmethod
|
| 500 |
+
def _render_advanced_metrics(metrics: dict):
|
| 501 |
+
"""Render advanced metrics section."""
|
| 502 |
+
# POS distribution if available
|
| 503 |
+
if 'pos_distribution' in metrics:
|
| 504 |
+
st.subheader("Part-of-Speech Distribution")
|
| 505 |
+
|
| 506 |
+
pos_dist = metrics['pos_distribution']
|
| 507 |
+
if pos_dist:
|
| 508 |
+
import plotly.express as px
|
| 509 |
|
| 510 |
+
# Prepare data
|
| 511 |
+
pos_df = pd.DataFrame([
|
| 512 |
+
{'POS': pos, 'Count': count}
|
| 513 |
+
for pos, count in pos_dist.items()
|
| 514 |
+
])
|
| 515 |
+
pos_df = pos_df.sort_values('Count', ascending=False)
|
|
|
|
| 516 |
|
| 517 |
+
# Create pie chart
|
| 518 |
+
fig = px.pie(
|
| 519 |
+
pos_df,
|
| 520 |
+
values='Count',
|
| 521 |
+
names='POS',
|
| 522 |
+
title="Part-of-Speech Distribution"
|
|
|
|
|
|
|
| 523 |
)
|
| 524 |
|
| 525 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 526 |
+
|
| 527 |
+
# Sophistication metrics
|
| 528 |
+
st.subheader("Sophistication Metrics")
|
| 529 |
+
|
| 530 |
+
col1, col2 = st.columns(2)
|
| 531 |
+
|
| 532 |
+
with col1:
|
| 533 |
+
if 'avg_word_frequency' in metrics:
|
| 534 |
+
st.metric(
|
| 535 |
+
"Average Word Frequency",
|
| 536 |
+
f"{metrics['avg_word_frequency']:.2f}",
|
| 537 |
+
help="Average frequency of words in reference corpus"
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
if 'academic_words_ratio' in metrics:
|
| 541 |
+
st.metric(
|
| 542 |
+
"Academic Words Ratio",
|
| 543 |
+
f"{metrics['academic_words_ratio']:.2%}",
|
| 544 |
+
help="Percentage of academic vocabulary"
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
with col2:
|
| 548 |
+
if 'rare_words_ratio' in metrics:
|
| 549 |
+
st.metric(
|
| 550 |
+
"Rare Words Ratio",
|
| 551 |
+
f"{metrics['rare_words_ratio']:.2%}",
|
| 552 |
+
help="Percentage of infrequent words"
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
if 'lexical_sophistication_score' in metrics:
|
| 556 |
+
st.metric(
|
| 557 |
+
"Sophistication Score",
|
| 558 |
+
f"{metrics['lexical_sophistication_score']:.3f}",
|
| 559 |
+
help="Overall lexical sophistication"
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
@staticmethod
|
| 563 |
+
def _render_raw_data(metrics: dict):
|
| 564 |
+
"""Render raw data section."""
|
| 565 |
+
st.write("**Available Metrics:**")
|
| 566 |
+
|
| 567 |
+
# Display all metrics in an expandable format
|
| 568 |
+
for key, value in metrics.items():
|
| 569 |
+
if isinstance(value, (dict, list)) and len(str(value)) > 100:
|
| 570 |
+
with st.expander(f"{key} (complex data)"):
|
| 571 |
+
if isinstance(value, dict):
|
| 572 |
+
st.json(value)
|
| 573 |
+
else:
|
| 574 |
+
st.write(value)
|
| 575 |
+
else:
|
| 576 |
+
st.write(f"• **{key}:** {value}")
|
| 577 |
+
|
| 578 |
+
# Export options
|
| 579 |
+
st.subheader("Export Data")
|
| 580 |
+
|
| 581 |
+
# Prepare export data
|
| 582 |
+
export_data = {k: v for k, v in metrics.items()
|
| 583 |
+
if not isinstance(v, (dict, list)) or k in ['pos_distribution']}
|
| 584 |
+
|
| 585 |
+
col1, col2 = st.columns(2)
|
| 586 |
+
|
| 587 |
+
with col1:
|
| 588 |
+
# JSON export
|
| 589 |
+
json_str = pd.Series(export_data).to_json(indent=2)
|
| 590 |
+
st.download_button(
|
| 591 |
+
label="📥 Download as JSON",
|
| 592 |
+
data=json_str,
|
| 593 |
+
file_name=f"analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
| 594 |
+
mime="application/json"
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
with col2:
|
| 598 |
+
# CSV export
|
| 599 |
+
df_export = pd.DataFrame([export_data])
|
| 600 |
+
csv = df_export.to_csv(index=False)
|
| 601 |
+
st.download_button(
|
| 602 |
+
label="📥 Download as CSV",
|
| 603 |
+
data=csv,
|
| 604 |
+
file_name=f"analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 605 |
+
mime="text/csv"
|
| 606 |
+
)
|
web_app/handlers/analysis_handlers.py.backup_20250726_162020
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Analysis handlers module for different types of text analysis.
|
| 3 |
+
Handles single text, batch, and comparison analysis workflows.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import plotly.graph_objects as go
|
| 10 |
+
from scipy import stats
|
| 11 |
+
import tempfile
|
| 12 |
+
import os
|
| 13 |
+
from typing import Dict, List, Any, Optional, Tuple
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import zipfile
|
| 16 |
+
import time
|
| 17 |
+
|
| 18 |
+
from web_app.session_manager import SessionManager
|
| 19 |
+
from web_app.components.ui_components import UIComponents
|
| 20 |
+
from web_app.components.comparison_functions import get_text_input, display_comparison_results
|
| 21 |
+
from web_app.reference_manager import ReferenceManager
|
| 22 |
+
from web_app.utils import FileUploadHandler
|
| 23 |
+
|
| 24 |
+
class AnalysisHandlers:
|
| 25 |
+
"""Handles different types of text analysis workflows."""
|
| 26 |
+
|
| 27 |
+
@staticmethod
|
| 28 |
+
def get_analyzer():
|
| 29 |
+
"""Get or create lexical sophistication analyzer."""
|
| 30 |
+
if (st.session_state.analyzer is None or
|
| 31 |
+
st.session_state.analyzer.language != st.session_state.language or
|
| 32 |
+
st.session_state.analyzer.model_size != st.session_state.model_size):
|
| 33 |
+
try:
|
| 34 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 35 |
+
st.session_state.analyzer = LexicalSophisticationAnalyzer(
|
| 36 |
+
language=st.session_state.language,
|
| 37 |
+
model_size=st.session_state.model_size
|
| 38 |
+
)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
st.error(f"Error loading analyzer: {e}")
|
| 41 |
+
return None
|
| 42 |
+
return st.session_state.analyzer
|
| 43 |
+
|
| 44 |
+
@staticmethod
|
| 45 |
+
def get_pos_parser():
|
| 46 |
+
"""Get or create POS parser."""
|
| 47 |
+
if (st.session_state.pos_parser is None or
|
| 48 |
+
st.session_state.pos_parser.language != st.session_state.language or
|
| 49 |
+
st.session_state.pos_parser.model_size != st.session_state.model_size):
|
| 50 |
+
try:
|
| 51 |
+
from text_analyzer.pos_parser import POSParser
|
| 52 |
+
st.session_state.pos_parser = POSParser(
|
| 53 |
+
language=st.session_state.language,
|
| 54 |
+
model_size=st.session_state.model_size
|
| 55 |
+
)
|
| 56 |
+
except Exception as e:
|
| 57 |
+
st.error(f"Error loading POS parser: {e}")
|
| 58 |
+
return None
|
| 59 |
+
return st.session_state.pos_parser
|
| 60 |
+
|
| 61 |
+
@staticmethod
|
| 62 |
+
def handle_single_text_analysis(analyzer):
|
| 63 |
+
"""Handle single text analysis workflow."""
|
| 64 |
+
st.subheader("Single Text Analysis")
|
| 65 |
+
|
| 66 |
+
# Text input
|
| 67 |
+
text_content = UIComponents.render_text_input("text to analyze", "single")
|
| 68 |
+
|
| 69 |
+
if not text_content:
|
| 70 |
+
st.info("Please provide text to analyze.")
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
# Reference list configuration
|
| 74 |
+
st.subheader("Reference Lists")
|
| 75 |
+
ReferenceManager.configure_reference_lists(analyzer)
|
| 76 |
+
ReferenceManager.render_custom_upload_section()
|
| 77 |
+
|
| 78 |
+
# Analysis options
|
| 79 |
+
apply_log, word_type_filter = UIComponents.render_analysis_options()
|
| 80 |
+
|
| 81 |
+
# Analysis button
|
| 82 |
+
if st.button("Analyze Text", type="primary"):
|
| 83 |
+
reference_lists = SessionManager.get_reference_lists()
|
| 84 |
+
if not reference_lists:
|
| 85 |
+
st.warning("Please select or upload reference lists first.")
|
| 86 |
+
return
|
| 87 |
+
|
| 88 |
+
with st.spinner("Analyzing text..."):
|
| 89 |
+
try:
|
| 90 |
+
# Load reference lists
|
| 91 |
+
analyzer.load_reference_lists(reference_lists)
|
| 92 |
+
|
| 93 |
+
# Perform analysis
|
| 94 |
+
results = analyzer.analyze_text(
|
| 95 |
+
text_content,
|
| 96 |
+
list(reference_lists.keys()),
|
| 97 |
+
apply_log,
|
| 98 |
+
word_type_filter
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Display results
|
| 102 |
+
AnalysisHandlers.display_single_text_results(results)
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
st.error(f"Error during analysis: {e}")
|
| 106 |
+
|
| 107 |
+
@staticmethod
|
| 108 |
+
def handle_batch_analysis(analyzer):
|
| 109 |
+
"""Handle batch analysis workflow."""
|
| 110 |
+
st.subheader("Batch Analysis")
|
| 111 |
+
|
| 112 |
+
# File upload
|
| 113 |
+
uploaded_files = st.file_uploader(
|
| 114 |
+
"Upload Text Files",
|
| 115 |
+
type=['txt', 'zip'],
|
| 116 |
+
accept_multiple_files=True,
|
| 117 |
+
help="Upload individual .txt files or a .zip archive containing .txt files"
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
if not uploaded_files:
|
| 121 |
+
st.info("Please upload text files for batch analysis.")
|
| 122 |
+
return
|
| 123 |
+
|
| 124 |
+
# Reference list configuration
|
| 125 |
+
st.subheader("Reference Lists")
|
| 126 |
+
ReferenceManager.configure_reference_lists(analyzer)
|
| 127 |
+
ReferenceManager.render_custom_upload_section()
|
| 128 |
+
|
| 129 |
+
# Analysis options
|
| 130 |
+
apply_log = st.checkbox("Apply log₁₀ transformation", key="batch_log")
|
| 131 |
+
|
| 132 |
+
# Analysis button
|
| 133 |
+
if st.button("Analyze Batch", type="primary"):
|
| 134 |
+
reference_lists = SessionManager.get_reference_lists()
|
| 135 |
+
if not reference_lists:
|
| 136 |
+
st.warning("Please select or upload reference lists first.")
|
| 137 |
+
return
|
| 138 |
+
|
| 139 |
+
with st.spinner("Processing files..."):
|
| 140 |
+
try:
|
| 141 |
+
# Extract files
|
| 142 |
+
file_contents = AnalysisHandlers.extract_uploaded_files(uploaded_files)
|
| 143 |
+
|
| 144 |
+
if not file_contents:
|
| 145 |
+
st.error("No valid .txt files found in uploaded files.")
|
| 146 |
+
return
|
| 147 |
+
|
| 148 |
+
st.info(f"Found {len(file_contents)} files to process.")
|
| 149 |
+
|
| 150 |
+
# Load reference lists
|
| 151 |
+
analyzer.load_reference_lists(reference_lists)
|
| 152 |
+
|
| 153 |
+
# Create progress tracking
|
| 154 |
+
progress_bar = st.progress(0)
|
| 155 |
+
status_text = st.empty()
|
| 156 |
+
|
| 157 |
+
# Process files in memory
|
| 158 |
+
batch_results = []
|
| 159 |
+
selected_indices = list(reference_lists.keys())
|
| 160 |
+
|
| 161 |
+
for i, (filename, text_content) in enumerate(file_contents):
|
| 162 |
+
# Update progress
|
| 163 |
+
progress = (i + 1) / len(file_contents)
|
| 164 |
+
progress_bar.progress(progress)
|
| 165 |
+
status_text.text(f"Processing file {i + 1}/{len(file_contents)}: {filename}")
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
# Analyze for both content and function words
|
| 169 |
+
result_row = {'filename': filename}
|
| 170 |
+
|
| 171 |
+
for word_type in ['CW', 'FW']:
|
| 172 |
+
analysis = analyzer.analyze_text(text_content, selected_indices, apply_log, word_type)
|
| 173 |
+
|
| 174 |
+
# Extract summary scores
|
| 175 |
+
if analysis and 'summary' in analysis:
|
| 176 |
+
for index, stats in analysis['summary'].items():
|
| 177 |
+
col_name = f"{index}_{word_type}"
|
| 178 |
+
result_row[col_name] = stats['mean']
|
| 179 |
+
|
| 180 |
+
batch_results.append(result_row)
|
| 181 |
+
except Exception as e:
|
| 182 |
+
st.warning(f"Error analyzing {filename}: {e}")
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
# Convert to DataFrame
|
| 186 |
+
results_df = pd.DataFrame(batch_results)
|
| 187 |
+
|
| 188 |
+
# Display results
|
| 189 |
+
st.success(f"Analysis complete! Processed {len(results_df)} files.")
|
| 190 |
+
st.subheader("Results")
|
| 191 |
+
st.dataframe(results_df, use_container_width=True)
|
| 192 |
+
|
| 193 |
+
# Download link
|
| 194 |
+
csv_data = results_df.to_csv(index=False)
|
| 195 |
+
st.download_button(
|
| 196 |
+
label="Download Results (CSV)",
|
| 197 |
+
data=csv_data,
|
| 198 |
+
file_name="lexical_sophistication_results.csv",
|
| 199 |
+
mime="text/csv"
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
st.error(f"Error during batch analysis: {e}")
|
| 204 |
+
|
| 205 |
+
@staticmethod
|
| 206 |
+
def handle_two_text_comparison(analyzer):
|
| 207 |
+
"""Handle two-text comparison analysis."""
|
| 208 |
+
st.subheader("Two-Text Comparison")
|
| 209 |
+
|
| 210 |
+
# Create two columns for text input
|
| 211 |
+
col_a, col_b = st.columns(2)
|
| 212 |
+
|
| 213 |
+
with col_a:
|
| 214 |
+
st.subheader("📄 Text A")
|
| 215 |
+
text_a = get_text_input("Text A", "a")
|
| 216 |
+
|
| 217 |
+
with col_b:
|
| 218 |
+
st.subheader("📄 Text B")
|
| 219 |
+
text_b = get_text_input("Text B", "b")
|
| 220 |
+
|
| 221 |
+
# Check if both texts are provided
|
| 222 |
+
if not text_a or not text_b:
|
| 223 |
+
st.info("Please provide both texts to compare.")
|
| 224 |
+
return
|
| 225 |
+
# Reference list configuration
|
| 226 |
+
st.subheader("Reference Lists")
|
| 227 |
+
ReferenceManager.configure_reference_lists(analyzer)
|
| 228 |
+
ReferenceManager.render_custom_upload_section()
|
| 229 |
+
|
| 230 |
+
# Analysis options
|
| 231 |
+
col1, col2 = st.columns(2)
|
| 232 |
+
with col1:
|
| 233 |
+
apply_log = st.checkbox("Apply log₁₀ transformation", key="comparison_log")
|
| 234 |
+
with col2:
|
| 235 |
+
word_type_filter = st.selectbox(
|
| 236 |
+
"Word Type Filter",
|
| 237 |
+
options=[None, 'CW', 'FW'],
|
| 238 |
+
format_func=lambda x: 'All Words' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
|
| 239 |
+
key="comparison_word_type"
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# Analysis button
|
| 243 |
+
if st.button("🔍 Compare Texts", type="primary"):
|
| 244 |
+
reference_lists = SessionManager.get_reference_lists()
|
| 245 |
+
if not reference_lists:
|
| 246 |
+
st.warning("Please select or upload reference lists first.")
|
| 247 |
+
return
|
| 248 |
+
|
| 249 |
+
with st.spinner("Analyzing texts..."):
|
| 250 |
+
try:
|
| 251 |
+
# Load reference lists
|
| 252 |
+
analyzer.load_reference_lists(reference_lists)
|
| 253 |
+
|
| 254 |
+
# Perform analysis on both texts
|
| 255 |
+
selected_indices = list(reference_lists.keys())
|
| 256 |
+
|
| 257 |
+
results_a = analyzer.analyze_text(text_a, selected_indices, apply_log, word_type_filter)
|
| 258 |
+
results_b = analyzer.analyze_text(text_b, selected_indices, apply_log, word_type_filter)
|
| 259 |
+
|
| 260 |
+
# Display comparison results
|
| 261 |
+
display_comparison_results(results_a, results_b)
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
st.error(f"Error during comparison: {e}")
|
| 265 |
+
|
| 266 |
+
@staticmethod
|
| 267 |
+
def extract_uploaded_files(uploaded_files) -> List[Tuple[str, str]]:
|
| 268 |
+
"""Extract uploaded files and return list of (filename, content) tuples."""
|
| 269 |
+
file_contents = []
|
| 270 |
+
temp_paths = [] # Track temp files for cleanup
|
| 271 |
+
|
| 272 |
+
try:
|
| 273 |
+
for uploaded_file in uploaded_files:
|
| 274 |
+
if uploaded_file.name.endswith('.zip'):
|
| 275 |
+
# Handle ZIP files using temp file approach
|
| 276 |
+
zip_file = FileUploadHandler.handle_zip_file(uploaded_file)
|
| 277 |
+
if not zip_file:
|
| 278 |
+
continue
|
| 279 |
+
|
| 280 |
+
with zip_file as zip_ref:
|
| 281 |
+
for file_info in zip_ref.infolist():
|
| 282 |
+
if file_info.filename.endswith('.txt'):
|
| 283 |
+
try:
|
| 284 |
+
content = zip_ref.read(file_info.filename)
|
| 285 |
+
# Decode content
|
| 286 |
+
try:
|
| 287 |
+
text_content = content.decode('utf-8')
|
| 288 |
+
except UnicodeDecodeError:
|
| 289 |
+
try:
|
| 290 |
+
text_content = content.decode('utf-16')
|
| 291 |
+
except UnicodeDecodeError:
|
| 292 |
+
st.error(f"Unable to decode file {file_info.filename}. Skipping.")
|
| 293 |
+
continue
|
| 294 |
+
file_contents.append((file_info.filename, text_content))
|
| 295 |
+
except Exception as e:
|
| 296 |
+
st.error(f"Cannot read {file_info.filename}: {e}")
|
| 297 |
+
continue
|
| 298 |
+
elif uploaded_file.name.endswith('.txt'):
|
| 299 |
+
# Handle individual text files using temp file approach
|
| 300 |
+
try:
|
| 301 |
+
# Save to temp and read content
|
| 302 |
+
temp_path = FileUploadHandler.save_to_temp(uploaded_file, prefix="analysis")
|
| 303 |
+
if not temp_path:
|
| 304 |
+
st.error(f"Failed to save file {uploaded_file.name}")
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
temp_paths.append(temp_path)
|
| 308 |
+
|
| 309 |
+
# Read content with encoding handling
|
| 310 |
+
content = FileUploadHandler.read_from_temp(temp_path)
|
| 311 |
+
if isinstance(content, bytes):
|
| 312 |
+
try:
|
| 313 |
+
text_content = content.decode('utf-8')
|
| 314 |
+
except UnicodeDecodeError:
|
| 315 |
+
try:
|
| 316 |
+
text_content = content.decode('utf-16')
|
| 317 |
+
except UnicodeDecodeError:
|
| 318 |
+
st.error(f"Unable to decode file {uploaded_file.name}. Skipping.")
|
| 319 |
+
continue
|
| 320 |
+
else:
|
| 321 |
+
text_content = content
|
| 322 |
+
|
| 323 |
+
file_contents.append((uploaded_file.name, text_content))
|
| 324 |
+
except Exception as e:
|
| 325 |
+
st.error(f"Cannot read file {uploaded_file.name}: {e}")
|
| 326 |
+
continue
|
| 327 |
+
else:
|
| 328 |
+
st.warning(f"Skipping {uploaded_file.name}: Not a .txt or .zip file")
|
| 329 |
+
|
| 330 |
+
return file_contents
|
| 331 |
+
|
| 332 |
+
finally:
|
| 333 |
+
# Cleanup temp files
|
| 334 |
+
for temp_path in temp_paths:
|
| 335 |
+
FileUploadHandler.cleanup_temp_file(temp_path)
|
| 336 |
+
|
| 337 |
+
@staticmethod
|
| 338 |
+
def display_single_text_results(results: Dict[str, Any]):
|
| 339 |
+
"""Display results for single text analysis."""
|
| 340 |
+
st.subheader("Analysis Results")
|
| 341 |
+
|
| 342 |
+
# Summary results
|
| 343 |
+
if results['summary']:
|
| 344 |
+
st.write("**Summary Statistics**")
|
| 345 |
+
summary_data = []
|
| 346 |
+
for key, stats in results['summary'].items():
|
| 347 |
+
summary_data.append({
|
| 348 |
+
'Index': key,
|
| 349 |
+
'Mean': round(stats['mean'], 3),
|
| 350 |
+
'Std Dev': round(stats['std'], 3),
|
| 351 |
+
'Count': stats['count'],
|
| 352 |
+
'Min': round(stats['min'], 3),
|
| 353 |
+
'Max': round(stats['max'], 3)
|
| 354 |
+
})
|
| 355 |
+
|
| 356 |
+
summary_df = pd.DataFrame(summary_data)
|
| 357 |
+
st.dataframe(summary_df, use_container_width=True)
|
| 358 |
+
|
| 359 |
+
# Token details
|
| 360 |
+
if results['token_details']:
|
| 361 |
+
st.write("**Token Analysis**")
|
| 362 |
+
token_df = pd.DataFrame(results['token_details'])
|
| 363 |
+
st.dataframe(token_df, use_container_width=True)
|
| 364 |
+
|
| 365 |
+
# Download token details
|
| 366 |
+
csv_data = token_df.to_csv(index=False)
|
| 367 |
+
st.download_button(
|
| 368 |
+
label="Download Token Details (CSV)",
|
| 369 |
+
data=csv_data,
|
| 370 |
+
file_name="token_analysis.csv",
|
| 371 |
+
mime="text/csv"
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
# Bigram and trigram details
|
| 375 |
+
for detail_type in ['bigram_details', 'trigram_details']:
|
| 376 |
+
if results.get(detail_type):
|
| 377 |
+
st.write(f"**{detail_type.replace('_', ' ').title()}**")
|
| 378 |
+
detail_df = pd.DataFrame(results[detail_type])
|
| 379 |
+
st.dataframe(detail_df, use_container_width=True)
|
| 380 |
+
|
| 381 |
+
# Density plots
|
| 382 |
+
if results['summary']:
|
| 383 |
+
st.write("**Score Distribution Plots**")
|
| 384 |
+
AnalysisHandlers.create_density_plots(results)
|
| 385 |
+
|
| 386 |
+
@staticmethod
|
| 387 |
+
def create_density_plots(results: Dict[str, Any]):
|
| 388 |
+
"""Create density plots for score distributions."""
|
| 389 |
+
if 'raw_scores' not in results:
|
| 390 |
+
return
|
| 391 |
+
|
| 392 |
+
for key, scores in results['raw_scores'].items():
|
| 393 |
+
if len(scores) > 1: # Need at least 2 points for density
|
| 394 |
+
# Create histogram with density curve
|
| 395 |
+
fig = go.Figure()
|
| 396 |
+
|
| 397 |
+
# Add histogram
|
| 398 |
+
fig.add_trace(go.Histogram(
|
| 399 |
+
x=scores,
|
| 400 |
+
nbinsx=min(30, len(scores)),
|
| 401 |
+
name='Histogram',
|
| 402 |
+
opacity=0.7,
|
| 403 |
+
histnorm='probability density'
|
| 404 |
+
))
|
| 405 |
+
|
| 406 |
+
# Calculate and add KDE curve
|
| 407 |
+
kde = stats.gaussian_kde(scores)
|
| 408 |
+
x_range = np.linspace(min(scores), max(scores), 100)
|
| 409 |
+
kde_values = kde(x_range)
|
| 410 |
+
|
| 411 |
+
fig.add_trace(go.Scatter(
|
| 412 |
+
x=x_range,
|
| 413 |
+
y=kde_values,
|
| 414 |
+
mode='lines',
|
| 415 |
+
name='Density',
|
| 416 |
+
line=dict(color='red', width=2)
|
| 417 |
+
))
|
| 418 |
+
|
| 419 |
+
# Update layout
|
| 420 |
+
fig.update_layout(
|
| 421 |
+
title=f"Distribution of {key}",
|
| 422 |
+
xaxis_title="Score",
|
| 423 |
+
yaxis_title="Density",
|
| 424 |
+
template='plotly_white',
|
| 425 |
+
showlegend=True,
|
| 426 |
+
bargap=0.05
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
st.plotly_chart(fig, use_container_width=True)
|
web_app/handlers/analysis_handlers_updated.py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Analysis Handlers for Streamlit Interface - Updated with MemoryFileHandler
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from typing import List, Tuple, Dict, Optional
|
| 8 |
+
import time
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import zipfile
|
| 11 |
+
from io import BytesIO, StringIO
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
# Add parent directory to path for imports
|
| 16 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 17 |
+
|
| 18 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 19 |
+
from web_app.session_manager import SessionManager
|
| 20 |
+
from web_app.components.ui_components import UIComponents
|
| 21 |
+
from web_app.utils import MemoryFileHandler
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class AnalysisHandlers:
|
| 25 |
+
"""
|
| 26 |
+
Handles analysis-related UI components and workflows.
|
| 27 |
+
Updated to use MemoryFileHandler for better compatibility.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def handle_single_text_analysis(analyzer: LexicalSophisticationAnalyzer):
|
| 32 |
+
"""Handle single text analysis workflow."""
|
| 33 |
+
# Get text input
|
| 34 |
+
text_content = UIComponents.render_text_input_section("Text for Analysis", "single_text")
|
| 35 |
+
|
| 36 |
+
if text_content and st.button("Analyze Text", type="primary", key="analyze_single"):
|
| 37 |
+
with st.spinner("Analyzing text..."):
|
| 38 |
+
start_time = time.time()
|
| 39 |
+
|
| 40 |
+
# Get analysis parameters
|
| 41 |
+
params = SessionManager.get_analysis_params()
|
| 42 |
+
|
| 43 |
+
# Run analysis
|
| 44 |
+
metrics = analyzer.analyze_text(
|
| 45 |
+
text_content,
|
| 46 |
+
include_pos_info=params['include_pos'],
|
| 47 |
+
min_word_length=params['min_word_length']
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
analysis_time = time.time() - start_time
|
| 51 |
+
|
| 52 |
+
# Store results
|
| 53 |
+
SessionManager.store_analysis_results(
|
| 54 |
+
'single_text',
|
| 55 |
+
metrics,
|
| 56 |
+
{'analysis_time': analysis_time}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Display results
|
| 60 |
+
AnalysisHandlers._display_analysis_results(metrics, analysis_time)
|
| 61 |
+
|
| 62 |
+
@staticmethod
|
| 63 |
+
def handle_comparison_analysis(analyzer: LexicalSophisticationAnalyzer):
|
| 64 |
+
"""Handle text comparison workflow."""
|
| 65 |
+
from web_app.components.comparison_functions import render_comparison_interface
|
| 66 |
+
render_comparison_interface(analyzer)
|
| 67 |
+
|
| 68 |
+
@staticmethod
|
| 69 |
+
def handle_batch_analysis(analyzer: LexicalSophisticationAnalyzer):
|
| 70 |
+
"""Handle batch analysis workflow with memory-based file handling."""
|
| 71 |
+
st.subheader("Batch Analysis")
|
| 72 |
+
|
| 73 |
+
# File upload
|
| 74 |
+
uploaded_files = st.file_uploader(
|
| 75 |
+
"Upload Text Files",
|
| 76 |
+
type=['txt', 'zip'],
|
| 77 |
+
accept_multiple_files=True,
|
| 78 |
+
help="Upload individual .txt files or a .zip archive containing .txt files"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
if not uploaded_files:
|
| 82 |
+
st.info("Please upload text files or a ZIP archive to begin batch analysis.")
|
| 83 |
+
return
|
| 84 |
+
|
| 85 |
+
# Analysis parameters
|
| 86 |
+
with st.expander("Analysis Parameters", expanded=True):
|
| 87 |
+
params = AnalysisHandlers._render_batch_analysis_params()
|
| 88 |
+
|
| 89 |
+
if st.button("Start Batch Analysis", type="primary"):
|
| 90 |
+
# Process files
|
| 91 |
+
file_contents = AnalysisHandlers._process_batch_files_memory(uploaded_files)
|
| 92 |
+
|
| 93 |
+
if not file_contents:
|
| 94 |
+
st.error("No valid text files found.")
|
| 95 |
+
return
|
| 96 |
+
|
| 97 |
+
# Run batch analysis
|
| 98 |
+
AnalysisHandlers._run_batch_analysis(analyzer, file_contents, params)
|
| 99 |
+
|
| 100 |
+
@staticmethod
|
| 101 |
+
def _process_batch_files_memory(uploaded_files) -> List[Tuple[str, str]]:
|
| 102 |
+
"""
|
| 103 |
+
Process uploaded files for batch analysis using memory-based approach.
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
List of tuples (filename, content)
|
| 107 |
+
"""
|
| 108 |
+
file_contents = []
|
| 109 |
+
|
| 110 |
+
with st.spinner("Processing uploaded files..."):
|
| 111 |
+
progress_bar = st.progress(0)
|
| 112 |
+
total_files = len(uploaded_files)
|
| 113 |
+
|
| 114 |
+
for idx, uploaded_file in enumerate(uploaded_files):
|
| 115 |
+
try:
|
| 116 |
+
if uploaded_file.name.endswith('.zip'):
|
| 117 |
+
# Handle ZIP files
|
| 118 |
+
zip_contents = MemoryFileHandler.handle_zip_file(uploaded_file)
|
| 119 |
+
if zip_contents:
|
| 120 |
+
for filename, content in zip_contents.items():
|
| 121 |
+
if filename.endswith('.txt'):
|
| 122 |
+
try:
|
| 123 |
+
# Decode bytes to text
|
| 124 |
+
if isinstance(content, bytes):
|
| 125 |
+
text_content = content.decode('utf-8')
|
| 126 |
+
else:
|
| 127 |
+
text_content = content
|
| 128 |
+
file_contents.append((filename, text_content))
|
| 129 |
+
except UnicodeDecodeError:
|
| 130 |
+
st.warning(f"Skipping {filename}: Unable to decode as UTF-8")
|
| 131 |
+
|
| 132 |
+
elif uploaded_file.name.endswith('.txt'):
|
| 133 |
+
# Handle individual text files
|
| 134 |
+
text_content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=True)
|
| 135 |
+
if text_content:
|
| 136 |
+
file_contents.append((uploaded_file.name, text_content))
|
| 137 |
+
else:
|
| 138 |
+
st.warning(f"Could not read {uploaded_file.name}")
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
| 142 |
+
|
| 143 |
+
# Update progress
|
| 144 |
+
progress_bar.progress((idx + 1) / total_files)
|
| 145 |
+
|
| 146 |
+
progress_bar.empty()
|
| 147 |
+
|
| 148 |
+
st.success(f"Processed {len(file_contents)} text files")
|
| 149 |
+
return file_contents
|
| 150 |
+
|
| 151 |
+
@staticmethod
|
| 152 |
+
def _render_batch_analysis_params() -> dict:
|
| 153 |
+
"""Render batch analysis parameters."""
|
| 154 |
+
col1, col2, col3 = st.columns(3)
|
| 155 |
+
|
| 156 |
+
with col1:
|
| 157 |
+
include_pos = st.checkbox(
|
| 158 |
+
"Include POS Analysis",
|
| 159 |
+
value=True,
|
| 160 |
+
help="Include part-of-speech tagging (slower but more detailed)"
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
min_word_length = st.number_input(
|
| 164 |
+
"Minimum Word Length",
|
| 165 |
+
min_value=1,
|
| 166 |
+
max_value=10,
|
| 167 |
+
value=1,
|
| 168 |
+
help="Exclude words shorter than this length"
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
with col2:
|
| 172 |
+
analyze_readability = st.checkbox(
|
| 173 |
+
"Analyze Readability",
|
| 174 |
+
value=True,
|
| 175 |
+
help="Include readability metrics"
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
analyze_diversity = st.checkbox(
|
| 179 |
+
"Analyze Diversity",
|
| 180 |
+
value=True,
|
| 181 |
+
help="Include lexical diversity metrics"
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
with col3:
|
| 185 |
+
export_format = st.selectbox(
|
| 186 |
+
"Export Format",
|
| 187 |
+
["CSV", "Excel", "JSON"],
|
| 188 |
+
help="Format for results export"
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
include_raw_data = st.checkbox(
|
| 192 |
+
"Include Raw Data",
|
| 193 |
+
value=False,
|
| 194 |
+
help="Include word lists in export"
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
return {
|
| 198 |
+
'include_pos': include_pos,
|
| 199 |
+
'min_word_length': min_word_length,
|
| 200 |
+
'analyze_readability': analyze_readability,
|
| 201 |
+
'analyze_diversity': analyze_diversity,
|
| 202 |
+
'export_format': export_format,
|
| 203 |
+
'include_raw_data': include_raw_data
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
@staticmethod
|
| 207 |
+
def _run_batch_analysis(analyzer: LexicalSophisticationAnalyzer,
|
| 208 |
+
file_contents: List[Tuple[str, str]],
|
| 209 |
+
params: dict):
|
| 210 |
+
"""Run batch analysis on multiple files."""
|
| 211 |
+
results = []
|
| 212 |
+
|
| 213 |
+
# Progress tracking
|
| 214 |
+
progress_bar = st.progress(0)
|
| 215 |
+
status_text = st.empty()
|
| 216 |
+
|
| 217 |
+
start_time = time.time()
|
| 218 |
+
|
| 219 |
+
for idx, (filename, content) in enumerate(file_contents):
|
| 220 |
+
status_text.text(f"Analyzing {filename}...")
|
| 221 |
+
|
| 222 |
+
try:
|
| 223 |
+
# Analyze text
|
| 224 |
+
metrics = analyzer.analyze_text(
|
| 225 |
+
content,
|
| 226 |
+
include_pos_info=params['include_pos'],
|
| 227 |
+
min_word_length=params['min_word_length']
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Add filename to results
|
| 231 |
+
metrics['filename'] = filename
|
| 232 |
+
results.append(metrics)
|
| 233 |
+
|
| 234 |
+
except Exception as e:
|
| 235 |
+
st.error(f"Error analyzing {filename}: {str(e)}")
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
# Update progress
|
| 239 |
+
progress_bar.progress((idx + 1) / len(file_contents))
|
| 240 |
+
|
| 241 |
+
# Clear progress indicators
|
| 242 |
+
progress_bar.empty()
|
| 243 |
+
status_text.empty()
|
| 244 |
+
|
| 245 |
+
total_time = time.time() - start_time
|
| 246 |
+
|
| 247 |
+
# Display results
|
| 248 |
+
AnalysisHandlers._display_batch_results(results, params, total_time)
|
| 249 |
+
|
| 250 |
+
@staticmethod
|
| 251 |
+
def _display_batch_results(results: List[dict], params: dict, total_time: float):
|
| 252 |
+
"""Display batch analysis results."""
|
| 253 |
+
st.success(f"✅ Analyzed {len(results)} files in {total_time:.1f} seconds")
|
| 254 |
+
|
| 255 |
+
if not results:
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
# Create results DataFrame
|
| 259 |
+
df_results = pd.DataFrame(results)
|
| 260 |
+
|
| 261 |
+
# Reorder columns for better display
|
| 262 |
+
priority_cols = ['filename', 'total_words', 'unique_words', 'avg_word_length',
|
| 263 |
+
'lexical_diversity', 'avg_word_frequency']
|
| 264 |
+
other_cols = [col for col in df_results.columns if col not in priority_cols]
|
| 265 |
+
ordered_cols = [col for col in priority_cols if col in df_results.columns] + other_cols
|
| 266 |
+
df_results = df_results[ordered_cols]
|
| 267 |
+
|
| 268 |
+
# Display options
|
| 269 |
+
col1, col2 = st.columns([3, 1])
|
| 270 |
+
with col1:
|
| 271 |
+
st.subheader("Analysis Results")
|
| 272 |
+
with col2:
|
| 273 |
+
display_mode = st.radio("Display", ["Table", "Charts"], horizontal=True)
|
| 274 |
+
|
| 275 |
+
if display_mode == "Table":
|
| 276 |
+
# Display as table
|
| 277 |
+
st.dataframe(
|
| 278 |
+
df_results,
|
| 279 |
+
use_container_width=True,
|
| 280 |
+
hide_index=True,
|
| 281 |
+
height=400
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
# Summary statistics
|
| 285 |
+
with st.expander("Summary Statistics"):
|
| 286 |
+
st.write(df_results.describe())
|
| 287 |
+
|
| 288 |
+
else:
|
| 289 |
+
# Display as charts
|
| 290 |
+
AnalysisHandlers._render_batch_charts(df_results)
|
| 291 |
+
|
| 292 |
+
# Export results
|
| 293 |
+
st.subheader("Export Results")
|
| 294 |
+
col1, col2, col3 = st.columns(3)
|
| 295 |
+
|
| 296 |
+
with col1:
|
| 297 |
+
# CSV export
|
| 298 |
+
csv = df_results.to_csv(index=False)
|
| 299 |
+
st.download_button(
|
| 300 |
+
label="📥 Download as CSV",
|
| 301 |
+
data=csv,
|
| 302 |
+
file_name=f"batch_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 303 |
+
mime="text/csv"
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
with col2:
|
| 307 |
+
# Excel export (using in-memory buffer)
|
| 308 |
+
if params['export_format'] == "Excel":
|
| 309 |
+
excel_buffer = BytesIO()
|
| 310 |
+
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
|
| 311 |
+
df_results.to_excel(writer, sheet_name='Results', index=False)
|
| 312 |
+
|
| 313 |
+
# Add summary sheet
|
| 314 |
+
df_summary = df_results.describe()
|
| 315 |
+
df_summary.to_excel(writer, sheet_name='Summary')
|
| 316 |
+
|
| 317 |
+
excel_data = excel_buffer.getvalue()
|
| 318 |
+
|
| 319 |
+
st.download_button(
|
| 320 |
+
label="📥 Download as Excel",
|
| 321 |
+
data=excel_data,
|
| 322 |
+
file_name=f"batch_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
|
| 323 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
with col3:
|
| 327 |
+
# JSON export
|
| 328 |
+
if params['export_format'] == "JSON":
|
| 329 |
+
json_str = df_results.to_json(orient='records', indent=2)
|
| 330 |
+
st.download_button(
|
| 331 |
+
label="📥 Download as JSON",
|
| 332 |
+
data=json_str,
|
| 333 |
+
file_name=f"batch_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
| 334 |
+
mime="application/json"
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
@staticmethod
|
| 338 |
+
def _render_batch_charts(df_results: pd.DataFrame):
|
| 339 |
+
"""Render charts for batch analysis results."""
|
| 340 |
+
import plotly.express as px
|
| 341 |
+
|
| 342 |
+
# Select metrics for visualization
|
| 343 |
+
numeric_cols = df_results.select_dtypes(include=['float64', 'int64']).columns
|
| 344 |
+
numeric_cols = [col for col in numeric_cols if col not in ['index']]
|
| 345 |
+
|
| 346 |
+
if len(numeric_cols) == 0:
|
| 347 |
+
st.warning("No numeric data available for visualization")
|
| 348 |
+
return
|
| 349 |
+
|
| 350 |
+
# Metric selection
|
| 351 |
+
col1, col2 = st.columns(2)
|
| 352 |
+
with col1:
|
| 353 |
+
x_metric = st.selectbox("X-axis metric", numeric_cols, index=0)
|
| 354 |
+
with col2:
|
| 355 |
+
y_metric = st.selectbox("Y-axis metric", numeric_cols,
|
| 356 |
+
index=min(1, len(numeric_cols)-1))
|
| 357 |
+
|
| 358 |
+
# Create scatter plot
|
| 359 |
+
fig = px.scatter(
|
| 360 |
+
df_results,
|
| 361 |
+
x=x_metric,
|
| 362 |
+
y=y_metric,
|
| 363 |
+
hover_data=['filename'],
|
| 364 |
+
title=f"{y_metric} vs {x_metric}",
|
| 365 |
+
labels={x_metric: x_metric.replace('_', ' ').title(),
|
| 366 |
+
y_metric: y_metric.replace('_', ' ').title()}
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
fig.update_traces(marker=dict(size=10))
|
| 370 |
+
fig.update_layout(height=500)
|
| 371 |
+
|
| 372 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 373 |
+
|
| 374 |
+
# Distribution plots
|
| 375 |
+
st.subheader("Metric Distributions")
|
| 376 |
+
|
| 377 |
+
selected_metric = st.selectbox(
|
| 378 |
+
"Select metric for distribution",
|
| 379 |
+
numeric_cols,
|
| 380 |
+
key="dist_metric"
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
col1, col2 = st.columns(2)
|
| 384 |
+
|
| 385 |
+
with col1:
|
| 386 |
+
# Histogram
|
| 387 |
+
fig_hist = px.histogram(
|
| 388 |
+
df_results,
|
| 389 |
+
x=selected_metric,
|
| 390 |
+
nbins=20,
|
| 391 |
+
title=f"Distribution of {selected_metric.replace('_', ' ').title()}"
|
| 392 |
+
)
|
| 393 |
+
fig_hist.update_layout(height=400)
|
| 394 |
+
st.plotly_chart(fig_hist, use_container_width=True)
|
| 395 |
+
|
| 396 |
+
with col2:
|
| 397 |
+
# Box plot
|
| 398 |
+
fig_box = px.box(
|
| 399 |
+
df_results,
|
| 400 |
+
y=selected_metric,
|
| 401 |
+
title=f"Box Plot of {selected_metric.replace('_', ' ').title()}",
|
| 402 |
+
points="all"
|
| 403 |
+
)
|
| 404 |
+
fig_box.update_layout(height=400)
|
| 405 |
+
st.plotly_chart(fig_box, use_container_width=True)
|
| 406 |
+
|
| 407 |
+
@staticmethod
|
| 408 |
+
def _display_analysis_results(metrics: dict, analysis_time: float):
|
| 409 |
+
"""Display single text analysis results."""
|
| 410 |
+
st.success(f"✅ Analysis completed in {analysis_time:.2f} seconds")
|
| 411 |
+
|
| 412 |
+
# Render results in tabs
|
| 413 |
+
tab1, tab2, tab3, tab4 = st.tabs([
|
| 414 |
+
"📊 Overview",
|
| 415 |
+
"📈 Frequency Analysis",
|
| 416 |
+
"🎯 Advanced Metrics",
|
| 417 |
+
"📋 Raw Data"
|
| 418 |
+
])
|
| 419 |
+
|
| 420 |
+
with tab1:
|
| 421 |
+
AnalysisHandlers._render_overview_metrics(metrics)
|
| 422 |
+
|
| 423 |
+
with tab2:
|
| 424 |
+
AnalysisHandlers._render_frequency_analysis(metrics)
|
| 425 |
+
|
| 426 |
+
with tab3:
|
| 427 |
+
AnalysisHandlers._render_advanced_metrics(metrics)
|
| 428 |
+
|
| 429 |
+
with tab4:
|
| 430 |
+
AnalysisHandlers._render_raw_data(metrics)
|
| 431 |
+
|
| 432 |
+
@staticmethod
|
| 433 |
+
def _render_overview_metrics(metrics: dict):
|
| 434 |
+
"""Render overview metrics."""
|
| 435 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 436 |
+
|
| 437 |
+
with col1:
|
| 438 |
+
st.metric("Total Words", f"{metrics.get('total_words', 0):,}")
|
| 439 |
+
st.metric("Sentences", f"{metrics.get('sentence_count', 0):,}")
|
| 440 |
+
|
| 441 |
+
with col2:
|
| 442 |
+
st.metric("Unique Words", f"{metrics.get('unique_words', 0):,}")
|
| 443 |
+
st.metric("Avg Sentence Length", f"{metrics.get('avg_sentence_length', 0):.1f}")
|
| 444 |
+
|
| 445 |
+
with col3:
|
| 446 |
+
st.metric("Lexical Diversity", f"{metrics.get('lexical_diversity', 0):.3f}")
|
| 447 |
+
st.metric("Avg Word Length", f"{metrics.get('avg_word_length', 0):.2f}")
|
| 448 |
+
|
| 449 |
+
with col4:
|
| 450 |
+
st.metric("Readability (Flesch)", f"{metrics.get('flesch_reading_ease', 0):.1f}")
|
| 451 |
+
st.metric("Grade Level", f"{metrics.get('flesch_kincaid_grade', 0):.1f}")
|
| 452 |
+
|
| 453 |
+
@staticmethod
|
| 454 |
+
def _render_frequency_analysis(metrics: dict):
|
| 455 |
+
"""Render frequency analysis section."""
|
| 456 |
+
import plotly.graph_objects as go
|
| 457 |
+
|
| 458 |
+
if 'frequency_distribution' not in metrics:
|
| 459 |
+
st.info("Frequency distribution data not available")
|
| 460 |
+
return
|
| 461 |
+
|
| 462 |
+
freq_dist = metrics['frequency_distribution']
|
| 463 |
+
|
| 464 |
+
# Prepare data for visualization
|
| 465 |
+
words = list(freq_dist.keys())[:30] # Top 30 words
|
| 466 |
+
frequencies = [freq_dist[word] for word in words]
|
| 467 |
+
|
| 468 |
+
# Create bar chart
|
| 469 |
+
fig = go.Figure(data=[
|
| 470 |
+
go.Bar(x=words, y=frequencies)
|
| 471 |
+
])
|
| 472 |
+
|
| 473 |
+
fig.update_layout(
|
| 474 |
+
title="Top 30 Most Frequent Words",
|
| 475 |
+
xaxis_title="Words",
|
| 476 |
+
yaxis_title="Frequency",
|
| 477 |
+
height=500
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 481 |
+
|
| 482 |
+
# Word frequency statistics
|
| 483 |
+
col1, col2 = st.columns(2)
|
| 484 |
+
|
| 485 |
+
with col1:
|
| 486 |
+
st.write("**Frequency Statistics:**")
|
| 487 |
+
st.write(f"• Most common word: {words[0]} ({frequencies[0]} times)")
|
| 488 |
+
st.write(f"• Hapax legomena: {sum(1 for f in freq_dist.values() if f == 1)} words")
|
| 489 |
+
st.write(f"• Words appearing 2+ times: {sum(1 for f in freq_dist.values() if f >= 2)}")
|
| 490 |
+
|
| 491 |
+
with col2:
|
| 492 |
+
st.write("**Coverage Analysis:**")
|
| 493 |
+
total_words = sum(freq_dist.values())
|
| 494 |
+
top10_coverage = sum(frequencies[:10]) / total_words * 100
|
| 495 |
+
top30_coverage = sum(frequencies[:30]) / total_words * 100
|
| 496 |
+
st.write(f"• Top 10 words: {top10_coverage:.1f}% of text")
|
| 497 |
+
st.write(f"• Top 30 words: {top30_coverage:.1f}% of text")
|
| 498 |
+
|
| 499 |
+
@staticmethod
|
| 500 |
+
def _render_advanced_metrics(metrics: dict):
|
| 501 |
+
"""Render advanced metrics section."""
|
| 502 |
+
# POS distribution if available
|
| 503 |
+
if 'pos_distribution' in metrics:
|
| 504 |
+
st.subheader("Part-of-Speech Distribution")
|
| 505 |
+
|
| 506 |
+
pos_dist = metrics['pos_distribution']
|
| 507 |
+
if pos_dist:
|
| 508 |
+
import plotly.express as px
|
| 509 |
+
|
| 510 |
+
# Prepare data
|
| 511 |
+
pos_df = pd.DataFrame([
|
| 512 |
+
{'POS': pos, 'Count': count}
|
| 513 |
+
for pos, count in pos_dist.items()
|
| 514 |
+
])
|
| 515 |
+
pos_df = pos_df.sort_values('Count', ascending=False)
|
| 516 |
+
|
| 517 |
+
# Create pie chart
|
| 518 |
+
fig = px.pie(
|
| 519 |
+
pos_df,
|
| 520 |
+
values='Count',
|
| 521 |
+
names='POS',
|
| 522 |
+
title="Part-of-Speech Distribution"
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 526 |
+
|
| 527 |
+
# Sophistication metrics
|
| 528 |
+
st.subheader("Sophistication Metrics")
|
| 529 |
+
|
| 530 |
+
col1, col2 = st.columns(2)
|
| 531 |
+
|
| 532 |
+
with col1:
|
| 533 |
+
if 'avg_word_frequency' in metrics:
|
| 534 |
+
st.metric(
|
| 535 |
+
"Average Word Frequency",
|
| 536 |
+
f"{metrics['avg_word_frequency']:.2f}",
|
| 537 |
+
help="Average frequency of words in reference corpus"
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
if 'academic_words_ratio' in metrics:
|
| 541 |
+
st.metric(
|
| 542 |
+
"Academic Words Ratio",
|
| 543 |
+
f"{metrics['academic_words_ratio']:.2%}",
|
| 544 |
+
help="Percentage of academic vocabulary"
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
with col2:
|
| 548 |
+
if 'rare_words_ratio' in metrics:
|
| 549 |
+
st.metric(
|
| 550 |
+
"Rare Words Ratio",
|
| 551 |
+
f"{metrics['rare_words_ratio']:.2%}",
|
| 552 |
+
help="Percentage of infrequent words"
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
if 'lexical_sophistication_score' in metrics:
|
| 556 |
+
st.metric(
|
| 557 |
+
"Sophistication Score",
|
| 558 |
+
f"{metrics['lexical_sophistication_score']:.3f}",
|
| 559 |
+
help="Overall lexical sophistication"
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
@staticmethod
|
| 563 |
+
def _render_raw_data(metrics: dict):
|
| 564 |
+
"""Render raw data section."""
|
| 565 |
+
st.write("**Available Metrics:**")
|
| 566 |
+
|
| 567 |
+
# Display all metrics in an expandable format
|
| 568 |
+
for key, value in metrics.items():
|
| 569 |
+
if isinstance(value, (dict, list)) and len(str(value)) > 100:
|
| 570 |
+
with st.expander(f"{key} (complex data)"):
|
| 571 |
+
if isinstance(value, dict):
|
| 572 |
+
st.json(value)
|
| 573 |
+
else:
|
| 574 |
+
st.write(value)
|
| 575 |
+
else:
|
| 576 |
+
st.write(f"• **{key}:** {value}")
|
| 577 |
+
|
| 578 |
+
# Export options
|
| 579 |
+
st.subheader("Export Data")
|
| 580 |
+
|
| 581 |
+
# Prepare export data
|
| 582 |
+
export_data = {k: v for k, v in metrics.items()
|
| 583 |
+
if not isinstance(v, (dict, list)) or k in ['pos_distribution']}
|
| 584 |
+
|
| 585 |
+
col1, col2 = st.columns(2)
|
| 586 |
+
|
| 587 |
+
with col1:
|
| 588 |
+
# JSON export
|
| 589 |
+
json_str = pd.Series(export_data).to_json(indent=2)
|
| 590 |
+
st.download_button(
|
| 591 |
+
label="📥 Download as JSON",
|
| 592 |
+
data=json_str,
|
| 593 |
+
file_name=f"analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
|
| 594 |
+
mime="application/json"
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
with col2:
|
| 598 |
+
# CSV export
|
| 599 |
+
df_export = pd.DataFrame([export_data])
|
| 600 |
+
csv = df_export.to_csv(index=False)
|
| 601 |
+
st.download_button(
|
| 602 |
+
label="📥 Download as CSV",
|
| 603 |
+
data=csv,
|
| 604 |
+
file_name=f"analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 605 |
+
mime="text/csv"
|
| 606 |
+
)
|
web_app/handlers/frequency_handlers.py
CHANGED
|
@@ -4,6 +4,8 @@ Frequency Analysis Handlers for Streamlit Interface
|
|
| 4 |
This module provides Streamlit interface handlers for word frequency visualization,
|
| 5 |
including file upload, visualization controls, and results display.
|
| 6 |
Supports flexible column mapping for diverse frequency data formats.
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import streamlit as st
|
|
@@ -15,13 +17,13 @@ from typing import Dict, List, Optional
|
|
| 15 |
import sys
|
| 16 |
import os
|
| 17 |
from pathlib import Path
|
| 18 |
-
from io import StringIO
|
| 19 |
|
| 20 |
# Add parent directory to path for imports
|
| 21 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 22 |
|
| 23 |
from text_analyzer.frequency_analyzer import FrequencyAnalyzer
|
| 24 |
-
from web_app.utils import
|
| 25 |
|
| 26 |
|
| 27 |
class FrequencyHandlers:
|
|
@@ -30,56 +32,48 @@ class FrequencyHandlers:
|
|
| 30 |
"""
|
| 31 |
|
| 32 |
@staticmethod
|
| 33 |
-
def
|
| 34 |
"""
|
| 35 |
-
|
|
|
|
| 36 |
"""
|
| 37 |
-
st.
|
| 38 |
-
"The system will automatically detect columns and let you choose which ones to use for analysis.")
|
| 39 |
|
| 40 |
-
# Initialize session state
|
| 41 |
-
if 'uploaded_file_name' not in st.session_state:
|
| 42 |
-
st.session_state.uploaded_file_name = None
|
| 43 |
-
if 'column_config' not in st.session_state:
|
| 44 |
-
st.session_state.column_config = None
|
| 45 |
if 'analyzer' not in st.session_state:
|
| 46 |
st.session_state.analyzer = None
|
| 47 |
if 'format_info' not in st.session_state:
|
| 48 |
st.session_state.format_info = None
|
| 49 |
-
if 'detected_cols' not in st.session_state:
|
| 50 |
-
st.session_state.detected_cols = None
|
| 51 |
if 'uploaded_file_content' not in st.session_state:
|
| 52 |
st.session_state.uploaded_file_content = None
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
# File
|
| 55 |
-
uploaded_file = FrequencyHandlers.
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
current_file_name = uploaded_file.name
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
st.session_state.uploaded_file_name = current_file_name
|
| 64 |
-
st.session_state.column_config = None
|
| 65 |
st.session_state.analyzer = None
|
| 66 |
st.session_state.format_info = None
|
| 67 |
-
st.session_state.detected_cols = None
|
| 68 |
|
| 69 |
-
# Handle file content loading with /tmp approach for HF Spaces compatibility
|
| 70 |
try:
|
| 71 |
-
#
|
| 72 |
-
if
|
|
|
|
| 73 |
return
|
| 74 |
|
| 75 |
-
#
|
| 76 |
-
|
| 77 |
-
if
|
| 78 |
-
st.session_state.uploaded_file_content =
|
| 79 |
-
st.
|
| 80 |
-
st.success(f"✅ File '{current_file_name}' ({len(st.session_state.uploaded_file_content):,} bytes) uploaded successfully")
|
| 81 |
else:
|
| 82 |
-
st.error("Failed to
|
| 83 |
return
|
| 84 |
except Exception as e:
|
| 85 |
st.error(f"❌ Failed to read uploaded file: {str(e)}")
|
|
@@ -92,50 +86,48 @@ class FrequencyHandlers:
|
|
| 92 |
# Initialize analyzer and process file (only if needed)
|
| 93 |
if st.session_state.analyzer is None or st.session_state.format_info is None:
|
| 94 |
st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
|
| 95 |
-
# Use the content we already read
|
| 96 |
st.session_state.format_info = st.session_state.analyzer.detect_file_format(st.session_state.uploaded_file_content)
|
| 97 |
|
| 98 |
# Show format detection results
|
| 99 |
st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
|
| 100 |
-
f"{
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
-
#
|
| 118 |
-
FrequencyHandlers.render_data_preview(df_preview, st.session_state.detected_cols)
|
| 119 |
-
|
| 120 |
-
# ALWAYS show column selection if we have detected columns (persistent interface)
|
| 121 |
-
if st.session_state.detected_cols is not None:
|
| 122 |
-
with st.expander("🎯 Column Selection", expanded=True):
|
| 123 |
-
column_config = FrequencyHandlers.render_persistent_column_selection(
|
| 124 |
-
st.session_state.detected_cols,
|
| 125 |
-
st.session_state.format_info,
|
| 126 |
-
st.session_state.column_config
|
| 127 |
-
)
|
| 128 |
-
|
| 129 |
-
# Check if column configuration changed
|
| 130 |
-
if column_config != st.session_state.column_config:
|
| 131 |
-
st.session_state.column_config = column_config
|
| 132 |
-
# Reload data with new configuration
|
| 133 |
-
df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
|
| 134 |
-
st.session_state.loaded_data = df
|
| 135 |
-
st.rerun()
|
| 136 |
-
|
| 137 |
-
# ALWAYS show visualization controls if we have a column config
|
| 138 |
-
if st.session_state.column_config is not None:
|
| 139 |
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 140 |
|
| 141 |
if viz_config:
|
|
@@ -158,45 +150,18 @@ class FrequencyHandlers:
|
|
| 158 |
else:
|
| 159 |
with st.expander("Error Details"):
|
| 160 |
st.code(str(e))
|
| 161 |
-
st.write("**Debug Information:**")
|
| 162 |
-
st.write(f"- File size: {len(st.session_state.uploaded_file_content) if st.session_state.uploaded_file_content else 'Unknown'} bytes")
|
| 163 |
-
st.write(f"- Session state keys: {list(st.session_state.keys())}")
|
| 164 |
-
|
| 165 |
-
st.info("Please ensure your file is a valid TSV/CSV with appropriate columns.")
|
| 166 |
-
|
| 167 |
-
elif st.session_state.column_config is not None and st.session_state.uploaded_file_content is not None:
|
| 168 |
-
# Show persistent interface even when no file is currently selected (using cached data)
|
| 169 |
-
with st.expander("🎯 Column Selection", expanded=False):
|
| 170 |
-
column_config = FrequencyHandlers.render_persistent_column_selection(
|
| 171 |
-
st.session_state.detected_cols,
|
| 172 |
-
st.session_state.format_info,
|
| 173 |
-
st.session_state.column_config
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
-
# Check if column configuration changed
|
| 177 |
-
if column_config != st.session_state.column_config:
|
| 178 |
-
st.session_state.column_config = column_config
|
| 179 |
-
# Reload data with new configuration
|
| 180 |
-
df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
|
| 181 |
-
st.session_state.loaded_data = df
|
| 182 |
-
st.rerun()
|
| 183 |
-
|
| 184 |
-
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 185 |
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
FileUploadHandler.cleanup_old_temp_files(max_age_hours=1)
|
| 193 |
-
except:
|
| 194 |
-
pass
|
| 195 |
|
| 196 |
@staticmethod
|
| 197 |
-
def
|
| 198 |
"""
|
| 199 |
-
Render
|
| 200 |
|
| 201 |
Returns:
|
| 202 |
File-like object or None
|
|
@@ -297,437 +262,433 @@ and\t28891\t3"""
|
|
| 297 |
df: Preview DataFrame
|
| 298 |
detected_cols: Detected column categorization
|
| 299 |
"""
|
| 300 |
-
st.
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
with col3:
|
| 309 |
-
word_cols = len(detected_cols.get('word_columns', []))
|
| 310 |
-
freq_cols = len(detected_cols.get('frequency_columns', []))
|
| 311 |
-
st.metric("Detected", f"{word_cols} word, {freq_cols} freq")
|
| 312 |
-
|
| 313 |
-
# Show sample data
|
| 314 |
-
st.write("**First 5 rows:**")
|
| 315 |
-
st.dataframe(df.head(), use_container_width=True)
|
| 316 |
|
| 317 |
-
# Show detected
|
| 318 |
-
with st.expander("🔍
|
| 319 |
-
col1, col2 = st.columns(
|
| 320 |
|
| 321 |
with col1:
|
| 322 |
-
st.write("**Word Columns
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
for col in word_cols:
|
| 326 |
-
st.write(f"- `{col}` ({df[col].dtype})")
|
| 327 |
-
else:
|
| 328 |
-
st.write("None detected")
|
| 329 |
-
|
| 330 |
-
st.write("**POS Columns:**")
|
| 331 |
-
pos_cols = detected_cols.get('pos_columns', [])
|
| 332 |
-
if pos_cols:
|
| 333 |
-
for col in pos_cols:
|
| 334 |
-
st.write(f"- `{col}` ({df[col].dtype})")
|
| 335 |
-
else:
|
| 336 |
-
st.write("None detected")
|
| 337 |
|
| 338 |
with col2:
|
| 339 |
-
st.write("**Frequency Columns
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
st.write(f"- `{col}` ({df[col].dtype}) - e.g., {sample_vals}")
|
| 345 |
-
else:
|
| 346 |
-
st.write("None detected")
|
| 347 |
-
|
| 348 |
st.write("**Other Columns:**")
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
for col in other_cols[:5]: # Show max 5
|
| 352 |
-
st.write(f"- `{col}` ({df[col].dtype})")
|
| 353 |
-
if len(other_cols) > 5:
|
| 354 |
-
st.write(f"... and {len(other_cols) - 5} more")
|
| 355 |
-
else:
|
| 356 |
-
st.write("None")
|
| 357 |
|
| 358 |
@staticmethod
|
| 359 |
-
def
|
| 360 |
"""
|
| 361 |
-
Render
|
| 362 |
|
| 363 |
Args:
|
| 364 |
detected_cols: Detected column categorization
|
| 365 |
-
|
| 366 |
|
| 367 |
Returns:
|
| 368 |
-
|
| 369 |
"""
|
| 370 |
-
st.subheader("
|
| 371 |
-
st.write("Select which columns to use for your frequency analysis:")
|
| 372 |
-
|
| 373 |
-
word_cols = detected_cols.get('word_columns', [])
|
| 374 |
-
freq_cols = detected_cols.get('frequency_columns', [])
|
| 375 |
-
pos_cols = detected_cols.get('pos_columns', [])
|
| 376 |
-
|
| 377 |
-
if not word_cols or not freq_cols:
|
| 378 |
-
st.error("❌ Required columns not detected. Please ensure your file has:")
|
| 379 |
-
st.write("- At least one text column (for words)")
|
| 380 |
-
st.write("- At least one numeric column (for frequencies)")
|
| 381 |
-
return None
|
| 382 |
|
| 383 |
col1, col2 = st.columns(2)
|
| 384 |
|
| 385 |
with col1:
|
| 386 |
-
# Word column selection
|
| 387 |
-
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
options=word_cols,
|
| 390 |
-
index=
|
| 391 |
-
help="
|
| 392 |
)
|
| 393 |
-
|
| 394 |
-
# POS column selection (optional)
|
| 395 |
-
pos_column = None
|
| 396 |
-
if pos_cols:
|
| 397 |
-
use_pos = st.checkbox("Include POS column", value=False)
|
| 398 |
-
if use_pos:
|
| 399 |
-
pos_column = st.selectbox(
|
| 400 |
-
"POS Column",
|
| 401 |
-
options=pos_cols,
|
| 402 |
-
index=0,
|
| 403 |
-
help="Column containing part-of-speech tags (optional)"
|
| 404 |
-
)
|
| 405 |
|
| 406 |
with col2:
|
| 407 |
-
# Frequency column selection
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
"Frequency Column",
|
| 410 |
options=freq_cols,
|
| 411 |
-
index=
|
| 412 |
-
help="
|
| 413 |
)
|
| 414 |
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
'separator': format_info['separator'],
|
| 421 |
-
'has_header': format_info['has_header']
|
| 422 |
-
}
|
| 423 |
|
| 424 |
-
|
| 425 |
-
|
|
|
|
|
|
|
| 426 |
|
| 427 |
-
return
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
return None
|
| 430 |
|
| 431 |
@staticmethod
|
| 432 |
-
def
|
| 433 |
"""
|
| 434 |
-
|
| 435 |
-
"""
|
| 436 |
-
return FrequencyHandlers.render_enhanced_visualization_controls(analyzer, column_config)
|
| 437 |
-
|
| 438 |
-
@staticmethod
|
| 439 |
-
def render_rank_based_analysis_simplified(analyzer: FrequencyAnalyzer, viz_config: Dict):
|
| 440 |
-
"""
|
| 441 |
-
Legacy method - redirects to enhanced analysis for backward compatibility.
|
| 442 |
-
"""
|
| 443 |
-
return FrequencyHandlers.render_enhanced_rank_based_analysis(analyzer, viz_config)
|
| 444 |
-
|
| 445 |
-
@staticmethod
|
| 446 |
-
def render_persistent_column_selection(detected_cols: Dict[str, List[str]],
|
| 447 |
-
format_info: Dict,
|
| 448 |
-
current_config: Optional[Dict] = None) -> Dict[str, str]:
|
| 449 |
-
"""
|
| 450 |
-
Render persistent column selection interface that doesn't disappear.
|
| 451 |
|
| 452 |
Args:
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
current_config: Current column configuration (for preserving selections)
|
| 456 |
|
| 457 |
Returns:
|
| 458 |
-
|
| 459 |
"""
|
| 460 |
-
st.
|
| 461 |
-
|
| 462 |
-
word_cols = detected_cols.get('word_columns', [])
|
| 463 |
-
freq_cols = detected_cols.get('frequency_columns', [])
|
| 464 |
-
pos_cols = detected_cols.get('pos_columns', [])
|
| 465 |
-
|
| 466 |
-
# Determine default selections
|
| 467 |
-
default_word_idx = 0
|
| 468 |
-
default_freq_idx = 0
|
| 469 |
-
default_use_pos = False
|
| 470 |
-
default_pos_idx = 0
|
| 471 |
-
|
| 472 |
-
if current_config:
|
| 473 |
-
# Preserve current selections
|
| 474 |
-
if current_config['word_column'] in word_cols:
|
| 475 |
-
default_word_idx = word_cols.index(current_config['word_column'])
|
| 476 |
-
if current_config['frequency_column'] in freq_cols:
|
| 477 |
-
default_freq_idx = freq_cols.index(current_config['frequency_column'])
|
| 478 |
-
if 'pos_column' in current_config and current_config['pos_column'] in pos_cols:
|
| 479 |
-
default_use_pos = True
|
| 480 |
-
default_pos_idx = pos_cols.index(current_config['pos_column'])
|
| 481 |
|
| 482 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
with col1:
|
| 485 |
-
|
| 486 |
-
"
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
help="Column containing word forms or lemmas",
|
| 490 |
-
key="persistent_word_col"
|
| 491 |
)
|
| 492 |
-
|
| 493 |
-
# POS column selection (optional)
|
| 494 |
-
pos_column = None
|
| 495 |
-
if pos_cols:
|
| 496 |
-
use_pos = st.checkbox("Include POS column", value=default_use_pos, key="persistent_use_pos")
|
| 497 |
-
if use_pos:
|
| 498 |
-
pos_column = st.selectbox(
|
| 499 |
-
"POS Column",
|
| 500 |
-
options=pos_cols,
|
| 501 |
-
index=default_pos_idx,
|
| 502 |
-
help="Column containing part-of-speech tags (optional)",
|
| 503 |
-
key="persistent_pos_col"
|
| 504 |
-
)
|
| 505 |
|
| 506 |
with col2:
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
index=default_freq_idx,
|
| 511 |
-
help="Column containing frequency values for analysis",
|
| 512 |
-
key="persistent_freq_col"
|
| 513 |
-
)
|
| 514 |
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
config = {
|
| 524 |
-
'word_column': word_column,
|
| 525 |
-
'frequency_column': frequency_column,
|
| 526 |
-
'separator': format_info['separator'],
|
| 527 |
-
'has_header': format_info['has_header']
|
| 528 |
-
}
|
| 529 |
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
| 533 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
@staticmethod
|
| 536 |
-
def
|
| 537 |
"""
|
| 538 |
-
Render enhanced
|
| 539 |
|
| 540 |
Args:
|
| 541 |
analyzer: FrequencyAnalyzer instance with loaded data
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
Returns:
|
| 545 |
-
Dict with visualization configuration or None
|
| 546 |
"""
|
| 547 |
-
st.subheader("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
|
| 549 |
-
#
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
col1, col2, col3 = st.columns(3)
|
| 553 |
|
| 554 |
with col1:
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
max_value=2000,
|
| 560 |
-
value=500,
|
| 561 |
-
step=100,
|
| 562 |
-
help="Number of words to group together for rank-based analysis"
|
| 563 |
-
)
|
| 564 |
|
| 565 |
with col2:
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
)
|
|
|
|
|
|
|
| 572 |
|
| 573 |
with col3:
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
)
|
| 584 |
-
|
| 585 |
-
# Quick preset buttons
|
| 586 |
-
st.write("**Quick Presets:**")
|
| 587 |
-
preset_cols = st.columns(4)
|
| 588 |
-
if preset_cols[0].button("10K", key="preset_10k"):
|
| 589 |
-
st.session_state.max_words_preset = 10000
|
| 590 |
-
if preset_cols[1].button("25K", key="preset_25k"):
|
| 591 |
-
st.session_state.max_words_preset = 25000
|
| 592 |
-
if preset_cols[2].button("50K", key="preset_50k"):
|
| 593 |
-
st.session_state.max_words_preset = 50000
|
| 594 |
-
if preset_cols[3].button("All", key="preset_all"):
|
| 595 |
-
st.session_state.max_words_preset = None
|
| 596 |
-
|
| 597 |
-
# Use preset value if set
|
| 598 |
-
if 'max_words_preset' in st.session_state:
|
| 599 |
-
max_words = st.session_state.max_words_preset
|
| 600 |
-
del st.session_state.max_words_preset
|
| 601 |
|
| 602 |
-
#
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
}
|
| 610 |
|
| 611 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
|
| 613 |
@staticmethod
|
| 614 |
-
def
|
| 615 |
-
"""
|
| 616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
viz_config: Visualization configuration
|
| 621 |
-
"""
|
| 622 |
-
st.subheader("📊 Enhanced Rank-Based Frequency Analysis")
|
| 623 |
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
|
|
|
|
|
|
| 628 |
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
col1, col2, col3, col4 = st.columns(4)
|
| 635 |
-
with col1:
|
| 636 |
-
words_analyzed = max_words_to_retain if max_words_to_retain and max_words_to_retain < stats['count'] else stats['count']
|
| 637 |
-
st.metric("Words Analyzed", f"{words_analyzed:,}")
|
| 638 |
-
with col2:
|
| 639 |
-
st.metric("Mean Frequency", f"{stats['mean']:.2f}")
|
| 640 |
-
with col3:
|
| 641 |
-
st.metric("Median Frequency", f"{stats['median']:.2f}")
|
| 642 |
-
with col4:
|
| 643 |
-
st.metric("Std Deviation", f"{stats['std']:.2f}")
|
| 644 |
-
|
| 645 |
-
# Show word limit info if applied
|
| 646 |
-
if max_words_to_retain and max_words_to_retain < stats['count']:
|
| 647 |
-
st.info(f"📊 Analysis limited to top {max_words_to_retain:,} most frequent words (out of {stats['count']:,} total)")
|
| 648 |
-
|
| 649 |
-
# Create rank-based visualization with enhanced parameters
|
| 650 |
-
result = analyzer.create_rank_based_visualization_flexible(
|
| 651 |
-
column=frequency_column,
|
| 652 |
-
bin_size=bin_size,
|
| 653 |
-
log_transform=log_transform,
|
| 654 |
-
max_words_to_retain=max_words_to_retain
|
| 655 |
-
)
|
| 656 |
-
|
| 657 |
-
# Create the main visualization
|
| 658 |
-
fig = go.Figure()
|
| 659 |
-
|
| 660 |
-
fig.add_trace(go.Bar(
|
| 661 |
-
x=result['group_centers'],
|
| 662 |
-
y=result['avg_frequencies'],
|
| 663 |
-
name=f"Avg {frequency_column}",
|
| 664 |
-
marker_color='steelblue',
|
| 665 |
-
hovertemplate=(
|
| 666 |
-
f"<b>Group %{{x}}</b><br>"
|
| 667 |
-
f"Avg {'Log₁₀ ' if log_transform else ''}{frequency_column}: %{{y:.3f}}<br>"
|
| 668 |
-
"<extra></extra>"
|
| 669 |
-
)
|
| 670 |
-
))
|
| 671 |
-
|
| 672 |
-
fig.update_layout(
|
| 673 |
-
title=result.get('title_suffix', f"Enhanced Rank-Based Analysis - {frequency_column}"),
|
| 674 |
-
xaxis_title=result.get('x_label', f"Rank Groups (bin size: {bin_size})"),
|
| 675 |
-
yaxis_title=result.get('y_label', f"{'Log₁₀ ' if log_transform else ''}Average {frequency_column}"),
|
| 676 |
-
showlegend=False,
|
| 677 |
-
height=500
|
| 678 |
-
)
|
| 679 |
-
|
| 680 |
-
st.plotly_chart(fig, use_container_width=True)
|
| 681 |
-
|
| 682 |
-
# Enhanced sample words display (up to 20 bins with 5 random samples each)
|
| 683 |
-
st.write("### 🎯 Sample Words by Rank Group (5 Random Samples)")
|
| 684 |
-
|
| 685 |
-
sample_words = result.get('sample_words', {})
|
| 686 |
-
if sample_words:
|
| 687 |
-
# Display up to 20 groups in a more organized layout
|
| 688 |
-
num_groups = min(20, len(sample_words))
|
| 689 |
-
|
| 690 |
-
if num_groups > 0:
|
| 691 |
-
st.write(f"Showing sample words from top {num_groups} rank groups:")
|
| 692 |
-
|
| 693 |
-
# Display in rows of 4 groups each
|
| 694 |
-
for row_start in range(0, num_groups, 4):
|
| 695 |
-
cols = st.columns(4)
|
| 696 |
-
for col_idx in range(4):
|
| 697 |
-
group_idx = row_start + col_idx
|
| 698 |
-
if group_idx < num_groups and group_idx in sample_words:
|
| 699 |
-
with cols[col_idx]:
|
| 700 |
-
group_label = result['group_labels'][group_idx]
|
| 701 |
-
words = sample_words[group_idx]
|
| 702 |
-
|
| 703 |
-
st.write(f"**Group {group_label}:**")
|
| 704 |
-
word_list = [w['word'] for w in words]
|
| 705 |
-
# Display as bullet points for better readability
|
| 706 |
-
for word in word_list:
|
| 707 |
-
st.write(f"• {word}")
|
| 708 |
-
|
| 709 |
-
# Add spacing between groups
|
| 710 |
-
st.write("")
|
| 711 |
-
else:
|
| 712 |
-
st.write("No sample words available")
|
| 713 |
-
|
| 714 |
-
# Show enhanced group statistics
|
| 715 |
-
with st.expander("📈 Detailed Group Statistics"):
|
| 716 |
-
group_stats = result.get('group_stats')
|
| 717 |
-
if group_stats is not None and not group_stats.empty:
|
| 718 |
-
display_stats = group_stats.copy()
|
| 719 |
-
|
| 720 |
-
# Format numeric columns
|
| 721 |
-
numeric_cols = display_stats.select_dtypes(include=[np.number]).columns
|
| 722 |
-
for col in numeric_cols:
|
| 723 |
-
if 'count' not in col.lower():
|
| 724 |
-
display_stats[col] = display_stats[col].round(2)
|
| 725 |
-
|
| 726 |
-
st.dataframe(display_stats, use_container_width=True)
|
| 727 |
-
else:
|
| 728 |
-
st.write("No detailed statistics available")
|
| 729 |
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
This module provides Streamlit interface handlers for word frequency visualization,
|
| 5 |
including file upload, visualization controls, and results display.
|
| 6 |
Supports flexible column mapping for diverse frequency data formats.
|
| 7 |
+
|
| 8 |
+
Updated to use MemoryFileHandler to avoid 403 errors on restricted environments.
|
| 9 |
"""
|
| 10 |
|
| 11 |
import streamlit as st
|
|
|
|
| 17 |
import sys
|
| 18 |
import os
|
| 19 |
from pathlib import Path
|
| 20 |
+
from io import StringIO, BytesIO
|
| 21 |
|
| 22 |
# Add parent directory to path for imports
|
| 23 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 24 |
|
| 25 |
from text_analyzer.frequency_analyzer import FrequencyAnalyzer
|
| 26 |
+
from web_app.utils import MemoryFileHandler
|
| 27 |
|
| 28 |
|
| 29 |
class FrequencyHandlers:
|
|
|
|
| 32 |
"""
|
| 33 |
|
| 34 |
@staticmethod
|
| 35 |
+
def render_frequency_visualization_interface():
|
| 36 |
"""
|
| 37 |
+
Main interface for frequency visualization analysis.
|
| 38 |
+
Manages state across multiple interactions.
|
| 39 |
"""
|
| 40 |
+
st.subheader("📊 Word Frequency Visualization")
|
|
|
|
| 41 |
|
| 42 |
+
# Initialize session state
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
if 'analyzer' not in st.session_state:
|
| 44 |
st.session_state.analyzer = None
|
| 45 |
if 'format_info' not in st.session_state:
|
| 46 |
st.session_state.format_info = None
|
|
|
|
|
|
|
| 47 |
if 'uploaded_file_content' not in st.session_state:
|
| 48 |
st.session_state.uploaded_file_content = None
|
| 49 |
+
if 'column_config' not in st.session_state:
|
| 50 |
+
st.session_state.column_config = None
|
| 51 |
|
| 52 |
+
# File selection
|
| 53 |
+
uploaded_file = FrequencyHandlers.render_file_selection_section()
|
| 54 |
|
| 55 |
+
if uploaded_file:
|
| 56 |
+
# Track file changes
|
| 57 |
+
current_file_name = uploaded_file.name if hasattr(uploaded_file, 'name') else 'sample_file'
|
| 58 |
|
| 59 |
+
if st.session_state.get('last_file_name') != current_file_name:
|
| 60 |
+
st.session_state.last_file_name = current_file_name
|
|
|
|
|
|
|
| 61 |
st.session_state.analyzer = None
|
| 62 |
st.session_state.format_info = None
|
|
|
|
| 63 |
|
|
|
|
| 64 |
try:
|
| 65 |
+
# Check file size
|
| 66 |
+
if hasattr(uploaded_file, 'size') and uploaded_file.size > 300 * 1024 * 1024:
|
| 67 |
+
st.error(f"File too large ({uploaded_file.size / 1024 / 1024:.1f} MB). Maximum allowed: 300MB")
|
| 68 |
return
|
| 69 |
|
| 70 |
+
# Process file using memory-based approach
|
| 71 |
+
content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=False)
|
| 72 |
+
if content:
|
| 73 |
+
st.session_state.uploaded_file_content = content
|
| 74 |
+
st.success(f"✅ File '{current_file_name}' ({len(content):,} bytes) uploaded successfully")
|
|
|
|
| 75 |
else:
|
| 76 |
+
st.error("Failed to read uploaded file. Please try again.")
|
| 77 |
return
|
| 78 |
except Exception as e:
|
| 79 |
st.error(f"❌ Failed to read uploaded file: {str(e)}")
|
|
|
|
| 86 |
# Initialize analyzer and process file (only if needed)
|
| 87 |
if st.session_state.analyzer is None or st.session_state.format_info is None:
|
| 88 |
st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
|
| 89 |
+
# Use the content we already read
|
| 90 |
st.session_state.format_info = st.session_state.analyzer.detect_file_format(st.session_state.uploaded_file_content)
|
| 91 |
|
| 92 |
# Show format detection results
|
| 93 |
st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
|
| 94 |
+
f"{st.session_state.format_info['line_count']} lines")
|
| 95 |
+
|
| 96 |
+
# Parse the data if not already done
|
| 97 |
+
if st.session_state.analyzer.df is None:
|
| 98 |
+
with st.spinner("Parsing frequency data..."):
|
| 99 |
+
try:
|
| 100 |
+
# Create file-like object from content
|
| 101 |
+
file_obj = BytesIO(st.session_state.uploaded_file_content)
|
| 102 |
+
st.session_state.analyzer.read_frequency_data_from_content(file_obj)
|
| 103 |
+
|
| 104 |
+
if st.session_state.analyzer.df is None or st.session_state.analyzer.df.empty:
|
| 105 |
+
st.error("No data could be parsed from the file. Please check the file format.")
|
| 106 |
+
return
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
st.error(f"Error parsing file: {str(e)}")
|
| 110 |
+
return
|
| 111 |
+
|
| 112 |
+
# Display results
|
| 113 |
+
with st.expander("📋 Data Preview", expanded=True):
|
| 114 |
+
FrequencyHandlers.render_data_preview(
|
| 115 |
+
st.session_state.analyzer.df.head(20),
|
| 116 |
+
st.session_state.analyzer.detected_columns
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Column configuration - always allow user to change
|
| 120 |
+
st.session_state.column_config = FrequencyHandlers.render_enhanced_column_configuration(
|
| 121 |
+
st.session_state.analyzer.detected_columns,
|
| 122 |
+
st.session_state.analyzer.df
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
if st.session_state.column_config:
|
| 126 |
+
# Set the analyzer's columns based on user selection
|
| 127 |
+
st.session_state.analyzer.word_column = st.session_state.column_config['word_column']
|
| 128 |
+
st.session_state.analyzer.frequency_column = st.session_state.column_config['frequency_column']
|
| 129 |
|
| 130 |
+
# Visualization controls
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 132 |
|
| 133 |
if viz_config:
|
|
|
|
| 150 |
else:
|
| 151 |
with st.expander("Error Details"):
|
| 152 |
st.code(str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
# Cleanup session for debugging
|
| 155 |
+
if st.sidebar.button("🔄 Reset Analysis", help="Clear all cached data and start fresh"):
|
| 156 |
+
for key in ['analyzer', 'format_info', 'uploaded_file_content', 'column_config', 'last_file_name']:
|
| 157 |
+
if key in st.session_state:
|
| 158 |
+
del st.session_state[key]
|
| 159 |
+
st.experimental_rerun()
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
@staticmethod
|
| 162 |
+
def render_file_selection_section():
|
| 163 |
"""
|
| 164 |
+
Render file selection section.
|
| 165 |
|
| 166 |
Returns:
|
| 167 |
File-like object or None
|
|
|
|
| 262 |
df: Preview DataFrame
|
| 263 |
detected_cols: Detected column categorization
|
| 264 |
"""
|
| 265 |
+
st.write("**File Preview:**")
|
| 266 |
+
st.dataframe(
|
| 267 |
+
df,
|
| 268 |
+
use_container_width=True,
|
| 269 |
+
hide_index=True,
|
| 270 |
+
height=400
|
| 271 |
+
)
|
| 272 |
+
st.caption(f"Showing first {len(df)} of total entries")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
# Show detected columns
|
| 275 |
+
with st.expander("🔍 Detected Columns", expanded=False):
|
| 276 |
+
col1, col2, col3 = st.columns(3)
|
| 277 |
|
| 278 |
with col1:
|
| 279 |
+
st.write("**Word Columns:**")
|
| 280 |
+
for col in detected_cols.get('word_columns', []):
|
| 281 |
+
st.write(f"• {col}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
with col2:
|
| 284 |
+
st.write("**Frequency Columns:**")
|
| 285 |
+
for col in detected_cols.get('frequency_columns', []):
|
| 286 |
+
st.write(f"• {col}")
|
| 287 |
+
|
| 288 |
+
with col3:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
st.write("**Other Columns:**")
|
| 290 |
+
for col in detected_cols.get('other_columns', []):
|
| 291 |
+
st.write(f"• {col}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
@staticmethod
|
| 294 |
+
def render_enhanced_column_configuration(detected_cols: Dict[str, List[str]], df: pd.DataFrame):
|
| 295 |
"""
|
| 296 |
+
Render enhanced column configuration with smart defaults.
|
| 297 |
|
| 298 |
Args:
|
| 299 |
detected_cols: Detected column categorization
|
| 300 |
+
df: The full DataFrame
|
| 301 |
|
| 302 |
Returns:
|
| 303 |
+
Dictionary with column configuration or None
|
| 304 |
"""
|
| 305 |
+
st.subheader("⚙️ Column Configuration")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
col1, col2 = st.columns(2)
|
| 308 |
|
| 309 |
with col1:
|
| 310 |
+
# Word column selection with smart default
|
| 311 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 312 |
+
if not word_cols:
|
| 313 |
+
word_cols = list(df.columns)
|
| 314 |
+
|
| 315 |
+
default_word = 0
|
| 316 |
+
# Prioritize columns with 'word', 'token', 'lemma', etc.
|
| 317 |
+
for i, col in enumerate(word_cols):
|
| 318 |
+
if any(term in col.lower() for term in ['word', 'token', 'lemma', 'type']):
|
| 319 |
+
default_word = i
|
| 320 |
+
break
|
| 321 |
+
|
| 322 |
+
word_col = st.selectbox(
|
| 323 |
+
"Word/Token Column",
|
| 324 |
options=word_cols,
|
| 325 |
+
index=default_word,
|
| 326 |
+
help="Select the column containing words or tokens"
|
| 327 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
with col2:
|
| 330 |
+
# Frequency column selection with smart default
|
| 331 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 332 |
+
if not freq_cols:
|
| 333 |
+
# Try to identify numeric columns
|
| 334 |
+
freq_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
| 335 |
+
if not freq_cols:
|
| 336 |
+
freq_cols = list(df.columns)
|
| 337 |
+
|
| 338 |
+
default_freq = 0
|
| 339 |
+
# Prioritize columns with 'freq', 'count', etc.
|
| 340 |
+
for i, col in enumerate(freq_cols):
|
| 341 |
+
if any(term in col.lower() for term in ['freq', 'count', 'occurrences']):
|
| 342 |
+
default_freq = i
|
| 343 |
+
break
|
| 344 |
+
|
| 345 |
+
freq_col = st.selectbox(
|
| 346 |
"Frequency Column",
|
| 347 |
options=freq_cols,
|
| 348 |
+
index=default_freq,
|
| 349 |
+
help="Select the column containing frequency counts"
|
| 350 |
)
|
| 351 |
|
| 352 |
+
if word_col and freq_col:
|
| 353 |
+
# Validate configuration
|
| 354 |
+
if word_col == freq_col:
|
| 355 |
+
st.error("Word and frequency columns cannot be the same!")
|
| 356 |
+
return None
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
+
# Show sample data with selected columns
|
| 359 |
+
st.write("**Preview with selected columns:**")
|
| 360 |
+
preview_df = df[[word_col, freq_col]].head(5)
|
| 361 |
+
st.dataframe(preview_df, use_container_width=True, hide_index=True)
|
| 362 |
|
| 363 |
+
return {
|
| 364 |
+
'word_column': word_col,
|
| 365 |
+
'frequency_column': freq_col
|
| 366 |
+
}
|
| 367 |
|
| 368 |
return None
|
| 369 |
|
| 370 |
@staticmethod
|
| 371 |
+
def render_enhanced_visualization_controls(analyzer: FrequencyAnalyzer, column_config: Dict[str, str]):
|
| 372 |
"""
|
| 373 |
+
Render enhanced visualization controls.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
Args:
|
| 376 |
+
analyzer: FrequencyAnalyzer instance
|
| 377 |
+
column_config: Column configuration
|
|
|
|
| 378 |
|
| 379 |
Returns:
|
| 380 |
+
Dictionary with visualization configuration or None
|
| 381 |
"""
|
| 382 |
+
st.subheader("📊 Visualization Settings")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
+
# Get data statistics
|
| 385 |
+
total_words = len(analyzer.df)
|
| 386 |
+
max_freq = analyzer.df[column_config['frequency_column']].max()
|
| 387 |
+
min_freq = analyzer.df[column_config['frequency_column']].min()
|
| 388 |
+
|
| 389 |
+
col1, col2, col3 = st.columns(3)
|
| 390 |
|
| 391 |
with col1:
|
| 392 |
+
chart_type = st.selectbox(
|
| 393 |
+
"Chart Type",
|
| 394 |
+
["Bar Chart", "Line Chart", "Area Chart", "Scatter Plot"],
|
| 395 |
+
help="Select visualization type"
|
|
|
|
|
|
|
| 396 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
with col2:
|
| 399 |
+
# Dynamic range based on data
|
| 400 |
+
max_words = min(total_words, 1000)
|
| 401 |
+
default_n = min(50, max_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
+
top_n = st.slider(
|
| 404 |
+
"Number of Words",
|
| 405 |
+
min_value=10,
|
| 406 |
+
max_value=max_words,
|
| 407 |
+
value=default_n,
|
| 408 |
+
step=10,
|
| 409 |
+
help=f"Display top N words (total: {total_words:,})"
|
| 410 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
+
with col3:
|
| 413 |
+
scale = st.selectbox(
|
| 414 |
+
"Y-Axis Scale",
|
| 415 |
+
["Linear", "Logarithmic"],
|
| 416 |
+
help="Logarithmic scale is useful for data with large frequency variations"
|
| 417 |
+
)
|
| 418 |
|
| 419 |
+
# Advanced options
|
| 420 |
+
with st.expander("🎨 Advanced Options", expanded=False):
|
| 421 |
+
col1, col2 = st.columns(2)
|
| 422 |
+
|
| 423 |
+
with col1:
|
| 424 |
+
color_scheme = st.selectbox(
|
| 425 |
+
"Color Scheme",
|
| 426 |
+
["Viridis", "Blues", "Reds", "Turbo", "Rainbow"],
|
| 427 |
+
help="Select color scheme for visualization"
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
show_values = st.checkbox(
|
| 431 |
+
"Show Values on Chart",
|
| 432 |
+
value=False,
|
| 433 |
+
help="Display frequency values on the chart"
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
with col2:
|
| 437 |
+
orientation = st.radio(
|
| 438 |
+
"Orientation",
|
| 439 |
+
["Vertical", "Horizontal"],
|
| 440 |
+
help="Chart orientation"
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
show_grid = st.checkbox(
|
| 444 |
+
"Show Grid",
|
| 445 |
+
value=True,
|
| 446 |
+
help="Display grid lines"
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
# Summary statistics
|
| 450 |
+
st.write("**Data Statistics:**")
|
| 451 |
+
stat_col1, stat_col2, stat_col3, stat_col4 = st.columns(4)
|
| 452 |
+
|
| 453 |
+
with stat_col1:
|
| 454 |
+
st.metric("Total Words", f"{total_words:,}")
|
| 455 |
+
with stat_col2:
|
| 456 |
+
st.metric("Max Frequency", f"{max_freq:,}")
|
| 457 |
+
with stat_col3:
|
| 458 |
+
st.metric("Min Frequency", f"{min_freq:,}")
|
| 459 |
+
with stat_col4:
|
| 460 |
+
mean_freq = analyzer.df[column_config['frequency_column']].mean()
|
| 461 |
+
st.metric("Mean Frequency", f"{mean_freq:,.1f}")
|
| 462 |
+
|
| 463 |
+
return {
|
| 464 |
+
'chart_type': chart_type,
|
| 465 |
+
'top_n': top_n,
|
| 466 |
+
'scale': scale,
|
| 467 |
+
'color_scheme': color_scheme.lower(),
|
| 468 |
+
'show_values': show_values,
|
| 469 |
+
'orientation': orientation.lower(),
|
| 470 |
+
'show_grid': show_grid,
|
| 471 |
+
'word_column': column_config['word_column'],
|
| 472 |
+
'frequency_column': column_config['frequency_column']
|
| 473 |
+
}
|
| 474 |
|
| 475 |
@staticmethod
|
| 476 |
+
def render_enhanced_rank_based_analysis(analyzer: FrequencyAnalyzer, viz_config: dict):
|
| 477 |
"""
|
| 478 |
+
Render enhanced rank-based frequency analysis.
|
| 479 |
|
| 480 |
Args:
|
| 481 |
analyzer: FrequencyAnalyzer instance with loaded data
|
| 482 |
+
viz_config: Visualization configuration
|
|
|
|
|
|
|
|
|
|
| 483 |
"""
|
| 484 |
+
st.subheader("📈 Frequency Analysis Results")
|
| 485 |
+
|
| 486 |
+
# Get top N words
|
| 487 |
+
top_n = viz_config['top_n']
|
| 488 |
+
word_col = viz_config['word_column']
|
| 489 |
+
freq_col = viz_config['frequency_column']
|
| 490 |
+
|
| 491 |
+
# Sort and get top N
|
| 492 |
+
df_sorted = analyzer.df.sort_values(by=freq_col, ascending=False).head(top_n).copy()
|
| 493 |
|
| 494 |
+
# Add rank column
|
| 495 |
+
df_sorted['rank'] = range(1, len(df_sorted) + 1)
|
| 496 |
+
|
| 497 |
+
# Create visualization
|
| 498 |
+
if viz_config['orientation'] == 'horizontal':
|
| 499 |
+
x_col, y_col = freq_col, word_col
|
| 500 |
+
# Reverse order for horizontal bar chart
|
| 501 |
+
df_sorted = df_sorted.iloc[::-1]
|
| 502 |
+
else:
|
| 503 |
+
x_col, y_col = word_col, freq_col
|
| 504 |
+
|
| 505 |
+
# Create figure based on chart type
|
| 506 |
+
if viz_config['chart_type'] == "Bar Chart":
|
| 507 |
+
fig = px.bar(
|
| 508 |
+
df_sorted,
|
| 509 |
+
x=x_col,
|
| 510 |
+
y=y_col,
|
| 511 |
+
color=freq_col,
|
| 512 |
+
color_continuous_scale=viz_config['color_scheme'],
|
| 513 |
+
title=f"Top {top_n} Most Frequent Words",
|
| 514 |
+
labels={freq_col: "Frequency", word_col: "Words"},
|
| 515 |
+
orientation='h' if viz_config['orientation'] == 'horizontal' else 'v'
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
elif viz_config['chart_type'] == "Line Chart":
|
| 519 |
+
fig = px.line(
|
| 520 |
+
df_sorted,
|
| 521 |
+
x=word_col,
|
| 522 |
+
y=freq_col,
|
| 523 |
+
markers=True,
|
| 524 |
+
title=f"Top {top_n} Most Frequent Words",
|
| 525 |
+
labels={freq_col: "Frequency", word_col: "Words"}
|
| 526 |
+
)
|
| 527 |
+
fig.update_traces(line_color=px.colors.qualitative.Plotly[0], line_width=3)
|
| 528 |
+
|
| 529 |
+
elif viz_config['chart_type'] == "Area Chart":
|
| 530 |
+
fig = px.area(
|
| 531 |
+
df_sorted,
|
| 532 |
+
x=word_col,
|
| 533 |
+
y=freq_col,
|
| 534 |
+
title=f"Top {top_n} Most Frequent Words",
|
| 535 |
+
labels={freq_col: "Frequency", word_col: "Words"}
|
| 536 |
+
)
|
| 537 |
|
| 538 |
+
else: # Scatter Plot
|
| 539 |
+
fig = px.scatter(
|
| 540 |
+
df_sorted,
|
| 541 |
+
x='rank',
|
| 542 |
+
y=freq_col,
|
| 543 |
+
text=word_col,
|
| 544 |
+
size=freq_col,
|
| 545 |
+
color=freq_col,
|
| 546 |
+
color_continuous_scale=viz_config['color_scheme'],
|
| 547 |
+
title=f"Rank-Frequency Distribution (Top {top_n})",
|
| 548 |
+
labels={freq_col: "Frequency", 'rank': "Rank"}
|
| 549 |
+
)
|
| 550 |
+
fig.update_traces(textposition='top center')
|
| 551 |
+
|
| 552 |
+
# Apply logarithmic scale if selected
|
| 553 |
+
if viz_config['scale'] == "Logarithmic":
|
| 554 |
+
if viz_config['orientation'] == 'horizontal':
|
| 555 |
+
fig.update_xaxes(type="log")
|
| 556 |
+
else:
|
| 557 |
+
fig.update_yaxes(type="log")
|
| 558 |
+
|
| 559 |
+
# Show values on chart if selected
|
| 560 |
+
if viz_config['show_values'] and viz_config['chart_type'] == "Bar Chart":
|
| 561 |
+
fig.update_traces(texttemplate='%{value:,.0f}', textposition='outside')
|
| 562 |
+
|
| 563 |
+
# Update layout
|
| 564 |
+
fig.update_layout(
|
| 565 |
+
showlegend=False,
|
| 566 |
+
height=600,
|
| 567 |
+
xaxis_tickangle=-45 if viz_config['orientation'] == 'vertical' else 0,
|
| 568 |
+
plot_bgcolor='white' if viz_config['show_grid'] else 'rgba(0,0,0,0)',
|
| 569 |
+
xaxis_showgrid=viz_config['show_grid'],
|
| 570 |
+
yaxis_showgrid=viz_config['show_grid']
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
# Display chart
|
| 574 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 575 |
+
|
| 576 |
+
# Additional analyses
|
| 577 |
+
tab1, tab2, tab3 = st.tabs(["📊 Statistics", "📋 Data Table", "📈 Distribution Analysis"])
|
| 578 |
+
|
| 579 |
+
with tab1:
|
| 580 |
+
FrequencyHandlers.render_statistics_summary(df_sorted, freq_col, word_col)
|
| 581 |
+
|
| 582 |
+
with tab2:
|
| 583 |
+
FrequencyHandlers.render_data_table(df_sorted, word_col, freq_col)
|
| 584 |
+
|
| 585 |
+
with tab3:
|
| 586 |
+
FrequencyHandlers.render_distribution_analysis(analyzer, freq_col, viz_config)
|
| 587 |
+
|
| 588 |
+
@staticmethod
|
| 589 |
+
def render_statistics_summary(df: pd.DataFrame, freq_col: str, word_col: str):
|
| 590 |
+
"""Render statistical summary of the frequency data."""
|
| 591 |
col1, col2, col3 = st.columns(3)
|
| 592 |
|
| 593 |
with col1:
|
| 594 |
+
st.write("**Frequency Statistics:**")
|
| 595 |
+
st.write(f"• Total frequency: {df[freq_col].sum():,}")
|
| 596 |
+
st.write(f"• Mean frequency: {df[freq_col].mean():,.1f}")
|
| 597 |
+
st.write(f"• Median frequency: {df[freq_col].median():,.1f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
|
| 599 |
with col2:
|
| 600 |
+
st.write("**Coverage Analysis:**")
|
| 601 |
+
total_freq = df[freq_col].sum()
|
| 602 |
+
cumsum = df[freq_col].cumsum()
|
| 603 |
+
coverage_50 = len(cumsum[cumsum <= total_freq * 0.5])
|
| 604 |
+
coverage_80 = len(cumsum[cumsum <= total_freq * 0.8])
|
| 605 |
+
st.write(f"• Words for 50% coverage: {coverage_50}")
|
| 606 |
+
st.write(f"• Words for 80% coverage: {coverage_80}")
|
| 607 |
+
st.write(f"• Top 10 words: {(df[freq_col].head(10).sum() / total_freq * 100):.1f}%")
|
| 608 |
|
| 609 |
with col3:
|
| 610 |
+
st.write("**Diversity Metrics:**")
|
| 611 |
+
st.write(f"• Unique words shown: {len(df)}")
|
| 612 |
+
st.write(f"• Hapax legomena: {len(df[df[freq_col] == 1])}")
|
| 613 |
+
st.write(f"• Type-token ratio: {len(df) / df[freq_col].sum():.4f}")
|
| 614 |
+
|
| 615 |
+
@staticmethod
|
| 616 |
+
def render_data_table(df: pd.DataFrame, word_col: str, freq_col: str):
|
| 617 |
+
"""Render interactive data table."""
|
| 618 |
+
# Add percentage column
|
| 619 |
+
df_display = df.copy()
|
| 620 |
+
df_display['percentage'] = (df_display[freq_col] / df_display[freq_col].sum() * 100).round(2)
|
| 621 |
+
df_display['cumulative_%'] = (df_display[freq_col].cumsum() / df_display[freq_col].sum() * 100).round(2)
|
| 622 |
+
|
| 623 |
+
# Display options
|
| 624 |
+
col1, col2 = st.columns([1, 3])
|
| 625 |
+
with col1:
|
| 626 |
+
show_cols = st.multiselect(
|
| 627 |
+
"Columns to show:",
|
| 628 |
+
options=df_display.columns.tolist(),
|
| 629 |
+
default=['rank', word_col, freq_col, 'percentage', 'cumulative_%']
|
| 630 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
|
| 632 |
+
# Display table
|
| 633 |
+
st.dataframe(
|
| 634 |
+
df_display[show_cols],
|
| 635 |
+
use_container_width=True,
|
| 636 |
+
hide_index=True,
|
| 637 |
+
height=400
|
| 638 |
+
)
|
|
|
|
| 639 |
|
| 640 |
+
# Download button
|
| 641 |
+
csv = df_display[show_cols].to_csv(index=False)
|
| 642 |
+
st.download_button(
|
| 643 |
+
label="📥 Download as CSV",
|
| 644 |
+
data=csv,
|
| 645 |
+
file_name=f"frequency_analysis_top_{len(df)}.csv",
|
| 646 |
+
mime="text/csv"
|
| 647 |
+
)
|
| 648 |
|
| 649 |
@staticmethod
|
| 650 |
+
def render_distribution_analysis(analyzer: FrequencyAnalyzer, freq_col: str, viz_config: dict):
|
| 651 |
+
"""Render frequency distribution analysis."""
|
| 652 |
+
# Zipf's law analysis
|
| 653 |
+
st.write("**Zipf's Law Analysis:**")
|
| 654 |
+
|
| 655 |
+
df_full = analyzer.df.sort_values(by=freq_col, ascending=False).copy()
|
| 656 |
+
df_full['rank'] = range(1, len(df_full) + 1)
|
| 657 |
+
df_full['log_rank'] = np.log10(df_full['rank'])
|
| 658 |
+
df_full['log_freq'] = np.log10(df_full[freq_col])
|
| 659 |
+
|
| 660 |
+
# Create Zipf plot
|
| 661 |
+
fig_zipf = px.scatter(
|
| 662 |
+
df_full.head(min(1000, len(df_full))),
|
| 663 |
+
x='log_rank',
|
| 664 |
+
y='log_freq',
|
| 665 |
+
title="Zipf's Law Distribution (Log-Log Plot)",
|
| 666 |
+
labels={'log_rank': 'log₁₀(Rank)', 'log_freq': 'log₁₀(Frequency)'},
|
| 667 |
+
trendline="ols"
|
| 668 |
+
)
|
| 669 |
|
| 670 |
+
fig_zipf.update_layout(height=400)
|
| 671 |
+
st.plotly_chart(fig_zipf, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
| 672 |
|
| 673 |
+
# Frequency bands analysis
|
| 674 |
+
st.write("**Frequency Bands:**")
|
| 675 |
+
bands = pd.cut(df_full[freq_col],
|
| 676 |
+
bins=[0, 1, 10, 100, 1000, 10000, float('inf')],
|
| 677 |
+
labels=['1', '2-10', '11-100', '101-1000', '1001-10000', '10000+'])
|
| 678 |
+
band_counts = bands.value_counts().sort_index()
|
| 679 |
|
| 680 |
+
col1, col2 = st.columns(2)
|
| 681 |
+
with col1:
|
| 682 |
+
st.write("Words per frequency band:")
|
| 683 |
+
for band, count in band_counts.items():
|
| 684 |
+
st.write(f"• {band}: {count:,} words")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
|
| 686 |
+
with col2:
|
| 687 |
+
# Pie chart of frequency bands
|
| 688 |
+
fig_pie = px.pie(
|
| 689 |
+
values=band_counts.values,
|
| 690 |
+
names=band_counts.index,
|
| 691 |
+
title="Distribution of Words by Frequency Band"
|
| 692 |
+
)
|
| 693 |
+
fig_pie.update_layout(height=300)
|
| 694 |
+
st.plotly_chart(fig_pie, use_container_width=True)
|
web_app/handlers/frequency_handlers.py.backup_20250726_162020
ADDED
|
@@ -0,0 +1,733 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Frequency Analysis Handlers for Streamlit Interface
|
| 3 |
+
|
| 4 |
+
This module provides Streamlit interface handlers for word frequency visualization,
|
| 5 |
+
including file upload, visualization controls, and results display.
|
| 6 |
+
Supports flexible column mapping for diverse frequency data formats.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import streamlit as st
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import numpy as np
|
| 14 |
+
from typing import Dict, List, Optional
|
| 15 |
+
import sys
|
| 16 |
+
import os
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from io import StringIO
|
| 19 |
+
|
| 20 |
+
# Add parent directory to path for imports
|
| 21 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 22 |
+
|
| 23 |
+
from text_analyzer.frequency_analyzer import FrequencyAnalyzer
|
| 24 |
+
from web_app.utils import FileUploadHandler
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class FrequencyHandlers:
|
| 28 |
+
"""
|
| 29 |
+
Streamlit interface handlers for frequency analysis functionality.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
@staticmethod
|
| 33 |
+
def handle_frequency_analysis():
|
| 34 |
+
"""
|
| 35 |
+
Enhanced frequency analysis interface handler with persistent column selection.
|
| 36 |
+
"""
|
| 37 |
+
st.markdown("Upload a frequency data file (TSV/CSV) with flexible column mapping support. "
|
| 38 |
+
"The system will automatically detect columns and let you choose which ones to use for analysis.")
|
| 39 |
+
|
| 40 |
+
# Initialize session state variables
|
| 41 |
+
if 'uploaded_file_name' not in st.session_state:
|
| 42 |
+
st.session_state.uploaded_file_name = None
|
| 43 |
+
if 'column_config' not in st.session_state:
|
| 44 |
+
st.session_state.column_config = None
|
| 45 |
+
if 'analyzer' not in st.session_state:
|
| 46 |
+
st.session_state.analyzer = None
|
| 47 |
+
if 'format_info' not in st.session_state:
|
| 48 |
+
st.session_state.format_info = None
|
| 49 |
+
if 'detected_cols' not in st.session_state:
|
| 50 |
+
st.session_state.detected_cols = None
|
| 51 |
+
if 'uploaded_file_content' not in st.session_state:
|
| 52 |
+
st.session_state.uploaded_file_content = None
|
| 53 |
+
|
| 54 |
+
# File upload section
|
| 55 |
+
uploaded_file = FrequencyHandlers.render_file_upload()
|
| 56 |
+
|
| 57 |
+
# Check if a new file was uploaded
|
| 58 |
+
if uploaded_file is not None:
|
| 59 |
+
current_file_name = uploaded_file.name
|
| 60 |
+
|
| 61 |
+
# Reset state if new file is uploaded
|
| 62 |
+
if st.session_state.uploaded_file_name != current_file_name:
|
| 63 |
+
st.session_state.uploaded_file_name = current_file_name
|
| 64 |
+
st.session_state.column_config = None
|
| 65 |
+
st.session_state.analyzer = None
|
| 66 |
+
st.session_state.format_info = None
|
| 67 |
+
st.session_state.detected_cols = None
|
| 68 |
+
|
| 69 |
+
# Handle file content loading with /tmp approach for HF Spaces compatibility
|
| 70 |
+
try:
|
| 71 |
+
# Validate file size first
|
| 72 |
+
if not FileUploadHandler.validate_file_size(uploaded_file, max_size_mb=300):
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
+
# Save to temp and read content
|
| 76 |
+
temp_path = FileUploadHandler.save_to_temp(uploaded_file, prefix="freq")
|
| 77 |
+
if temp_path:
|
| 78 |
+
st.session_state.uploaded_file_content = FileUploadHandler.read_from_temp(temp_path)
|
| 79 |
+
st.session_state.temp_file_path = temp_path
|
| 80 |
+
st.success(f"✅ File '{current_file_name}' ({len(st.session_state.uploaded_file_content):,} bytes) uploaded successfully")
|
| 81 |
+
else:
|
| 82 |
+
st.error("Failed to save uploaded file. Please try again.")
|
| 83 |
+
return
|
| 84 |
+
except Exception as e:
|
| 85 |
+
st.error(f"❌ Failed to read uploaded file: {str(e)}")
|
| 86 |
+
if "403" in str(e) or "Forbidden" in str(e):
|
| 87 |
+
st.error("**Upload Error**: File upload was blocked. This is a known issue on Hugging Face Spaces. "
|
| 88 |
+
"Please try using the sample files option or deploy locally.")
|
| 89 |
+
return
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
# Initialize analyzer and process file (only if needed)
|
| 93 |
+
if st.session_state.analyzer is None or st.session_state.format_info is None:
|
| 94 |
+
st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
|
| 95 |
+
# Use the content we already read from temp file
|
| 96 |
+
st.session_state.format_info = st.session_state.analyzer.detect_file_format(st.session_state.uploaded_file_content)
|
| 97 |
+
|
| 98 |
+
# Show format detection results
|
| 99 |
+
st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
|
| 100 |
+
f"{'with' if st.session_state.format_info['has_header'] else 'without'} header, "
|
| 101 |
+
f"~{st.session_state.format_info['estimated_columns']} columns")
|
| 102 |
+
|
| 103 |
+
# Prepare data for column detection (use already loaded content)
|
| 104 |
+
content = st.session_state.uploaded_file_content
|
| 105 |
+
if isinstance(content, bytes):
|
| 106 |
+
content = content.decode('utf-8')
|
| 107 |
+
|
| 108 |
+
# Read data for preview and column detection
|
| 109 |
+
df_preview = pd.read_csv(StringIO(content),
|
| 110 |
+
sep=st.session_state.format_info['separator'],
|
| 111 |
+
header=0 if st.session_state.format_info['has_header'] else None,
|
| 112 |
+
nrows=100)
|
| 113 |
+
|
| 114 |
+
# Detect available columns
|
| 115 |
+
st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
|
| 116 |
+
|
| 117 |
+
# Show data preview
|
| 118 |
+
FrequencyHandlers.render_data_preview(df_preview, st.session_state.detected_cols)
|
| 119 |
+
|
| 120 |
+
# ALWAYS show column selection if we have detected columns (persistent interface)
|
| 121 |
+
if st.session_state.detected_cols is not None:
|
| 122 |
+
with st.expander("🎯 Column Selection", expanded=True):
|
| 123 |
+
column_config = FrequencyHandlers.render_persistent_column_selection(
|
| 124 |
+
st.session_state.detected_cols,
|
| 125 |
+
st.session_state.format_info,
|
| 126 |
+
st.session_state.column_config
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Check if column configuration changed
|
| 130 |
+
if column_config != st.session_state.column_config:
|
| 131 |
+
st.session_state.column_config = column_config
|
| 132 |
+
# Reload data with new configuration
|
| 133 |
+
df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
|
| 134 |
+
st.session_state.loaded_data = df
|
| 135 |
+
st.rerun()
|
| 136 |
+
|
| 137 |
+
# ALWAYS show visualization controls if we have a column config
|
| 138 |
+
if st.session_state.column_config is not None:
|
| 139 |
+
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 140 |
+
|
| 141 |
+
if viz_config:
|
| 142 |
+
# Generate analysis
|
| 143 |
+
FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
st.error(f"Error processing file: {str(e)}")
|
| 147 |
+
|
| 148 |
+
# Provide specific error guidance
|
| 149 |
+
if "403" in str(e) or "Forbidden" in str(e):
|
| 150 |
+
st.error("**HTTP 403 Error**: File upload was blocked by the server.")
|
| 151 |
+
st.info("This is a known limitation on Hugging Face Spaces. Please use the sample files option or deploy the app locally for full functionality.")
|
| 152 |
+
elif "timeout" in str(e).lower():
|
| 153 |
+
st.error("**Timeout Error**: File processing took too long")
|
| 154 |
+
st.info("Try uploading a smaller file or check your internet connection")
|
| 155 |
+
elif "memory" in str(e).lower() or "RAM" in str(e).upper():
|
| 156 |
+
st.error("**Memory Error**: Not enough memory to process this file")
|
| 157 |
+
st.info("Try uploading a smaller file")
|
| 158 |
+
else:
|
| 159 |
+
with st.expander("Error Details"):
|
| 160 |
+
st.code(str(e))
|
| 161 |
+
st.write("**Debug Information:**")
|
| 162 |
+
st.write(f"- File size: {len(st.session_state.uploaded_file_content) if st.session_state.uploaded_file_content else 'Unknown'} bytes")
|
| 163 |
+
st.write(f"- Session state keys: {list(st.session_state.keys())}")
|
| 164 |
+
|
| 165 |
+
st.info("Please ensure your file is a valid TSV/CSV with appropriate columns.")
|
| 166 |
+
|
| 167 |
+
elif st.session_state.column_config is not None and st.session_state.uploaded_file_content is not None:
|
| 168 |
+
# Show persistent interface even when no file is currently selected (using cached data)
|
| 169 |
+
with st.expander("🎯 Column Selection", expanded=False):
|
| 170 |
+
column_config = FrequencyHandlers.render_persistent_column_selection(
|
| 171 |
+
st.session_state.detected_cols,
|
| 172 |
+
st.session_state.format_info,
|
| 173 |
+
st.session_state.column_config
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# Check if column configuration changed
|
| 177 |
+
if column_config != st.session_state.column_config:
|
| 178 |
+
st.session_state.column_config = column_config
|
| 179 |
+
# Reload data with new configuration
|
| 180 |
+
df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
|
| 181 |
+
st.session_state.loaded_data = df
|
| 182 |
+
st.rerun()
|
| 183 |
+
|
| 184 |
+
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 185 |
+
|
| 186 |
+
if viz_config:
|
| 187 |
+
# Generate analysis
|
| 188 |
+
FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
|
| 189 |
+
|
| 190 |
+
# Cleanup old temp files periodically
|
| 191 |
+
try:
|
| 192 |
+
FileUploadHandler.cleanup_old_temp_files(max_age_hours=1)
|
| 193 |
+
except:
|
| 194 |
+
pass
|
| 195 |
+
|
| 196 |
+
@staticmethod
|
| 197 |
+
def render_file_upload():
|
| 198 |
+
"""
|
| 199 |
+
Render enhanced file upload interface with sample files fallback.
|
| 200 |
+
|
| 201 |
+
Returns:
|
| 202 |
+
File-like object or None
|
| 203 |
+
"""
|
| 204 |
+
st.subheader("📄 Select Frequency Data")
|
| 205 |
+
|
| 206 |
+
# Data source selection
|
| 207 |
+
data_source = st.radio(
|
| 208 |
+
"Choose data source:",
|
| 209 |
+
["Upload file", "Use sample files"],
|
| 210 |
+
help="Note: File uploads may experience issues on Hugging Face Spaces. Use sample files as a reliable alternative."
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
if data_source == "Upload file":
|
| 214 |
+
uploaded_file = st.file_uploader(
|
| 215 |
+
"Choose a frequency data file",
|
| 216 |
+
type=['tsv', 'csv', 'txt'],
|
| 217 |
+
help="Upload a TSV or CSV file with frequency data. Supports flexible column mapping.\n⚠️ If upload fails, try using sample files instead.",
|
| 218 |
+
accept_multiple_files=False
|
| 219 |
+
)
|
| 220 |
+
else:
|
| 221 |
+
# Sample files selection
|
| 222 |
+
sample_files = {
|
| 223 |
+
"word_freq.txt": "data/word_freq.txt",
|
| 224 |
+
"COCA_5000.txt": "data/COCA_5000.txt",
|
| 225 |
+
"jpn_word_freq.txt": "data/jpn_word_freq.txt"
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
selected_sample = st.selectbox(
|
| 229 |
+
"Choose a sample file:",
|
| 230 |
+
options=list(sample_files.keys()),
|
| 231 |
+
help="Pre-loaded frequency data files for testing"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
if st.button("Load Sample File", type="primary"):
|
| 235 |
+
sample_path = sample_files[selected_sample]
|
| 236 |
+
if os.path.exists(sample_path):
|
| 237 |
+
try:
|
| 238 |
+
# Create a file-like object from sample file
|
| 239 |
+
from io import BytesIO
|
| 240 |
+
with open(sample_path, 'rb') as f:
|
| 241 |
+
content = f.read()
|
| 242 |
+
|
| 243 |
+
# Create BytesIO object that mimics uploaded file
|
| 244 |
+
uploaded_file = BytesIO(content)
|
| 245 |
+
uploaded_file.name = selected_sample
|
| 246 |
+
uploaded_file.type = 'text/tab-separated-values' if selected_sample.endswith('.txt') else 'text/csv'
|
| 247 |
+
uploaded_file.size = len(content)
|
| 248 |
+
|
| 249 |
+
# Store in session state to persist across reruns
|
| 250 |
+
st.session_state.sample_file = uploaded_file
|
| 251 |
+
st.session_state.sample_file_name = selected_sample
|
| 252 |
+
st.success(f"Loaded sample file: {selected_sample}")
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
st.error(f"Error loading sample file: {str(e)}")
|
| 256 |
+
uploaded_file = None
|
| 257 |
+
else:
|
| 258 |
+
st.error(f"Sample file not found: {sample_path}")
|
| 259 |
+
uploaded_file = None
|
| 260 |
+
else:
|
| 261 |
+
# Check if sample file was previously loaded
|
| 262 |
+
uploaded_file = st.session_state.get('sample_file', None)
|
| 263 |
+
if uploaded_file and 'sample_file_name' in st.session_state:
|
| 264 |
+
st.info(f"Using loaded sample file: {st.session_state.sample_file_name}")
|
| 265 |
+
|
| 266 |
+
if uploaded_file is None and data_source == "Upload file":
|
| 267 |
+
# Show example formats
|
| 268 |
+
st.info("**Supported formats:**")
|
| 269 |
+
col1, col2 = st.columns(2)
|
| 270 |
+
|
| 271 |
+
with col1:
|
| 272 |
+
st.write("**Traditional format:**")
|
| 273 |
+
example_traditional = """Type\tFreq\tRank
|
| 274 |
+
the\t69868\t1
|
| 275 |
+
of\t36426\t2
|
| 276 |
+
and\t28891\t3"""
|
| 277 |
+
st.code(example_traditional, language="text")
|
| 278 |
+
|
| 279 |
+
with col2:
|
| 280 |
+
st.write("**Rich corpus format:**")
|
| 281 |
+
example_rich = """rank\tlForm\tlemma\tpos\tfrequency\tpmw
|
| 282 |
+
1\tノ\tの\t助詞\t5061558\t48383.9
|
| 283 |
+
2\tニ\tに\t助詞\t3576558\t34188.7
|
| 284 |
+
3\tテ\tて\t助詞\t3493117\t33391.0"""
|
| 285 |
+
st.code(example_rich, language="text")
|
| 286 |
+
|
| 287 |
+
st.write("**File size limit:** 300MB")
|
| 288 |
+
|
| 289 |
+
return uploaded_file
|
| 290 |
+
|
| 291 |
+
@staticmethod
|
| 292 |
+
def render_data_preview(df: pd.DataFrame, detected_cols: Dict[str, List[str]]):
|
| 293 |
+
"""
|
| 294 |
+
Render enhanced data preview section with column detection results.
|
| 295 |
+
|
| 296 |
+
Args:
|
| 297 |
+
df: Preview DataFrame
|
| 298 |
+
detected_cols: Detected column categorization
|
| 299 |
+
"""
|
| 300 |
+
st.subheader("📊 Data Preview")
|
| 301 |
+
|
| 302 |
+
# Basic metrics
|
| 303 |
+
col1, col2, col3 = st.columns(3)
|
| 304 |
+
with col1:
|
| 305 |
+
st.metric("Total Rows", len(df))
|
| 306 |
+
with col2:
|
| 307 |
+
st.metric("Total Columns", len(df.columns))
|
| 308 |
+
with col3:
|
| 309 |
+
word_cols = len(detected_cols.get('word_columns', []))
|
| 310 |
+
freq_cols = len(detected_cols.get('frequency_columns', []))
|
| 311 |
+
st.metric("Detected", f"{word_cols} word, {freq_cols} freq")
|
| 312 |
+
|
| 313 |
+
# Show sample data
|
| 314 |
+
st.write("**First 5 rows:**")
|
| 315 |
+
st.dataframe(df.head(), use_container_width=True)
|
| 316 |
+
|
| 317 |
+
# Show detected column categories
|
| 318 |
+
with st.expander("🔍 Column Detection Results", expanded=True):
|
| 319 |
+
col1, col2 = st.columns(2)
|
| 320 |
+
|
| 321 |
+
with col1:
|
| 322 |
+
st.write("**Word Columns (text data):**")
|
| 323 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 324 |
+
if word_cols:
|
| 325 |
+
for col in word_cols:
|
| 326 |
+
st.write(f"- `{col}` ({df[col].dtype})")
|
| 327 |
+
else:
|
| 328 |
+
st.write("None detected")
|
| 329 |
+
|
| 330 |
+
st.write("**POS Columns:**")
|
| 331 |
+
pos_cols = detected_cols.get('pos_columns', [])
|
| 332 |
+
if pos_cols:
|
| 333 |
+
for col in pos_cols:
|
| 334 |
+
st.write(f"- `{col}` ({df[col].dtype})")
|
| 335 |
+
else:
|
| 336 |
+
st.write("None detected")
|
| 337 |
+
|
| 338 |
+
with col2:
|
| 339 |
+
st.write("**Frequency Columns (numeric data):**")
|
| 340 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 341 |
+
if freq_cols:
|
| 342 |
+
for col in freq_cols:
|
| 343 |
+
sample_vals = df[col].dropna().head(3).tolist()
|
| 344 |
+
st.write(f"- `{col}` ({df[col].dtype}) - e.g., {sample_vals}")
|
| 345 |
+
else:
|
| 346 |
+
st.write("None detected")
|
| 347 |
+
|
| 348 |
+
st.write("**Other Columns:**")
|
| 349 |
+
other_cols = detected_cols.get('other_columns', [])
|
| 350 |
+
if other_cols:
|
| 351 |
+
for col in other_cols[:5]: # Show max 5
|
| 352 |
+
st.write(f"- `{col}` ({df[col].dtype})")
|
| 353 |
+
if len(other_cols) > 5:
|
| 354 |
+
st.write(f"... and {len(other_cols) - 5} more")
|
| 355 |
+
else:
|
| 356 |
+
st.write("None")
|
| 357 |
+
|
| 358 |
+
@staticmethod
|
| 359 |
+
def render_column_selection_simplified(detected_cols: Dict[str, List[str]], format_info: Dict) -> Optional[Dict[str, str]]:
|
| 360 |
+
"""
|
| 361 |
+
Render simplified column selection interface without multi-frequency complexity.
|
| 362 |
+
|
| 363 |
+
Args:
|
| 364 |
+
detected_cols: Detected column categorization
|
| 365 |
+
format_info: File format information
|
| 366 |
+
|
| 367 |
+
Returns:
|
| 368 |
+
Column configuration dict or None
|
| 369 |
+
"""
|
| 370 |
+
st.subheader("🎯 Column Mapping")
|
| 371 |
+
st.write("Select which columns to use for your frequency analysis:")
|
| 372 |
+
|
| 373 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 374 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 375 |
+
pos_cols = detected_cols.get('pos_columns', [])
|
| 376 |
+
|
| 377 |
+
if not word_cols or not freq_cols:
|
| 378 |
+
st.error("❌ Required columns not detected. Please ensure your file has:")
|
| 379 |
+
st.write("- At least one text column (for words)")
|
| 380 |
+
st.write("- At least one numeric column (for frequencies)")
|
| 381 |
+
return None
|
| 382 |
+
|
| 383 |
+
col1, col2 = st.columns(2)
|
| 384 |
+
|
| 385 |
+
with col1:
|
| 386 |
+
# Word column selection
|
| 387 |
+
word_column = st.selectbox(
|
| 388 |
+
"Word Column",
|
| 389 |
+
options=word_cols,
|
| 390 |
+
index=0,
|
| 391 |
+
help="Column containing word forms or lemmas"
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# POS column selection (optional)
|
| 395 |
+
pos_column = None
|
| 396 |
+
if pos_cols:
|
| 397 |
+
use_pos = st.checkbox("Include POS column", value=False)
|
| 398 |
+
if use_pos:
|
| 399 |
+
pos_column = st.selectbox(
|
| 400 |
+
"POS Column",
|
| 401 |
+
options=pos_cols,
|
| 402 |
+
index=0,
|
| 403 |
+
help="Column containing part-of-speech tags (optional)"
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
with col2:
|
| 407 |
+
# Frequency column selection
|
| 408 |
+
frequency_column = st.selectbox(
|
| 409 |
+
"Frequency Column",
|
| 410 |
+
options=freq_cols,
|
| 411 |
+
index=0,
|
| 412 |
+
help="Column containing frequency values for analysis"
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
# Confirm button
|
| 416 |
+
if st.button("🚀 Start Analysis", type="primary"):
|
| 417 |
+
config = {
|
| 418 |
+
'word_column': word_column,
|
| 419 |
+
'frequency_column': frequency_column,
|
| 420 |
+
'separator': format_info['separator'],
|
| 421 |
+
'has_header': format_info['has_header']
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
if pos_column:
|
| 425 |
+
config['pos_column'] = pos_column
|
| 426 |
+
|
| 427 |
+
return config
|
| 428 |
+
|
| 429 |
+
return None
|
| 430 |
+
|
| 431 |
+
@staticmethod
|
| 432 |
+
def render_visualization_controls_simplified(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
|
| 433 |
+
"""
|
| 434 |
+
Legacy method - redirects to enhanced controls for backward compatibility.
|
| 435 |
+
"""
|
| 436 |
+
return FrequencyHandlers.render_enhanced_visualization_controls(analyzer, column_config)
|
| 437 |
+
|
| 438 |
+
@staticmethod
|
| 439 |
+
def render_rank_based_analysis_simplified(analyzer: FrequencyAnalyzer, viz_config: Dict):
|
| 440 |
+
"""
|
| 441 |
+
Legacy method - redirects to enhanced analysis for backward compatibility.
|
| 442 |
+
"""
|
| 443 |
+
return FrequencyHandlers.render_enhanced_rank_based_analysis(analyzer, viz_config)
|
| 444 |
+
|
| 445 |
+
@staticmethod
|
| 446 |
+
def render_persistent_column_selection(detected_cols: Dict[str, List[str]],
|
| 447 |
+
format_info: Dict,
|
| 448 |
+
current_config: Optional[Dict] = None) -> Dict[str, str]:
|
| 449 |
+
"""
|
| 450 |
+
Render persistent column selection interface that doesn't disappear.
|
| 451 |
+
|
| 452 |
+
Args:
|
| 453 |
+
detected_cols: Detected column categorization
|
| 454 |
+
format_info: File format information
|
| 455 |
+
current_config: Current column configuration (for preserving selections)
|
| 456 |
+
|
| 457 |
+
Returns:
|
| 458 |
+
Column configuration dict
|
| 459 |
+
"""
|
| 460 |
+
st.write("Select which columns to use for your frequency analysis:")
|
| 461 |
+
|
| 462 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 463 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 464 |
+
pos_cols = detected_cols.get('pos_columns', [])
|
| 465 |
+
|
| 466 |
+
# Determine default selections
|
| 467 |
+
default_word_idx = 0
|
| 468 |
+
default_freq_idx = 0
|
| 469 |
+
default_use_pos = False
|
| 470 |
+
default_pos_idx = 0
|
| 471 |
+
|
| 472 |
+
if current_config:
|
| 473 |
+
# Preserve current selections
|
| 474 |
+
if current_config['word_column'] in word_cols:
|
| 475 |
+
default_word_idx = word_cols.index(current_config['word_column'])
|
| 476 |
+
if current_config['frequency_column'] in freq_cols:
|
| 477 |
+
default_freq_idx = freq_cols.index(current_config['frequency_column'])
|
| 478 |
+
if 'pos_column' in current_config and current_config['pos_column'] in pos_cols:
|
| 479 |
+
default_use_pos = True
|
| 480 |
+
default_pos_idx = pos_cols.index(current_config['pos_column'])
|
| 481 |
+
|
| 482 |
+
col1, col2 = st.columns(2)
|
| 483 |
+
|
| 484 |
+
with col1:
|
| 485 |
+
word_column = st.selectbox(
|
| 486 |
+
"Word Column",
|
| 487 |
+
options=word_cols,
|
| 488 |
+
index=default_word_idx,
|
| 489 |
+
help="Column containing word forms or lemmas",
|
| 490 |
+
key="persistent_word_col"
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
# POS column selection (optional)
|
| 494 |
+
pos_column = None
|
| 495 |
+
if pos_cols:
|
| 496 |
+
use_pos = st.checkbox("Include POS column", value=default_use_pos, key="persistent_use_pos")
|
| 497 |
+
if use_pos:
|
| 498 |
+
pos_column = st.selectbox(
|
| 499 |
+
"POS Column",
|
| 500 |
+
options=pos_cols,
|
| 501 |
+
index=default_pos_idx,
|
| 502 |
+
help="Column containing part-of-speech tags (optional)",
|
| 503 |
+
key="persistent_pos_col"
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
with col2:
|
| 507 |
+
frequency_column = st.selectbox(
|
| 508 |
+
"Frequency Column",
|
| 509 |
+
options=freq_cols,
|
| 510 |
+
index=default_freq_idx,
|
| 511 |
+
help="Column containing frequency values for analysis",
|
| 512 |
+
key="persistent_freq_col"
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
# Show quick info about selected columns
|
| 516 |
+
st.write("**Selected Configuration:**")
|
| 517 |
+
st.write(f"• Words: `{word_column}`")
|
| 518 |
+
st.write(f"• Frequencies: `{frequency_column}`")
|
| 519 |
+
if pos_column:
|
| 520 |
+
st.write(f"• POS: `{pos_column}`")
|
| 521 |
+
|
| 522 |
+
# Always return configuration (no button needed)
|
| 523 |
+
config = {
|
| 524 |
+
'word_column': word_column,
|
| 525 |
+
'frequency_column': frequency_column,
|
| 526 |
+
'separator': format_info['separator'],
|
| 527 |
+
'has_header': format_info['has_header']
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
if pos_column:
|
| 531 |
+
config['pos_column'] = pos_column
|
| 532 |
+
|
| 533 |
+
return config
|
| 534 |
+
|
| 535 |
+
@staticmethod
|
| 536 |
+
def render_enhanced_visualization_controls(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
|
| 537 |
+
"""
|
| 538 |
+
Render enhanced visualization controls with max words limit.
|
| 539 |
+
|
| 540 |
+
Args:
|
| 541 |
+
analyzer: FrequencyAnalyzer instance with loaded data
|
| 542 |
+
column_config: Column configuration from user selection
|
| 543 |
+
|
| 544 |
+
Returns:
|
| 545 |
+
Dict with visualization configuration or None
|
| 546 |
+
"""
|
| 547 |
+
st.subheader("🎛️ Enhanced Visualization Controls")
|
| 548 |
+
|
| 549 |
+
# Get the frequency column
|
| 550 |
+
frequency_column = column_config['frequency_column']
|
| 551 |
+
|
| 552 |
+
col1, col2, col3 = st.columns(3)
|
| 553 |
+
|
| 554 |
+
with col1:
|
| 555 |
+
# Bin size controls
|
| 556 |
+
bin_size = st.slider(
|
| 557 |
+
"Bin Size (words per group)",
|
| 558 |
+
min_value=100,
|
| 559 |
+
max_value=2000,
|
| 560 |
+
value=500,
|
| 561 |
+
step=100,
|
| 562 |
+
help="Number of words to group together for rank-based analysis"
|
| 563 |
+
)
|
| 564 |
+
|
| 565 |
+
with col2:
|
| 566 |
+
# Log transformation option
|
| 567 |
+
log_transform = st.checkbox(
|
| 568 |
+
"Apply log₁₀ transformation",
|
| 569 |
+
value=False,
|
| 570 |
+
help="Transform frequency values using log₁₀ for better visualization"
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
with col3:
|
| 574 |
+
# Max words control
|
| 575 |
+
max_words = st.number_input(
|
| 576 |
+
"Max words to analyze",
|
| 577 |
+
min_value=1000,
|
| 578 |
+
max_value=200000,
|
| 579 |
+
value=None,
|
| 580 |
+
step=1000,
|
| 581 |
+
help="Limit analysis to top N most frequent words (leave empty for no limit)",
|
| 582 |
+
key="max_words_input"
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
# Quick preset buttons
|
| 586 |
+
st.write("**Quick Presets:**")
|
| 587 |
+
preset_cols = st.columns(4)
|
| 588 |
+
if preset_cols[0].button("10K", key="preset_10k"):
|
| 589 |
+
st.session_state.max_words_preset = 10000
|
| 590 |
+
if preset_cols[1].button("25K", key="preset_25k"):
|
| 591 |
+
st.session_state.max_words_preset = 25000
|
| 592 |
+
if preset_cols[2].button("50K", key="preset_50k"):
|
| 593 |
+
st.session_state.max_words_preset = 50000
|
| 594 |
+
if preset_cols[3].button("All", key="preset_all"):
|
| 595 |
+
st.session_state.max_words_preset = None
|
| 596 |
+
|
| 597 |
+
# Use preset value if set
|
| 598 |
+
if 'max_words_preset' in st.session_state:
|
| 599 |
+
max_words = st.session_state.max_words_preset
|
| 600 |
+
del st.session_state.max_words_preset
|
| 601 |
+
|
| 602 |
+
# Generate visualization button
|
| 603 |
+
if st.button("📊 Generate Enhanced Visualization", type="primary", key="generate_viz"):
|
| 604 |
+
return {
|
| 605 |
+
'frequency_column': frequency_column,
|
| 606 |
+
'bin_size': bin_size,
|
| 607 |
+
'log_transform': log_transform,
|
| 608 |
+
'max_words_to_retain': max_words
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
+
return None
|
| 612 |
+
|
| 613 |
+
@staticmethod
|
| 614 |
+
def render_enhanced_rank_based_analysis(analyzer: FrequencyAnalyzer, viz_config: Dict):
|
| 615 |
+
"""
|
| 616 |
+
Render enhanced rank-based analysis with improved sample words display.
|
| 617 |
+
|
| 618 |
+
Args:
|
| 619 |
+
analyzer: FrequencyAnalyzer instance with loaded data
|
| 620 |
+
viz_config: Visualization configuration
|
| 621 |
+
"""
|
| 622 |
+
st.subheader("📊 Enhanced Rank-Based Frequency Analysis")
|
| 623 |
+
|
| 624 |
+
frequency_column = viz_config['frequency_column']
|
| 625 |
+
bin_size = viz_config['bin_size']
|
| 626 |
+
log_transform = viz_config['log_transform']
|
| 627 |
+
max_words_to_retain = viz_config.get('max_words_to_retain')
|
| 628 |
+
|
| 629 |
+
try:
|
| 630 |
+
# Calculate statistics
|
| 631 |
+
stats = analyzer.calculate_statistics(frequency_column)
|
| 632 |
+
|
| 633 |
+
# Display basic statistics with word limit info
|
| 634 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 635 |
+
with col1:
|
| 636 |
+
words_analyzed = max_words_to_retain if max_words_to_retain and max_words_to_retain < stats['count'] else stats['count']
|
| 637 |
+
st.metric("Words Analyzed", f"{words_analyzed:,}")
|
| 638 |
+
with col2:
|
| 639 |
+
st.metric("Mean Frequency", f"{stats['mean']:.2f}")
|
| 640 |
+
with col3:
|
| 641 |
+
st.metric("Median Frequency", f"{stats['median']:.2f}")
|
| 642 |
+
with col4:
|
| 643 |
+
st.metric("Std Deviation", f"{stats['std']:.2f}")
|
| 644 |
+
|
| 645 |
+
# Show word limit info if applied
|
| 646 |
+
if max_words_to_retain and max_words_to_retain < stats['count']:
|
| 647 |
+
st.info(f"📊 Analysis limited to top {max_words_to_retain:,} most frequent words (out of {stats['count']:,} total)")
|
| 648 |
+
|
| 649 |
+
# Create rank-based visualization with enhanced parameters
|
| 650 |
+
result = analyzer.create_rank_based_visualization_flexible(
|
| 651 |
+
column=frequency_column,
|
| 652 |
+
bin_size=bin_size,
|
| 653 |
+
log_transform=log_transform,
|
| 654 |
+
max_words_to_retain=max_words_to_retain
|
| 655 |
+
)
|
| 656 |
+
|
| 657 |
+
# Create the main visualization
|
| 658 |
+
fig = go.Figure()
|
| 659 |
+
|
| 660 |
+
fig.add_trace(go.Bar(
|
| 661 |
+
x=result['group_centers'],
|
| 662 |
+
y=result['avg_frequencies'],
|
| 663 |
+
name=f"Avg {frequency_column}",
|
| 664 |
+
marker_color='steelblue',
|
| 665 |
+
hovertemplate=(
|
| 666 |
+
f"<b>Group %{{x}}</b><br>"
|
| 667 |
+
f"Avg {'Log₁₀ ' if log_transform else ''}{frequency_column}: %{{y:.3f}}<br>"
|
| 668 |
+
"<extra></extra>"
|
| 669 |
+
)
|
| 670 |
+
))
|
| 671 |
+
|
| 672 |
+
fig.update_layout(
|
| 673 |
+
title=result.get('title_suffix', f"Enhanced Rank-Based Analysis - {frequency_column}"),
|
| 674 |
+
xaxis_title=result.get('x_label', f"Rank Groups (bin size: {bin_size})"),
|
| 675 |
+
yaxis_title=result.get('y_label', f"{'Log₁₀ ' if log_transform else ''}Average {frequency_column}"),
|
| 676 |
+
showlegend=False,
|
| 677 |
+
height=500
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 681 |
+
|
| 682 |
+
# Enhanced sample words display (up to 20 bins with 5 random samples each)
|
| 683 |
+
st.write("### 🎯 Sample Words by Rank Group (5 Random Samples)")
|
| 684 |
+
|
| 685 |
+
sample_words = result.get('sample_words', {})
|
| 686 |
+
if sample_words:
|
| 687 |
+
# Display up to 20 groups in a more organized layout
|
| 688 |
+
num_groups = min(20, len(sample_words))
|
| 689 |
+
|
| 690 |
+
if num_groups > 0:
|
| 691 |
+
st.write(f"Showing sample words from top {num_groups} rank groups:")
|
| 692 |
+
|
| 693 |
+
# Display in rows of 4 groups each
|
| 694 |
+
for row_start in range(0, num_groups, 4):
|
| 695 |
+
cols = st.columns(4)
|
| 696 |
+
for col_idx in range(4):
|
| 697 |
+
group_idx = row_start + col_idx
|
| 698 |
+
if group_idx < num_groups and group_idx in sample_words:
|
| 699 |
+
with cols[col_idx]:
|
| 700 |
+
group_label = result['group_labels'][group_idx]
|
| 701 |
+
words = sample_words[group_idx]
|
| 702 |
+
|
| 703 |
+
st.write(f"**Group {group_label}:**")
|
| 704 |
+
word_list = [w['word'] for w in words]
|
| 705 |
+
# Display as bullet points for better readability
|
| 706 |
+
for word in word_list:
|
| 707 |
+
st.write(f"• {word}")
|
| 708 |
+
|
| 709 |
+
# Add spacing between groups
|
| 710 |
+
st.write("")
|
| 711 |
+
else:
|
| 712 |
+
st.write("No sample words available")
|
| 713 |
+
|
| 714 |
+
# Show enhanced group statistics
|
| 715 |
+
with st.expander("📈 Detailed Group Statistics"):
|
| 716 |
+
group_stats = result.get('group_stats')
|
| 717 |
+
if group_stats is not None and not group_stats.empty:
|
| 718 |
+
display_stats = group_stats.copy()
|
| 719 |
+
|
| 720 |
+
# Format numeric columns
|
| 721 |
+
numeric_cols = display_stats.select_dtypes(include=[np.number]).columns
|
| 722 |
+
for col in numeric_cols:
|
| 723 |
+
if 'count' not in col.lower():
|
| 724 |
+
display_stats[col] = display_stats[col].round(2)
|
| 725 |
+
|
| 726 |
+
st.dataframe(display_stats, use_container_width=True)
|
| 727 |
+
else:
|
| 728 |
+
st.write("No detailed statistics available")
|
| 729 |
+
|
| 730 |
+
except Exception as e:
|
| 731 |
+
st.error(f"Error in enhanced rank-based analysis: {str(e)}")
|
| 732 |
+
with st.expander("Error Details"):
|
| 733 |
+
st.code(str(e))
|
web_app/handlers/frequency_handlers_memory.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Memory-based Frequency Handlers for file upload without filesystem access
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from typing import Optional, Dict, List, Any
|
| 8 |
+
from io import BytesIO, StringIO
|
| 9 |
+
|
| 10 |
+
# Import from parent directory
|
| 11 |
+
import sys
|
| 12 |
+
import os
|
| 13 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 14 |
+
|
| 15 |
+
from text_analyzer.frequency_analyzer import FrequencyAnalyzer
|
| 16 |
+
from web_app.utils.memory_file_handler import MemoryFileHandler
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class FrequencyHandlersMemory:
|
| 20 |
+
"""Handlers for frequency analysis interface using memory-based file handling."""
|
| 21 |
+
|
| 22 |
+
@staticmethod
|
| 23 |
+
def render_frequency_visualization_interface():
|
| 24 |
+
"""Main interface for frequency visualization with memory-based file handling."""
|
| 25 |
+
st.subheader("📊 Word Frequency Visualization")
|
| 26 |
+
|
| 27 |
+
# File selection
|
| 28 |
+
uploaded_file = FrequencyHandlersMemory.render_file_selection_section()
|
| 29 |
+
|
| 30 |
+
if uploaded_file:
|
| 31 |
+
# Process file using memory-based approach
|
| 32 |
+
FrequencyHandlersMemory.process_uploaded_file_memory(uploaded_file)
|
| 33 |
+
|
| 34 |
+
@staticmethod
|
| 35 |
+
def render_file_selection_section():
|
| 36 |
+
"""
|
| 37 |
+
Render file selection section with memory-based handling.
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
File-like object or None
|
| 41 |
+
"""
|
| 42 |
+
st.subheader("📄 Select Frequency Data")
|
| 43 |
+
|
| 44 |
+
# Data source selection
|
| 45 |
+
data_source = st.radio(
|
| 46 |
+
"Choose data source:",
|
| 47 |
+
["Upload file", "Use sample files"],
|
| 48 |
+
help="Upload your own frequency data or use pre-loaded samples"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
if data_source == "Upload file":
|
| 52 |
+
uploaded_file = st.file_uploader(
|
| 53 |
+
"Choose a frequency data file",
|
| 54 |
+
type=['tsv', 'csv', 'txt'],
|
| 55 |
+
help="Upload a TSV or CSV file with frequency data. Maximum size: 300MB",
|
| 56 |
+
accept_multiple_files=False
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
if uploaded_file and uploaded_file.size > 300 * 1024 * 1024:
|
| 60 |
+
st.error(f"File too large ({uploaded_file.size / 1024 / 1024:.1f} MB). Maximum allowed: 300MB")
|
| 61 |
+
return None
|
| 62 |
+
|
| 63 |
+
return uploaded_file
|
| 64 |
+
else:
|
| 65 |
+
# Sample files selection (existing code)
|
| 66 |
+
return FrequencyHandlersMemory.handle_sample_files()
|
| 67 |
+
|
| 68 |
+
@staticmethod
|
| 69 |
+
def process_uploaded_file_memory(uploaded_file):
|
| 70 |
+
"""Process uploaded file using memory-based approach."""
|
| 71 |
+
|
| 72 |
+
# Initialize session state
|
| 73 |
+
if 'analyzer' not in st.session_state:
|
| 74 |
+
st.session_state.analyzer = None
|
| 75 |
+
if 'format_info' not in st.session_state:
|
| 76 |
+
st.session_state.format_info = None
|
| 77 |
+
if 'file_content' not in st.session_state:
|
| 78 |
+
st.session_state.file_content = None
|
| 79 |
+
|
| 80 |
+
# Check if this is a new file
|
| 81 |
+
current_file_name = uploaded_file.name
|
| 82 |
+
if st.session_state.get('last_file_name') != current_file_name:
|
| 83 |
+
st.session_state.last_file_name = current_file_name
|
| 84 |
+
st.session_state.analyzer = None
|
| 85 |
+
st.session_state.format_info = None
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
# Read file content directly into memory
|
| 89 |
+
st.info("📖 Reading file content...")
|
| 90 |
+
uploaded_file.seek(0)
|
| 91 |
+
content = uploaded_file.read()
|
| 92 |
+
|
| 93 |
+
# Store in session state
|
| 94 |
+
st.session_state.file_content = content
|
| 95 |
+
st.success(f"✅ File '{current_file_name}' ({len(content):,} bytes) loaded successfully")
|
| 96 |
+
|
| 97 |
+
except Exception as e:
|
| 98 |
+
st.error(f"❌ Failed to read file: {str(e)}")
|
| 99 |
+
return
|
| 100 |
+
|
| 101 |
+
# Process the file content
|
| 102 |
+
if st.session_state.file_content:
|
| 103 |
+
try:
|
| 104 |
+
# Initialize analyzer if needed
|
| 105 |
+
if st.session_state.analyzer is None:
|
| 106 |
+
st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
|
| 107 |
+
st.session_state.format_info = st.session_state.analyzer.detect_file_format(
|
| 108 |
+
st.session_state.file_content
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Show format detection results
|
| 112 |
+
st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
|
| 113 |
+
f"{st.session_state.format_info['line_count']} lines")
|
| 114 |
+
|
| 115 |
+
# Parse the data
|
| 116 |
+
if 'data' not in st.session_state or st.session_state.data is None:
|
| 117 |
+
with st.spinner("Parsing frequency data..."):
|
| 118 |
+
# Create a file-like object from the content
|
| 119 |
+
file_obj = BytesIO(st.session_state.file_content)
|
| 120 |
+
df = pd.read_csv(
|
| 121 |
+
file_obj,
|
| 122 |
+
delimiter=st.session_state.format_info['separator'],
|
| 123 |
+
encoding='utf-8'
|
| 124 |
+
)
|
| 125 |
+
st.session_state.data = df
|
| 126 |
+
st.session_state.analyzer.df = df
|
| 127 |
+
|
| 128 |
+
# Continue with visualization
|
| 129 |
+
if st.session_state.data is not None:
|
| 130 |
+
# Data preview
|
| 131 |
+
with st.expander("📋 Data Preview", expanded=True):
|
| 132 |
+
st.dataframe(st.session_state.data.head(20))
|
| 133 |
+
st.caption(f"Showing first 20 of {len(st.session_state.data):,} entries")
|
| 134 |
+
|
| 135 |
+
# Column configuration
|
| 136 |
+
st.session_state.column_config = FrequencyHandlersMemory.render_column_configuration(
|
| 137 |
+
st.session_state.analyzer
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if st.session_state.column_config:
|
| 141 |
+
# Visualization controls
|
| 142 |
+
viz_config = FrequencyHandlersMemory.render_visualization_controls(
|
| 143 |
+
st.session_state.analyzer,
|
| 144 |
+
st.session_state.column_config
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
if viz_config:
|
| 148 |
+
# Generate visualization
|
| 149 |
+
FrequencyHandlersMemory.render_visualization(
|
| 150 |
+
st.session_state.analyzer,
|
| 151 |
+
viz_config
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
st.error(f"Error processing file: {str(e)}")
|
| 156 |
+
with st.expander("Error Details"):
|
| 157 |
+
st.code(str(e))
|
| 158 |
+
|
| 159 |
+
@staticmethod
|
| 160 |
+
def handle_sample_files():
|
| 161 |
+
"""Handle sample file selection (existing implementation)."""
|
| 162 |
+
sample_files = {
|
| 163 |
+
"word_freq.txt": "data/word_freq.txt",
|
| 164 |
+
"COCA_5000.txt": "data/COCA_5000.txt",
|
| 165 |
+
"jpn_word_freq.txt": "data/jpn_word_freq.txt"
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
selected_sample = st.selectbox(
|
| 169 |
+
"Choose a sample file:",
|
| 170 |
+
options=list(sample_files.keys()),
|
| 171 |
+
help="Pre-loaded frequency data files for testing"
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
if st.button("Load Sample File", type="primary"):
|
| 175 |
+
sample_path = sample_files[selected_sample]
|
| 176 |
+
if os.path.exists(sample_path):
|
| 177 |
+
try:
|
| 178 |
+
with open(sample_path, 'rb') as f:
|
| 179 |
+
content = f.read()
|
| 180 |
+
|
| 181 |
+
# Create a mock uploaded file object
|
| 182 |
+
from io import BytesIO
|
| 183 |
+
mock_file = BytesIO(content)
|
| 184 |
+
mock_file.name = selected_sample
|
| 185 |
+
mock_file.size = len(content)
|
| 186 |
+
|
| 187 |
+
return mock_file
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
st.error(f"Error loading sample file: {str(e)}")
|
| 191 |
+
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
@staticmethod
|
| 195 |
+
def render_column_configuration(analyzer):
|
| 196 |
+
"""Render column configuration section."""
|
| 197 |
+
st.subheader("⚙️ Column Configuration")
|
| 198 |
+
|
| 199 |
+
detected_cols = analyzer.detected_columns
|
| 200 |
+
|
| 201 |
+
col1, col2 = st.columns(2)
|
| 202 |
+
|
| 203 |
+
with col1:
|
| 204 |
+
word_col = st.selectbox(
|
| 205 |
+
"Word/Token Column",
|
| 206 |
+
options=detected_cols.get('word_columns', []),
|
| 207 |
+
help="Column containing words or tokens"
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
with col2:
|
| 211 |
+
freq_col = st.selectbox(
|
| 212 |
+
"Frequency Column",
|
| 213 |
+
options=detected_cols.get('frequency_columns', []),
|
| 214 |
+
help="Column containing frequency counts"
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
if word_col and freq_col:
|
| 218 |
+
return {'word_column': word_col, 'frequency_column': freq_col}
|
| 219 |
+
return None
|
| 220 |
+
|
| 221 |
+
@staticmethod
|
| 222 |
+
def render_visualization_controls(analyzer, column_config):
|
| 223 |
+
"""Render visualization controls."""
|
| 224 |
+
st.subheader("📊 Visualization Settings")
|
| 225 |
+
|
| 226 |
+
col1, col2, col3 = st.columns(3)
|
| 227 |
+
|
| 228 |
+
with col1:
|
| 229 |
+
chart_type = st.selectbox(
|
| 230 |
+
"Chart Type",
|
| 231 |
+
["Bar Chart", "Line Chart", "Area Chart"],
|
| 232 |
+
help="Select visualization type"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
with col2:
|
| 236 |
+
top_n = st.slider(
|
| 237 |
+
"Number of Words",
|
| 238 |
+
min_value=10,
|
| 239 |
+
max_value=100,
|
| 240 |
+
value=30,
|
| 241 |
+
step=5,
|
| 242 |
+
help="Number of top words to display"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
with col3:
|
| 246 |
+
scale = st.selectbox(
|
| 247 |
+
"Y-Axis Scale",
|
| 248 |
+
["Linear", "Logarithmic"],
|
| 249 |
+
help="Scale for frequency axis"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
return {
|
| 253 |
+
'chart_type': chart_type,
|
| 254 |
+
'top_n': top_n,
|
| 255 |
+
'scale': scale,
|
| 256 |
+
'word_column': column_config['word_column'],
|
| 257 |
+
'frequency_column': column_config['frequency_column']
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
@staticmethod
|
| 261 |
+
def render_visualization(analyzer, viz_config):
|
| 262 |
+
"""Render the actual visualization."""
|
| 263 |
+
import plotly.express as px
|
| 264 |
+
import plotly.graph_objects as go
|
| 265 |
+
|
| 266 |
+
# Get top N words
|
| 267 |
+
df = analyzer.df.nlargest(viz_config['top_n'], viz_config['frequency_column'])
|
| 268 |
+
|
| 269 |
+
# Create figure based on chart type
|
| 270 |
+
if viz_config['chart_type'] == "Bar Chart":
|
| 271 |
+
fig = px.bar(
|
| 272 |
+
df,
|
| 273 |
+
x=viz_config['word_column'],
|
| 274 |
+
y=viz_config['frequency_column'],
|
| 275 |
+
title=f"Top {viz_config['top_n']} Most Frequent Words"
|
| 276 |
+
)
|
| 277 |
+
elif viz_config['chart_type'] == "Line Chart":
|
| 278 |
+
fig = px.line(
|
| 279 |
+
df,
|
| 280 |
+
x=viz_config['word_column'],
|
| 281 |
+
y=viz_config['frequency_column'],
|
| 282 |
+
title=f"Top {viz_config['top_n']} Most Frequent Words",
|
| 283 |
+
markers=True
|
| 284 |
+
)
|
| 285 |
+
else: # Area Chart
|
| 286 |
+
fig = px.area(
|
| 287 |
+
df,
|
| 288 |
+
x=viz_config['word_column'],
|
| 289 |
+
y=viz_config['frequency_column'],
|
| 290 |
+
title=f"Top {viz_config['top_n']} Most Frequent Words"
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
# Apply scale
|
| 294 |
+
if viz_config['scale'] == "Logarithmic":
|
| 295 |
+
fig.update_yaxis(type="log")
|
| 296 |
+
|
| 297 |
+
# Update layout
|
| 298 |
+
fig.update_layout(
|
| 299 |
+
xaxis_title="Words",
|
| 300 |
+
yaxis_title="Frequency",
|
| 301 |
+
height=600,
|
| 302 |
+
showlegend=False
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# Display
|
| 306 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 307 |
+
|
| 308 |
+
# Summary statistics
|
| 309 |
+
with st.expander("📊 Summary Statistics"):
|
| 310 |
+
col1, col2, col3 = st.columns(3)
|
| 311 |
+
|
| 312 |
+
with col1:
|
| 313 |
+
st.metric("Total Words", f"{len(analyzer.df):,}")
|
| 314 |
+
with col2:
|
| 315 |
+
st.metric("Total Frequency", f"{analyzer.df[viz_config['frequency_column']].sum():,}")
|
| 316 |
+
with col3:
|
| 317 |
+
st.metric("Average Frequency", f"{analyzer.df[viz_config['frequency_column']].mean():.2f}")
|
web_app/handlers/frequency_handlers_updated.py
ADDED
|
@@ -0,0 +1,694 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Frequency Analysis Handlers for Streamlit Interface
|
| 3 |
+
|
| 4 |
+
This module provides Streamlit interface handlers for word frequency visualization,
|
| 5 |
+
including file upload, visualization controls, and results display.
|
| 6 |
+
Supports flexible column mapping for diverse frequency data formats.
|
| 7 |
+
|
| 8 |
+
Updated to use MemoryFileHandler to avoid 403 errors on restricted environments.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import streamlit as st
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import plotly.graph_objects as go
|
| 14 |
+
import plotly.express as px
|
| 15 |
+
import numpy as np
|
| 16 |
+
from typing import Dict, List, Optional
|
| 17 |
+
import sys
|
| 18 |
+
import os
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from io import StringIO, BytesIO
|
| 21 |
+
|
| 22 |
+
# Add parent directory to path for imports
|
| 23 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 24 |
+
|
| 25 |
+
from text_analyzer.frequency_analyzer import FrequencyAnalyzer
|
| 26 |
+
from web_app.utils import MemoryFileHandler
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class FrequencyHandlers:
|
| 30 |
+
"""
|
| 31 |
+
Streamlit interface handlers for frequency analysis functionality.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
@staticmethod
|
| 35 |
+
def render_frequency_visualization_interface():
|
| 36 |
+
"""
|
| 37 |
+
Main interface for frequency visualization analysis.
|
| 38 |
+
Manages state across multiple interactions.
|
| 39 |
+
"""
|
| 40 |
+
st.subheader("📊 Word Frequency Visualization")
|
| 41 |
+
|
| 42 |
+
# Initialize session state
|
| 43 |
+
if 'analyzer' not in st.session_state:
|
| 44 |
+
st.session_state.analyzer = None
|
| 45 |
+
if 'format_info' not in st.session_state:
|
| 46 |
+
st.session_state.format_info = None
|
| 47 |
+
if 'uploaded_file_content' not in st.session_state:
|
| 48 |
+
st.session_state.uploaded_file_content = None
|
| 49 |
+
if 'column_config' not in st.session_state:
|
| 50 |
+
st.session_state.column_config = None
|
| 51 |
+
|
| 52 |
+
# File selection
|
| 53 |
+
uploaded_file = FrequencyHandlers.render_file_selection_section()
|
| 54 |
+
|
| 55 |
+
if uploaded_file:
|
| 56 |
+
# Track file changes
|
| 57 |
+
current_file_name = uploaded_file.name if hasattr(uploaded_file, 'name') else 'sample_file'
|
| 58 |
+
|
| 59 |
+
if st.session_state.get('last_file_name') != current_file_name:
|
| 60 |
+
st.session_state.last_file_name = current_file_name
|
| 61 |
+
st.session_state.analyzer = None
|
| 62 |
+
st.session_state.format_info = None
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
# Check file size
|
| 66 |
+
if hasattr(uploaded_file, 'size') and uploaded_file.size > 300 * 1024 * 1024:
|
| 67 |
+
st.error(f"File too large ({uploaded_file.size / 1024 / 1024:.1f} MB). Maximum allowed: 300MB")
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
# Process file using memory-based approach
|
| 71 |
+
content = MemoryFileHandler.process_uploaded_file(uploaded_file, as_text=False)
|
| 72 |
+
if content:
|
| 73 |
+
st.session_state.uploaded_file_content = content
|
| 74 |
+
st.success(f"✅ File '{current_file_name}' ({len(content):,} bytes) uploaded successfully")
|
| 75 |
+
else:
|
| 76 |
+
st.error("Failed to read uploaded file. Please try again.")
|
| 77 |
+
return
|
| 78 |
+
except Exception as e:
|
| 79 |
+
st.error(f"❌ Failed to read uploaded file: {str(e)}")
|
| 80 |
+
if "403" in str(e) or "Forbidden" in str(e):
|
| 81 |
+
st.error("**Upload Error**: File upload was blocked. This is a known issue on Hugging Face Spaces. "
|
| 82 |
+
"Please try using the sample files option or deploy locally.")
|
| 83 |
+
return
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
# Initialize analyzer and process file (only if needed)
|
| 87 |
+
if st.session_state.analyzer is None or st.session_state.format_info is None:
|
| 88 |
+
st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
|
| 89 |
+
# Use the content we already read
|
| 90 |
+
st.session_state.format_info = st.session_state.analyzer.detect_file_format(st.session_state.uploaded_file_content)
|
| 91 |
+
|
| 92 |
+
# Show format detection results
|
| 93 |
+
st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
|
| 94 |
+
f"{st.session_state.format_info['line_count']} lines")
|
| 95 |
+
|
| 96 |
+
# Parse the data if not already done
|
| 97 |
+
if st.session_state.analyzer.df is None:
|
| 98 |
+
with st.spinner("Parsing frequency data..."):
|
| 99 |
+
try:
|
| 100 |
+
# Create file-like object from content
|
| 101 |
+
file_obj = BytesIO(st.session_state.uploaded_file_content)
|
| 102 |
+
st.session_state.analyzer.read_frequency_data_from_content(file_obj)
|
| 103 |
+
|
| 104 |
+
if st.session_state.analyzer.df is None or st.session_state.analyzer.df.empty:
|
| 105 |
+
st.error("No data could be parsed from the file. Please check the file format.")
|
| 106 |
+
return
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
st.error(f"Error parsing file: {str(e)}")
|
| 110 |
+
return
|
| 111 |
+
|
| 112 |
+
# Display results
|
| 113 |
+
with st.expander("📋 Data Preview", expanded=True):
|
| 114 |
+
FrequencyHandlers.render_data_preview(
|
| 115 |
+
st.session_state.analyzer.df.head(20),
|
| 116 |
+
st.session_state.analyzer.detected_columns
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Column configuration - always allow user to change
|
| 120 |
+
st.session_state.column_config = FrequencyHandlers.render_enhanced_column_configuration(
|
| 121 |
+
st.session_state.analyzer.detected_columns,
|
| 122 |
+
st.session_state.analyzer.df
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
if st.session_state.column_config:
|
| 126 |
+
# Set the analyzer's columns based on user selection
|
| 127 |
+
st.session_state.analyzer.word_column = st.session_state.column_config['word_column']
|
| 128 |
+
st.session_state.analyzer.frequency_column = st.session_state.column_config['frequency_column']
|
| 129 |
+
|
| 130 |
+
# Visualization controls
|
| 131 |
+
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 132 |
+
|
| 133 |
+
if viz_config:
|
| 134 |
+
# Generate analysis
|
| 135 |
+
FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
st.error(f"Error processing file: {str(e)}")
|
| 139 |
+
|
| 140 |
+
# Provide specific error guidance
|
| 141 |
+
if "403" in str(e) or "Forbidden" in str(e):
|
| 142 |
+
st.error("**HTTP 403 Error**: File upload was blocked by the server.")
|
| 143 |
+
st.info("This is a known limitation on Hugging Face Spaces. Please use the sample files option or deploy the app locally for full functionality.")
|
| 144 |
+
elif "timeout" in str(e).lower():
|
| 145 |
+
st.error("**Timeout Error**: File processing took too long")
|
| 146 |
+
st.info("Try uploading a smaller file or check your internet connection")
|
| 147 |
+
elif "memory" in str(e).lower() or "RAM" in str(e).upper():
|
| 148 |
+
st.error("**Memory Error**: Not enough memory to process this file")
|
| 149 |
+
st.info("Try uploading a smaller file")
|
| 150 |
+
else:
|
| 151 |
+
with st.expander("Error Details"):
|
| 152 |
+
st.code(str(e))
|
| 153 |
+
|
| 154 |
+
# Cleanup session for debugging
|
| 155 |
+
if st.sidebar.button("🔄 Reset Analysis", help="Clear all cached data and start fresh"):
|
| 156 |
+
for key in ['analyzer', 'format_info', 'uploaded_file_content', 'column_config', 'last_file_name']:
|
| 157 |
+
if key in st.session_state:
|
| 158 |
+
del st.session_state[key]
|
| 159 |
+
st.experimental_rerun()
|
| 160 |
+
|
| 161 |
+
@staticmethod
|
| 162 |
+
def render_file_selection_section():
|
| 163 |
+
"""
|
| 164 |
+
Render file selection section.
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
File-like object or None
|
| 168 |
+
"""
|
| 169 |
+
st.subheader("📄 Select Frequency Data")
|
| 170 |
+
|
| 171 |
+
# Data source selection
|
| 172 |
+
data_source = st.radio(
|
| 173 |
+
"Choose data source:",
|
| 174 |
+
["Upload file", "Use sample files"],
|
| 175 |
+
help="Note: File uploads may experience issues on Hugging Face Spaces. Use sample files as a reliable alternative."
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
if data_source == "Upload file":
|
| 179 |
+
uploaded_file = st.file_uploader(
|
| 180 |
+
"Choose a frequency data file",
|
| 181 |
+
type=['tsv', 'csv', 'txt'],
|
| 182 |
+
help="Upload a TSV or CSV file with frequency data. Supports flexible column mapping.\n⚠️ If upload fails, try using sample files instead.",
|
| 183 |
+
accept_multiple_files=False
|
| 184 |
+
)
|
| 185 |
+
else:
|
| 186 |
+
# Sample files selection
|
| 187 |
+
sample_files = {
|
| 188 |
+
"word_freq.txt": "data/word_freq.txt",
|
| 189 |
+
"COCA_5000.txt": "data/COCA_5000.txt",
|
| 190 |
+
"jpn_word_freq.txt": "data/jpn_word_freq.txt"
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
selected_sample = st.selectbox(
|
| 194 |
+
"Choose a sample file:",
|
| 195 |
+
options=list(sample_files.keys()),
|
| 196 |
+
help="Pre-loaded frequency data files for testing"
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
if st.button("Load Sample File", type="primary"):
|
| 200 |
+
sample_path = sample_files[selected_sample]
|
| 201 |
+
if os.path.exists(sample_path):
|
| 202 |
+
try:
|
| 203 |
+
# Create a file-like object from sample file
|
| 204 |
+
from io import BytesIO
|
| 205 |
+
with open(sample_path, 'rb') as f:
|
| 206 |
+
content = f.read()
|
| 207 |
+
|
| 208 |
+
# Create BytesIO object that mimics uploaded file
|
| 209 |
+
uploaded_file = BytesIO(content)
|
| 210 |
+
uploaded_file.name = selected_sample
|
| 211 |
+
uploaded_file.type = 'text/tab-separated-values' if selected_sample.endswith('.txt') else 'text/csv'
|
| 212 |
+
uploaded_file.size = len(content)
|
| 213 |
+
|
| 214 |
+
# Store in session state to persist across reruns
|
| 215 |
+
st.session_state.sample_file = uploaded_file
|
| 216 |
+
st.session_state.sample_file_name = selected_sample
|
| 217 |
+
st.success(f"Loaded sample file: {selected_sample}")
|
| 218 |
+
|
| 219 |
+
except Exception as e:
|
| 220 |
+
st.error(f"Error loading sample file: {str(e)}")
|
| 221 |
+
uploaded_file = None
|
| 222 |
+
else:
|
| 223 |
+
st.error(f"Sample file not found: {sample_path}")
|
| 224 |
+
uploaded_file = None
|
| 225 |
+
else:
|
| 226 |
+
# Check if sample file was previously loaded
|
| 227 |
+
uploaded_file = st.session_state.get('sample_file', None)
|
| 228 |
+
if uploaded_file and 'sample_file_name' in st.session_state:
|
| 229 |
+
st.info(f"Using loaded sample file: {st.session_state.sample_file_name}")
|
| 230 |
+
|
| 231 |
+
if uploaded_file is None and data_source == "Upload file":
|
| 232 |
+
# Show example formats
|
| 233 |
+
st.info("**Supported formats:**")
|
| 234 |
+
col1, col2 = st.columns(2)
|
| 235 |
+
|
| 236 |
+
with col1:
|
| 237 |
+
st.write("**Traditional format:**")
|
| 238 |
+
example_traditional = """Type\tFreq\tRank
|
| 239 |
+
the\t69868\t1
|
| 240 |
+
of\t36426\t2
|
| 241 |
+
and\t28891\t3"""
|
| 242 |
+
st.code(example_traditional, language="text")
|
| 243 |
+
|
| 244 |
+
with col2:
|
| 245 |
+
st.write("**Rich corpus format:**")
|
| 246 |
+
example_rich = """rank\tlForm\tlemma\tpos\tfrequency\tpmw
|
| 247 |
+
1\tノ\tの\t助詞\t5061558\t48383.9
|
| 248 |
+
2\tニ\tに\t助詞\t3576558\t34188.7
|
| 249 |
+
3\tテ\tて\t助詞\t3493117\t33391.0"""
|
| 250 |
+
st.code(example_rich, language="text")
|
| 251 |
+
|
| 252 |
+
st.write("**File size limit:** 300MB")
|
| 253 |
+
|
| 254 |
+
return uploaded_file
|
| 255 |
+
|
| 256 |
+
@staticmethod
|
| 257 |
+
def render_data_preview(df: pd.DataFrame, detected_cols: Dict[str, List[str]]):
|
| 258 |
+
"""
|
| 259 |
+
Render enhanced data preview section with column detection results.
|
| 260 |
+
|
| 261 |
+
Args:
|
| 262 |
+
df: Preview DataFrame
|
| 263 |
+
detected_cols: Detected column categorization
|
| 264 |
+
"""
|
| 265 |
+
st.write("**File Preview:**")
|
| 266 |
+
st.dataframe(
|
| 267 |
+
df,
|
| 268 |
+
use_container_width=True,
|
| 269 |
+
hide_index=True,
|
| 270 |
+
height=400
|
| 271 |
+
)
|
| 272 |
+
st.caption(f"Showing first {len(df)} of total entries")
|
| 273 |
+
|
| 274 |
+
# Show detected columns
|
| 275 |
+
with st.expander("🔍 Detected Columns", expanded=False):
|
| 276 |
+
col1, col2, col3 = st.columns(3)
|
| 277 |
+
|
| 278 |
+
with col1:
|
| 279 |
+
st.write("**Word Columns:**")
|
| 280 |
+
for col in detected_cols.get('word_columns', []):
|
| 281 |
+
st.write(f"• {col}")
|
| 282 |
+
|
| 283 |
+
with col2:
|
| 284 |
+
st.write("**Frequency Columns:**")
|
| 285 |
+
for col in detected_cols.get('frequency_columns', []):
|
| 286 |
+
st.write(f"• {col}")
|
| 287 |
+
|
| 288 |
+
with col3:
|
| 289 |
+
st.write("**Other Columns:**")
|
| 290 |
+
for col in detected_cols.get('other_columns', []):
|
| 291 |
+
st.write(f"• {col}")
|
| 292 |
+
|
| 293 |
+
@staticmethod
|
| 294 |
+
def render_enhanced_column_configuration(detected_cols: Dict[str, List[str]], df: pd.DataFrame):
|
| 295 |
+
"""
|
| 296 |
+
Render enhanced column configuration with smart defaults.
|
| 297 |
+
|
| 298 |
+
Args:
|
| 299 |
+
detected_cols: Detected column categorization
|
| 300 |
+
df: The full DataFrame
|
| 301 |
+
|
| 302 |
+
Returns:
|
| 303 |
+
Dictionary with column configuration or None
|
| 304 |
+
"""
|
| 305 |
+
st.subheader("⚙️ Column Configuration")
|
| 306 |
+
|
| 307 |
+
col1, col2 = st.columns(2)
|
| 308 |
+
|
| 309 |
+
with col1:
|
| 310 |
+
# Word column selection with smart default
|
| 311 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 312 |
+
if not word_cols:
|
| 313 |
+
word_cols = list(df.columns)
|
| 314 |
+
|
| 315 |
+
default_word = 0
|
| 316 |
+
# Prioritize columns with 'word', 'token', 'lemma', etc.
|
| 317 |
+
for i, col in enumerate(word_cols):
|
| 318 |
+
if any(term in col.lower() for term in ['word', 'token', 'lemma', 'type']):
|
| 319 |
+
default_word = i
|
| 320 |
+
break
|
| 321 |
+
|
| 322 |
+
word_col = st.selectbox(
|
| 323 |
+
"Word/Token Column",
|
| 324 |
+
options=word_cols,
|
| 325 |
+
index=default_word,
|
| 326 |
+
help="Select the column containing words or tokens"
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
with col2:
|
| 330 |
+
# Frequency column selection with smart default
|
| 331 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 332 |
+
if not freq_cols:
|
| 333 |
+
# Try to identify numeric columns
|
| 334 |
+
freq_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
| 335 |
+
if not freq_cols:
|
| 336 |
+
freq_cols = list(df.columns)
|
| 337 |
+
|
| 338 |
+
default_freq = 0
|
| 339 |
+
# Prioritize columns with 'freq', 'count', etc.
|
| 340 |
+
for i, col in enumerate(freq_cols):
|
| 341 |
+
if any(term in col.lower() for term in ['freq', 'count', 'occurrences']):
|
| 342 |
+
default_freq = i
|
| 343 |
+
break
|
| 344 |
+
|
| 345 |
+
freq_col = st.selectbox(
|
| 346 |
+
"Frequency Column",
|
| 347 |
+
options=freq_cols,
|
| 348 |
+
index=default_freq,
|
| 349 |
+
help="Select the column containing frequency counts"
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
if word_col and freq_col:
|
| 353 |
+
# Validate configuration
|
| 354 |
+
if word_col == freq_col:
|
| 355 |
+
st.error("Word and frequency columns cannot be the same!")
|
| 356 |
+
return None
|
| 357 |
+
|
| 358 |
+
# Show sample data with selected columns
|
| 359 |
+
st.write("**Preview with selected columns:**")
|
| 360 |
+
preview_df = df[[word_col, freq_col]].head(5)
|
| 361 |
+
st.dataframe(preview_df, use_container_width=True, hide_index=True)
|
| 362 |
+
|
| 363 |
+
return {
|
| 364 |
+
'word_column': word_col,
|
| 365 |
+
'frequency_column': freq_col
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
return None
|
| 369 |
+
|
| 370 |
+
@staticmethod
|
| 371 |
+
def render_enhanced_visualization_controls(analyzer: FrequencyAnalyzer, column_config: Dict[str, str]):
|
| 372 |
+
"""
|
| 373 |
+
Render enhanced visualization controls.
|
| 374 |
+
|
| 375 |
+
Args:
|
| 376 |
+
analyzer: FrequencyAnalyzer instance
|
| 377 |
+
column_config: Column configuration
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
Dictionary with visualization configuration or None
|
| 381 |
+
"""
|
| 382 |
+
st.subheader("📊 Visualization Settings")
|
| 383 |
+
|
| 384 |
+
# Get data statistics
|
| 385 |
+
total_words = len(analyzer.df)
|
| 386 |
+
max_freq = analyzer.df[column_config['frequency_column']].max()
|
| 387 |
+
min_freq = analyzer.df[column_config['frequency_column']].min()
|
| 388 |
+
|
| 389 |
+
col1, col2, col3 = st.columns(3)
|
| 390 |
+
|
| 391 |
+
with col1:
|
| 392 |
+
chart_type = st.selectbox(
|
| 393 |
+
"Chart Type",
|
| 394 |
+
["Bar Chart", "Line Chart", "Area Chart", "Scatter Plot"],
|
| 395 |
+
help="Select visualization type"
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
with col2:
|
| 399 |
+
# Dynamic range based on data
|
| 400 |
+
max_words = min(total_words, 1000)
|
| 401 |
+
default_n = min(50, max_words)
|
| 402 |
+
|
| 403 |
+
top_n = st.slider(
|
| 404 |
+
"Number of Words",
|
| 405 |
+
min_value=10,
|
| 406 |
+
max_value=max_words,
|
| 407 |
+
value=default_n,
|
| 408 |
+
step=10,
|
| 409 |
+
help=f"Display top N words (total: {total_words:,})"
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
with col3:
|
| 413 |
+
scale = st.selectbox(
|
| 414 |
+
"Y-Axis Scale",
|
| 415 |
+
["Linear", "Logarithmic"],
|
| 416 |
+
help="Logarithmic scale is useful for data with large frequency variations"
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
# Advanced options
|
| 420 |
+
with st.expander("🎨 Advanced Options", expanded=False):
|
| 421 |
+
col1, col2 = st.columns(2)
|
| 422 |
+
|
| 423 |
+
with col1:
|
| 424 |
+
color_scheme = st.selectbox(
|
| 425 |
+
"Color Scheme",
|
| 426 |
+
["Viridis", "Blues", "Reds", "Turbo", "Rainbow"],
|
| 427 |
+
help="Select color scheme for visualization"
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
show_values = st.checkbox(
|
| 431 |
+
"Show Values on Chart",
|
| 432 |
+
value=False,
|
| 433 |
+
help="Display frequency values on the chart"
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
with col2:
|
| 437 |
+
orientation = st.radio(
|
| 438 |
+
"Orientation",
|
| 439 |
+
["Vertical", "Horizontal"],
|
| 440 |
+
help="Chart orientation"
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
show_grid = st.checkbox(
|
| 444 |
+
"Show Grid",
|
| 445 |
+
value=True,
|
| 446 |
+
help="Display grid lines"
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
# Summary statistics
|
| 450 |
+
st.write("**Data Statistics:**")
|
| 451 |
+
stat_col1, stat_col2, stat_col3, stat_col4 = st.columns(4)
|
| 452 |
+
|
| 453 |
+
with stat_col1:
|
| 454 |
+
st.metric("Total Words", f"{total_words:,}")
|
| 455 |
+
with stat_col2:
|
| 456 |
+
st.metric("Max Frequency", f"{max_freq:,}")
|
| 457 |
+
with stat_col3:
|
| 458 |
+
st.metric("Min Frequency", f"{min_freq:,}")
|
| 459 |
+
with stat_col4:
|
| 460 |
+
mean_freq = analyzer.df[column_config['frequency_column']].mean()
|
| 461 |
+
st.metric("Mean Frequency", f"{mean_freq:,.1f}")
|
| 462 |
+
|
| 463 |
+
return {
|
| 464 |
+
'chart_type': chart_type,
|
| 465 |
+
'top_n': top_n,
|
| 466 |
+
'scale': scale,
|
| 467 |
+
'color_scheme': color_scheme.lower(),
|
| 468 |
+
'show_values': show_values,
|
| 469 |
+
'orientation': orientation.lower(),
|
| 470 |
+
'show_grid': show_grid,
|
| 471 |
+
'word_column': column_config['word_column'],
|
| 472 |
+
'frequency_column': column_config['frequency_column']
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
@staticmethod
|
| 476 |
+
def render_enhanced_rank_based_analysis(analyzer: FrequencyAnalyzer, viz_config: dict):
|
| 477 |
+
"""
|
| 478 |
+
Render enhanced rank-based frequency analysis.
|
| 479 |
+
|
| 480 |
+
Args:
|
| 481 |
+
analyzer: FrequencyAnalyzer instance with loaded data
|
| 482 |
+
viz_config: Visualization configuration
|
| 483 |
+
"""
|
| 484 |
+
st.subheader("📈 Frequency Analysis Results")
|
| 485 |
+
|
| 486 |
+
# Get top N words
|
| 487 |
+
top_n = viz_config['top_n']
|
| 488 |
+
word_col = viz_config['word_column']
|
| 489 |
+
freq_col = viz_config['frequency_column']
|
| 490 |
+
|
| 491 |
+
# Sort and get top N
|
| 492 |
+
df_sorted = analyzer.df.sort_values(by=freq_col, ascending=False).head(top_n).copy()
|
| 493 |
+
|
| 494 |
+
# Add rank column
|
| 495 |
+
df_sorted['rank'] = range(1, len(df_sorted) + 1)
|
| 496 |
+
|
| 497 |
+
# Create visualization
|
| 498 |
+
if viz_config['orientation'] == 'horizontal':
|
| 499 |
+
x_col, y_col = freq_col, word_col
|
| 500 |
+
# Reverse order for horizontal bar chart
|
| 501 |
+
df_sorted = df_sorted.iloc[::-1]
|
| 502 |
+
else:
|
| 503 |
+
x_col, y_col = word_col, freq_col
|
| 504 |
+
|
| 505 |
+
# Create figure based on chart type
|
| 506 |
+
if viz_config['chart_type'] == "Bar Chart":
|
| 507 |
+
fig = px.bar(
|
| 508 |
+
df_sorted,
|
| 509 |
+
x=x_col,
|
| 510 |
+
y=y_col,
|
| 511 |
+
color=freq_col,
|
| 512 |
+
color_continuous_scale=viz_config['color_scheme'],
|
| 513 |
+
title=f"Top {top_n} Most Frequent Words",
|
| 514 |
+
labels={freq_col: "Frequency", word_col: "Words"},
|
| 515 |
+
orientation='h' if viz_config['orientation'] == 'horizontal' else 'v'
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
elif viz_config['chart_type'] == "Line Chart":
|
| 519 |
+
fig = px.line(
|
| 520 |
+
df_sorted,
|
| 521 |
+
x=word_col,
|
| 522 |
+
y=freq_col,
|
| 523 |
+
markers=True,
|
| 524 |
+
title=f"Top {top_n} Most Frequent Words",
|
| 525 |
+
labels={freq_col: "Frequency", word_col: "Words"}
|
| 526 |
+
)
|
| 527 |
+
fig.update_traces(line_color=px.colors.qualitative.Plotly[0], line_width=3)
|
| 528 |
+
|
| 529 |
+
elif viz_config['chart_type'] == "Area Chart":
|
| 530 |
+
fig = px.area(
|
| 531 |
+
df_sorted,
|
| 532 |
+
x=word_col,
|
| 533 |
+
y=freq_col,
|
| 534 |
+
title=f"Top {top_n} Most Frequent Words",
|
| 535 |
+
labels={freq_col: "Frequency", word_col: "Words"}
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
else: # Scatter Plot
|
| 539 |
+
fig = px.scatter(
|
| 540 |
+
df_sorted,
|
| 541 |
+
x='rank',
|
| 542 |
+
y=freq_col,
|
| 543 |
+
text=word_col,
|
| 544 |
+
size=freq_col,
|
| 545 |
+
color=freq_col,
|
| 546 |
+
color_continuous_scale=viz_config['color_scheme'],
|
| 547 |
+
title=f"Rank-Frequency Distribution (Top {top_n})",
|
| 548 |
+
labels={freq_col: "Frequency", 'rank': "Rank"}
|
| 549 |
+
)
|
| 550 |
+
fig.update_traces(textposition='top center')
|
| 551 |
+
|
| 552 |
+
# Apply logarithmic scale if selected
|
| 553 |
+
if viz_config['scale'] == "Logarithmic":
|
| 554 |
+
if viz_config['orientation'] == 'horizontal':
|
| 555 |
+
fig.update_xaxes(type="log")
|
| 556 |
+
else:
|
| 557 |
+
fig.update_yaxes(type="log")
|
| 558 |
+
|
| 559 |
+
# Show values on chart if selected
|
| 560 |
+
if viz_config['show_values'] and viz_config['chart_type'] == "Bar Chart":
|
| 561 |
+
fig.update_traces(texttemplate='%{value:,.0f}', textposition='outside')
|
| 562 |
+
|
| 563 |
+
# Update layout
|
| 564 |
+
fig.update_layout(
|
| 565 |
+
showlegend=False,
|
| 566 |
+
height=600,
|
| 567 |
+
xaxis_tickangle=-45 if viz_config['orientation'] == 'vertical' else 0,
|
| 568 |
+
plot_bgcolor='white' if viz_config['show_grid'] else 'rgba(0,0,0,0)',
|
| 569 |
+
xaxis_showgrid=viz_config['show_grid'],
|
| 570 |
+
yaxis_showgrid=viz_config['show_grid']
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
# Display chart
|
| 574 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 575 |
+
|
| 576 |
+
# Additional analyses
|
| 577 |
+
tab1, tab2, tab3 = st.tabs(["📊 Statistics", "📋 Data Table", "📈 Distribution Analysis"])
|
| 578 |
+
|
| 579 |
+
with tab1:
|
| 580 |
+
FrequencyHandlers.render_statistics_summary(df_sorted, freq_col, word_col)
|
| 581 |
+
|
| 582 |
+
with tab2:
|
| 583 |
+
FrequencyHandlers.render_data_table(df_sorted, word_col, freq_col)
|
| 584 |
+
|
| 585 |
+
with tab3:
|
| 586 |
+
FrequencyHandlers.render_distribution_analysis(analyzer, freq_col, viz_config)
|
| 587 |
+
|
| 588 |
+
@staticmethod
|
| 589 |
+
def render_statistics_summary(df: pd.DataFrame, freq_col: str, word_col: str):
|
| 590 |
+
"""Render statistical summary of the frequency data."""
|
| 591 |
+
col1, col2, col3 = st.columns(3)
|
| 592 |
+
|
| 593 |
+
with col1:
|
| 594 |
+
st.write("**Frequency Statistics:**")
|
| 595 |
+
st.write(f"• Total frequency: {df[freq_col].sum():,}")
|
| 596 |
+
st.write(f"• Mean frequency: {df[freq_col].mean():,.1f}")
|
| 597 |
+
st.write(f"• Median frequency: {df[freq_col].median():,.1f}")
|
| 598 |
+
|
| 599 |
+
with col2:
|
| 600 |
+
st.write("**Coverage Analysis:**")
|
| 601 |
+
total_freq = df[freq_col].sum()
|
| 602 |
+
cumsum = df[freq_col].cumsum()
|
| 603 |
+
coverage_50 = len(cumsum[cumsum <= total_freq * 0.5])
|
| 604 |
+
coverage_80 = len(cumsum[cumsum <= total_freq * 0.8])
|
| 605 |
+
st.write(f"• Words for 50% coverage: {coverage_50}")
|
| 606 |
+
st.write(f"• Words for 80% coverage: {coverage_80}")
|
| 607 |
+
st.write(f"• Top 10 words: {(df[freq_col].head(10).sum() / total_freq * 100):.1f}%")
|
| 608 |
+
|
| 609 |
+
with col3:
|
| 610 |
+
st.write("**Diversity Metrics:**")
|
| 611 |
+
st.write(f"• Unique words shown: {len(df)}")
|
| 612 |
+
st.write(f"• Hapax legomena: {len(df[df[freq_col] == 1])}")
|
| 613 |
+
st.write(f"• Type-token ratio: {len(df) / df[freq_col].sum():.4f}")
|
| 614 |
+
|
| 615 |
+
@staticmethod
|
| 616 |
+
def render_data_table(df: pd.DataFrame, word_col: str, freq_col: str):
|
| 617 |
+
"""Render interactive data table."""
|
| 618 |
+
# Add percentage column
|
| 619 |
+
df_display = df.copy()
|
| 620 |
+
df_display['percentage'] = (df_display[freq_col] / df_display[freq_col].sum() * 100).round(2)
|
| 621 |
+
df_display['cumulative_%'] = (df_display[freq_col].cumsum() / df_display[freq_col].sum() * 100).round(2)
|
| 622 |
+
|
| 623 |
+
# Display options
|
| 624 |
+
col1, col2 = st.columns([1, 3])
|
| 625 |
+
with col1:
|
| 626 |
+
show_cols = st.multiselect(
|
| 627 |
+
"Columns to show:",
|
| 628 |
+
options=df_display.columns.tolist(),
|
| 629 |
+
default=['rank', word_col, freq_col, 'percentage', 'cumulative_%']
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
+
# Display table
|
| 633 |
+
st.dataframe(
|
| 634 |
+
df_display[show_cols],
|
| 635 |
+
use_container_width=True,
|
| 636 |
+
hide_index=True,
|
| 637 |
+
height=400
|
| 638 |
+
)
|
| 639 |
+
|
| 640 |
+
# Download button
|
| 641 |
+
csv = df_display[show_cols].to_csv(index=False)
|
| 642 |
+
st.download_button(
|
| 643 |
+
label="📥 Download as CSV",
|
| 644 |
+
data=csv,
|
| 645 |
+
file_name=f"frequency_analysis_top_{len(df)}.csv",
|
| 646 |
+
mime="text/csv"
|
| 647 |
+
)
|
| 648 |
+
|
| 649 |
+
@staticmethod
|
| 650 |
+
def render_distribution_analysis(analyzer: FrequencyAnalyzer, freq_col: str, viz_config: dict):
|
| 651 |
+
"""Render frequency distribution analysis."""
|
| 652 |
+
# Zipf's law analysis
|
| 653 |
+
st.write("**Zipf's Law Analysis:**")
|
| 654 |
+
|
| 655 |
+
df_full = analyzer.df.sort_values(by=freq_col, ascending=False).copy()
|
| 656 |
+
df_full['rank'] = range(1, len(df_full) + 1)
|
| 657 |
+
df_full['log_rank'] = np.log10(df_full['rank'])
|
| 658 |
+
df_full['log_freq'] = np.log10(df_full[freq_col])
|
| 659 |
+
|
| 660 |
+
# Create Zipf plot
|
| 661 |
+
fig_zipf = px.scatter(
|
| 662 |
+
df_full.head(min(1000, len(df_full))),
|
| 663 |
+
x='log_rank',
|
| 664 |
+
y='log_freq',
|
| 665 |
+
title="Zipf's Law Distribution (Log-Log Plot)",
|
| 666 |
+
labels={'log_rank': 'log₁₀(Rank)', 'log_freq': 'log₁₀(Frequency)'},
|
| 667 |
+
trendline="ols"
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
fig_zipf.update_layout(height=400)
|
| 671 |
+
st.plotly_chart(fig_zipf, use_container_width=True)
|
| 672 |
+
|
| 673 |
+
# Frequency bands analysis
|
| 674 |
+
st.write("**Frequency Bands:**")
|
| 675 |
+
bands = pd.cut(df_full[freq_col],
|
| 676 |
+
bins=[0, 1, 10, 100, 1000, 10000, float('inf')],
|
| 677 |
+
labels=['1', '2-10', '11-100', '101-1000', '1001-10000', '10000+'])
|
| 678 |
+
band_counts = bands.value_counts().sort_index()
|
| 679 |
+
|
| 680 |
+
col1, col2 = st.columns(2)
|
| 681 |
+
with col1:
|
| 682 |
+
st.write("Words per frequency band:")
|
| 683 |
+
for band, count in band_counts.items():
|
| 684 |
+
st.write(f"• {band}: {count:,} words")
|
| 685 |
+
|
| 686 |
+
with col2:
|
| 687 |
+
# Pie chart of frequency bands
|
| 688 |
+
fig_pie = px.pie(
|
| 689 |
+
values=band_counts.values,
|
| 690 |
+
names=band_counts.index,
|
| 691 |
+
title="Distribution of Words by Frequency Band"
|
| 692 |
+
)
|
| 693 |
+
fig_pie.update_layout(height=300)
|
| 694 |
+
st.plotly_chart(fig_pie, use_container_width=True)
|
web_app/utils/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
"""Web app utilities package."""
|
| 2 |
|
| 3 |
from .file_upload_handler import FileUploadHandler
|
|
|
|
| 4 |
|
| 5 |
-
__all__ = ['FileUploadHandler']
|
|
|
|
| 1 |
"""Web app utilities package."""
|
| 2 |
|
| 3 |
from .file_upload_handler import FileUploadHandler
|
| 4 |
+
from .memory_file_handler import MemoryFileHandler
|
| 5 |
|
| 6 |
+
__all__ = ['FileUploadHandler', 'MemoryFileHandler']
|
web_app/utils/memory_file_handler.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Memory-based File Handler for Hugging Face Spaces Compatibility
|
| 3 |
+
|
| 4 |
+
This module provides an alternative to disk-based file handling by keeping
|
| 5 |
+
files in memory, avoiding 403 errors from filesystem restrictions.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from io import BytesIO, StringIO
|
| 10 |
+
from typing import Optional, Union, Dict, Any
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import zipfile
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class MemoryFileHandler:
|
| 16 |
+
"""Handle files entirely in memory to avoid filesystem restrictions."""
|
| 17 |
+
|
| 18 |
+
@staticmethod
|
| 19 |
+
def process_uploaded_file(uploaded_file, as_text: bool = False, encoding: str = 'utf-8') -> Optional[Union[bytes, str]]:
|
| 20 |
+
"""
|
| 21 |
+
Process uploaded file directly from Streamlit's UploadedFile object.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
uploaded_file: Streamlit UploadedFile object
|
| 25 |
+
as_text: Whether to return content as decoded text
|
| 26 |
+
encoding: Text encoding to use if as_text is True
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
File content as bytes or string, or None if error
|
| 30 |
+
"""
|
| 31 |
+
try:
|
| 32 |
+
# Reset file pointer to beginning
|
| 33 |
+
uploaded_file.seek(0)
|
| 34 |
+
|
| 35 |
+
# Read content directly from uploaded file
|
| 36 |
+
if as_text:
|
| 37 |
+
# For text mode, decode the bytes
|
| 38 |
+
content = uploaded_file.read()
|
| 39 |
+
if isinstance(content, bytes):
|
| 40 |
+
return content.decode(encoding)
|
| 41 |
+
return content
|
| 42 |
+
else:
|
| 43 |
+
# For binary mode, return raw bytes
|
| 44 |
+
return uploaded_file.read()
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
st.error(f"Failed to read file: {str(e)}")
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
@staticmethod
|
| 51 |
+
def process_csv_tsv_file(uploaded_file, delimiter: Optional[str] = None) -> Optional[pd.DataFrame]:
|
| 52 |
+
"""
|
| 53 |
+
Process CSV/TSV file directly into pandas DataFrame.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
uploaded_file: Streamlit UploadedFile object
|
| 57 |
+
delimiter: Column delimiter (auto-detected if None)
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
DataFrame or None if error
|
| 61 |
+
"""
|
| 62 |
+
try:
|
| 63 |
+
# Reset file pointer
|
| 64 |
+
uploaded_file.seek(0)
|
| 65 |
+
|
| 66 |
+
# Auto-detect delimiter if not provided
|
| 67 |
+
if delimiter is None:
|
| 68 |
+
# Read first few lines to detect delimiter
|
| 69 |
+
uploaded_file.seek(0)
|
| 70 |
+
sample = uploaded_file.read(1024).decode('utf-8', errors='ignore')
|
| 71 |
+
uploaded_file.seek(0)
|
| 72 |
+
|
| 73 |
+
if '\t' in sample:
|
| 74 |
+
delimiter = '\t'
|
| 75 |
+
else:
|
| 76 |
+
delimiter = ','
|
| 77 |
+
|
| 78 |
+
# Read directly into DataFrame
|
| 79 |
+
df = pd.read_csv(uploaded_file, delimiter=delimiter, encoding='utf-8')
|
| 80 |
+
return df
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
st.error(f"Failed to process CSV/TSV file: {str(e)}")
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
@staticmethod
|
| 87 |
+
def handle_zip_file(uploaded_file) -> Optional[Dict[str, bytes]]:
|
| 88 |
+
"""
|
| 89 |
+
Handle ZIP file uploads by extracting contents to memory.
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
uploaded_file: Streamlit UploadedFile object (should be a ZIP file)
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
Dictionary mapping filenames to file contents, or None if error
|
| 96 |
+
"""
|
| 97 |
+
try:
|
| 98 |
+
# Reset file pointer
|
| 99 |
+
uploaded_file.seek(0)
|
| 100 |
+
|
| 101 |
+
# Read ZIP file into memory
|
| 102 |
+
zip_bytes = BytesIO(uploaded_file.read())
|
| 103 |
+
|
| 104 |
+
# Extract files to memory
|
| 105 |
+
file_contents = {}
|
| 106 |
+
with zipfile.ZipFile(zip_bytes, 'r') as zip_file:
|
| 107 |
+
for filename in zip_file.namelist():
|
| 108 |
+
if not filename.endswith('/'): # Skip directories
|
| 109 |
+
file_contents[filename] = zip_file.read(filename)
|
| 110 |
+
|
| 111 |
+
return file_contents
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
st.error(f"Failed to process ZIP file: {str(e)}")
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
@staticmethod
|
| 118 |
+
def create_download_content(content: Union[str, bytes], filename: str) -> bytes:
|
| 119 |
+
"""
|
| 120 |
+
Prepare content for download.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
content: Content to download (string or bytes)
|
| 124 |
+
filename: Suggested filename for download
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Bytes ready for download
|
| 128 |
+
"""
|
| 129 |
+
if isinstance(content, str):
|
| 130 |
+
return content.encode('utf-8')
|
| 131 |
+
return content
|
| 132 |
+
|
| 133 |
+
@staticmethod
|
| 134 |
+
def store_in_session(key: str, content: Any):
|
| 135 |
+
"""
|
| 136 |
+
Store content in session state for persistence across reruns.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
key: Session state key
|
| 140 |
+
content: Content to store
|
| 141 |
+
"""
|
| 142 |
+
st.session_state[key] = content
|
| 143 |
+
|
| 144 |
+
@staticmethod
|
| 145 |
+
def retrieve_from_session(key: str) -> Optional[Any]:
|
| 146 |
+
"""
|
| 147 |
+
Retrieve content from session state.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
key: Session state key
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
Stored content or None
|
| 154 |
+
"""
|
| 155 |
+
return st.session_state.get(key, None)
|
| 156 |
+
|
| 157 |
+
@staticmethod
|
| 158 |
+
def clear_session_storage(prefix: str = ""):
|
| 159 |
+
"""
|
| 160 |
+
Clear session storage.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
prefix: Only clear keys starting with this prefix
|
| 164 |
+
"""
|
| 165 |
+
if prefix:
|
| 166 |
+
keys_to_remove = [k for k in st.session_state.keys() if k.startswith(prefix)]
|
| 167 |
+
for key in keys_to_remove:
|
| 168 |
+
del st.session_state[key]
|
| 169 |
+
else:
|
| 170 |
+
st.session_state.clear()
|