Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

egumasa commited on Jul 26, 2025

Commit

ca02ec3

1 Parent(s): 2dff38a

removed temporary file writing

Browse files

Files changed (10) hide show

test_frequency_flexible.py → test/test_frequency_flexible.py +0 -0
test_fugashi_diagnostic.py → test/test_fugashi_diagnostic.py +0 -0
test_japanese_integration.py → test/test_japanese_integration.py +0 -0
test/test_memory_handling.py +85 -0
test_unidic_diagnostic.py → test/test_unidic_diagnostic.py +0 -0
text_analyzer/lexical_sophistication.py +14 -4
web_app/config_manager.py +35 -18
web_app/handlers/analysis_handlers.py +70 -33
web_app/reference_manager.py +2 -1
web_app/session_manager.py +12 -8

test_frequency_flexible.py → test/test_frequency_flexible.py RENAMED Viewed

File without changes

test_fugashi_diagnostic.py → test/test_fugashi_diagnostic.py RENAMED Viewed

File without changes

test_japanese_integration.py → test/test_japanese_integration.py RENAMED Viewed

File without changes

test/test_memory_handling.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/env python3
+"""Test script to verify in-memory file handling works correctly."""
+import sys
+sys.path.append('.')
+from io import StringIO
+import pandas as pd
+# Test 1: Verify StringIO works with pandas
+def test_stringio_pandas():
+    print("Test 1: StringIO with pandas")
+    csv_content = """word,frequency
+apple,100
+banana,80
+cherry,60"""
+    content_io = StringIO(csv_content)
+    df = pd.read_csv(content_io)
+    print(f"✓ Successfully read CSV from StringIO: {len(df)} rows")
+    print(df)
+    print()
+# Test 2: Test custom config parsing
+def test_custom_config():
+    print("Test 2: Custom config parsing")
+    from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+    analyzer = LexicalSophisticationAnalyzer()
+    config = {
+        'content': """word\tfreq
+hello\t500
+world\t400
+test\t300""",
+        'word_column': 'word',
+        'freq_column': 'freq',
+        'delimiter': '\t',
+        'is_custom_config': True
+    }
+    result = analyzer._parse_custom_config(config)
+    print(f"✓ Successfully parsed custom config: {len(result)} entries")
+    print(f"Sample entries: {list(result.items())[:3]}")
+    print()
+# Test 3: Test file content extraction
+def test_file_extraction():
+    print("Test 3: File content extraction")
+    from web_app.handlers.analysis_handlers import AnalysisHandlers
+    # Simulate uploaded file
+    class MockUploadedFile:
+        def __init__(self, name, content):
+            self.name = name
+            self._content = content.encode('utf-8')
+            self._position = 0
+        def read(self):
+            self._position = len(self._content)
+            return self._content
+        def seek(self, position):
+            self._position = position
+    mock_file = MockUploadedFile("test.txt", "This is a test file content.")
+    file_contents = AnalysisHandlers.extract_uploaded_files([mock_file])
+    print(f"✓ Successfully extracted {len(file_contents)} files")
+    for filename, content in file_contents:
+        print(f"  - {filename}: {content[:50]}...")
+    print()
+if __name__ == "__main__":
+    print("Testing in-memory file handling...\n")
+    try:
+        test_stringio_pandas()
+        test_custom_config()
+        test_file_extraction()
+        print("✅ All tests passed!")
+    except Exception as e:
+        print(f"❌ Test failed: {e}")
+        import traceback
+        traceback.print_exc()

test_unidic_diagnostic.py → test/test_unidic_diagnostic.py RENAMED Viewed

File without changes

text_analyzer/lexical_sophistication.py CHANGED Viewed

@@ -170,16 +170,26 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
         Parse custom frequency list using user-selected columns.
         Args:
-            config: Dictionary with file_path, word_column, freq_column, delimiter
         """
-        file_path = config['file_path']
         word_column = config['word_column']
         freq_column = config['freq_column']
         delimiter = config['delimiter']
         try:
-            # Load the file
-            df = pd.read_csv(file_path, delimiter=delimiter, header=0)
             # Validate columns exist
             if word_column not in df.columns:

         Parse custom frequency list using user-selected columns.
         Args:
+            config: Dictionary with file_name/content, word_column, freq_column, delimiter
         """
         word_column = config['word_column']
         freq_column = config['freq_column']
         delimiter = config['delimiter']
         try:
+            from io import StringIO
+            # Check if we have content directly or need to read from file
+            if 'content' in config:
+                # Use content directly
+                content_io = StringIO(config['content'])
+                df = pd.read_csv(content_io, delimiter=delimiter, header=0)
+            elif 'file_path' in config:
+                # Fallback to file path for backward compatibility
+                df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0)
+            else:
+                logger.error("No content or file_path found in config")
+                return {}
             # Validate columns exist
             if word_column not in df.columns:

web_app/config_manager.py CHANGED Viewed

@@ -49,28 +49,43 @@ class ConfigManager:
     def process_uploaded_file(uploaded_file) -> Optional[Dict[str, Any]]:
         """Process a single uploaded file and return its configuration."""
         try:
-            # Save file temporarily
-            temp_dir = SessionManager.get_temp_dir()
-            file_path = os.path.join(temp_dir, uploaded_file.name)
-            with open(file_path, 'wb') as f:
-                f.write(uploaded_file.getvalue())
-            # Determine delimiter
-            with open(file_path, 'r', encoding='utf-8') as f:
-                sample = f.read(1024)
-                delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
             # Load preview
-            df_preview = pd.read_csv(file_path, delimiter=delimiter, header=0, nrows=5)
             return {
-                'file_path': file_path,
                 'columns': list(df_preview.columns),
                 'delimiter': delimiter,
                 'preview': df_preview,
                 'base_name': Path(uploaded_file.name).stem,
-                'configurations': []
             }
         except Exception as e:
@@ -78,10 +93,11 @@ class ConfigManager:
             return None
     @staticmethod
-    def create_custom_config(file_path: str, delimiter: str, word_col: str, score_col: str) -> Dict[str, Any]:
         """Create custom configuration object for backend."""
         return {
-            'file_path': file_path,
             'word_column': word_col,
             'freq_column': score_col,
             'delimiter': delimiter,
@@ -109,9 +125,9 @@ class ConfigManager:
         errors = []
         for file_key, file_config in all_configs.items():
-            # Validate file exists
-            if not os.path.exists(file_config['file_path']):
-                errors.append(f"File not found: {file_config['file_path']}")
                 continue
             for index_config in file_config['indices']:
@@ -131,7 +147,8 @@ class ConfigManager:
                 # Create and store configuration
                 custom_data = ConfigManager.create_custom_config(
-                    file_config['file_path'],
                     file_config['delimiter'],
                     word_col,
                     score_col

     def process_uploaded_file(uploaded_file) -> Optional[Dict[str, Any]]:
         """Process a single uploaded file and return its configuration."""
         try:
+            from io import StringIO
+            # Read file content directly from uploaded file
+            uploaded_file.seek(0)
+            content = uploaded_file.read()
+            # Decode content if it's bytes
+            if isinstance(content, bytes):
+                text_content = content.decode('utf-8')
+            else:
+                text_content = content
+            # Determine delimiter from first 1024 chars
+            sample = text_content[:1024]
+            delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
+            # Create StringIO for pandas to read
+            content_io = StringIO(text_content)
             # Load preview
+            df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5)
+            # Store content in session state instead of file path
+            if 'uploaded_files_content' not in st.session_state:
+                st.session_state.uploaded_files_content = {}
+            # Use filename as key
+            st.session_state.uploaded_files_content[uploaded_file.name] = text_content
             return {
+                'file_name': uploaded_file.name,
                 'columns': list(df_preview.columns),
                 'delimiter': delimiter,
                 'preview': df_preview,
                 'base_name': Path(uploaded_file.name).stem,
+                'configurations': [],
+                'content': text_content  # Include content for immediate use
             }
         except Exception as e:
             return None
     @staticmethod
+    def create_custom_config(file_name: str, content: str, delimiter: str, word_col: str, score_col: str) -> Dict[str, Any]:
         """Create custom configuration object for backend."""
         return {
+            'file_name': file_name,
+            'content': content,
             'word_column': word_col,
             'freq_column': score_col,
             'delimiter': delimiter,
         errors = []
         for file_key, file_config in all_configs.items():
+            # Validate that we have content
+            if 'content' not in file_config or not file_config['content']:
+                errors.append(f"No content found for file: {file_key}")
                 continue
             for index_config in file_config['indices']:
                 # Create and store configuration
                 custom_data = ConfigManager.create_custom_config(
+                    file_config['file_name'],
+                    file_config['content'],
                     file_config['delimiter'],
                     word_col,
                     score_col

web_app/handlers/analysis_handlers.py CHANGED Viewed

@@ -10,7 +10,7 @@ import plotly.graph_objects as go
 from scipy import stats
 import tempfile
 import os
-from typing import Dict, List, Any, Optional
 from pathlib import Path
 import zipfile
 import time
@@ -138,13 +138,13 @@ class AnalysisHandlers:
             with st.spinner("Processing files..."):
                 try:
                     # Extract files
-                    file_paths = AnalysisHandlers.extract_uploaded_files(uploaded_files)
-                    if not file_paths:
                         st.error("No valid .txt files found in uploaded files.")
                         return
-                    st.info(f"Found {len(file_paths)} files to process.")
                     # Load reference lists
                     analyzer.load_reference_lists(reference_lists)
@@ -153,18 +153,36 @@ class AnalysisHandlers:
                     progress_bar = st.progress(0)
                     status_text = st.empty()
-                    def progress_callback(current, total):
-                        progress = current / total
                         progress_bar.progress(progress)
-                        status_text.text(f"Processing file {current}/{total}")
-                    # Perform batch analysis
-                    results_df = analyzer.analyze_batch(
-                        file_paths,
-                        list(reference_lists.keys()),
-                        apply_log,
-                        progress_callback
-                    )
                     # Display results
                     st.success(f"Analysis complete! Processed {len(results_df)} files.")
@@ -245,10 +263,9 @@ class AnalysisHandlers:
                     st.error(f"Error during comparison: {e}")
     @staticmethod
-    def extract_uploaded_files(uploaded_files) -> List[str]:
-        """Extract uploaded files and return list of file paths."""
-        temp_dir = tempfile.mkdtemp()
-        file_paths = []
         for uploaded_file in uploaded_files:
             if uploaded_file.name.endswith('.zip'):
@@ -256,25 +273,45 @@ class AnalysisHandlers:
                 with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
                     for file_info in zip_ref.infolist():
                         if file_info.filename.endswith('.txt'):
-                            zip_ref.extract(file_info, temp_dir)
-                            file_paths.append(os.path.join(temp_dir, file_info.filename))
             elif uploaded_file.name.endswith('.txt'):
                 # Handle individual text files
-                file_path = os.path.join(temp_dir, uploaded_file.name)
                 try:
-                    content = uploaded_file.read().decode('utf-8')
-                except UnicodeDecodeError:
-                    try:
-                        content = uploaded_file.read().decode('utf-16')
-                    except UnicodeDecodeError:
-                        st.error(f"Unable to decode file {uploaded_file.name}. Skipping.")
-                        continue
-                with open(file_path, 'w', encoding='utf-8') as f:
-                    f.write(content)
-                file_paths.append(file_path)
-        return file_paths
     @staticmethod
     def display_single_text_results(results: Dict[str, Any]):

 from scipy import stats
 import tempfile
 import os
+from typing import Dict, List, Any, Optional, Tuple
 from pathlib import Path
 import zipfile
 import time
             with st.spinner("Processing files..."):
                 try:
                     # Extract files
+                    file_contents = AnalysisHandlers.extract_uploaded_files(uploaded_files)
+                    if not file_contents:
                         st.error("No valid .txt files found in uploaded files.")
                         return
+                    st.info(f"Found {len(file_contents)} files to process.")
                     # Load reference lists
                     analyzer.load_reference_lists(reference_lists)
                     progress_bar = st.progress(0)
                     status_text = st.empty()
+                    # Process files in memory
+                    batch_results = []
+                    selected_indices = list(reference_lists.keys())
+                    for i, (filename, text_content) in enumerate(file_contents):
+                        # Update progress
+                        progress = (i + 1) / len(file_contents)
                         progress_bar.progress(progress)
+                        status_text.text(f"Processing file {i + 1}/{len(file_contents)}: {filename}")
+                        try:
+                            # Analyze for both content and function words
+                            result_row = {'filename': filename}
+                            for word_type in ['CW', 'FW']:
+                                analysis = analyzer.analyze_text(text_content, selected_indices, apply_log, word_type)
+                                # Extract summary scores
+                                if analysis and 'summary' in analysis:
+                                    for index, stats in analysis['summary'].items():
+                                        col_name = f"{index}_{word_type}"
+                                        result_row[col_name] = stats['mean']
+                            batch_results.append(result_row)
+                        except Exception as e:
+                            st.warning(f"Error analyzing {filename}: {e}")
+                            continue
+                    # Convert to DataFrame
+                    results_df = pd.DataFrame(batch_results)
                     # Display results
                     st.success(f"Analysis complete! Processed {len(results_df)} files.")
                     st.error(f"Error during comparison: {e}")
     @staticmethod
+    def extract_uploaded_files(uploaded_files) -> List[Tuple[str, str]]:
+        """Extract uploaded files and return list of (filename, content) tuples."""
+        file_contents = []
         for uploaded_file in uploaded_files:
             if uploaded_file.name.endswith('.zip'):
                 with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
                     for file_info in zip_ref.infolist():
                         if file_info.filename.endswith('.txt'):
+                            try:
+                                content = zip_ref.read(file_info.filename)
+                                # Decode content
+                                try:
+                                    text_content = content.decode('utf-8')
+                                except UnicodeDecodeError:
+                                    try:
+                                        text_content = content.decode('utf-16')
+                                    except UnicodeDecodeError:
+                                        st.error(f"Unable to decode file {file_info.filename}. Skipping.")
+                                        continue
+                                file_contents.append((file_info.filename, text_content))
+                            except Exception as e:
+                                st.error(f"Cannot read {file_info.filename}: {e}")
+                                continue
             elif uploaded_file.name.endswith('.txt'):
                 # Handle individual text files
                 try:
+                    # Reset file pointer to beginning
+                    uploaded_file.seek(0)
+                    content = uploaded_file.read()
+                    if isinstance(content, bytes):
+                        try:
+                            text_content = content.decode('utf-8')
+                        except UnicodeDecodeError:
+                            try:
+                                text_content = content.decode('utf-16')
+                            except UnicodeDecodeError:
+                                st.error(f"Unable to decode file {uploaded_file.name}. Skipping.")
+                                continue
+                    else:
+                        text_content = content
+                    file_contents.append((uploaded_file.name, text_content))
+                except Exception as e:
+                    st.error(f"Cannot read file {uploaded_file.name}: {e}")
+                    continue
+        return file_contents
     @staticmethod
     def display_single_text_results(results: Dict[str, Any]):

web_app/reference_manager.py CHANGED Viewed

@@ -147,7 +147,8 @@ class ReferenceManager:
                     file_configs.append(index_config)
                 all_configs[file_key] = {
-                    'file_path': config['file_path'],
                     'delimiter': config['delimiter'],
                     'indices': file_configs
                 }

                     file_configs.append(index_config)
                 all_configs[file_key] = {
+                    'file_name': config['file_name'],
+                    'content': config['content'],
                     'delimiter': config['delimiter'],
                     'indices': file_configs
                 }

web_app/session_manager.py CHANGED Viewed

@@ -20,7 +20,7 @@ class SessionManager:
             'pos_parser': None,
             'reference_lists': {},
             'uploaded_file_configs': {},
-            'temp_dir': None,
             'last_language_change': st.session_state.get('language', 'en'),
             'show_language_warning': False
         }
@@ -80,13 +80,17 @@ class SessionManager:
     @staticmethod
     def get_temp_dir() -> Optional[str]:
-        """Get or create temporary directory for uploaded files."""
-        import tempfile
-        if 'temp_dir' not in st.session_state or st.session_state.temp_dir is None:
-            st.session_state.temp_dir = tempfile.mkdtemp()
-        return st.session_state.temp_dir
     @staticmethod
     def is_custom_reference_list(name: str) -> bool:

             'pos_parser': None,
             'reference_lists': {},
             'uploaded_file_configs': {},
+            'uploaded_files_content': {},  # Store file contents in memory
             'last_language_change': st.session_state.get('language', 'en'),
             'show_language_warning': False
         }
     @staticmethod
     def get_temp_dir() -> Optional[str]:
+        """
+        DEPRECATED: This method is no longer used.
+        File handling is now done in-memory to support read-only filesystems.
+        """
+        import warnings
+        warnings.warn(
+            "get_temp_dir() is deprecated. Use in-memory file handling instead.",
+            DeprecationWarning,
+            stacklevel=2
+        )
+        return None
     @staticmethod
     def is_custom_reference_list(name: str) -> bool: