egumasa commited on
Commit
ca02ec3
Β·
1 Parent(s): 2dff38a

removed temporary file writing

Browse files
test_frequency_flexible.py β†’ test/test_frequency_flexible.py RENAMED
File without changes
test_fugashi_diagnostic.py β†’ test/test_fugashi_diagnostic.py RENAMED
File without changes
test_japanese_integration.py β†’ test/test_japanese_integration.py RENAMED
File without changes
test/test_memory_handling.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Test script to verify in-memory file handling works correctly."""
3
+
4
+ import sys
5
+ sys.path.append('.')
6
+
7
+ from io import StringIO
8
+ import pandas as pd
9
+
10
+ # Test 1: Verify StringIO works with pandas
11
+ def test_stringio_pandas():
12
+ print("Test 1: StringIO with pandas")
13
+ csv_content = """word,frequency
14
+ apple,100
15
+ banana,80
16
+ cherry,60"""
17
+
18
+ content_io = StringIO(csv_content)
19
+ df = pd.read_csv(content_io)
20
+ print(f"βœ“ Successfully read CSV from StringIO: {len(df)} rows")
21
+ print(df)
22
+ print()
23
+
24
+ # Test 2: Test custom config parsing
25
+ def test_custom_config():
26
+ print("Test 2: Custom config parsing")
27
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
28
+
29
+ analyzer = LexicalSophisticationAnalyzer()
30
+
31
+ config = {
32
+ 'content': """word\tfreq
33
+ hello\t500
34
+ world\t400
35
+ test\t300""",
36
+ 'word_column': 'word',
37
+ 'freq_column': 'freq',
38
+ 'delimiter': '\t',
39
+ 'is_custom_config': True
40
+ }
41
+
42
+ result = analyzer._parse_custom_config(config)
43
+ print(f"βœ“ Successfully parsed custom config: {len(result)} entries")
44
+ print(f"Sample entries: {list(result.items())[:3]}")
45
+ print()
46
+
47
+ # Test 3: Test file content extraction
48
+ def test_file_extraction():
49
+ print("Test 3: File content extraction")
50
+ from web_app.handlers.analysis_handlers import AnalysisHandlers
51
+
52
+ # Simulate uploaded file
53
+ class MockUploadedFile:
54
+ def __init__(self, name, content):
55
+ self.name = name
56
+ self._content = content.encode('utf-8')
57
+ self._position = 0
58
+
59
+ def read(self):
60
+ self._position = len(self._content)
61
+ return self._content
62
+
63
+ def seek(self, position):
64
+ self._position = position
65
+
66
+ mock_file = MockUploadedFile("test.txt", "This is a test file content.")
67
+ file_contents = AnalysisHandlers.extract_uploaded_files([mock_file])
68
+
69
+ print(f"βœ“ Successfully extracted {len(file_contents)} files")
70
+ for filename, content in file_contents:
71
+ print(f" - {filename}: {content[:50]}...")
72
+ print()
73
+
74
+ if __name__ == "__main__":
75
+ print("Testing in-memory file handling...\n")
76
+
77
+ try:
78
+ test_stringio_pandas()
79
+ test_custom_config()
80
+ test_file_extraction()
81
+ print("βœ… All tests passed!")
82
+ except Exception as e:
83
+ print(f"❌ Test failed: {e}")
84
+ import traceback
85
+ traceback.print_exc()
test_unidic_diagnostic.py β†’ test/test_unidic_diagnostic.py RENAMED
File without changes
text_analyzer/lexical_sophistication.py CHANGED
@@ -170,16 +170,26 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
170
  Parse custom frequency list using user-selected columns.
171
 
172
  Args:
173
- config: Dictionary with file_path, word_column, freq_column, delimiter
174
  """
175
- file_path = config['file_path']
176
  word_column = config['word_column']
177
  freq_column = config['freq_column']
178
  delimiter = config['delimiter']
179
 
180
  try:
181
- # Load the file
182
- df = pd.read_csv(file_path, delimiter=delimiter, header=0)
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  # Validate columns exist
185
  if word_column not in df.columns:
 
170
  Parse custom frequency list using user-selected columns.
171
 
172
  Args:
173
+ config: Dictionary with file_name/content, word_column, freq_column, delimiter
174
  """
 
175
  word_column = config['word_column']
176
  freq_column = config['freq_column']
177
  delimiter = config['delimiter']
178
 
179
  try:
180
+ from io import StringIO
181
+
182
+ # Check if we have content directly or need to read from file
183
+ if 'content' in config:
184
+ # Use content directly
185
+ content_io = StringIO(config['content'])
186
+ df = pd.read_csv(content_io, delimiter=delimiter, header=0)
187
+ elif 'file_path' in config:
188
+ # Fallback to file path for backward compatibility
189
+ df = pd.read_csv(config['file_path'], delimiter=delimiter, header=0)
190
+ else:
191
+ logger.error("No content or file_path found in config")
192
+ return {}
193
 
194
  # Validate columns exist
195
  if word_column not in df.columns:
web_app/config_manager.py CHANGED
@@ -49,28 +49,43 @@ class ConfigManager:
49
  def process_uploaded_file(uploaded_file) -> Optional[Dict[str, Any]]:
50
  """Process a single uploaded file and return its configuration."""
51
  try:
52
- # Save file temporarily
53
- temp_dir = SessionManager.get_temp_dir()
54
- file_path = os.path.join(temp_dir, uploaded_file.name)
55
 
56
- with open(file_path, 'wb') as f:
57
- f.write(uploaded_file.getvalue())
 
58
 
59
- # Determine delimiter
60
- with open(file_path, 'r', encoding='utf-8') as f:
61
- sample = f.read(1024)
62
- delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
 
 
 
 
 
 
 
 
63
 
64
  # Load preview
65
- df_preview = pd.read_csv(file_path, delimiter=delimiter, header=0, nrows=5)
 
 
 
 
 
 
 
66
 
67
  return {
68
- 'file_path': file_path,
69
  'columns': list(df_preview.columns),
70
  'delimiter': delimiter,
71
  'preview': df_preview,
72
  'base_name': Path(uploaded_file.name).stem,
73
- 'configurations': []
 
74
  }
75
 
76
  except Exception as e:
@@ -78,10 +93,11 @@ class ConfigManager:
78
  return None
79
 
80
  @staticmethod
81
- def create_custom_config(file_path: str, delimiter: str, word_col: str, score_col: str) -> Dict[str, Any]:
82
  """Create custom configuration object for backend."""
83
  return {
84
- 'file_path': file_path,
 
85
  'word_column': word_col,
86
  'freq_column': score_col,
87
  'delimiter': delimiter,
@@ -109,9 +125,9 @@ class ConfigManager:
109
  errors = []
110
 
111
  for file_key, file_config in all_configs.items():
112
- # Validate file exists
113
- if not os.path.exists(file_config['file_path']):
114
- errors.append(f"File not found: {file_config['file_path']}")
115
  continue
116
 
117
  for index_config in file_config['indices']:
@@ -131,7 +147,8 @@ class ConfigManager:
131
 
132
  # Create and store configuration
133
  custom_data = ConfigManager.create_custom_config(
134
- file_config['file_path'],
 
135
  file_config['delimiter'],
136
  word_col,
137
  score_col
 
49
  def process_uploaded_file(uploaded_file) -> Optional[Dict[str, Any]]:
50
  """Process a single uploaded file and return its configuration."""
51
  try:
52
+ from io import StringIO
 
 
53
 
54
+ # Read file content directly from uploaded file
55
+ uploaded_file.seek(0)
56
+ content = uploaded_file.read()
57
 
58
+ # Decode content if it's bytes
59
+ if isinstance(content, bytes):
60
+ text_content = content.decode('utf-8')
61
+ else:
62
+ text_content = content
63
+
64
+ # Determine delimiter from first 1024 chars
65
+ sample = text_content[:1024]
66
+ delimiter = ',' if sample.count(',') > sample.count('\t') else '\t'
67
+
68
+ # Create StringIO for pandas to read
69
+ content_io = StringIO(text_content)
70
 
71
  # Load preview
72
+ df_preview = pd.read_csv(content_io, delimiter=delimiter, header=0, nrows=5)
73
+
74
+ # Store content in session state instead of file path
75
+ if 'uploaded_files_content' not in st.session_state:
76
+ st.session_state.uploaded_files_content = {}
77
+
78
+ # Use filename as key
79
+ st.session_state.uploaded_files_content[uploaded_file.name] = text_content
80
 
81
  return {
82
+ 'file_name': uploaded_file.name,
83
  'columns': list(df_preview.columns),
84
  'delimiter': delimiter,
85
  'preview': df_preview,
86
  'base_name': Path(uploaded_file.name).stem,
87
+ 'configurations': [],
88
+ 'content': text_content # Include content for immediate use
89
  }
90
 
91
  except Exception as e:
 
93
  return None
94
 
95
  @staticmethod
96
+ def create_custom_config(file_name: str, content: str, delimiter: str, word_col: str, score_col: str) -> Dict[str, Any]:
97
  """Create custom configuration object for backend."""
98
  return {
99
+ 'file_name': file_name,
100
+ 'content': content,
101
  'word_column': word_col,
102
  'freq_column': score_col,
103
  'delimiter': delimiter,
 
125
  errors = []
126
 
127
  for file_key, file_config in all_configs.items():
128
+ # Validate that we have content
129
+ if 'content' not in file_config or not file_config['content']:
130
+ errors.append(f"No content found for file: {file_key}")
131
  continue
132
 
133
  for index_config in file_config['indices']:
 
147
 
148
  # Create and store configuration
149
  custom_data = ConfigManager.create_custom_config(
150
+ file_config['file_name'],
151
+ file_config['content'],
152
  file_config['delimiter'],
153
  word_col,
154
  score_col
web_app/handlers/analysis_handlers.py CHANGED
@@ -10,7 +10,7 @@ import plotly.graph_objects as go
10
  from scipy import stats
11
  import tempfile
12
  import os
13
- from typing import Dict, List, Any, Optional
14
  from pathlib import Path
15
  import zipfile
16
  import time
@@ -138,13 +138,13 @@ class AnalysisHandlers:
138
  with st.spinner("Processing files..."):
139
  try:
140
  # Extract files
141
- file_paths = AnalysisHandlers.extract_uploaded_files(uploaded_files)
142
 
143
- if not file_paths:
144
  st.error("No valid .txt files found in uploaded files.")
145
  return
146
 
147
- st.info(f"Found {len(file_paths)} files to process.")
148
 
149
  # Load reference lists
150
  analyzer.load_reference_lists(reference_lists)
@@ -153,18 +153,36 @@ class AnalysisHandlers:
153
  progress_bar = st.progress(0)
154
  status_text = st.empty()
155
 
156
- def progress_callback(current, total):
157
- progress = current / total
 
 
 
 
 
158
  progress_bar.progress(progress)
159
- status_text.text(f"Processing file {current}/{total}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Perform batch analysis
162
- results_df = analyzer.analyze_batch(
163
- file_paths,
164
- list(reference_lists.keys()),
165
- apply_log,
166
- progress_callback
167
- )
168
 
169
  # Display results
170
  st.success(f"Analysis complete! Processed {len(results_df)} files.")
@@ -245,10 +263,9 @@ class AnalysisHandlers:
245
  st.error(f"Error during comparison: {e}")
246
 
247
  @staticmethod
248
- def extract_uploaded_files(uploaded_files) -> List[str]:
249
- """Extract uploaded files and return list of file paths."""
250
- temp_dir = tempfile.mkdtemp()
251
- file_paths = []
252
 
253
  for uploaded_file in uploaded_files:
254
  if uploaded_file.name.endswith('.zip'):
@@ -256,25 +273,45 @@ class AnalysisHandlers:
256
  with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
257
  for file_info in zip_ref.infolist():
258
  if file_info.filename.endswith('.txt'):
259
- zip_ref.extract(file_info, temp_dir)
260
- file_paths.append(os.path.join(temp_dir, file_info.filename))
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  elif uploaded_file.name.endswith('.txt'):
262
  # Handle individual text files
263
- file_path = os.path.join(temp_dir, uploaded_file.name)
264
  try:
265
- content = uploaded_file.read().decode('utf-8')
266
- except UnicodeDecodeError:
267
- try:
268
- content = uploaded_file.read().decode('utf-16')
269
- except UnicodeDecodeError:
270
- st.error(f"Unable to decode file {uploaded_file.name}. Skipping.")
271
- continue
272
-
273
- with open(file_path, 'w', encoding='utf-8') as f:
274
- f.write(content)
275
- file_paths.append(file_path)
 
 
 
 
 
 
 
 
276
 
277
- return file_paths
278
 
279
  @staticmethod
280
  def display_single_text_results(results: Dict[str, Any]):
 
10
  from scipy import stats
11
  import tempfile
12
  import os
13
+ from typing import Dict, List, Any, Optional, Tuple
14
  from pathlib import Path
15
  import zipfile
16
  import time
 
138
  with st.spinner("Processing files..."):
139
  try:
140
  # Extract files
141
+ file_contents = AnalysisHandlers.extract_uploaded_files(uploaded_files)
142
 
143
+ if not file_contents:
144
  st.error("No valid .txt files found in uploaded files.")
145
  return
146
 
147
+ st.info(f"Found {len(file_contents)} files to process.")
148
 
149
  # Load reference lists
150
  analyzer.load_reference_lists(reference_lists)
 
153
  progress_bar = st.progress(0)
154
  status_text = st.empty()
155
 
156
+ # Process files in memory
157
+ batch_results = []
158
+ selected_indices = list(reference_lists.keys())
159
+
160
+ for i, (filename, text_content) in enumerate(file_contents):
161
+ # Update progress
162
+ progress = (i + 1) / len(file_contents)
163
  progress_bar.progress(progress)
164
+ status_text.text(f"Processing file {i + 1}/{len(file_contents)}: {filename}")
165
+
166
+ try:
167
+ # Analyze for both content and function words
168
+ result_row = {'filename': filename}
169
+
170
+ for word_type in ['CW', 'FW']:
171
+ analysis = analyzer.analyze_text(text_content, selected_indices, apply_log, word_type)
172
+
173
+ # Extract summary scores
174
+ if analysis and 'summary' in analysis:
175
+ for index, stats in analysis['summary'].items():
176
+ col_name = f"{index}_{word_type}"
177
+ result_row[col_name] = stats['mean']
178
+
179
+ batch_results.append(result_row)
180
+ except Exception as e:
181
+ st.warning(f"Error analyzing {filename}: {e}")
182
+ continue
183
 
184
+ # Convert to DataFrame
185
+ results_df = pd.DataFrame(batch_results)
 
 
 
 
 
186
 
187
  # Display results
188
  st.success(f"Analysis complete! Processed {len(results_df)} files.")
 
263
  st.error(f"Error during comparison: {e}")
264
 
265
  @staticmethod
266
+ def extract_uploaded_files(uploaded_files) -> List[Tuple[str, str]]:
267
+ """Extract uploaded files and return list of (filename, content) tuples."""
268
+ file_contents = []
 
269
 
270
  for uploaded_file in uploaded_files:
271
  if uploaded_file.name.endswith('.zip'):
 
273
  with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
274
  for file_info in zip_ref.infolist():
275
  if file_info.filename.endswith('.txt'):
276
+ try:
277
+ content = zip_ref.read(file_info.filename)
278
+ # Decode content
279
+ try:
280
+ text_content = content.decode('utf-8')
281
+ except UnicodeDecodeError:
282
+ try:
283
+ text_content = content.decode('utf-16')
284
+ except UnicodeDecodeError:
285
+ st.error(f"Unable to decode file {file_info.filename}. Skipping.")
286
+ continue
287
+ file_contents.append((file_info.filename, text_content))
288
+ except Exception as e:
289
+ st.error(f"Cannot read {file_info.filename}: {e}")
290
+ continue
291
  elif uploaded_file.name.endswith('.txt'):
292
  # Handle individual text files
 
293
  try:
294
+ # Reset file pointer to beginning
295
+ uploaded_file.seek(0)
296
+ content = uploaded_file.read()
297
+ if isinstance(content, bytes):
298
+ try:
299
+ text_content = content.decode('utf-8')
300
+ except UnicodeDecodeError:
301
+ try:
302
+ text_content = content.decode('utf-16')
303
+ except UnicodeDecodeError:
304
+ st.error(f"Unable to decode file {uploaded_file.name}. Skipping.")
305
+ continue
306
+ else:
307
+ text_content = content
308
+
309
+ file_contents.append((uploaded_file.name, text_content))
310
+ except Exception as e:
311
+ st.error(f"Cannot read file {uploaded_file.name}: {e}")
312
+ continue
313
 
314
+ return file_contents
315
 
316
  @staticmethod
317
  def display_single_text_results(results: Dict[str, Any]):
web_app/reference_manager.py CHANGED
@@ -147,7 +147,8 @@ class ReferenceManager:
147
  file_configs.append(index_config)
148
 
149
  all_configs[file_key] = {
150
- 'file_path': config['file_path'],
 
151
  'delimiter': config['delimiter'],
152
  'indices': file_configs
153
  }
 
147
  file_configs.append(index_config)
148
 
149
  all_configs[file_key] = {
150
+ 'file_name': config['file_name'],
151
+ 'content': config['content'],
152
  'delimiter': config['delimiter'],
153
  'indices': file_configs
154
  }
web_app/session_manager.py CHANGED
@@ -20,7 +20,7 @@ class SessionManager:
20
  'pos_parser': None,
21
  'reference_lists': {},
22
  'uploaded_file_configs': {},
23
- 'temp_dir': None,
24
  'last_language_change': st.session_state.get('language', 'en'),
25
  'show_language_warning': False
26
  }
@@ -80,13 +80,17 @@ class SessionManager:
80
 
81
  @staticmethod
82
  def get_temp_dir() -> Optional[str]:
83
- """Get or create temporary directory for uploaded files."""
84
- import tempfile
85
-
86
- if 'temp_dir' not in st.session_state or st.session_state.temp_dir is None:
87
- st.session_state.temp_dir = tempfile.mkdtemp()
88
-
89
- return st.session_state.temp_dir
 
 
 
 
90
 
91
  @staticmethod
92
  def is_custom_reference_list(name: str) -> bool:
 
20
  'pos_parser': None,
21
  'reference_lists': {},
22
  'uploaded_file_configs': {},
23
+ 'uploaded_files_content': {}, # Store file contents in memory
24
  'last_language_change': st.session_state.get('language', 'en'),
25
  'show_language_warning': False
26
  }
 
80
 
81
  @staticmethod
82
  def get_temp_dir() -> Optional[str]:
83
+ """
84
+ DEPRECATED: This method is no longer used.
85
+ File handling is now done in-memory to support read-only filesystems.
86
+ """
87
+ import warnings
88
+ warnings.warn(
89
+ "get_temp_dir() is deprecated. Use in-memory file handling instead.",
90
+ DeprecationWarning,
91
+ stacklevel=2
92
+ )
93
+ return None
94
 
95
  @staticmethod
96
  def is_custom_reference_list(name: str) -> bool: