egumasa commited on
Commit
385ead1
·
1 Parent(s): dcb572b
Files changed (3) hide show
  1. pyproject.toml +2 -2
  2. text_analyzer/lexical_sophistication.py +227 -157
  3. uv.lock +1 -1
pyproject.toml CHANGED
@@ -6,13 +6,13 @@ readme = "README.md"
6
  requires-python = ">=3.12"
7
  dependencies = [
8
  "streamlit>=1.28.0",
9
- "spacy>=3.7.0",
10
  "pandas>=2.0.0",
11
  "numpy>=1.24.0,<2.0",
12
  "plotly>=5.15.0",
13
  "pyyaml>=6.0",
14
  "scipy>=1.11.0",
15
- "torch", # PyTorch with automatic CUDA detection
16
  "spacy-curated-transformers>=0.1.0,<0.3.0",
17
  "spacy-transformers>=1.3.0",
18
  "en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl",
 
6
  requires-python = ">=3.12"
7
  dependencies = [
8
  "streamlit>=1.28.0",
9
+ "spacy[cuda12x]>=3.7.0",
10
  "pandas>=2.0.0",
11
  "numpy>=1.24.0,<2.0",
12
  "plotly>=5.15.0",
13
  "pyyaml>=6.0",
14
  "scipy>=1.11.0",
15
+ "torch", # PyTorch with automatic CUDA detection
16
  "spacy-curated-transformers>=0.1.0,<0.3.0",
17
  "spacy-transformers>=1.3.0",
18
  "en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl",
text_analyzer/lexical_sophistication.py CHANGED
@@ -68,10 +68,18 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
68
  logger.info(f"Loaded pre-loaded {file_type} reference list for {index_name}")
69
  continue
70
 
71
- # Check if it's a DataFrame (for n-grams)
72
  if isinstance(file_path_or_dict, pd.DataFrame):
73
- self.reference_lists[index_name][file_type] = file_path_or_dict
74
- logger.info(f"Loaded pre-loaded {file_type} DataFrame for {index_name}")
 
 
 
 
 
 
 
 
75
  continue
76
 
77
  # Otherwise, treat as file path
@@ -90,24 +98,20 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
90
  # Check if this is a custom frequency list format with specific columns
91
  if self._is_custom_frequency_format(df):
92
  processed_data = self._parse_custom_frequency_format(df)
93
- self.reference_lists[index_name][file_type] = processed_data
94
- # For standard unigram files, expect 2 columns: word, score
 
 
 
 
95
  elif df.shape[1] >= 2:
96
- word_col = df.columns[0]
97
- score_col = df.columns[1]
98
- # Clean and convert scores to numeric
99
- df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
100
- # Remove rows with NaN scores
101
- df = df.dropna(subset=[score_col])
102
- self.reference_lists[index_name][file_type] = dict(
103
- zip(df[word_col].str.lower(), df[score_col])
104
- )
105
  else:
106
- # For n-gram files, store full dataframe for multi-column support
107
- # Clean numeric columns
108
- for col in df.columns[1:]:
109
- df[col] = pd.to_numeric(df[col], errors='coerce')
110
- self.reference_lists[index_name][file_type] = df
111
 
112
  logger.info(f"Loaded {file_type} reference list for {index_name}")
113
 
@@ -127,6 +131,97 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
127
 
128
  return all(col in df_columns_lower for col in expected_columns_lower)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def _parse_custom_frequency_format(self, df):
131
  """
132
  Parse custom frequency list format and return a dictionary mapping words to frequency scores.
@@ -269,13 +364,13 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
269
  def _lookup_score(self, word: str, index_name: str, file_type: str,
270
  measure_col: Optional[str] = None) -> Optional[float]:
271
  """
272
- Look up score for a word in reference lists.
273
 
274
  Args:
275
- word: Word to look up
276
  index_name: Name of the reference index
277
  file_type: Type of reference file ('token', 'lemma', 'bigram', 'trigram')
278
- measure_col: Column name for n-gram measures (optional)
279
 
280
  Returns:
281
  Score if found, None otherwise
@@ -287,20 +382,32 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
287
  if ref_data is None:
288
  return None
289
 
290
- if file_type in ['token', 'lemma']:
291
- # Check if this is Japanese corpus data
292
- if isinstance(ref_data, dict) and ref_data.get('is_japanese_corpus', False):
293
- # This should not be called directly for Japanese data
294
- # Use _lookup_japanese_score instead
295
- return None
296
-
297
- # Simple dictionary lookup for unigrams
 
298
  return ref_data.get(word.lower())
299
- else:
300
- # DataFrame lookup for n-grams
301
- if not isinstance(ref_data, pd.DataFrame):
 
 
302
  return None
303
 
 
 
 
 
 
 
 
 
 
304
  # Find matching row
305
  word_col = ref_data.columns[0]
306
  matching_rows = ref_data[ref_data[word_col].str.lower() == word.lower()]
@@ -324,6 +431,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
324
  except (ValueError, TypeError):
325
  return None
326
  return None
 
 
327
 
328
  def _lookup_with_unidic_fallback(self, token, index_name: str, file_type: str) -> Dict:
329
  """
@@ -761,9 +870,43 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
761
  continue
762
 
763
  ref_data = self.reference_lists[index_name].get(ngram_type)
764
- if ref_data is None or not isinstance(ref_data, pd.DataFrame):
 
 
 
 
 
 
 
 
 
765
  continue
766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767
  # Get columns config for proper measure naming from YAML config
768
  # We need to access the original YAML configuration to get proper measure names
769
  from web_app.config_manager import ConfigManager
@@ -778,55 +921,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
778
  config_entry = config[language_key][config_section][index_name]
779
  break
780
 
781
- if config_entry and 'columns' in config_entry:
782
- # Get columns config for this n-gram type
783
- columns_config = config_entry.get('columns', {})
784
-
785
- # Create mapping from column index to measure name
786
- measure_mapping = {}
787
- for measure_name, col_idx in columns_config.items():
788
- if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
789
- measure_mapping[ref_data.columns[col_idx]] = measure_name
790
-
791
- # Use the measure mapping to get proper names
792
- for col_name, measure_name in measure_mapping.items():
793
- if col_name == ref_data.columns[0]: # Skip the n-gram text column
794
- continue
795
-
796
- # Check if this measure should be computed
797
- if not self._should_compute_measure(index_name, measure_name, selected_measures):
798
- continue
799
-
800
- score = self._lookup_score(ngram, index_name, ngram_type, col_name)
801
- if score is not None:
802
- # Check if this measure should be log-transformed
803
- should_log_transform = self._should_apply_log_transform(
804
- index_name, ngram_type, measure_name, log_transforms, apply_log
805
- )
806
- score_val = np.log10(score) if should_log_transform and score > 0 else score
807
- ngram_detail[f"{index_name}_{measure_name}"] = score_val
808
- else:
809
- ngram_detail[f"{index_name}_{measure_name}"] = None
810
- else:
811
- # Fallback to old logic
812
- available_measures = ref_data.columns[1:].tolist()
813
-
814
- # Filter measures based on selection
815
- for measure in available_measures:
816
- # Check if this measure should be computed
817
- if not self._should_compute_measure(index_name, measure, selected_measures):
818
- continue
819
-
820
- score = self._lookup_score(ngram, index_name, ngram_type, measure)
821
- if score is not None:
822
- # Check if this measure should be log-transformed
823
- should_log_transform = self._should_apply_log_transform(
824
- index_name, ngram_type, measure, log_transforms, apply_log
825
- )
826
- score_val = np.log10(score) if should_log_transform and score > 0 else score
827
- ngram_detail[f"{index_name}_{measure}"] = score_val
828
- else:
829
- ngram_detail[f"{index_name}_{measure}"] = None
830
 
831
  results[ngram_details_key].append(ngram_detail)
832
 
@@ -836,9 +932,54 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
836
  continue
837
 
838
  ref_data = self.reference_lists[index_name].get(ngram_type)
839
- if ref_data is None or not isinstance(ref_data, pd.DataFrame):
 
 
 
 
 
840
  continue
841
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
842
  # Get columns config for proper measure naming from YAML config
843
  # We need to access the original YAML configuration to get proper measure names
844
  from web_app.config_manager import ConfigManager
@@ -853,79 +994,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
853
  config_entry = config[language_key][config_section][index_name]
854
  break
855
 
856
- if config_entry and 'columns' in config_entry:
857
- # Get columns config for this n-gram type
858
- columns_config = config_entry.get('columns', {})
859
-
860
- # Create mapping from column index to measure name
861
- measure_mapping = {}
862
- for measure_name, col_idx in columns_config.items():
863
- if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
864
- measure_mapping[ref_data.columns[col_idx]] = measure_name
865
-
866
- # Use the measure mapping to get proper names
867
- for col_name, measure_name in measure_mapping.items():
868
- if col_name == ref_data.columns[0]: # Skip the n-gram text column
869
- continue
870
-
871
- # Check if this measure should be computed
872
- if not self._should_compute_measure(index_name, measure_name, selected_measures):
873
- continue
874
-
875
- ngram_scores = []
876
- for ngram in ngrams:
877
- score = self._lookup_score(ngram, index_name, ngram_type, col_name)
878
- if score is not None:
879
- # Check if this measure should be log-transformed
880
- should_log_transform = self._should_apply_log_transform(
881
- index_name, ngram_type, measure_name, log_transforms, apply_log
882
- )
883
- score_val = np.log10(score) if should_log_transform and score > 0 else score
884
- ngram_scores.append(score_val)
885
-
886
- if ngram_scores:
887
- key = f"{index_name}_{ngram_type}_{measure_name}"
888
- results['summary'][key] = {
889
- 'mean': np.mean(ngram_scores),
890
- 'std': np.std(ngram_scores),
891
- 'count': len(ngram_scores),
892
- 'min': np.min(ngram_scores),
893
- 'max': np.max(ngram_scores)
894
- }
895
- # Store raw scores for plotting
896
- results['raw_scores'][key] = ngram_scores
897
- else:
898
- # Fallback to old logic if config not properly structured
899
- available_measures = ref_data.columns[1:].tolist()
900
-
901
- # Filter measures based on selection and compute summary statistics
902
- for measure in available_measures:
903
- # Check if this measure should be computed
904
- if not self._should_compute_measure(index_name, measure, selected_measures):
905
- continue
906
-
907
- ngram_scores = []
908
- for ngram in ngrams:
909
- score = self._lookup_score(ngram, index_name, ngram_type, measure)
910
- if score is not None:
911
- # Check if this measure should be log-transformed
912
- should_log_transform = self._should_apply_log_transform(
913
- index_name, ngram_type, measure, log_transforms, apply_log
914
- )
915
- score_val = np.log10(score) if should_log_transform and score > 0 else score
916
- ngram_scores.append(score_val)
917
-
918
- if ngram_scores:
919
- key = f"{index_name}_{ngram_type}_{measure}"
920
- results['summary'][key] = {
921
- 'mean': np.mean(ngram_scores),
922
- 'std': np.std(ngram_scores),
923
- 'count': len(ngram_scores),
924
- 'min': np.min(ngram_scores),
925
- 'max': np.max(ngram_scores)
926
- }
927
- # Store raw scores for plotting
928
- results['raw_scores'][key] = ngram_scores
929
 
930
  return results
931
 
 
68
  logger.info(f"Loaded pre-loaded {file_type} reference list for {index_name}")
69
  continue
70
 
71
+ # Check if it's a DataFrame (for n-grams) - convert to nested dict
72
  if isinstance(file_path_or_dict, pd.DataFrame):
73
+ if file_type in ['bigram', 'trigram']:
74
+ # Convert DataFrame to nested dictionary for better performance
75
+ nested_dict = self._convert_dataframe_to_nested_dict(
76
+ file_path_or_dict, index_name, file_type
77
+ )
78
+ self.reference_lists[index_name][file_type] = nested_dict
79
+ logger.info(f"Converted pre-loaded {file_type} DataFrame to nested dict for {index_name}")
80
+ else:
81
+ self.reference_lists[index_name][file_type] = file_path_or_dict
82
+ logger.info(f"Loaded pre-loaded {file_type} DataFrame for {index_name}")
83
  continue
84
 
85
  # Otherwise, treat as file path
 
98
  # Check if this is a custom frequency list format with specific columns
99
  if self._is_custom_frequency_format(df):
100
  processed_data = self._parse_custom_frequency_format(df)
101
+ # Convert to nested dict format for consistency
102
+ nested_dict = {}
103
+ for word, freq in processed_data.items():
104
+ nested_dict[word] = {'frequency': freq}
105
+ self.reference_lists[index_name][file_type] = nested_dict
106
+ # For standard unigram files, convert to nested dict format
107
  elif df.shape[1] >= 2:
108
+ # Convert all columns to nested dictionary
109
+ nested_dict = self._convert_dataframe_to_nested_dict(df, index_name, file_type)
110
+ self.reference_lists[index_name][file_type] = nested_dict
 
 
 
 
 
 
111
  else:
112
+ # For n-gram files, convert DataFrame to nested dictionary for better performance
113
+ nested_dict = self._convert_dataframe_to_nested_dict(df, index_name, file_type)
114
+ self.reference_lists[index_name][file_type] = nested_dict
 
 
115
 
116
  logger.info(f"Loaded {file_type} reference list for {index_name}")
117
 
 
131
 
132
  return all(col in df_columns_lower for col in expected_columns_lower)
133
 
134
+ def _convert_dataframe_to_nested_dict(self, df: pd.DataFrame, index_name: str, file_type: str) -> Dict[str, Dict[str, float]]:
135
+ """
136
+ Convert DataFrame to nested dictionary structure for fast O(1) lookups.
137
+
138
+ Args:
139
+ df: Source DataFrame
140
+ index_name: Name of the reference index
141
+ file_type: Type of reference file ('token', 'lemma', 'bigram', 'trigram')
142
+
143
+ Returns:
144
+ {item_text: {measure_name: value, ...}}
145
+ """
146
+ nested_dict = {}
147
+
148
+ if df.empty or len(df.columns) < 2:
149
+ logger.warning(f"Empty or invalid DataFrame for {index_name} {file_type}")
150
+ return nested_dict
151
+
152
+ # First column is always the text (word/n-gram)
153
+ text_col = df.columns[0]
154
+
155
+ # Get column configuration from YAML if available
156
+ try:
157
+ from web_app.config_manager import ConfigManager
158
+ config = ConfigManager.load_reference_config()
159
+ language_key = "english" if self.language == 'en' else "japanese"
160
+
161
+ # Find the config entry for this index
162
+ config_entry = None
163
+ if file_type in ['token', 'lemma']:
164
+ section_key = 'unigrams'
165
+ else:
166
+ section_key = f"{file_type}s" # bigrams/trigrams
167
+
168
+ if section_key in config.get(language_key, {}):
169
+ if index_name in config[language_key][section_key]:
170
+ config_entry = config[language_key][section_key][index_name]
171
+
172
+ # Create measure mapping
173
+ measure_mapping = {}
174
+ if config_entry and 'columns' in config_entry:
175
+ columns_config = config_entry.get('columns', {})
176
+ for measure_name, col_idx in columns_config.items():
177
+ if isinstance(col_idx, int) and col_idx < len(df.columns):
178
+ measure_mapping[measure_name] = df.columns[col_idx]
179
+ else:
180
+ # Fallback: use column names directly as measure names (skip first column)
181
+ for i, col_name in enumerate(df.columns[1:], 1):
182
+ measure_mapping[col_name] = col_name
183
+
184
+ except Exception as e:
185
+ logger.warning(f"Could not load YAML config for {index_name}, using fallback naming: {e}")
186
+ # Fallback: use column names directly as measure names (skip first column)
187
+ measure_mapping = {}
188
+ for i, col_name in enumerate(df.columns[1:], 1):
189
+ measure_mapping[col_name] = col_name
190
+
191
+ # Clean and convert data
192
+ df_clean = df.copy()
193
+
194
+ # Clean text column
195
+ df_clean[text_col] = df_clean[text_col].astype(str).str.strip().str.lower()
196
+ df_clean = df_clean[df_clean[text_col] != '']
197
+ df_clean = df_clean[df_clean[text_col] != 'nan']
198
+
199
+ # Clean numeric columns
200
+ for col in df_clean.columns[1:]:
201
+ df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
202
+
203
+ # Remove rows with all NaN measures
204
+ df_clean = df_clean.dropna(subset=df_clean.columns[1:].tolist(), how='all')
205
+
206
+ # Convert to nested dictionary
207
+ for _, row in df_clean.iterrows():
208
+ text_key = row[text_col]
209
+ if pd.isna(text_key) or text_key == '':
210
+ continue
211
+
212
+ measures = {}
213
+ for measure_name, col_name in measure_mapping.items():
214
+ if col_name == text_col: # Skip the text column
215
+ continue
216
+ if col_name in row and not pd.isna(row[col_name]):
217
+ measures[measure_name] = float(row[col_name])
218
+
219
+ if measures: # Only add if we have at least one valid measure
220
+ nested_dict[text_key] = measures
221
+
222
+ logger.info(f"Converted {len(nested_dict)} entries from DataFrame to nested dict for {index_name} {file_type}")
223
+ return nested_dict
224
+
225
  def _parse_custom_frequency_format(self, df):
226
  """
227
  Parse custom frequency list format and return a dictionary mapping words to frequency scores.
 
364
  def _lookup_score(self, word: str, index_name: str, file_type: str,
365
  measure_col: Optional[str] = None) -> Optional[float]:
366
  """
367
+ Unified lookup for both words and N-grams using nested dictionaries for O(1) performance.
368
 
369
  Args:
370
+ word: Word/N-gram to look up
371
  index_name: Name of the reference index
372
  file_type: Type of reference file ('token', 'lemma', 'bigram', 'trigram')
373
+ measure_col: Specific measure to retrieve (e.g., 'frequency', 'mi_score')
374
 
375
  Returns:
376
  Score if found, None otherwise
 
382
  if ref_data is None:
383
  return None
384
 
385
+ # Handle Japanese corpus data (special case)
386
+ if isinstance(ref_data, dict) and ref_data.get('is_japanese_corpus', False):
387
+ # This should not be called directly for Japanese data
388
+ # Use _lookup_japanese_score instead
389
+ return None
390
+
391
+ # Handle legacy simple dictionaries (old unigram format: word -> single_score)
392
+ if isinstance(ref_data, dict) and not any(isinstance(v, dict) for v in ref_data.values()):
393
+ # Legacy simple dictionary format
394
  return ref_data.get(word.lower())
395
+
396
+ # Handle nested dictionary format (new unified format: word -> {measure: value, ...})
397
+ if isinstance(ref_data, dict):
398
+ word_data = ref_data.get(word.lower())
399
+ if word_data is None or not isinstance(word_data, dict):
400
  return None
401
 
402
+ # If measure specified, return that specific measure
403
+ if measure_col:
404
+ return word_data.get(measure_col)
405
+ else:
406
+ # Return first available measure for backward compatibility
407
+ return next(iter(word_data.values())) if word_data else None
408
+
409
+ # Fallback to DataFrame lookup (for compatibility during transition)
410
+ if isinstance(ref_data, pd.DataFrame):
411
  # Find matching row
412
  word_col = ref_data.columns[0]
413
  matching_rows = ref_data[ref_data[word_col].str.lower() == word.lower()]
 
431
  except (ValueError, TypeError):
432
  return None
433
  return None
434
+
435
+ return None
436
 
437
  def _lookup_with_unidic_fallback(self, token, index_name: str, file_type: str) -> Dict:
438
  """
 
870
  continue
871
 
872
  ref_data = self.reference_lists[index_name].get(ngram_type)
873
+ if ref_data is None:
874
+ continue
875
+
876
+ # Skip if using old DataFrame format (should be converted by now)
877
+ if isinstance(ref_data, pd.DataFrame):
878
+ logger.warning(f"Found unconverted DataFrame for {index_name} {ngram_type}, skipping")
879
+ continue
880
+
881
+ # Ensure we have the new nested dictionary format
882
+ if not isinstance(ref_data, dict):
883
  continue
884
 
885
+ # Get available measures from any N-gram entry
886
+ sample_ngram_data = next(iter(ref_data.values())) if ref_data else {}
887
+ if not isinstance(sample_ngram_data, dict):
888
+ continue
889
+
890
+ available_measures = list(sample_ngram_data.keys())
891
+
892
+ # Process each available measure
893
+ for measure_name in available_measures:
894
+ # Check if this measure should be computed
895
+ if not self._should_compute_measure(index_name, measure_name, selected_measures):
896
+ continue
897
+
898
+ # Use the unified lookup method for O(1) performance
899
+ score = self._lookup_score(ngram, index_name, ngram_type, measure_name)
900
+ if score is not None:
901
+ # Check if this measure should be log-transformed
902
+ should_log_transform = self._should_apply_log_transform(
903
+ index_name, ngram_type, measure_name, log_transforms, apply_log
904
+ )
905
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
906
+ ngram_detail[f"{index_name}_{measure_name}"] = score_val
907
+ else:
908
+ ngram_detail[f"{index_name}_{measure_name}"] = None
909
+
910
  # Get columns config for proper measure naming from YAML config
911
  # We need to access the original YAML configuration to get proper measure names
912
  from web_app.config_manager import ConfigManager
 
921
  config_entry = config[language_key][config_section][index_name]
922
  break
923
 
924
+ # Note: With nested dictionary format, we already processed all measures above
925
+ # No additional processing needed here since measures are extracted directly from the dictionary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
 
927
  results[ngram_details_key].append(ngram_detail)
928
 
 
932
  continue
933
 
934
  ref_data = self.reference_lists[index_name].get(ngram_type)
935
+ if ref_data is None:
936
+ continue
937
+
938
+ # Skip if using old DataFrame format (should be converted by now)
939
+ if isinstance(ref_data, pd.DataFrame):
940
+ logger.warning(f"Found unconverted DataFrame for {index_name} {ngram_type} in summary, skipping")
941
  continue
942
 
943
+ # Ensure we have the new nested dictionary format
944
+ if not isinstance(ref_data, dict):
945
+ continue
946
+
947
+ # Get available measures from any N-gram entry
948
+ sample_ngram_data = next(iter(ref_data.values())) if ref_data else {}
949
+ if not isinstance(sample_ngram_data, dict):
950
+ continue
951
+
952
+ available_measures = list(sample_ngram_data.keys())
953
+
954
+ # Process each available measure for summary statistics
955
+ for measure_name in available_measures:
956
+ # Check if this measure should be computed
957
+ if not self._should_compute_measure(index_name, measure_name, selected_measures):
958
+ continue
959
+
960
+ ngram_scores = []
961
+ for ngram in ngrams:
962
+ score = self._lookup_score(ngram, index_name, ngram_type, measure_name)
963
+ if score is not None:
964
+ # Check if this measure should be log-transformed
965
+ should_log_transform = self._should_apply_log_transform(
966
+ index_name, ngram_type, measure_name, log_transforms, apply_log
967
+ )
968
+ score_val = np.log10(score) if should_log_transform and score > 0 else score
969
+ ngram_scores.append(score_val)
970
+
971
+ if ngram_scores:
972
+ key = f"{index_name}_{ngram_type}_{measure_name}"
973
+ results['summary'][key] = {
974
+ 'mean': np.mean(ngram_scores),
975
+ 'std': np.std(ngram_scores),
976
+ 'count': len(ngram_scores),
977
+ 'min': np.min(ngram_scores),
978
+ 'max': np.max(ngram_scores)
979
+ }
980
+ # Store raw scores for plotting
981
+ results['raw_scores'][key] = ngram_scores
982
+
983
  # Get columns config for proper measure naming from YAML config
984
  # We need to access the original YAML configuration to get proper measure names
985
  from web_app.config_manager import ConfigManager
 
994
  config_entry = config[language_key][config_section][index_name]
995
  break
996
 
997
+ # Note: With nested dictionary format, summary statistics are already processed above
998
+ # No additional processing needed here since measures are extracted directly from the dictionary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
999
 
1000
  return results
1001
 
uv.lock CHANGED
@@ -1751,7 +1751,7 @@ requires-dist = [
1751
  { name = "plotly", specifier = ">=5.15.0" },
1752
  { name = "pyyaml", specifier = ">=6.0" },
1753
  { name = "scipy", specifier = ">=1.11.0" },
1754
- { name = "spacy", specifier = ">=3.7.0" },
1755
  { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
1756
  { name = "spacy-transformers", specifier = ">=1.3.0" },
1757
  { name = "streamlit", specifier = ">=1.28.0" },
 
1751
  { name = "plotly", specifier = ">=5.15.0" },
1752
  { name = "pyyaml", specifier = ">=6.0" },
1753
  { name = "scipy", specifier = ">=1.11.0" },
1754
+ { name = "spacy", extras = ["cuda11", "cuda12"], specifier = ">=3.7.0" },
1755
  { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
1756
  { name = "spacy-transformers", specifier = ">=1.3.0" },
1757
  { name = "streamlit", specifier = ">=1.28.0" },