Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

egumasa commited on Aug 6, 2025

Commit

385ead1

1 Parent(s): dcb572b

bug fix

Browse files

Files changed (3) hide show

pyproject.toml +2 -2
text_analyzer/lexical_sophistication.py +227 -157
uv.lock +1 -1

pyproject.toml CHANGED Viewed

@@ -6,13 +6,13 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "streamlit>=1.28.0",
-    "spacy>=3.7.0",
     "pandas>=2.0.0",
     "numpy>=1.24.0,<2.0",
     "plotly>=5.15.0",
     "pyyaml>=6.0",
     "scipy>=1.11.0",
-    "torch",  # PyTorch with automatic CUDA detection
     "spacy-curated-transformers>=0.1.0,<0.3.0",
     "spacy-transformers>=1.3.0",
     "en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl",

 requires-python = ">=3.12"
 dependencies = [
     "streamlit>=1.28.0",
+    "spacy[cuda12x]>=3.7.0",
     "pandas>=2.0.0",
     "numpy>=1.24.0,<2.0",
     "plotly>=5.15.0",
     "pyyaml>=6.0",
     "scipy>=1.11.0",
+    "torch", # PyTorch with automatic CUDA detection
     "spacy-curated-transformers>=0.1.0,<0.3.0",
     "spacy-transformers>=1.3.0",
     "en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl",

text_analyzer/lexical_sophistication.py CHANGED Viewed

@@ -68,10 +68,18 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                             logger.info(f"Loaded pre-loaded {file_type} reference list for {index_name}")
                             continue
-                    # Check if it's a DataFrame (for n-grams)
                     if isinstance(file_path_or_dict, pd.DataFrame):
-                        self.reference_lists[index_name][file_type] = file_path_or_dict
-                        logger.info(f"Loaded pre-loaded {file_type} DataFrame for {index_name}")
                         continue
                     # Otherwise, treat as file path
@@ -90,24 +98,20 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                         # Check if this is a custom frequency list format with specific columns
                         if self._is_custom_frequency_format(df):
                             processed_data = self._parse_custom_frequency_format(df)
-                            self.reference_lists[index_name][file_type] = processed_data
-                        # For standard unigram files, expect 2 columns: word, score
                         elif df.shape[1] >= 2:
-                            word_col = df.columns[0]
-                            score_col = df.columns[1]
-                            # Clean and convert scores to numeric
-                            df[score_col] = pd.to_numeric(df[score_col], errors='coerce')
-                            # Remove rows with NaN scores
-                            df = df.dropna(subset=[score_col])
-                            self.reference_lists[index_name][file_type] = dict(
-                                zip(df[word_col].str.lower(), df[score_col])
-                            )
                     else:
-                        # For n-gram files, store full dataframe for multi-column support
-                        # Clean numeric columns
-                        for col in df.columns[1:]:
-                            df[col] = pd.to_numeric(df[col], errors='coerce')
-                        self.reference_lists[index_name][file_type] = df
                     logger.info(f"Loaded {file_type} reference list for {index_name}")
@@ -127,6 +131,97 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
         return all(col in df_columns_lower for col in expected_columns_lower)
     def _parse_custom_frequency_format(self, df):
         """
         Parse custom frequency list format and return a dictionary mapping words to frequency scores.
@@ -269,13 +364,13 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
     def _lookup_score(self, word: str, index_name: str, file_type: str,
                      measure_col: Optional[str] = None) -> Optional[float]:
         """
-        Look up score for a word in reference lists.
         Args:
-            word: Word to look up
             index_name: Name of the reference index
             file_type: Type of reference file ('token', 'lemma', 'bigram', 'trigram')
-            measure_col: Column name for n-gram measures (optional)
         Returns:
             Score if found, None otherwise
@@ -287,20 +382,32 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
         if ref_data is None:
             return None
-        if file_type in ['token', 'lemma']:
-            # Check if this is Japanese corpus data
-            if isinstance(ref_data, dict) and ref_data.get('is_japanese_corpus', False):
-                # This should not be called directly for Japanese data
-                # Use _lookup_japanese_score instead
-                return None
-            # Simple dictionary lookup for unigrams
             return ref_data.get(word.lower())
-        else:
-            # DataFrame lookup for n-grams
-            if not isinstance(ref_data, pd.DataFrame):
                 return None
             # Find matching row
             word_col = ref_data.columns[0]
             matching_rows = ref_data[ref_data[word_col].str.lower() == word.lower()]
@@ -324,6 +431,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                     except (ValueError, TypeError):
                         return None
                 return None
     def _lookup_with_unidic_fallback(self, token, index_name: str, file_type: str) -> Dict:
         """
@@ -761,9 +870,43 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                         continue
                     ref_data = self.reference_lists[index_name].get(ngram_type)
-                    if ref_data is None or not isinstance(ref_data, pd.DataFrame):
                         continue
                     # Get columns config for proper measure naming from YAML config
                     # We need to access the original YAML configuration to get proper measure names
                     from web_app.config_manager import ConfigManager
@@ -778,55 +921,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                                 config_entry = config[language_key][config_section][index_name]
                                 break
-                    if config_entry and 'columns' in config_entry:
-                        # Get columns config for this n-gram type
-                        columns_config = config_entry.get('columns', {})
-                        # Create mapping from column index to measure name
-                        measure_mapping = {}
-                        for measure_name, col_idx in columns_config.items():
-                            if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
-                                measure_mapping[ref_data.columns[col_idx]] = measure_name
-                        # Use the measure mapping to get proper names
-                        for col_name, measure_name in measure_mapping.items():
-                            if col_name == ref_data.columns[0]:  # Skip the n-gram text column
-                                continue
-                            # Check if this measure should be computed
-                            if not self._should_compute_measure(index_name, measure_name, selected_measures):
-                                continue
-                            score = self._lookup_score(ngram, index_name, ngram_type, col_name)
-                            if score is not None:
-                                # Check if this measure should be log-transformed
-                                should_log_transform = self._should_apply_log_transform(
-                                    index_name, ngram_type, measure_name, log_transforms, apply_log
-                                )
-                                score_val = np.log10(score) if should_log_transform and score > 0 else score
-                                ngram_detail[f"{index_name}_{measure_name}"] = score_val
-                            else:
-                                ngram_detail[f"{index_name}_{measure_name}"] = None
-                    else:
-                        # Fallback to old logic
-                        available_measures = ref_data.columns[1:].tolist()
-                        # Filter measures based on selection
-                        for measure in available_measures:
-                            # Check if this measure should be computed
-                            if not self._should_compute_measure(index_name, measure, selected_measures):
-                                continue
-                            score = self._lookup_score(ngram, index_name, ngram_type, measure)
-                            if score is not None:
-                                # Check if this measure should be log-transformed
-                                should_log_transform = self._should_apply_log_transform(
-                                    index_name, ngram_type, measure, log_transforms, apply_log
-                                )
-                                score_val = np.log10(score) if should_log_transform and score > 0 else score
-                                ngram_detail[f"{index_name}_{measure}"] = score_val
-                            else:
-                                ngram_detail[f"{index_name}_{measure}"] = None
                 results[ngram_details_key].append(ngram_detail)
@@ -836,9 +932,54 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                     continue
                 ref_data = self.reference_lists[index_name].get(ngram_type)
-                if ref_data is None or not isinstance(ref_data, pd.DataFrame):
                     continue
                 # Get columns config for proper measure naming from YAML config
                 # We need to access the original YAML configuration to get proper measure names
                 from web_app.config_manager import ConfigManager
@@ -853,79 +994,8 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
                             config_entry = config[language_key][config_section][index_name]
                             break
-                if config_entry and 'columns' in config_entry:
-                    # Get columns config for this n-gram type
-                    columns_config = config_entry.get('columns', {})
-                    # Create mapping from column index to measure name
-                    measure_mapping = {}
-                    for measure_name, col_idx in columns_config.items():
-                        if isinstance(col_idx, int) and col_idx < len(ref_data.columns):
-                            measure_mapping[ref_data.columns[col_idx]] = measure_name
-                    # Use the measure mapping to get proper names
-                    for col_name, measure_name in measure_mapping.items():
-                        if col_name == ref_data.columns[0]:  # Skip the n-gram text column
-                            continue
-                        # Check if this measure should be computed
-                        if not self._should_compute_measure(index_name, measure_name, selected_measures):
-                            continue
-                        ngram_scores = []
-                        for ngram in ngrams:
-                            score = self._lookup_score(ngram, index_name, ngram_type, col_name)
-                            if score is not None:
-                                # Check if this measure should be log-transformed
-                                should_log_transform = self._should_apply_log_transform(
-                                    index_name, ngram_type, measure_name, log_transforms, apply_log
-                                )
-                                score_val = np.log10(score) if should_log_transform and score > 0 else score
-                                ngram_scores.append(score_val)
-                        if ngram_scores:
-                            key = f"{index_name}_{ngram_type}_{measure_name}"
-                            results['summary'][key] = {
-                                'mean': np.mean(ngram_scores),
-                                'std': np.std(ngram_scores),
-                                'count': len(ngram_scores),
-                                'min': np.min(ngram_scores),
-                                'max': np.max(ngram_scores)
-                            }
-                            # Store raw scores for plotting
-                            results['raw_scores'][key] = ngram_scores
-                else:
-                    # Fallback to old logic if config not properly structured
-                    available_measures = ref_data.columns[1:].tolist()
-                    # Filter measures based on selection and compute summary statistics
-                    for measure in available_measures:
-                        # Check if this measure should be computed
-                        if not self._should_compute_measure(index_name, measure, selected_measures):
-                            continue
-                        ngram_scores = []
-                        for ngram in ngrams:
-                            score = self._lookup_score(ngram, index_name, ngram_type, measure)
-                            if score is not None:
-                                # Check if this measure should be log-transformed
-                                should_log_transform = self._should_apply_log_transform(
-                                    index_name, ngram_type, measure, log_transforms, apply_log
-                                )
-                                score_val = np.log10(score) if should_log_transform and score > 0 else score
-                                ngram_scores.append(score_val)
-                        if ngram_scores:
-                            key = f"{index_name}_{ngram_type}_{measure}"
-                            results['summary'][key] = {
-                                'mean': np.mean(ngram_scores),
-                                'std': np.std(ngram_scores),
-                                'count': len(ngram_scores),
-                                'min': np.min(ngram_scores),
-                                'max': np.max(ngram_scores)
-                            }
-                            # Store raw scores for plotting
-                            results['raw_scores'][key] = ngram_scores
         return results

                             logger.info(f"Loaded pre-loaded {file_type} reference list for {index_name}")
                             continue
+                    # Check if it's a DataFrame (for n-grams) - convert to nested dict
                     if isinstance(file_path_or_dict, pd.DataFrame):
+                        if file_type in ['bigram', 'trigram']:
+                            # Convert DataFrame to nested dictionary for better performance
+                            nested_dict = self._convert_dataframe_to_nested_dict(
+                                file_path_or_dict, index_name, file_type
+                            )
+                            self.reference_lists[index_name][file_type] = nested_dict
+                            logger.info(f"Converted pre-loaded {file_type} DataFrame to nested dict for {index_name}")
+                        else:
+                            self.reference_lists[index_name][file_type] = file_path_or_dict
+                            logger.info(f"Loaded pre-loaded {file_type} DataFrame for {index_name}")
                         continue
                     # Otherwise, treat as file path
                         # Check if this is a custom frequency list format with specific columns
                         if self._is_custom_frequency_format(df):
                             processed_data = self._parse_custom_frequency_format(df)
+                            # Convert to nested dict format for consistency
+                            nested_dict = {}
+                            for word, freq in processed_data.items():
+                                nested_dict[word] = {'frequency': freq}
+                            self.reference_lists[index_name][file_type] = nested_dict
+                        # For standard unigram files, convert to nested dict format
                         elif df.shape[1] >= 2:
+                            # Convert all columns to nested dictionary
+                            nested_dict = self._convert_dataframe_to_nested_dict(df, index_name, file_type)
+                            self.reference_lists[index_name][file_type] = nested_dict
                     else:
+                        # For n-gram files, convert DataFrame to nested dictionary for better performance
+                        nested_dict = self._convert_dataframe_to_nested_dict(df, index_name, file_type)
+                        self.reference_lists[index_name][file_type] = nested_dict
                     logger.info(f"Loaded {file_type} reference list for {index_name}")
         return all(col in df_columns_lower for col in expected_columns_lower)
+    def _convert_dataframe_to_nested_dict(self, df: pd.DataFrame, index_name: str, file_type: str) -> Dict[str, Dict[str, float]]:
+        """
+        Convert DataFrame to nested dictionary structure for fast O(1) lookups.
+        Args:
+            df: Source DataFrame
+            index_name: Name of the reference index
+            file_type: Type of reference file ('token', 'lemma', 'bigram', 'trigram')
+        Returns:
+            {item_text: {measure_name: value, ...}}
+        """
+        nested_dict = {}
+        if df.empty or len(df.columns) < 2:
+            logger.warning(f"Empty or invalid DataFrame for {index_name} {file_type}")
+            return nested_dict
+        # First column is always the text (word/n-gram)
+        text_col = df.columns[0]
+        # Get column configuration from YAML if available
+        try:
+            from web_app.config_manager import ConfigManager
+            config = ConfigManager.load_reference_config()
+            language_key = "english" if self.language == 'en' else "japanese"
+            # Find the config entry for this index
+            config_entry = None
+            if file_type in ['token', 'lemma']:
+                section_key = 'unigrams'
+            else:
+                section_key = f"{file_type}s"  # bigrams/trigrams
+            if section_key in config.get(language_key, {}):
+                if index_name in config[language_key][section_key]:
+                    config_entry = config[language_key][section_key][index_name]
+            # Create measure mapping
+            measure_mapping = {}
+            if config_entry and 'columns' in config_entry:
+                columns_config = config_entry.get('columns', {})
+                for measure_name, col_idx in columns_config.items():
+                    if isinstance(col_idx, int) and col_idx < len(df.columns):
+                        measure_mapping[measure_name] = df.columns[col_idx]
+            else:
+                # Fallback: use column names directly as measure names (skip first column)
+                for i, col_name in enumerate(df.columns[1:], 1):
+                    measure_mapping[col_name] = col_name
+        except Exception as e:
+            logger.warning(f"Could not load YAML config for {index_name}, using fallback naming: {e}")
+            # Fallback: use column names directly as measure names (skip first column)
+            measure_mapping = {}
+            for i, col_name in enumerate(df.columns[1:], 1):
+                measure_mapping[col_name] = col_name
+        # Clean and convert data
+        df_clean = df.copy()
+        # Clean text column
+        df_clean[text_col] = df_clean[text_col].astype(str).str.strip().str.lower()
+        df_clean = df_clean[df_clean[text_col] != '']
+        df_clean = df_clean[df_clean[text_col] != 'nan']
+        # Clean numeric columns
+        for col in df_clean.columns[1:]:
+            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
+        # Remove rows with all NaN measures
+        df_clean = df_clean.dropna(subset=df_clean.columns[1:].tolist(), how='all')
+        # Convert to nested dictionary
+        for _, row in df_clean.iterrows():
+            text_key = row[text_col]
+            if pd.isna(text_key) or text_key == '':
+                continue
+            measures = {}
+            for measure_name, col_name in measure_mapping.items():
+                if col_name == text_col:  # Skip the text column
+                    continue
+                if col_name in row and not pd.isna(row[col_name]):
+                    measures[measure_name] = float(row[col_name])
+            if measures:  # Only add if we have at least one valid measure
+                nested_dict[text_key] = measures
+        logger.info(f"Converted {len(nested_dict)} entries from DataFrame to nested dict for {index_name} {file_type}")
+        return nested_dict
     def _parse_custom_frequency_format(self, df):
         """
         Parse custom frequency list format and return a dictionary mapping words to frequency scores.
     def _lookup_score(self, word: str, index_name: str, file_type: str,
                      measure_col: Optional[str] = None) -> Optional[float]:
         """
+        Unified lookup for both words and N-grams using nested dictionaries for O(1) performance.
         Args:
+            word: Word/N-gram to look up
             index_name: Name of the reference index
             file_type: Type of reference file ('token', 'lemma', 'bigram', 'trigram')
+            measure_col: Specific measure to retrieve (e.g., 'frequency', 'mi_score')
         Returns:
             Score if found, None otherwise
         if ref_data is None:
             return None
+        # Handle Japanese corpus data (special case)
+        if isinstance(ref_data, dict) and ref_data.get('is_japanese_corpus', False):
+            # This should not be called directly for Japanese data
+            # Use _lookup_japanese_score instead
+            return None
+        # Handle legacy simple dictionaries (old unigram format: word -> single_score)
+        if isinstance(ref_data, dict) and not any(isinstance(v, dict) for v in ref_data.values()):
+            # Legacy simple dictionary format
             return ref_data.get(word.lower())
+        # Handle nested dictionary format (new unified format: word -> {measure: value, ...})
+        if isinstance(ref_data, dict):
+            word_data = ref_data.get(word.lower())
+            if word_data is None or not isinstance(word_data, dict):
                 return None
+            # If measure specified, return that specific measure
+            if measure_col:
+                return word_data.get(measure_col)
+            else:
+                # Return first available measure for backward compatibility
+                return next(iter(word_data.values())) if word_data else None
+        # Fallback to DataFrame lookup (for compatibility during transition)
+        if isinstance(ref_data, pd.DataFrame):
             # Find matching row
             word_col = ref_data.columns[0]
             matching_rows = ref_data[ref_data[word_col].str.lower() == word.lower()]
                     except (ValueError, TypeError):
                         return None
                 return None
+        return None
     def _lookup_with_unidic_fallback(self, token, index_name: str, file_type: str) -> Dict:
         """
                         continue
                     ref_data = self.reference_lists[index_name].get(ngram_type)
+                    if ref_data is None:
+                        continue
+                    # Skip if using old DataFrame format (should be converted by now)
+                    if isinstance(ref_data, pd.DataFrame):
+                        logger.warning(f"Found unconverted DataFrame for {index_name} {ngram_type}, skipping")
+                        continue
+                    # Ensure we have the new nested dictionary format
+                    if not isinstance(ref_data, dict):
                         continue
+                    # Get available measures from any N-gram entry
+                    sample_ngram_data = next(iter(ref_data.values())) if ref_data else {}
+                    if not isinstance(sample_ngram_data, dict):
+                        continue
+                    available_measures = list(sample_ngram_data.keys())
+                    # Process each available measure
+                    for measure_name in available_measures:
+                        # Check if this measure should be computed
+                        if not self._should_compute_measure(index_name, measure_name, selected_measures):
+                            continue
+                        # Use the unified lookup method for O(1) performance
+                        score = self._lookup_score(ngram, index_name, ngram_type, measure_name)
+                        if score is not None:
+                            # Check if this measure should be log-transformed
+                            should_log_transform = self._should_apply_log_transform(
+                                index_name, ngram_type, measure_name, log_transforms, apply_log
+                            )
+                            score_val = np.log10(score) if should_log_transform and score > 0 else score
+                            ngram_detail[f"{index_name}_{measure_name}"] = score_val
+                        else:
+                            ngram_detail[f"{index_name}_{measure_name}"] = None
                     # Get columns config for proper measure naming from YAML config
                     # We need to access the original YAML configuration to get proper measure names
                     from web_app.config_manager import ConfigManager
                                 config_entry = config[language_key][config_section][index_name]
                                 break
+                    # Note: With nested dictionary format, we already processed all measures above
+                    # No additional processing needed here since measures are extracted directly from the dictionary
                 results[ngram_details_key].append(ngram_detail)
                     continue
                 ref_data = self.reference_lists[index_name].get(ngram_type)
+                if ref_data is None:
+                    continue
+                # Skip if using old DataFrame format (should be converted by now)
+                if isinstance(ref_data, pd.DataFrame):
+                    logger.warning(f"Found unconverted DataFrame for {index_name} {ngram_type} in summary, skipping")
                     continue
+                # Ensure we have the new nested dictionary format
+                if not isinstance(ref_data, dict):
+                    continue
+                # Get available measures from any N-gram entry
+                sample_ngram_data = next(iter(ref_data.values())) if ref_data else {}
+                if not isinstance(sample_ngram_data, dict):
+                    continue
+                available_measures = list(sample_ngram_data.keys())
+                # Process each available measure for summary statistics
+                for measure_name in available_measures:
+                    # Check if this measure should be computed
+                    if not self._should_compute_measure(index_name, measure_name, selected_measures):
+                        continue
+                    ngram_scores = []
+                    for ngram in ngrams:
+                        score = self._lookup_score(ngram, index_name, ngram_type, measure_name)
+                        if score is not None:
+                            # Check if this measure should be log-transformed
+                            should_log_transform = self._should_apply_log_transform(
+                                index_name, ngram_type, measure_name, log_transforms, apply_log
+                            )
+                            score_val = np.log10(score) if should_log_transform and score > 0 else score
+                            ngram_scores.append(score_val)
+                    if ngram_scores:
+                        key = f"{index_name}_{ngram_type}_{measure_name}"
+                        results['summary'][key] = {
+                            'mean': np.mean(ngram_scores),
+                            'std': np.std(ngram_scores),
+                            'count': len(ngram_scores),
+                            'min': np.min(ngram_scores),
+                            'max': np.max(ngram_scores)
+                        }
+                        # Store raw scores for plotting
+                        results['raw_scores'][key] = ngram_scores
                 # Get columns config for proper measure naming from YAML config
                 # We need to access the original YAML configuration to get proper measure names
                 from web_app.config_manager import ConfigManager
                             config_entry = config[language_key][config_section][index_name]
                             break
+                # Note: With nested dictionary format, summary statistics are already processed above
+                # No additional processing needed here since measures are extracted directly from the dictionary
         return results

uv.lock CHANGED Viewed

@@ -1751,7 +1751,7 @@ requires-dist = [
     { name = "plotly", specifier = ">=5.15.0" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "scipy", specifier = ">=1.11.0" },
-    { name = "spacy", specifier = ">=3.7.0" },
     { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
     { name = "spacy-transformers", specifier = ">=1.3.0" },
     { name = "streamlit", specifier = ">=1.28.0" },

     { name = "plotly", specifier = ">=5.15.0" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "scipy", specifier = ">=1.11.0" },
+    { name = "spacy", extras = ["cuda11", "cuda12"], specifier = ">=3.7.0" },
     { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
     { name = "spacy-transformers", specifier = ">=1.3.0" },
     { name = "streamlit", specifier = ">=1.28.0" },