Spaces:
Building
Building
more sophistication indice selection
Browse files- clear_session.py +34 -0
- config/reference_lists.yaml +917 -154
- config/reference_lists.yaml.backup_20250727_220815 +301 -0
- config/reference_lists.yaml.backup_20250727_230913 +319 -0
- config/reference_lists.yaml.backup_20250727_231728 +319 -0
- text_analyzer/lexical_sophistication.py +87 -8
- web_app/components/ui_components.py +120 -8
- web_app/config_manager.py +24 -2
- web_app/defaults_manager.py +401 -0
- web_app/handlers/analysis_handlers.py +126 -10
- web_app/reference_manager.py +5 -6
- web_app/schema_migrator.py +161 -0
- web_app/schema_validator.py +309 -0
clear_session.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from web_app.session_manager import SessionManager
|
| 4 |
+
|
| 5 |
+
st.title("🔄 Session State Reset")
|
| 6 |
+
|
| 7 |
+
st.write("## Current Session State")
|
| 8 |
+
st.write("Reference lists currently loaded:")
|
| 9 |
+
|
| 10 |
+
if hasattr(st.session_state, 'reference_lists') and st.session_state.reference_lists:
|
| 11 |
+
for name, data in st.session_state.reference_lists.items():
|
| 12 |
+
st.write(f"- **{name}**")
|
| 13 |
+
else:
|
| 14 |
+
st.write("No reference lists loaded")
|
| 15 |
+
|
| 16 |
+
st.write("---")
|
| 17 |
+
|
| 18 |
+
if st.button("🗑️ Clear All Session State", type="primary"):
|
| 19 |
+
# Clear all session state
|
| 20 |
+
for key in list(st.session_state.keys()):
|
| 21 |
+
del st.session_state[key]
|
| 22 |
+
|
| 23 |
+
# Reinitialize
|
| 24 |
+
SessionManager.initialize_session_state()
|
| 25 |
+
|
| 26 |
+
st.success("✅ Session state cleared! Please refresh the page.")
|
| 27 |
+
st.balloons()
|
| 28 |
+
|
| 29 |
+
st.write("### Instructions:")
|
| 30 |
+
st.write("1. Click 'Clear All Session State' above")
|
| 31 |
+
st.write("2. Refresh your browser page")
|
| 32 |
+
st.write("3. Go back to the Lexical Sophistication tool")
|
| 33 |
+
st.write("4. Re-select your reference lists")
|
| 34 |
+
st.write("5. You should now see smart defaults!")
|
config/reference_lists.yaml
CHANGED
|
@@ -1,17 +1,12 @@
|
|
| 1 |
-
# Configuration for Default Reference Lists
|
| 2 |
-
# Add new reference lists here and they'll automatically appear in the UI
|
| 3 |
-
# Structure: language -> type -> list_name -> configuration
|
| 4 |
-
|
| 5 |
english:
|
| 6 |
unigrams:
|
| 7 |
-
|
| 8 |
-
display_name:
|
| 9 |
-
description:
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
columns:
|
| 15 |
word: 0
|
| 16 |
frequency: 1
|
| 17 |
normalized_freq: 2
|
|
@@ -19,59 +14,269 @@ english:
|
|
| 19 |
dispersion: 4
|
| 20 |
has_header: false
|
| 21 |
enabled: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
| 24 |
-
display_name:
|
| 25 |
-
description:
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
word: 0
|
| 32 |
concreteness: 1
|
| 33 |
has_header: true
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
word: 0
|
| 46 |
frequency: 1
|
| 47 |
has_header: true
|
| 48 |
-
enabled: false
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
columns:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
bigram: 0
|
| 60 |
frequency: 1
|
| 61 |
normalized_freq: 2
|
| 62 |
documents: 3
|
| 63 |
range: 4
|
| 64 |
has_header: false
|
| 65 |
-
enabled: true
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
bigram: 0
|
| 76 |
frequency: 1
|
| 77 |
mi_score: 2
|
|
@@ -80,17 +285,199 @@ english:
|
|
| 80 |
delta_p: 5
|
| 81 |
ap_collex: 6
|
| 82 |
has_header: true
|
| 83 |
-
enabled:
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
trigrams:
|
| 86 |
-
|
| 87 |
-
display_name:
|
| 88 |
-
description:
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
format: "tsv"
|
| 93 |
-
columns:
|
| 94 |
trigram: 0
|
| 95 |
frequency: 1
|
| 96 |
normalized_freq: 2
|
|
@@ -98,15 +485,62 @@ english:
|
|
| 98 |
dispersion: 4
|
| 99 |
has_header: false
|
| 100 |
enabled: true
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
trigram: 0
|
| 111 |
frequency: 1
|
| 112 |
mi_score: 2
|
|
@@ -115,16 +549,65 @@ english:
|
|
| 115 |
delta_p: 5
|
| 116 |
ap_collex: 6
|
| 117 |
has_header: true
|
| 118 |
-
enabled:
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
trigram: 0
|
| 129 |
frequency: 1
|
| 130 |
mi_score: 2
|
|
@@ -133,118 +616,398 @@ english:
|
|
| 133 |
delta_p: 5
|
| 134 |
ap_collex: 6
|
| 135 |
has_header: true
|
| 136 |
-
enabled:
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
japanese:
|
| 139 |
unigrams:
|
| 140 |
-
|
| 141 |
-
display_name:
|
| 142 |
-
description:
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
| 147 |
has_header: true
|
| 148 |
enabled: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
japanese_corpus: true
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
BCCWJ_pmw:
|
| 157 |
-
display_name: "BCCWJ Written - Per Million Words"
|
| 158 |
-
description: "BCCWJ normalized frequency for written Japanese"
|
| 159 |
-
files:
|
| 160 |
-
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 161 |
-
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 162 |
-
format: "tsv"
|
| 163 |
has_header: true
|
| 164 |
enabled: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
japanese_corpus: true
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
surface_form: 1
|
| 168 |
lemma: 2
|
| 169 |
pos: 3
|
| 170 |
-
frequency: 7
|
| 171 |
-
|
| 172 |
-
BCCWJ_rank:
|
| 173 |
-
display_name: "BCCWJ Written - Frequency Rank"
|
| 174 |
-
description: "BCCWJ frequency ranking for written Japanese"
|
| 175 |
-
files:
|
| 176 |
-
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 177 |
-
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 178 |
-
format: "tsv"
|
| 179 |
has_header: true
|
| 180 |
enabled: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
japanese_corpus: true
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
surface_form: 1
|
| 184 |
lemma: 2
|
| 185 |
pos: 3
|
| 186 |
-
frequency: 0
|
| 187 |
-
|
| 188 |
-
CSJ_frequency:
|
| 189 |
-
display_name: "CSJ Spoken - Frequency"
|
| 190 |
-
description: "CSJ raw frequency counts for spoken Japanese"
|
| 191 |
-
files:
|
| 192 |
-
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 193 |
-
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 194 |
-
format: "tsv"
|
| 195 |
has_header: true
|
| 196 |
enabled: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
japanese_corpus: true
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
surface_form: 1
|
| 200 |
lemma: 2
|
| 201 |
pos: 3
|
| 202 |
frequency: 6
|
| 203 |
-
|
| 204 |
-
CSJ_pmw:
|
| 205 |
-
display_name: "CSJ Spoken - Per Million Words"
|
| 206 |
-
description: "CSJ normalized frequency for spoken Japanese"
|
| 207 |
-
files:
|
| 208 |
-
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 209 |
-
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 210 |
-
format: "tsv"
|
| 211 |
has_header: true
|
| 212 |
enabled: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
japanese_corpus: true
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
surface_form: 1
|
| 216 |
lemma: 2
|
| 217 |
pos: 3
|
| 218 |
frequency: 7
|
| 219 |
-
|
| 220 |
-
CSJ_rank:
|
| 221 |
-
display_name: "CSJ Spoken - Frequency Rank"
|
| 222 |
-
description: "CSJ frequency ranking for spoken Japanese"
|
| 223 |
-
files:
|
| 224 |
-
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 225 |
-
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 226 |
-
format: "tsv"
|
| 227 |
has_header: true
|
| 228 |
enabled: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
japanese_corpus: true
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
surface_form: 1
|
| 232 |
lemma: 2
|
| 233 |
pos: 3
|
| 234 |
frequency: 0
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
word: 0
|
| 245 |
frequency: 1
|
| 246 |
has_header: true
|
| 247 |
-
enabled: false
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
english:
|
| 2 |
unigrams:
|
| 3 |
+
COCA_magazine_frequency_token:
|
| 4 |
+
display_name: COCA Magazine Frequency (Token)
|
| 5 |
+
description: Frequency and range data from COCA magazine subcorpus - token-based
|
| 6 |
+
analysis
|
| 7 |
+
file: resources/reference_lists/en/COCA_magazine_unigram_list.csv
|
| 8 |
+
format: tsv
|
| 9 |
+
columns: &id001
|
|
|
|
| 10 |
word: 0
|
| 11 |
frequency: 1
|
| 12 |
normalized_freq: 2
|
|
|
|
| 14 |
dispersion: 4
|
| 15 |
has_header: false
|
| 16 |
enabled: true
|
| 17 |
+
analysis_type: token
|
| 18 |
+
log_transformable:
|
| 19 |
+
- frequency
|
| 20 |
+
- normalized_freq
|
| 21 |
+
selectable_measures:
|
| 22 |
+
- frequency
|
| 23 |
+
- normalized_freq
|
| 24 |
+
- range
|
| 25 |
+
- dispersion
|
| 26 |
+
default_measures:
|
| 27 |
+
- frequency
|
| 28 |
+
- normalized_freq
|
| 29 |
+
- range
|
| 30 |
+
default_log_transforms:
|
| 31 |
+
- frequency
|
| 32 |
+
- normalized_freq
|
| 33 |
+
measure_classifications:
|
| 34 |
+
frequency: frequency
|
| 35 |
+
normalized_freq: frequency
|
| 36 |
+
range: range
|
| 37 |
+
dispersion: range
|
| 38 |
|
| 39 |
+
COCA_magazine_frequency_lemma:
|
| 40 |
+
display_name: COCA Magazine Frequency (Lemma)
|
| 41 |
+
description: Frequency and range data from COCA magazine subcorpus - lemma-based
|
| 42 |
+
analysis
|
| 43 |
+
file: resources/reference_lists/en/COCA_magazine_unigram_list.csv
|
| 44 |
+
format: tsv
|
| 45 |
+
columns: *id001
|
| 46 |
+
has_header: false
|
| 47 |
+
enabled: true
|
| 48 |
+
analysis_type: lemma
|
| 49 |
+
log_transformable:
|
| 50 |
+
- frequency
|
| 51 |
+
- normalized_freq
|
| 52 |
+
selectable_measures:
|
| 53 |
+
- frequency
|
| 54 |
+
- normalized_freq
|
| 55 |
+
- range
|
| 56 |
+
- dispersion
|
| 57 |
+
default_measures:
|
| 58 |
+
- frequency
|
| 59 |
+
- normalized_freq
|
| 60 |
+
- range
|
| 61 |
+
default_log_transforms:
|
| 62 |
+
- frequency
|
| 63 |
+
- normalized_freq
|
| 64 |
+
measure_classifications:
|
| 65 |
+
frequency: frequency
|
| 66 |
+
normalized_freq: frequency
|
| 67 |
+
range: range
|
| 68 |
+
dispersion: range
|
| 69 |
+
|
| 70 |
+
concreteness_ratings_token:
|
| 71 |
+
display_name: Concreteness Ratings (Token)
|
| 72 |
+
description: Concreteness ratings for English words (1-5 scale) - token-based
|
| 73 |
+
analysis
|
| 74 |
+
file: resources/reference_lists/en/Concreteness_Brysbaert.txt
|
| 75 |
+
format: tsv
|
| 76 |
+
columns: &id002
|
| 77 |
word: 0
|
| 78 |
concreteness: 1
|
| 79 |
has_header: true
|
| 80 |
+
enabled: true
|
| 81 |
+
analysis_type: token
|
| 82 |
+
log_transformable: []
|
| 83 |
+
selectable_measures:
|
| 84 |
+
- concreteness
|
| 85 |
+
default_measures:
|
| 86 |
+
- concreteness
|
| 87 |
+
default_log_transforms: []
|
| 88 |
+
measure_classifications:
|
| 89 |
+
concreteness: psycholinguistic
|
| 90 |
+
header_prefix: '#'
|
| 91 |
+
|
| 92 |
+
concreteness_ratings_lemma:
|
| 93 |
+
display_name: Concreteness Ratings (Lemma)
|
| 94 |
+
description: Concreteness ratings for English words (1-5 scale) - lemma-based
|
| 95 |
+
analysis
|
| 96 |
+
file: resources/reference_lists/en/Concreteness_Brysbaert.txt
|
| 97 |
+
format: tsv
|
| 98 |
+
columns: *id002
|
| 99 |
+
has_header: true
|
| 100 |
+
enabled: true
|
| 101 |
+
analysis_type: lemma
|
| 102 |
+
log_transformable: []
|
| 103 |
+
selectable_measures:
|
| 104 |
+
- concreteness
|
| 105 |
+
default_measures:
|
| 106 |
+
- concreteness
|
| 107 |
+
default_log_transforms: []
|
| 108 |
+
measure_classifications:
|
| 109 |
+
concreteness: psycholinguistic
|
| 110 |
+
header_prefix: '#'
|
| 111 |
+
academic_words_token:
|
| 112 |
+
display_name: Academic Word List (Token)
|
| 113 |
+
description: Common academic vocabulary for research writing - token-based analysis
|
| 114 |
+
file: resources/reference_lists/en/academic_words_token.csv
|
| 115 |
+
format: csv
|
| 116 |
+
columns: &id003
|
| 117 |
word: 0
|
| 118 |
frequency: 1
|
| 119 |
has_header: true
|
| 120 |
+
enabled: false
|
| 121 |
+
analysis_type: token
|
| 122 |
+
log_transformable:
|
| 123 |
+
- frequency
|
| 124 |
+
selectable_measures:
|
| 125 |
+
- frequency
|
| 126 |
+
default_measures:
|
| 127 |
+
- frequency
|
| 128 |
+
default_log_transforms:
|
| 129 |
+
- frequency
|
| 130 |
+
measure_classifications:
|
| 131 |
+
frequency: frequency
|
| 132 |
+
academic_words_lemma:
|
| 133 |
+
display_name: Academic Word List (Lemma)
|
| 134 |
+
description: Common academic vocabulary for research writing - lemma-based analysis
|
| 135 |
+
file: resources/reference_lists/en/academic_words_lemma.csv
|
| 136 |
+
format: csv
|
| 137 |
+
columns: *id003
|
| 138 |
+
has_header: true
|
| 139 |
+
enabled: false
|
| 140 |
+
analysis_type: lemma
|
| 141 |
+
log_transformable:
|
| 142 |
+
- frequency
|
| 143 |
+
selectable_measures:
|
| 144 |
+
- frequency
|
| 145 |
+
default_measures:
|
| 146 |
+
- frequency
|
| 147 |
+
default_log_transforms:
|
| 148 |
+
- frequency
|
| 149 |
+
measure_classifications:
|
| 150 |
+
frequency: frequency
|
| 151 |
+
COCA_spoken_frequency_token:
|
| 152 |
+
display_name: COCA Spoken Frequency (Token)
|
| 153 |
+
description: Frequency and range data from COCA spoken subcorpus - token-based
|
| 154 |
+
analysis
|
| 155 |
+
file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
|
| 156 |
+
format: tsv
|
| 157 |
+
columns:
|
| 158 |
+
word: 0
|
| 159 |
+
frequency: 1
|
| 160 |
+
normalized_freq: 2
|
| 161 |
+
range: 3
|
| 162 |
+
dispersion: 4
|
| 163 |
+
has_header: false
|
| 164 |
+
enabled: true
|
| 165 |
+
analysis_type: token
|
| 166 |
+
log_transformable:
|
| 167 |
+
- frequency
|
| 168 |
+
- normalized_freq
|
| 169 |
+
selectable_measures:
|
| 170 |
+
- frequency
|
| 171 |
+
- normalized_freq
|
| 172 |
+
- range
|
| 173 |
+
- dispersion
|
| 174 |
+
default_measures:
|
| 175 |
+
- frequency
|
| 176 |
+
- normalized_freq
|
| 177 |
+
default_log_transforms:
|
| 178 |
+
- frequency
|
| 179 |
+
- normalized_freq
|
| 180 |
+
COCA_spoken_frequency_lemma:
|
| 181 |
+
display_name: COCA Spoken Frequency (Lemma)
|
| 182 |
+
description: Frequency and range data from COCA spoken subcorpus - lemma-based
|
| 183 |
+
analysis
|
| 184 |
+
file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
|
| 185 |
+
format: tsv
|
| 186 |
columns:
|
| 187 |
+
word: 0
|
| 188 |
+
frequency: 1
|
| 189 |
+
normalized_freq: 2
|
| 190 |
+
range: 3
|
| 191 |
+
dispersion: 4
|
| 192 |
+
has_header: false
|
| 193 |
+
enabled: true
|
| 194 |
+
analysis_type: lemma
|
| 195 |
+
log_transformable:
|
| 196 |
+
- frequency
|
| 197 |
+
- normalized_freq
|
| 198 |
+
selectable_measures:
|
| 199 |
+
- frequency
|
| 200 |
+
- normalized_freq
|
| 201 |
+
- range
|
| 202 |
+
- dispersion
|
| 203 |
+
default_measures:
|
| 204 |
+
- frequency
|
| 205 |
+
- normalized_freq
|
| 206 |
+
default_log_transforms:
|
| 207 |
+
- frequency
|
| 208 |
+
- normalized_freq
|
| 209 |
+
bigrams:
|
| 210 |
+
COCA_spoken_bigram_frequency_token:
|
| 211 |
+
display_name: COCA Spoken Bigram Frequency (Token)
|
| 212 |
+
description: Bigram frequencies and range data - token-based analysis
|
| 213 |
+
file: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 214 |
+
format: tsv
|
| 215 |
+
columns: &id004
|
| 216 |
bigram: 0
|
| 217 |
frequency: 1
|
| 218 |
normalized_freq: 2
|
| 219 |
documents: 3
|
| 220 |
range: 4
|
| 221 |
has_header: false
|
| 222 |
+
enabled: true
|
| 223 |
+
analysis_type: token
|
| 224 |
+
log_transformable:
|
| 225 |
+
- frequency
|
| 226 |
+
- normalized_freq
|
| 227 |
+
selectable_measures:
|
| 228 |
+
- frequency
|
| 229 |
+
- normalized_freq
|
| 230 |
+
- documents
|
| 231 |
+
- range
|
| 232 |
+
default_measures:
|
| 233 |
+
- frequency
|
| 234 |
+
- normalized_freq
|
| 235 |
+
- range
|
| 236 |
+
default_log_transforms:
|
| 237 |
+
- frequency
|
| 238 |
+
- normalized_freq
|
| 239 |
+
measure_classifications:
|
| 240 |
+
frequency: frequency
|
| 241 |
+
normalized_freq: frequency
|
| 242 |
+
documents: range
|
| 243 |
+
range: range
|
| 244 |
+
COCA_spoken_bigram_frequency_lemma:
|
| 245 |
+
display_name: COCA Spoken Bigram Frequency (Lemma)
|
| 246 |
+
description: Bigram frequencies and range data - lemma-based analysis
|
| 247 |
+
file: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 248 |
+
format: tsv
|
| 249 |
+
columns: *id004
|
| 250 |
+
has_header: false
|
| 251 |
+
enabled: true
|
| 252 |
+
analysis_type: lemma
|
| 253 |
+
log_transformable:
|
| 254 |
+
- frequency
|
| 255 |
+
- normalized_freq
|
| 256 |
+
selectable_measures:
|
| 257 |
+
- frequency
|
| 258 |
+
- normalized_freq
|
| 259 |
+
- documents
|
| 260 |
+
- range
|
| 261 |
+
default_measures:
|
| 262 |
+
- frequency
|
| 263 |
+
- normalized_freq
|
| 264 |
+
- range
|
| 265 |
+
default_log_transforms:
|
| 266 |
+
- frequency
|
| 267 |
+
- normalized_freq
|
| 268 |
+
measure_classifications:
|
| 269 |
+
frequency: frequency
|
| 270 |
+
normalized_freq: frequency
|
| 271 |
+
documents: range
|
| 272 |
+
range: range
|
| 273 |
+
COCA_spoken_bigram_association_token:
|
| 274 |
+
display_name: COCA Spoken Bigram Associations (Token)
|
| 275 |
+
description: Bigram association measures (MI, T-score, Delta P) - token-based
|
| 276 |
+
analysis
|
| 277 |
+
file: resources/reference_lists/en/spoken_bi_contingency.csv
|
| 278 |
+
format: csv
|
| 279 |
+
columns: &id005
|
| 280 |
bigram: 0
|
| 281 |
frequency: 1
|
| 282 |
mi_score: 2
|
|
|
|
| 285 |
delta_p: 5
|
| 286 |
ap_collex: 6
|
| 287 |
has_header: true
|
| 288 |
+
enabled: true
|
| 289 |
+
analysis_type: token
|
| 290 |
+
log_transformable:
|
| 291 |
+
- frequency
|
| 292 |
+
selectable_measures:
|
| 293 |
+
- frequency
|
| 294 |
+
- mi_score
|
| 295 |
+
- mi_2_score
|
| 296 |
+
- t_score
|
| 297 |
+
- delta_p
|
| 298 |
+
- ap_collex
|
| 299 |
+
default_measures:
|
| 300 |
+
- frequency
|
| 301 |
+
- t_score
|
| 302 |
+
default_log_transforms:
|
| 303 |
+
- frequency
|
| 304 |
+
measure_classifications:
|
| 305 |
+
frequency: frequency
|
| 306 |
+
mi_score: association
|
| 307 |
+
mi_2_score: association
|
| 308 |
+
t_score: association
|
| 309 |
+
delta_p: association
|
| 310 |
+
ap_collex: association
|
| 311 |
+
COCA_spoken_bigram_association_lemma:
|
| 312 |
+
display_name: COCA Spoken Bigram Associations (Lemma)
|
| 313 |
+
description: Bigram association measures (MI, T-score, Delta P) - lemma-based
|
| 314 |
+
analysis
|
| 315 |
+
file: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
|
| 316 |
+
format: csv
|
| 317 |
+
columns: *id005
|
| 318 |
+
has_header: true
|
| 319 |
+
enabled: true
|
| 320 |
+
analysis_type: lemma
|
| 321 |
+
log_transformable:
|
| 322 |
+
- frequency
|
| 323 |
+
selectable_measures:
|
| 324 |
+
- frequency
|
| 325 |
+
- mi_score
|
| 326 |
+
- mi_2_score
|
| 327 |
+
- t_score
|
| 328 |
+
- delta_p
|
| 329 |
+
- ap_collex
|
| 330 |
+
default_measures:
|
| 331 |
+
- frequency
|
| 332 |
+
- t_score
|
| 333 |
+
default_log_transforms:
|
| 334 |
+
- frequency
|
| 335 |
+
measure_classifications:
|
| 336 |
+
frequency: frequency
|
| 337 |
+
mi_score: association
|
| 338 |
+
mi_2_score: association
|
| 339 |
+
t_score: association
|
| 340 |
+
delta_p: association
|
| 341 |
+
ap_collex: association
|
| 342 |
+
COCA_magazine_bigram_frequency_token:
|
| 343 |
+
display_name: COCA Magazine Bigram Frequency (Token)
|
| 344 |
+
description: Bigram frequencies and range data in Magazine - token-based analysis
|
| 345 |
+
file: resources/reference_lists/en/COCA_magazine_bigram_list.csv
|
| 346 |
+
format: tsv
|
| 347 |
+
columns: &id006
|
| 348 |
+
bigram: 0
|
| 349 |
+
frequency: 1
|
| 350 |
+
normalized_freq: 2
|
| 351 |
+
documents: 3
|
| 352 |
+
range: 4
|
| 353 |
+
has_header: false
|
| 354 |
+
enabled: true
|
| 355 |
+
analysis_type: token
|
| 356 |
+
log_transformable:
|
| 357 |
+
- frequency
|
| 358 |
+
- normalized_freq
|
| 359 |
+
selectable_measures:
|
| 360 |
+
- frequency
|
| 361 |
+
- normalized_freq
|
| 362 |
+
- documents
|
| 363 |
+
- range
|
| 364 |
+
default_measures:
|
| 365 |
+
- frequency
|
| 366 |
+
- normalized_freq
|
| 367 |
+
- range
|
| 368 |
+
default_log_transforms:
|
| 369 |
+
- frequency
|
| 370 |
+
- normalized_freq
|
| 371 |
+
measure_classifications:
|
| 372 |
+
frequency: frequency
|
| 373 |
+
normalized_freq: frequency
|
| 374 |
+
documents: range
|
| 375 |
+
range: range
|
| 376 |
+
COCA_magazine_bigram_frequency_lemma:
|
| 377 |
+
display_name: COCA Magazine Bigram Frequency (Lemma)
|
| 378 |
+
description: Bigram frequencies and range data in Magazine - lemma-based analysis
|
| 379 |
+
file: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 380 |
+
format: tsv
|
| 381 |
+
columns: *id006
|
| 382 |
+
has_header: false
|
| 383 |
+
enabled: true
|
| 384 |
+
analysis_type: lemma
|
| 385 |
+
log_transformable:
|
| 386 |
+
- frequency
|
| 387 |
+
- normalized_freq
|
| 388 |
+
selectable_measures:
|
| 389 |
+
- frequency
|
| 390 |
+
- normalized_freq
|
| 391 |
+
- documents
|
| 392 |
+
- range
|
| 393 |
+
default_measures:
|
| 394 |
+
- frequency
|
| 395 |
+
- normalized_freq
|
| 396 |
+
- range
|
| 397 |
+
default_log_transforms:
|
| 398 |
+
- frequency
|
| 399 |
+
- normalized_freq
|
| 400 |
+
measure_classifications:
|
| 401 |
+
frequency: frequency
|
| 402 |
+
normalized_freq: frequency
|
| 403 |
+
documents: range
|
| 404 |
+
range: range
|
| 405 |
+
COCA_magazine_bigram_association_token:
|
| 406 |
+
display_name: COCA Magazine Bigram Associations (Token)
|
| 407 |
+
description: Bigram association measures (MI, T-score, Delta P) - token-based
|
| 408 |
+
analysis
|
| 409 |
+
file: resources/reference_lists/en/magazine_bi_contingency.csv
|
| 410 |
+
format: csv
|
| 411 |
+
columns: &id007
|
| 412 |
+
bigram: 0
|
| 413 |
+
frequency: 1
|
| 414 |
+
mi_score: 2
|
| 415 |
+
mi_2_score: 3
|
| 416 |
+
t_score: 4
|
| 417 |
+
delta_p: 5
|
| 418 |
+
ap_collex: 6
|
| 419 |
+
has_header: true
|
| 420 |
+
enabled: true
|
| 421 |
+
analysis_type: token
|
| 422 |
+
log_transformable:
|
| 423 |
+
- frequency
|
| 424 |
+
selectable_measures:
|
| 425 |
+
- frequency
|
| 426 |
+
- mi_score
|
| 427 |
+
- mi_2_score
|
| 428 |
+
- t_score
|
| 429 |
+
- delta_p
|
| 430 |
+
- ap_collex
|
| 431 |
+
default_measures:
|
| 432 |
+
- frequency
|
| 433 |
+
- t_score
|
| 434 |
+
default_log_transforms:
|
| 435 |
+
- frequency
|
| 436 |
+
measure_classifications:
|
| 437 |
+
frequency: frequency
|
| 438 |
+
mi_score: association
|
| 439 |
+
mi_2_score: association
|
| 440 |
+
t_score: association
|
| 441 |
+
delta_p: association
|
| 442 |
+
ap_collex: association
|
| 443 |
+
COCA_magazine_bigram_association_lemma:
|
| 444 |
+
display_name: COCA Magazine Bigram Associations (Lemma)
|
| 445 |
+
description: Bigram association measures (MI, T-score, Delta P) - lemma-based
|
| 446 |
+
analysis
|
| 447 |
+
file: resources/reference_lists/en/magazine_bigram_lemma_contingency.csv
|
| 448 |
+
format: csv
|
| 449 |
+
columns: *id007
|
| 450 |
+
has_header: true
|
| 451 |
+
enabled: true
|
| 452 |
+
analysis_type: lemma
|
| 453 |
+
log_transformable:
|
| 454 |
+
- frequency
|
| 455 |
+
selectable_measures:
|
| 456 |
+
- frequency
|
| 457 |
+
- mi_score
|
| 458 |
+
- mi_2_score
|
| 459 |
+
- t_score
|
| 460 |
+
- delta_p
|
| 461 |
+
- ap_collex
|
| 462 |
+
default_measures:
|
| 463 |
+
- frequency
|
| 464 |
+
- t_score
|
| 465 |
+
default_log_transforms:
|
| 466 |
+
- frequency
|
| 467 |
+
measure_classifications:
|
| 468 |
+
frequency: frequency
|
| 469 |
+
mi_score: association
|
| 470 |
+
mi_2_score: association
|
| 471 |
+
t_score: association
|
| 472 |
+
delta_p: association
|
| 473 |
+
ap_collex: association
|
| 474 |
trigrams:
|
| 475 |
+
COCA_trigram_frequency_token:
|
| 476 |
+
display_name: COCA Trigram Frequency (Token)
|
| 477 |
+
description: Trigram frequencies and range data - token-based analysis
|
| 478 |
+
file: resources/reference_lists/en/COCA_spoken_trigram_list.csv
|
| 479 |
+
format: tsv
|
| 480 |
+
columns: &id008
|
|
|
|
|
|
|
| 481 |
trigram: 0
|
| 482 |
frequency: 1
|
| 483 |
normalized_freq: 2
|
|
|
|
| 485 |
dispersion: 4
|
| 486 |
has_header: false
|
| 487 |
enabled: true
|
| 488 |
+
analysis_type: token
|
| 489 |
+
log_transformable:
|
| 490 |
+
- frequency
|
| 491 |
+
- normalized_freq
|
| 492 |
+
selectable_measures:
|
| 493 |
+
- frequency
|
| 494 |
+
- normalized_freq
|
| 495 |
+
- range
|
| 496 |
+
- dispersion
|
| 497 |
+
default_measures:
|
| 498 |
+
- frequency
|
| 499 |
+
- normalized_freq
|
| 500 |
+
- range
|
| 501 |
+
default_log_transforms:
|
| 502 |
+
- frequency
|
| 503 |
+
- normalized_freq
|
| 504 |
+
measure_classifications:
|
| 505 |
+
frequency: frequency
|
| 506 |
+
normalized_freq: frequency
|
| 507 |
+
range: range
|
| 508 |
+
dispersion: range
|
| 509 |
+
COCA_trigram_frequency_lemma:
|
| 510 |
+
display_name: COCA Trigram Frequency (Lemma)
|
| 511 |
+
description: Trigram frequencies and range data - lemma-based analysis
|
| 512 |
+
file: resources/reference_lists/en/COCA_spoken_trigram_list.csv
|
| 513 |
+
format: tsv
|
| 514 |
+
columns: *id008
|
| 515 |
+
has_header: false
|
| 516 |
+
enabled: true
|
| 517 |
+
analysis_type: lemma
|
| 518 |
+
log_transformable:
|
| 519 |
+
- frequency
|
| 520 |
+
- normalized_freq
|
| 521 |
+
selectable_measures:
|
| 522 |
+
- frequency
|
| 523 |
+
- normalized_freq
|
| 524 |
+
- range
|
| 525 |
+
- dispersion
|
| 526 |
+
default_measures:
|
| 527 |
+
- frequency
|
| 528 |
+
- normalized_freq
|
| 529 |
+
- range
|
| 530 |
+
default_log_transforms:
|
| 531 |
+
- frequency
|
| 532 |
+
- normalized_freq
|
| 533 |
+
measure_classifications:
|
| 534 |
+
frequency: frequency
|
| 535 |
+
normalized_freq: frequency
|
| 536 |
+
range: range
|
| 537 |
+
dispersion: range
|
| 538 |
+
COCA_trigram_assoc_uni_bi_token:
|
| 539 |
+
display_name: COCA Trigram→Bigram Associations (Token)
|
| 540 |
+
description: Trigram to bigram association measures - token-based analysis
|
| 541 |
+
file: resources/reference_lists/en/spoken_tri_contingency_1.csv
|
| 542 |
+
format: csv
|
| 543 |
+
columns: &id009
|
| 544 |
trigram: 0
|
| 545 |
frequency: 1
|
| 546 |
mi_score: 2
|
|
|
|
| 549 |
delta_p: 5
|
| 550 |
ap_collex: 6
|
| 551 |
has_header: true
|
| 552 |
+
enabled: true
|
| 553 |
+
analysis_type: token
|
| 554 |
+
log_transformable:
|
| 555 |
+
- frequency
|
| 556 |
+
selectable_measures:
|
| 557 |
+
- frequency
|
| 558 |
+
- mi_score
|
| 559 |
+
- mi_2_score
|
| 560 |
+
- t_score
|
| 561 |
+
- delta_p
|
| 562 |
+
- ap_collex
|
| 563 |
+
default_measures:
|
| 564 |
+
- frequency
|
| 565 |
+
- t_score
|
| 566 |
+
default_log_transforms:
|
| 567 |
+
- frequency
|
| 568 |
+
measure_classifications:
|
| 569 |
+
frequency: frequency
|
| 570 |
+
mi_score: association
|
| 571 |
+
mi_2_score: association
|
| 572 |
+
t_score: association
|
| 573 |
+
delta_p: association
|
| 574 |
+
ap_collex: association
|
| 575 |
+
COCA_trigram_assoc_uni_bi_lemma:
|
| 576 |
+
display_name: COCA Trigram→Bigram Associations (Lemma)
|
| 577 |
+
description: Trigram to bigram association measures - lemma-based analysis
|
| 578 |
+
file: resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv
|
| 579 |
+
format: csv
|
| 580 |
+
columns: *id009
|
| 581 |
+
has_header: true
|
| 582 |
+
enabled: true
|
| 583 |
+
analysis_type: lemma
|
| 584 |
+
log_transformable:
|
| 585 |
+
- frequency
|
| 586 |
+
selectable_measures:
|
| 587 |
+
- frequency
|
| 588 |
+
- mi_score
|
| 589 |
+
- mi_2_score
|
| 590 |
+
- t_score
|
| 591 |
+
- delta_p
|
| 592 |
+
- ap_collex
|
| 593 |
+
default_measures:
|
| 594 |
+
- frequency
|
| 595 |
+
- t_score
|
| 596 |
+
default_log_transforms:
|
| 597 |
+
- frequency
|
| 598 |
+
measure_classifications:
|
| 599 |
+
frequency: frequency
|
| 600 |
+
mi_score: association
|
| 601 |
+
mi_2_score: association
|
| 602 |
+
t_score: association
|
| 603 |
+
delta_p: association
|
| 604 |
+
ap_collex: association
|
| 605 |
+
COCA_trigram_assoc_bi_uni_token:
|
| 606 |
+
display_name: COCA Trigram→Unigram Associations (Token)
|
| 607 |
+
description: Trigram to unigram association measures - token-based analysis
|
| 608 |
+
file: resources/reference_lists/en/spoken_tri_contingency_2.csv
|
| 609 |
+
format: csv
|
| 610 |
+
columns: &id010
|
| 611 |
trigram: 0
|
| 612 |
frequency: 1
|
| 613 |
mi_score: 2
|
|
|
|
| 616 |
delta_p: 5
|
| 617 |
ap_collex: 6
|
| 618 |
has_header: true
|
| 619 |
+
enabled: true
|
| 620 |
+
analysis_type: token
|
| 621 |
+
log_transformable:
|
| 622 |
+
- frequency
|
| 623 |
+
selectable_measures:
|
| 624 |
+
- frequency
|
| 625 |
+
- mi_score
|
| 626 |
+
- mi_2_score
|
| 627 |
+
- t_score
|
| 628 |
+
- delta_p
|
| 629 |
+
- ap_collex
|
| 630 |
+
default_measures:
|
| 631 |
+
- frequency
|
| 632 |
+
- t_score
|
| 633 |
+
default_log_transforms:
|
| 634 |
+
- frequency
|
| 635 |
+
measure_classifications:
|
| 636 |
+
frequency: frequency
|
| 637 |
+
mi_score: association
|
| 638 |
+
mi_2_score: association
|
| 639 |
+
t_score: association
|
| 640 |
+
delta_p: association
|
| 641 |
+
ap_collex: association
|
| 642 |
+
COCA_trigram_assoc_bi_uni_lemma:
|
| 643 |
+
display_name: COCA Trigram→Unigram Associations (Lemma)
|
| 644 |
+
description: Trigram to unigram association measures - lemma-based analysis
|
| 645 |
+
file: resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv
|
| 646 |
+
format: csv
|
| 647 |
+
columns: *id010
|
| 648 |
+
has_header: true
|
| 649 |
+
enabled: true
|
| 650 |
+
analysis_type: lemma
|
| 651 |
+
log_transformable:
|
| 652 |
+
- frequency
|
| 653 |
+
selectable_measures:
|
| 654 |
+
- frequency
|
| 655 |
+
- mi_score
|
| 656 |
+
- mi_2_score
|
| 657 |
+
- t_score
|
| 658 |
+
- delta_p
|
| 659 |
+
- ap_collex
|
| 660 |
+
default_measures:
|
| 661 |
+
- frequency
|
| 662 |
+
- t_score
|
| 663 |
+
default_log_transforms:
|
| 664 |
+
- frequency
|
| 665 |
+
measure_classifications:
|
| 666 |
+
frequency: frequency
|
| 667 |
+
mi_score: association
|
| 668 |
+
mi_2_score: association
|
| 669 |
+
t_score: association
|
| 670 |
+
delta_p: association
|
| 671 |
+
ap_collex: association
|
| 672 |
japanese:
|
| 673 |
unigrams:
|
| 674 |
+
BCCWJ_frequency_token:
|
| 675 |
+
display_name: BCCWJ Written - Frequency (Token)
|
| 676 |
+
description: BCCWJ raw frequency counts for written Japanese - token-based analysis
|
| 677 |
+
file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 678 |
+
format: tsv
|
| 679 |
+
columns: &id011
|
| 680 |
+
surface_form: 1
|
| 681 |
+
lemma: 2
|
| 682 |
+
pos: 3
|
| 683 |
+
frequency: 6
|
| 684 |
has_header: true
|
| 685 |
enabled: true
|
| 686 |
+
analysis_type: token
|
| 687 |
+
log_transformable:
|
| 688 |
+
- frequency
|
| 689 |
+
selectable_measures:
|
| 690 |
+
- pos
|
| 691 |
+
- frequency
|
| 692 |
+
default_measures:
|
| 693 |
+
- frequency
|
| 694 |
+
- pos
|
| 695 |
+
default_log_transforms:
|
| 696 |
+
- frequency
|
| 697 |
+
measure_classifications:
|
| 698 |
+
pos: unknown
|
| 699 |
+
frequency: frequency
|
| 700 |
japanese_corpus: true
|
| 701 |
+
BCCWJ_frequency_lemma:
|
| 702 |
+
display_name: BCCWJ Written - Frequency (Lemma)
|
| 703 |
+
description: BCCWJ raw frequency counts for written Japanese - lemma-based analysis
|
| 704 |
+
file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 705 |
+
format: tsv
|
| 706 |
+
columns: *id011
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 707 |
has_header: true
|
| 708 |
enabled: true
|
| 709 |
+
analysis_type: lemma
|
| 710 |
+
log_transformable:
|
| 711 |
+
- frequency
|
| 712 |
+
selectable_measures:
|
| 713 |
+
- pos
|
| 714 |
+
- frequency
|
| 715 |
+
default_measures:
|
| 716 |
+
- frequency
|
| 717 |
+
- pos
|
| 718 |
+
default_log_transforms:
|
| 719 |
+
- frequency
|
| 720 |
+
measure_classifications:
|
| 721 |
+
pos: unknown
|
| 722 |
+
frequency: frequency
|
| 723 |
japanese_corpus: true
|
| 724 |
+
BCCWJ_pmw_token:
|
| 725 |
+
display_name: BCCWJ Written - Per Million Words (Token)
|
| 726 |
+
description: BCCWJ normalized frequency for written Japanese - token-based analysis
|
| 727 |
+
file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 728 |
+
format: tsv
|
| 729 |
+
columns: &id012
|
| 730 |
surface_form: 1
|
| 731 |
lemma: 2
|
| 732 |
pos: 3
|
| 733 |
+
frequency: 7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
has_header: true
|
| 735 |
enabled: true
|
| 736 |
+
analysis_type: token
|
| 737 |
+
log_transformable:
|
| 738 |
+
- frequency
|
| 739 |
+
selectable_measures:
|
| 740 |
+
- pos
|
| 741 |
+
- frequency
|
| 742 |
+
default_measures:
|
| 743 |
+
- frequency
|
| 744 |
+
- pos
|
| 745 |
+
default_log_transforms:
|
| 746 |
+
- frequency
|
| 747 |
+
measure_classifications:
|
| 748 |
+
pos: unknown
|
| 749 |
+
frequency: frequency
|
| 750 |
japanese_corpus: true
|
| 751 |
+
BCCWJ_pmw_lemma:
|
| 752 |
+
display_name: BCCWJ Written - Per Million Words (Lemma)
|
| 753 |
+
description: BCCWJ normalized frequency for written Japanese - lemma-based analysis
|
| 754 |
+
file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 755 |
+
format: tsv
|
| 756 |
+
columns: *id012
|
| 757 |
+
has_header: true
|
| 758 |
+
enabled: true
|
| 759 |
+
analysis_type: lemma
|
| 760 |
+
log_transformable:
|
| 761 |
+
- frequency
|
| 762 |
+
selectable_measures:
|
| 763 |
+
- pos
|
| 764 |
+
- frequency
|
| 765 |
+
default_measures:
|
| 766 |
+
- frequency
|
| 767 |
+
- pos
|
| 768 |
+
default_log_transforms:
|
| 769 |
+
- frequency
|
| 770 |
+
measure_classifications:
|
| 771 |
+
pos: unknown
|
| 772 |
+
frequency: frequency
|
| 773 |
+
japanese_corpus: true
|
| 774 |
+
BCCWJ_rank_token:
|
| 775 |
+
display_name: BCCWJ Written - Frequency Rank (Token)
|
| 776 |
+
description: BCCWJ frequency ranking for written Japanese - token-based analysis
|
| 777 |
+
file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 778 |
+
format: tsv
|
| 779 |
+
columns: &id013
|
| 780 |
surface_form: 1
|
| 781 |
lemma: 2
|
| 782 |
pos: 3
|
| 783 |
+
frequency: 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
has_header: true
|
| 785 |
enabled: true
|
| 786 |
+
analysis_type: token
|
| 787 |
+
log_transformable:
|
| 788 |
+
- frequency
|
| 789 |
+
selectable_measures:
|
| 790 |
+
- pos
|
| 791 |
+
- frequency
|
| 792 |
+
default_measures:
|
| 793 |
+
- frequency
|
| 794 |
+
- pos
|
| 795 |
+
default_log_transforms:
|
| 796 |
+
- frequency
|
| 797 |
+
measure_classifications:
|
| 798 |
+
pos: unknown
|
| 799 |
+
frequency: frequency
|
| 800 |
japanese_corpus: true
|
| 801 |
+
BCCWJ_rank_lemma:
|
| 802 |
+
display_name: BCCWJ Written - Frequency Rank (Lemma)
|
| 803 |
+
description: BCCWJ frequency ranking for written Japanese - lemma-based analysis
|
| 804 |
+
file: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 805 |
+
format: tsv
|
| 806 |
+
columns: *id013
|
| 807 |
+
has_header: true
|
| 808 |
+
enabled: true
|
| 809 |
+
analysis_type: lemma
|
| 810 |
+
log_transformable:
|
| 811 |
+
- frequency
|
| 812 |
+
selectable_measures:
|
| 813 |
+
- pos
|
| 814 |
+
- frequency
|
| 815 |
+
default_measures:
|
| 816 |
+
- frequency
|
| 817 |
+
- pos
|
| 818 |
+
default_log_transforms:
|
| 819 |
+
- frequency
|
| 820 |
+
measure_classifications:
|
| 821 |
+
pos: unknown
|
| 822 |
+
frequency: frequency
|
| 823 |
+
japanese_corpus: true
|
| 824 |
+
CSJ_frequency_token:
|
| 825 |
+
display_name: CSJ Spoken - Frequency (Token)
|
| 826 |
+
description: CSJ raw frequency counts for spoken Japanese - token-based analysis
|
| 827 |
+
file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 828 |
+
format: tsv
|
| 829 |
+
columns: &id014
|
| 830 |
surface_form: 1
|
| 831 |
lemma: 2
|
| 832 |
pos: 3
|
| 833 |
frequency: 6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
has_header: true
|
| 835 |
enabled: true
|
| 836 |
+
analysis_type: token
|
| 837 |
+
log_transformable:
|
| 838 |
+
- frequency
|
| 839 |
+
selectable_measures:
|
| 840 |
+
- pos
|
| 841 |
+
- frequency
|
| 842 |
+
default_measures:
|
| 843 |
+
- frequency
|
| 844 |
+
- pos
|
| 845 |
+
default_log_transforms:
|
| 846 |
+
- frequency
|
| 847 |
+
measure_classifications:
|
| 848 |
+
pos: unknown
|
| 849 |
+
frequency: frequency
|
| 850 |
japanese_corpus: true
|
| 851 |
+
CSJ_frequency_lemma:
|
| 852 |
+
display_name: CSJ Spoken - Frequency (Lemma)
|
| 853 |
+
description: CSJ raw frequency counts for spoken Japanese - lemma-based analysis
|
| 854 |
+
file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 855 |
+
format: tsv
|
| 856 |
+
columns: *id014
|
| 857 |
+
has_header: true
|
| 858 |
+
enabled: true
|
| 859 |
+
analysis_type: lemma
|
| 860 |
+
log_transformable:
|
| 861 |
+
- frequency
|
| 862 |
+
selectable_measures:
|
| 863 |
+
- pos
|
| 864 |
+
- frequency
|
| 865 |
+
default_measures:
|
| 866 |
+
- frequency
|
| 867 |
+
- pos
|
| 868 |
+
default_log_transforms:
|
| 869 |
+
- frequency
|
| 870 |
+
measure_classifications:
|
| 871 |
+
pos: unknown
|
| 872 |
+
frequency: frequency
|
| 873 |
+
japanese_corpus: true
|
| 874 |
+
CSJ_pmw_token:
|
| 875 |
+
display_name: CSJ Spoken - Per Million Words (Token)
|
| 876 |
+
description: CSJ normalized frequency for spoken Japanese - token-based analysis
|
| 877 |
+
file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 878 |
+
format: tsv
|
| 879 |
+
columns: &id015
|
| 880 |
surface_form: 1
|
| 881 |
lemma: 2
|
| 882 |
pos: 3
|
| 883 |
frequency: 7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
has_header: true
|
| 885 |
enabled: true
|
| 886 |
+
analysis_type: token
|
| 887 |
+
log_transformable:
|
| 888 |
+
- frequency
|
| 889 |
+
selectable_measures:
|
| 890 |
+
- pos
|
| 891 |
+
- frequency
|
| 892 |
+
default_measures:
|
| 893 |
+
- frequency
|
| 894 |
+
- pos
|
| 895 |
+
default_log_transforms:
|
| 896 |
+
- frequency
|
| 897 |
+
measure_classifications:
|
| 898 |
+
pos: unknown
|
| 899 |
+
frequency: frequency
|
| 900 |
japanese_corpus: true
|
| 901 |
+
CSJ_pmw_lemma:
|
| 902 |
+
display_name: CSJ Spoken - Per Million Words (Lemma)
|
| 903 |
+
description: CSJ normalized frequency for spoken Japanese - lemma-based analysis
|
| 904 |
+
file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 905 |
+
format: tsv
|
| 906 |
+
columns: *id015
|
| 907 |
+
has_header: true
|
| 908 |
+
enabled: true
|
| 909 |
+
analysis_type: lemma
|
| 910 |
+
log_transformable:
|
| 911 |
+
- frequency
|
| 912 |
+
selectable_measures:
|
| 913 |
+
- pos
|
| 914 |
+
- frequency
|
| 915 |
+
default_measures:
|
| 916 |
+
- frequency
|
| 917 |
+
- pos
|
| 918 |
+
default_log_transforms:
|
| 919 |
+
- frequency
|
| 920 |
+
measure_classifications:
|
| 921 |
+
pos: unknown
|
| 922 |
+
frequency: frequency
|
| 923 |
+
japanese_corpus: true
|
| 924 |
+
CSJ_rank_token:
|
| 925 |
+
display_name: CSJ Spoken - Frequency Rank (Token)
|
| 926 |
+
description: CSJ frequency ranking for spoken Japanese - token-based analysis
|
| 927 |
+
file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 928 |
+
format: tsv
|
| 929 |
+
columns: &id016
|
| 930 |
surface_form: 1
|
| 931 |
lemma: 2
|
| 932 |
pos: 3
|
| 933 |
frequency: 0
|
| 934 |
+
has_header: true
|
| 935 |
+
enabled: true
|
| 936 |
+
analysis_type: token
|
| 937 |
+
log_transformable:
|
| 938 |
+
- frequency
|
| 939 |
+
selectable_measures:
|
| 940 |
+
- pos
|
| 941 |
+
- frequency
|
| 942 |
+
default_measures:
|
| 943 |
+
- frequency
|
| 944 |
+
- pos
|
| 945 |
+
default_log_transforms:
|
| 946 |
+
- frequency
|
| 947 |
+
measure_classifications:
|
| 948 |
+
pos: unknown
|
| 949 |
+
frequency: frequency
|
| 950 |
+
japanese_corpus: true
|
| 951 |
+
CSJ_rank_lemma:
|
| 952 |
+
display_name: CSJ Spoken - Frequency Rank (Lemma)
|
| 953 |
+
description: CSJ frequency ranking for spoken Japanese - lemma-based analysis
|
| 954 |
+
file: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 955 |
+
format: tsv
|
| 956 |
+
columns: *id016
|
| 957 |
+
has_header: true
|
| 958 |
+
enabled: true
|
| 959 |
+
analysis_type: lemma
|
| 960 |
+
log_transformable:
|
| 961 |
+
- frequency
|
| 962 |
+
selectable_measures:
|
| 963 |
+
- pos
|
| 964 |
+
- frequency
|
| 965 |
+
default_measures:
|
| 966 |
+
- frequency
|
| 967 |
+
- pos
|
| 968 |
+
default_log_transforms:
|
| 969 |
+
- frequency
|
| 970 |
+
measure_classifications:
|
| 971 |
+
pos: unknown
|
| 972 |
+
frequency: frequency
|
| 973 |
+
japanese_corpus: true
|
| 974 |
+
jp_frequency_token:
|
| 975 |
+
display_name: Japanese Frequency List (Token)
|
| 976 |
+
description: Frequency data for Japanese words - token-based analysis
|
| 977 |
+
file: resources/reference_lists/ja/jp_frequency_token.csv
|
| 978 |
+
format: csv
|
| 979 |
+
columns: &id017
|
| 980 |
word: 0
|
| 981 |
frequency: 1
|
| 982 |
has_header: true
|
| 983 |
+
enabled: false
|
| 984 |
+
analysis_type: token
|
| 985 |
+
log_transformable:
|
| 986 |
+
- frequency
|
| 987 |
+
selectable_measures:
|
| 988 |
+
- frequency
|
| 989 |
+
default_measures:
|
| 990 |
+
- frequency
|
| 991 |
+
default_log_transforms:
|
| 992 |
+
- frequency
|
| 993 |
+
measure_classifications:
|
| 994 |
+
frequency: frequency
|
| 995 |
+
jp_frequency_lemma:
|
| 996 |
+
display_name: Japanese Frequency List (Lemma)
|
| 997 |
+
description: Frequency data for Japanese words - lemma-based analysis
|
| 998 |
+
file: resources/reference_lists/ja/jp_frequency_lemma.csv
|
| 999 |
+
format: csv
|
| 1000 |
+
columns: *id017
|
| 1001 |
+
has_header: true
|
| 1002 |
+
enabled: false
|
| 1003 |
+
analysis_type: lemma
|
| 1004 |
+
log_transformable:
|
| 1005 |
+
- frequency
|
| 1006 |
+
selectable_measures:
|
| 1007 |
+
- frequency
|
| 1008 |
+
default_measures:
|
| 1009 |
+
- frequency
|
| 1010 |
+
default_log_transforms:
|
| 1011 |
+
- frequency
|
| 1012 |
+
measure_classifications:
|
| 1013 |
+
frequency: frequency
|
config/reference_lists.yaml.backup_20250727_220815
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration for Default Reference Lists
|
| 2 |
+
# Add new reference lists here and they'll automatically appear in the UI
|
| 3 |
+
# Structure: language -> type -> list_name -> configuration
|
| 4 |
+
|
| 5 |
+
english:
|
| 6 |
+
unigrams:
|
| 7 |
+
COCA_spoken_frequency:
|
| 8 |
+
display_name: "COCA Spoken Frequency"
|
| 9 |
+
description: "Frequency and range data from COCA spoken subcorpus"
|
| 10 |
+
files:
|
| 11 |
+
token: "resources/reference_lists/en/COCA_spoken_unigram_list.csv"
|
| 12 |
+
lemma: "resources/reference_lists/en/COCA_spoken_unigram_list.csv" # Using same file for now
|
| 13 |
+
format: "tsv"
|
| 14 |
+
columns:
|
| 15 |
+
word: 0
|
| 16 |
+
frequency: 1
|
| 17 |
+
normalized_freq: 2
|
| 18 |
+
range: 3
|
| 19 |
+
dispersion: 4
|
| 20 |
+
has_header: false
|
| 21 |
+
enabled: true
|
| 22 |
+
|
| 23 |
+
COCA_magazine_frequency:
|
| 24 |
+
display_name: "COCA Magazine Frequency"
|
| 25 |
+
description: "Frequency and range data from COCA magazine subcorpus"
|
| 26 |
+
files:
|
| 27 |
+
token: "resources/reference_lists/en/COCA_magazine_unigram_list.csv"
|
| 28 |
+
lemma: "resources/reference_lists/en/COCA_magazine_unigram_list.csv" # Using same file for now
|
| 29 |
+
format: "tsv"
|
| 30 |
+
columns:
|
| 31 |
+
word: 0
|
| 32 |
+
frequency: 1
|
| 33 |
+
normalized_freq: 2
|
| 34 |
+
range: 3
|
| 35 |
+
dispersion: 4
|
| 36 |
+
has_header: false
|
| 37 |
+
enabled: true
|
| 38 |
+
|
| 39 |
+
concreteness_ratings:
|
| 40 |
+
display_name: "Concreteness Ratings"
|
| 41 |
+
description: "Concreteness ratings for English words (1-5 scale)"
|
| 42 |
+
files:
|
| 43 |
+
token: "resources/reference_lists/en/Concreteness_Brysbaert.txt"
|
| 44 |
+
lemma: "resources/reference_lists/en/Concreteness_Brysbaert.txt"
|
| 45 |
+
format: "tsv"
|
| 46 |
+
columns:
|
| 47 |
+
word: 0
|
| 48 |
+
concreteness: 1
|
| 49 |
+
has_header: true
|
| 50 |
+
header_prefix: "#"
|
| 51 |
+
enabled: true # Disabled until files exist
|
| 52 |
+
|
| 53 |
+
academic_words:
|
| 54 |
+
display_name: "Academic Word List"
|
| 55 |
+
description: "Common academic vocabulary for research writing"
|
| 56 |
+
files:
|
| 57 |
+
token: "resources/reference_lists/en/academic_words_token.csv"
|
| 58 |
+
lemma: "resources/reference_lists/en/academic_words_lemma.csv"
|
| 59 |
+
format: "csv"
|
| 60 |
+
columns:
|
| 61 |
+
word: 0
|
| 62 |
+
frequency: 1
|
| 63 |
+
has_header: true
|
| 64 |
+
enabled: false # Disabled until files exist
|
| 65 |
+
|
| 66 |
+
bigrams:
|
| 67 |
+
COCA_spoken_bigram_frequency:
|
| 68 |
+
display_name: "COCA Spoken Bigram Frequency"
|
| 69 |
+
description: "Bigram frequencies and range data"
|
| 70 |
+
files:
|
| 71 |
+
token: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
|
| 72 |
+
lemma: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
|
| 73 |
+
format: "tsv"
|
| 74 |
+
columns:
|
| 75 |
+
bigram: 0
|
| 76 |
+
frequency: 1
|
| 77 |
+
normalized_freq: 2
|
| 78 |
+
documents: 3
|
| 79 |
+
range: 4
|
| 80 |
+
has_header: false
|
| 81 |
+
enabled: true # Disabled until files exist
|
| 82 |
+
|
| 83 |
+
COCA_spoken_bigram_association:
|
| 84 |
+
display_name: "COCA Spoken Bigram Associations"
|
| 85 |
+
description: "Bigram association measures (MI, T-score, Delta P)"
|
| 86 |
+
files:
|
| 87 |
+
token: "resources/reference_lists/en/spoken_bi_contingency.csv"
|
| 88 |
+
lemma: "resources/reference_lists/en/spoken_bigram_lemma_contingency.csv"
|
| 89 |
+
format: "csv"
|
| 90 |
+
columns:
|
| 91 |
+
bigram: 0
|
| 92 |
+
frequency: 1
|
| 93 |
+
mi_score: 2
|
| 94 |
+
mi_2_score: 3
|
| 95 |
+
t_score: 4
|
| 96 |
+
delta_p: 5
|
| 97 |
+
ap_collex: 6
|
| 98 |
+
has_header: true
|
| 99 |
+
enabled: true # Disabled until files exist
|
| 100 |
+
|
| 101 |
+
COCA_magazine_bigram_frequency:
|
| 102 |
+
display_name: "COCA Magazine Bigram Frequency"
|
| 103 |
+
description: "Bigram frequencies and range data in Magazine"
|
| 104 |
+
files:
|
| 105 |
+
token: "resources/reference_lists/en/COCA_magazine_bigram_list.csv"
|
| 106 |
+
lemma: "resources/reference_lists/en/COCA_spoken_bigram_list.csv"
|
| 107 |
+
format: "tsv"
|
| 108 |
+
columns:
|
| 109 |
+
bigram: 0
|
| 110 |
+
frequency: 1
|
| 111 |
+
normalized_freq: 2
|
| 112 |
+
documents: 3
|
| 113 |
+
range: 4
|
| 114 |
+
has_header: false
|
| 115 |
+
enabled: true # Disabled until files exist
|
| 116 |
+
|
| 117 |
+
COCA_magazine_bigram_association:
|
| 118 |
+
display_name: "COCA Magazine Bigram Associations"
|
| 119 |
+
description: "Bigram association measures (MI, T-score, Delta P)"
|
| 120 |
+
files:
|
| 121 |
+
token: "resources/reference_lists/en/magazine_bi_contingency.csv"
|
| 122 |
+
lemma: "resources/reference_lists/en/magazine_bigram_lemma_contingency.csv"
|
| 123 |
+
format: "csv"
|
| 124 |
+
columns:
|
| 125 |
+
bigram: 0
|
| 126 |
+
frequency: 1
|
| 127 |
+
mi_score: 2
|
| 128 |
+
mi_2_score: 3
|
| 129 |
+
t_score: 4
|
| 130 |
+
delta_p: 5
|
| 131 |
+
ap_collex: 6
|
| 132 |
+
has_header: true
|
| 133 |
+
enabled: true # Disabled until files exist
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
trigrams:
|
| 137 |
+
COCA_trigram_frequency:
|
| 138 |
+
display_name: "COCA Trigram Frequency"
|
| 139 |
+
description: "Trigram frequencies and range data"
|
| 140 |
+
files:
|
| 141 |
+
token: "resources/reference_lists/en/COCA_spoken_trigram_list.csv"
|
| 142 |
+
lemma: "resources/reference_lists/en/COCA_spoken_trigram_list.csv"
|
| 143 |
+
format: "tsv"
|
| 144 |
+
columns:
|
| 145 |
+
trigram: 0
|
| 146 |
+
frequency: 1
|
| 147 |
+
normalized_freq: 2
|
| 148 |
+
range: 3
|
| 149 |
+
dispersion: 4
|
| 150 |
+
has_header: false
|
| 151 |
+
enabled: true
|
| 152 |
+
|
| 153 |
+
COCA_trigram_assoc_uni_bi:
|
| 154 |
+
display_name: "COCA Trigram→Bigram Associations"
|
| 155 |
+
description: "Trigram to bigram association measures"
|
| 156 |
+
files:
|
| 157 |
+
token: "resources/reference_lists/en/spoken_tri_contingency_1.csv"
|
| 158 |
+
lemma: "resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv"
|
| 159 |
+
format: "csv"
|
| 160 |
+
columns:
|
| 161 |
+
trigram: 0
|
| 162 |
+
frequency: 1
|
| 163 |
+
mi_score: 2
|
| 164 |
+
mi_2_score: 3
|
| 165 |
+
t_score: 4
|
| 166 |
+
delta_p: 5
|
| 167 |
+
ap_collex: 6
|
| 168 |
+
has_header: true
|
| 169 |
+
enabled: true # Disabled until files exist
|
| 170 |
+
|
| 171 |
+
COCA_trigram_assoc_bi_uni:
|
| 172 |
+
display_name: "COCA Trigram→Unigram Associations"
|
| 173 |
+
description: "Trigram to unigram association measures"
|
| 174 |
+
files:
|
| 175 |
+
token: "resources/reference_lists/en/spoken_tri_contingency_2.csv"
|
| 176 |
+
lemma: "resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv"
|
| 177 |
+
format: "csv"
|
| 178 |
+
columns:
|
| 179 |
+
trigram: 0
|
| 180 |
+
frequency: 1
|
| 181 |
+
mi_score: 2
|
| 182 |
+
mi_2_score: 3
|
| 183 |
+
t_score: 4
|
| 184 |
+
delta_p: 5
|
| 185 |
+
ap_collex: 6
|
| 186 |
+
has_header: true
|
| 187 |
+
enabled: true # Disabled until files exist
|
| 188 |
+
|
| 189 |
+
japanese:
|
| 190 |
+
unigrams:
|
| 191 |
+
BCCWJ_frequency:
|
| 192 |
+
display_name: "BCCWJ Written - Frequency"
|
| 193 |
+
description: "BCCWJ raw frequency counts for written Japanese"
|
| 194 |
+
files:
|
| 195 |
+
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 196 |
+
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 197 |
+
format: "tsv"
|
| 198 |
+
has_header: true
|
| 199 |
+
enabled: true
|
| 200 |
+
japanese_corpus: true
|
| 201 |
+
columns:
|
| 202 |
+
surface_form: 1 # lForm
|
| 203 |
+
lemma: 2 # lemma
|
| 204 |
+
pos: 3 # pos
|
| 205 |
+
frequency: 6 # primary measure column
|
| 206 |
+
|
| 207 |
+
BCCWJ_pmw:
|
| 208 |
+
display_name: "BCCWJ Written - Per Million Words"
|
| 209 |
+
description: "BCCWJ normalized frequency for written Japanese"
|
| 210 |
+
files:
|
| 211 |
+
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 212 |
+
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 213 |
+
format: "tsv"
|
| 214 |
+
has_header: true
|
| 215 |
+
enabled: true
|
| 216 |
+
japanese_corpus: true
|
| 217 |
+
columns:
|
| 218 |
+
surface_form: 1
|
| 219 |
+
lemma: 2
|
| 220 |
+
pos: 3
|
| 221 |
+
frequency: 7 # pmw column
|
| 222 |
+
|
| 223 |
+
BCCWJ_rank:
|
| 224 |
+
display_name: "BCCWJ Written - Frequency Rank"
|
| 225 |
+
description: "BCCWJ frequency ranking for written Japanese"
|
| 226 |
+
files:
|
| 227 |
+
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 228 |
+
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 229 |
+
format: "tsv"
|
| 230 |
+
has_header: true
|
| 231 |
+
enabled: true
|
| 232 |
+
japanese_corpus: true
|
| 233 |
+
columns:
|
| 234 |
+
surface_form: 1
|
| 235 |
+
lemma: 2
|
| 236 |
+
pos: 3
|
| 237 |
+
frequency: 0 # rank column
|
| 238 |
+
|
| 239 |
+
CSJ_frequency:
|
| 240 |
+
display_name: "CSJ Spoken - Frequency"
|
| 241 |
+
description: "CSJ raw frequency counts for spoken Japanese"
|
| 242 |
+
files:
|
| 243 |
+
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 244 |
+
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 245 |
+
format: "tsv"
|
| 246 |
+
has_header: true
|
| 247 |
+
enabled: true
|
| 248 |
+
japanese_corpus: true
|
| 249 |
+
columns:
|
| 250 |
+
surface_form: 1
|
| 251 |
+
lemma: 2
|
| 252 |
+
pos: 3
|
| 253 |
+
frequency: 6
|
| 254 |
+
|
| 255 |
+
CSJ_pmw:
|
| 256 |
+
display_name: "CSJ Spoken - Per Million Words"
|
| 257 |
+
description: "CSJ normalized frequency for spoken Japanese"
|
| 258 |
+
files:
|
| 259 |
+
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 260 |
+
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 261 |
+
format: "tsv"
|
| 262 |
+
has_header: true
|
| 263 |
+
enabled: true
|
| 264 |
+
japanese_corpus: true
|
| 265 |
+
columns:
|
| 266 |
+
surface_form: 1
|
| 267 |
+
lemma: 2
|
| 268 |
+
pos: 3
|
| 269 |
+
frequency: 7
|
| 270 |
+
|
| 271 |
+
CSJ_rank:
|
| 272 |
+
display_name: "CSJ Spoken - Frequency Rank"
|
| 273 |
+
description: "CSJ frequency ranking for spoken Japanese"
|
| 274 |
+
files:
|
| 275 |
+
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 276 |
+
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 277 |
+
format: "tsv"
|
| 278 |
+
has_header: true
|
| 279 |
+
enabled: true
|
| 280 |
+
japanese_corpus: true
|
| 281 |
+
columns:
|
| 282 |
+
surface_form: 1
|
| 283 |
+
lemma: 2
|
| 284 |
+
pos: 3
|
| 285 |
+
frequency: 0
|
| 286 |
+
|
| 287 |
+
jp_frequency:
|
| 288 |
+
display_name: "Japanese Frequency List"
|
| 289 |
+
description: "Frequency data for Japanese words"
|
| 290 |
+
files:
|
| 291 |
+
token: "resources/reference_lists/ja/jp_frequency_token.csv"
|
| 292 |
+
lemma: "resources/reference_lists/ja/jp_frequency_lemma.csv"
|
| 293 |
+
format: "csv"
|
| 294 |
+
columns:
|
| 295 |
+
word: 0
|
| 296 |
+
frequency: 1
|
| 297 |
+
has_header: true
|
| 298 |
+
enabled: false # Disabled until files exist
|
| 299 |
+
|
| 300 |
+
# bigrams: {}
|
| 301 |
+
# trigrams: {}
|
config/reference_lists.yaml.backup_20250727_230913
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
english:
|
| 2 |
+
unigrams:
|
| 3 |
+
COCA_magazine_frequency:
|
| 4 |
+
display_name: COCA Magazine Frequency
|
| 5 |
+
description: Frequency and range data from COCA magazine subcorpus
|
| 6 |
+
files:
|
| 7 |
+
token: resources/reference_lists/en/COCA_magazine_unigram_list.csv
|
| 8 |
+
lemma: resources/reference_lists/en/COCA_magazine_unigram_list.csv
|
| 9 |
+
format: tsv
|
| 10 |
+
columns:
|
| 11 |
+
word: 0
|
| 12 |
+
frequency: 1
|
| 13 |
+
normalized_freq: 2
|
| 14 |
+
range: 3
|
| 15 |
+
dispersion: 4
|
| 16 |
+
has_header: false
|
| 17 |
+
enabled: true
|
| 18 |
+
concreteness_ratings:
|
| 19 |
+
display_name: Concreteness Ratings
|
| 20 |
+
description: Concreteness ratings for English words (1-5 scale)
|
| 21 |
+
files:
|
| 22 |
+
token: resources/reference_lists/en/Concreteness_Brysbaert.txt
|
| 23 |
+
lemma: resources/reference_lists/en/Concreteness_Brysbaert.txt
|
| 24 |
+
format: tsv
|
| 25 |
+
columns:
|
| 26 |
+
word: 0
|
| 27 |
+
concreteness: 1
|
| 28 |
+
has_header: true
|
| 29 |
+
header_prefix: '#'
|
| 30 |
+
enabled: true
|
| 31 |
+
academic_words:
|
| 32 |
+
display_name: Academic Word List
|
| 33 |
+
description: Common academic vocabulary for research writing
|
| 34 |
+
files:
|
| 35 |
+
token: resources/reference_lists/en/academic_words_token.csv
|
| 36 |
+
lemma: resources/reference_lists/en/academic_words_lemma.csv
|
| 37 |
+
format: csv
|
| 38 |
+
columns:
|
| 39 |
+
word: 0
|
| 40 |
+
frequency: 1
|
| 41 |
+
has_header: true
|
| 42 |
+
enabled: false
|
| 43 |
+
COCA_spoken_frequency_token:
|
| 44 |
+
display_name: COCA Spoken Frequency (Token)
|
| 45 |
+
description: Frequency and range data from COCA spoken subcorpus - token-based
|
| 46 |
+
analysis
|
| 47 |
+
file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
|
| 48 |
+
format: tsv
|
| 49 |
+
columns:
|
| 50 |
+
word: 0
|
| 51 |
+
frequency: 1
|
| 52 |
+
normalized_freq: 2
|
| 53 |
+
range: 3
|
| 54 |
+
dispersion: 4
|
| 55 |
+
has_header: false
|
| 56 |
+
enabled: true
|
| 57 |
+
analysis_type: token
|
| 58 |
+
log_transformable:
|
| 59 |
+
- frequency
|
| 60 |
+
- normalized_freq
|
| 61 |
+
selectable_measures:
|
| 62 |
+
- frequency
|
| 63 |
+
- normalized_freq
|
| 64 |
+
- range
|
| 65 |
+
- dispersion
|
| 66 |
+
default_measures:
|
| 67 |
+
- frequency
|
| 68 |
+
- normalized_freq
|
| 69 |
+
default_log_transforms:
|
| 70 |
+
- frequency
|
| 71 |
+
- normalized_freq
|
| 72 |
+
COCA_spoken_frequency_lemma:
|
| 73 |
+
display_name: COCA Spoken Frequency (Lemma)
|
| 74 |
+
description: Frequency and range data from COCA spoken subcorpus - lemma-based
|
| 75 |
+
analysis
|
| 76 |
+
file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
|
| 77 |
+
format: tsv
|
| 78 |
+
columns:
|
| 79 |
+
word: 0
|
| 80 |
+
frequency: 1
|
| 81 |
+
normalized_freq: 2
|
| 82 |
+
range: 3
|
| 83 |
+
dispersion: 4
|
| 84 |
+
has_header: false
|
| 85 |
+
enabled: true
|
| 86 |
+
analysis_type: lemma
|
| 87 |
+
log_transformable:
|
| 88 |
+
- frequency
|
| 89 |
+
- normalized_freq
|
| 90 |
+
selectable_measures:
|
| 91 |
+
- frequency
|
| 92 |
+
- normalized_freq
|
| 93 |
+
- range
|
| 94 |
+
- dispersion
|
| 95 |
+
default_measures:
|
| 96 |
+
- frequency
|
| 97 |
+
- normalized_freq
|
| 98 |
+
default_log_transforms:
|
| 99 |
+
- frequency
|
| 100 |
+
- normalized_freq
|
| 101 |
+
bigrams:
|
| 102 |
+
COCA_spoken_bigram_frequency:
|
| 103 |
+
display_name: COCA Spoken Bigram Frequency
|
| 104 |
+
description: Bigram frequencies and range data
|
| 105 |
+
files:
|
| 106 |
+
token: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 107 |
+
lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 108 |
+
format: tsv
|
| 109 |
+
columns:
|
| 110 |
+
bigram: 0
|
| 111 |
+
frequency: 1
|
| 112 |
+
normalized_freq: 2
|
| 113 |
+
documents: 3
|
| 114 |
+
range: 4
|
| 115 |
+
has_header: false
|
| 116 |
+
enabled: true
|
| 117 |
+
COCA_spoken_bigram_association:
|
| 118 |
+
display_name: COCA Spoken Bigram Associations
|
| 119 |
+
description: Bigram association measures (MI, T-score, Delta P)
|
| 120 |
+
files:
|
| 121 |
+
token: resources/reference_lists/en/spoken_bi_contingency.csv
|
| 122 |
+
lemma: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
|
| 123 |
+
format: csv
|
| 124 |
+
columns:
|
| 125 |
+
bigram: 0
|
| 126 |
+
frequency: 1
|
| 127 |
+
mi_score: 2
|
| 128 |
+
mi_2_score: 3
|
| 129 |
+
t_score: 4
|
| 130 |
+
delta_p: 5
|
| 131 |
+
ap_collex: 6
|
| 132 |
+
has_header: true
|
| 133 |
+
enabled: true
|
| 134 |
+
COCA_magazine_bigram_frequency:
|
| 135 |
+
display_name: COCA Magazine Bigram Frequency
|
| 136 |
+
description: Bigram frequencies and range data in Magazine
|
| 137 |
+
files:
|
| 138 |
+
token: resources/reference_lists/en/COCA_magazine_bigram_list.csv
|
| 139 |
+
lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 140 |
+
format: tsv
|
| 141 |
+
columns:
|
| 142 |
+
bigram: 0
|
| 143 |
+
frequency: 1
|
| 144 |
+
normalized_freq: 2
|
| 145 |
+
documents: 3
|
| 146 |
+
range: 4
|
| 147 |
+
has_header: false
|
| 148 |
+
enabled: true
|
| 149 |
+
COCA_magazine_bigram_association:
|
| 150 |
+
display_name: COCA Magazine Bigram Associations
|
| 151 |
+
description: Bigram association measures (MI, T-score, Delta P)
|
| 152 |
+
files:
|
| 153 |
+
token: resources/reference_lists/en/magazine_bi_contingency.csv
|
| 154 |
+
lemma: resources/reference_lists/en/magazine_bigram_lemma_contingency.csv
|
| 155 |
+
format: csv
|
| 156 |
+
columns:
|
| 157 |
+
bigram: 0
|
| 158 |
+
frequency: 1
|
| 159 |
+
mi_score: 2
|
| 160 |
+
mi_2_score: 3
|
| 161 |
+
t_score: 4
|
| 162 |
+
delta_p: 5
|
| 163 |
+
ap_collex: 6
|
| 164 |
+
has_header: true
|
| 165 |
+
enabled: true
|
| 166 |
+
trigrams:
|
| 167 |
+
COCA_trigram_frequency:
|
| 168 |
+
display_name: COCA Trigram Frequency
|
| 169 |
+
description: Trigram frequencies and range data
|
| 170 |
+
files:
|
| 171 |
+
token: resources/reference_lists/en/COCA_spoken_trigram_list.csv
|
| 172 |
+
lemma: resources/reference_lists/en/COCA_spoken_trigram_list.csv
|
| 173 |
+
format: tsv
|
| 174 |
+
columns:
|
| 175 |
+
trigram: 0
|
| 176 |
+
frequency: 1
|
| 177 |
+
normalized_freq: 2
|
| 178 |
+
range: 3
|
| 179 |
+
dispersion: 4
|
| 180 |
+
has_header: false
|
| 181 |
+
enabled: true
|
| 182 |
+
COCA_trigram_assoc_uni_bi:
|
| 183 |
+
display_name: COCA Trigram→Bigram Associations
|
| 184 |
+
description: Trigram to bigram association measures
|
| 185 |
+
files:
|
| 186 |
+
token: resources/reference_lists/en/spoken_tri_contingency_1.csv
|
| 187 |
+
lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv
|
| 188 |
+
format: csv
|
| 189 |
+
columns:
|
| 190 |
+
trigram: 0
|
| 191 |
+
frequency: 1
|
| 192 |
+
mi_score: 2
|
| 193 |
+
mi_2_score: 3
|
| 194 |
+
t_score: 4
|
| 195 |
+
delta_p: 5
|
| 196 |
+
ap_collex: 6
|
| 197 |
+
has_header: true
|
| 198 |
+
enabled: true
|
| 199 |
+
COCA_trigram_assoc_bi_uni:
|
| 200 |
+
display_name: COCA Trigram→Unigram Associations
|
| 201 |
+
description: Trigram to unigram association measures
|
| 202 |
+
files:
|
| 203 |
+
token: resources/reference_lists/en/spoken_tri_contingency_2.csv
|
| 204 |
+
lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv
|
| 205 |
+
format: csv
|
| 206 |
+
columns:
|
| 207 |
+
trigram: 0
|
| 208 |
+
frequency: 1
|
| 209 |
+
mi_score: 2
|
| 210 |
+
mi_2_score: 3
|
| 211 |
+
t_score: 4
|
| 212 |
+
delta_p: 5
|
| 213 |
+
ap_collex: 6
|
| 214 |
+
has_header: true
|
| 215 |
+
enabled: true
|
| 216 |
+
japanese:
|
| 217 |
+
unigrams:
|
| 218 |
+
BCCWJ_frequency:
|
| 219 |
+
display_name: BCCWJ Written - Frequency
|
| 220 |
+
description: BCCWJ raw frequency counts for written Japanese
|
| 221 |
+
files:
|
| 222 |
+
token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 223 |
+
lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 224 |
+
format: tsv
|
| 225 |
+
has_header: true
|
| 226 |
+
enabled: true
|
| 227 |
+
japanese_corpus: true
|
| 228 |
+
columns:
|
| 229 |
+
surface_form: 1
|
| 230 |
+
lemma: 2
|
| 231 |
+
pos: 3
|
| 232 |
+
frequency: 6
|
| 233 |
+
BCCWJ_pmw:
|
| 234 |
+
display_name: BCCWJ Written - Per Million Words
|
| 235 |
+
description: BCCWJ normalized frequency for written Japanese
|
| 236 |
+
files:
|
| 237 |
+
token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 238 |
+
lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 239 |
+
format: tsv
|
| 240 |
+
has_header: true
|
| 241 |
+
enabled: true
|
| 242 |
+
japanese_corpus: true
|
| 243 |
+
columns:
|
| 244 |
+
surface_form: 1
|
| 245 |
+
lemma: 2
|
| 246 |
+
pos: 3
|
| 247 |
+
frequency: 7
|
| 248 |
+
BCCWJ_rank:
|
| 249 |
+
display_name: BCCWJ Written - Frequency Rank
|
| 250 |
+
description: BCCWJ frequency ranking for written Japanese
|
| 251 |
+
files:
|
| 252 |
+
token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 253 |
+
lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 254 |
+
format: tsv
|
| 255 |
+
has_header: true
|
| 256 |
+
enabled: true
|
| 257 |
+
japanese_corpus: true
|
| 258 |
+
columns:
|
| 259 |
+
surface_form: 1
|
| 260 |
+
lemma: 2
|
| 261 |
+
pos: 3
|
| 262 |
+
frequency: 0
|
| 263 |
+
CSJ_frequency:
|
| 264 |
+
display_name: CSJ Spoken - Frequency
|
| 265 |
+
description: CSJ raw frequency counts for spoken Japanese
|
| 266 |
+
files:
|
| 267 |
+
token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 268 |
+
lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 269 |
+
format: tsv
|
| 270 |
+
has_header: true
|
| 271 |
+
enabled: true
|
| 272 |
+
japanese_corpus: true
|
| 273 |
+
columns:
|
| 274 |
+
surface_form: 1
|
| 275 |
+
lemma: 2
|
| 276 |
+
pos: 3
|
| 277 |
+
frequency: 6
|
| 278 |
+
CSJ_pmw:
|
| 279 |
+
display_name: CSJ Spoken - Per Million Words
|
| 280 |
+
description: CSJ normalized frequency for spoken Japanese
|
| 281 |
+
files:
|
| 282 |
+
token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 283 |
+
lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 284 |
+
format: tsv
|
| 285 |
+
has_header: true
|
| 286 |
+
enabled: true
|
| 287 |
+
japanese_corpus: true
|
| 288 |
+
columns:
|
| 289 |
+
surface_form: 1
|
| 290 |
+
lemma: 2
|
| 291 |
+
pos: 3
|
| 292 |
+
frequency: 7
|
| 293 |
+
CSJ_rank:
|
| 294 |
+
display_name: CSJ Spoken - Frequency Rank
|
| 295 |
+
description: CSJ frequency ranking for spoken Japanese
|
| 296 |
+
files:
|
| 297 |
+
token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 298 |
+
lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 299 |
+
format: tsv
|
| 300 |
+
has_header: true
|
| 301 |
+
enabled: true
|
| 302 |
+
japanese_corpus: true
|
| 303 |
+
columns:
|
| 304 |
+
surface_form: 1
|
| 305 |
+
lemma: 2
|
| 306 |
+
pos: 3
|
| 307 |
+
frequency: 0
|
| 308 |
+
jp_frequency:
|
| 309 |
+
display_name: Japanese Frequency List
|
| 310 |
+
description: Frequency data for Japanese words
|
| 311 |
+
files:
|
| 312 |
+
token: resources/reference_lists/ja/jp_frequency_token.csv
|
| 313 |
+
lemma: resources/reference_lists/ja/jp_frequency_lemma.csv
|
| 314 |
+
format: csv
|
| 315 |
+
columns:
|
| 316 |
+
word: 0
|
| 317 |
+
frequency: 1
|
| 318 |
+
has_header: true
|
| 319 |
+
enabled: false
|
config/reference_lists.yaml.backup_20250727_231728
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
english:
|
| 2 |
+
unigrams:
|
| 3 |
+
COCA_magazine_frequency:
|
| 4 |
+
display_name: COCA Magazine Frequency
|
| 5 |
+
description: Frequency and range data from COCA magazine subcorpus
|
| 6 |
+
files:
|
| 7 |
+
token: resources/reference_lists/en/COCA_magazine_unigram_list.csv
|
| 8 |
+
lemma: resources/reference_lists/en/COCA_magazine_unigram_list.csv
|
| 9 |
+
format: tsv
|
| 10 |
+
columns:
|
| 11 |
+
word: 0
|
| 12 |
+
frequency: 1
|
| 13 |
+
normalized_freq: 2
|
| 14 |
+
range: 3
|
| 15 |
+
dispersion: 4
|
| 16 |
+
has_header: false
|
| 17 |
+
enabled: true
|
| 18 |
+
concreteness_ratings:
|
| 19 |
+
display_name: Concreteness Ratings
|
| 20 |
+
description: Concreteness ratings for English words (1-5 scale)
|
| 21 |
+
files:
|
| 22 |
+
token: resources/reference_lists/en/Concreteness_Brysbaert.txt
|
| 23 |
+
lemma: resources/reference_lists/en/Concreteness_Brysbaert.txt
|
| 24 |
+
format: tsv
|
| 25 |
+
columns:
|
| 26 |
+
word: 0
|
| 27 |
+
concreteness: 1
|
| 28 |
+
has_header: true
|
| 29 |
+
header_prefix: '#'
|
| 30 |
+
enabled: true
|
| 31 |
+
academic_words:
|
| 32 |
+
display_name: Academic Word List
|
| 33 |
+
description: Common academic vocabulary for research writing
|
| 34 |
+
files:
|
| 35 |
+
token: resources/reference_lists/en/academic_words_token.csv
|
| 36 |
+
lemma: resources/reference_lists/en/academic_words_lemma.csv
|
| 37 |
+
format: csv
|
| 38 |
+
columns:
|
| 39 |
+
word: 0
|
| 40 |
+
frequency: 1
|
| 41 |
+
has_header: true
|
| 42 |
+
enabled: false
|
| 43 |
+
COCA_spoken_frequency_token:
|
| 44 |
+
display_name: COCA Spoken Frequency (Token)
|
| 45 |
+
description: Frequency and range data from COCA spoken subcorpus - token-based
|
| 46 |
+
analysis
|
| 47 |
+
file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
|
| 48 |
+
format: tsv
|
| 49 |
+
columns:
|
| 50 |
+
word: 0
|
| 51 |
+
frequency: 1
|
| 52 |
+
normalized_freq: 2
|
| 53 |
+
range: 3
|
| 54 |
+
dispersion: 4
|
| 55 |
+
has_header: false
|
| 56 |
+
enabled: true
|
| 57 |
+
analysis_type: token
|
| 58 |
+
log_transformable:
|
| 59 |
+
- frequency
|
| 60 |
+
- normalized_freq
|
| 61 |
+
selectable_measures:
|
| 62 |
+
- frequency
|
| 63 |
+
- normalized_freq
|
| 64 |
+
- range
|
| 65 |
+
- dispersion
|
| 66 |
+
default_measures:
|
| 67 |
+
- frequency
|
| 68 |
+
- normalized_freq
|
| 69 |
+
default_log_transforms:
|
| 70 |
+
- frequency
|
| 71 |
+
- normalized_freq
|
| 72 |
+
COCA_spoken_frequency_lemma:
|
| 73 |
+
display_name: COCA Spoken Frequency (Lemma)
|
| 74 |
+
description: Frequency and range data from COCA spoken subcorpus - lemma-based
|
| 75 |
+
analysis
|
| 76 |
+
file: resources/reference_lists/en/COCA_spoken_unigram_list.csv
|
| 77 |
+
format: tsv
|
| 78 |
+
columns:
|
| 79 |
+
word: 0
|
| 80 |
+
frequency: 1
|
| 81 |
+
normalized_freq: 2
|
| 82 |
+
range: 3
|
| 83 |
+
dispersion: 4
|
| 84 |
+
has_header: false
|
| 85 |
+
enabled: true
|
| 86 |
+
analysis_type: lemma
|
| 87 |
+
log_transformable:
|
| 88 |
+
- frequency
|
| 89 |
+
- normalized_freq
|
| 90 |
+
selectable_measures:
|
| 91 |
+
- frequency
|
| 92 |
+
- normalized_freq
|
| 93 |
+
- range
|
| 94 |
+
- dispersion
|
| 95 |
+
default_measures:
|
| 96 |
+
- frequency
|
| 97 |
+
- normalized_freq
|
| 98 |
+
default_log_transforms:
|
| 99 |
+
- frequency
|
| 100 |
+
- normalized_freq
|
| 101 |
+
bigrams:
|
| 102 |
+
COCA_spoken_bigram_frequency:
|
| 103 |
+
display_name: COCA Spoken Bigram Frequency
|
| 104 |
+
description: Bigram frequencies and range data
|
| 105 |
+
files:
|
| 106 |
+
token: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 107 |
+
lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 108 |
+
format: tsv
|
| 109 |
+
columns:
|
| 110 |
+
bigram: 0
|
| 111 |
+
frequency: 1
|
| 112 |
+
normalized_freq: 2
|
| 113 |
+
documents: 3
|
| 114 |
+
range: 4
|
| 115 |
+
has_header: false
|
| 116 |
+
enabled: true
|
| 117 |
+
COCA_spoken_bigram_association:
|
| 118 |
+
display_name: COCA Spoken Bigram Associations
|
| 119 |
+
description: Bigram association measures (MI, T-score, Delta P)
|
| 120 |
+
files:
|
| 121 |
+
token: resources/reference_lists/en/spoken_bi_contingency.csv
|
| 122 |
+
lemma: resources/reference_lists/en/spoken_bigram_lemma_contingency.csv
|
| 123 |
+
format: csv
|
| 124 |
+
columns:
|
| 125 |
+
bigram: 0
|
| 126 |
+
frequency: 1
|
| 127 |
+
mi_score: 2
|
| 128 |
+
mi_2_score: 3
|
| 129 |
+
t_score: 4
|
| 130 |
+
delta_p: 5
|
| 131 |
+
ap_collex: 6
|
| 132 |
+
has_header: true
|
| 133 |
+
enabled: true
|
| 134 |
+
COCA_magazine_bigram_frequency:
|
| 135 |
+
display_name: COCA Magazine Bigram Frequency
|
| 136 |
+
description: Bigram frequencies and range data in Magazine
|
| 137 |
+
files:
|
| 138 |
+
token: resources/reference_lists/en/COCA_magazine_bigram_list.csv
|
| 139 |
+
lemma: resources/reference_lists/en/COCA_spoken_bigram_list.csv
|
| 140 |
+
format: tsv
|
| 141 |
+
columns:
|
| 142 |
+
bigram: 0
|
| 143 |
+
frequency: 1
|
| 144 |
+
normalized_freq: 2
|
| 145 |
+
documents: 3
|
| 146 |
+
range: 4
|
| 147 |
+
has_header: false
|
| 148 |
+
enabled: true
|
| 149 |
+
COCA_magazine_bigram_association:
|
| 150 |
+
display_name: COCA Magazine Bigram Associations
|
| 151 |
+
description: Bigram association measures (MI, T-score, Delta P)
|
| 152 |
+
files:
|
| 153 |
+
token: resources/reference_lists/en/magazine_bi_contingency.csv
|
| 154 |
+
lemma: resources/reference_lists/en/magazine_bigram_lemma_contingency.csv
|
| 155 |
+
format: csv
|
| 156 |
+
columns:
|
| 157 |
+
bigram: 0
|
| 158 |
+
frequency: 1
|
| 159 |
+
mi_score: 2
|
| 160 |
+
mi_2_score: 3
|
| 161 |
+
t_score: 4
|
| 162 |
+
delta_p: 5
|
| 163 |
+
ap_collex: 6
|
| 164 |
+
has_header: true
|
| 165 |
+
enabled: true
|
| 166 |
+
trigrams:
|
| 167 |
+
COCA_trigram_frequency:
|
| 168 |
+
display_name: COCA Trigram Frequency
|
| 169 |
+
description: Trigram frequencies and range data
|
| 170 |
+
files:
|
| 171 |
+
token: resources/reference_lists/en/COCA_spoken_trigram_list.csv
|
| 172 |
+
lemma: resources/reference_lists/en/COCA_spoken_trigram_list.csv
|
| 173 |
+
format: tsv
|
| 174 |
+
columns:
|
| 175 |
+
trigram: 0
|
| 176 |
+
frequency: 1
|
| 177 |
+
normalized_freq: 2
|
| 178 |
+
range: 3
|
| 179 |
+
dispersion: 4
|
| 180 |
+
has_header: false
|
| 181 |
+
enabled: true
|
| 182 |
+
COCA_trigram_assoc_uni_bi:
|
| 183 |
+
display_name: COCA Trigram→Bigram Associations
|
| 184 |
+
description: Trigram to bigram association measures
|
| 185 |
+
files:
|
| 186 |
+
token: resources/reference_lists/en/spoken_tri_contingency_1.csv
|
| 187 |
+
lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_1.csv
|
| 188 |
+
format: csv
|
| 189 |
+
columns:
|
| 190 |
+
trigram: 0
|
| 191 |
+
frequency: 1
|
| 192 |
+
mi_score: 2
|
| 193 |
+
mi_2_score: 3
|
| 194 |
+
t_score: 4
|
| 195 |
+
delta_p: 5
|
| 196 |
+
ap_collex: 6
|
| 197 |
+
has_header: true
|
| 198 |
+
enabled: true
|
| 199 |
+
COCA_trigram_assoc_bi_uni:
|
| 200 |
+
display_name: COCA Trigram→Unigram Associations
|
| 201 |
+
description: Trigram to unigram association measures
|
| 202 |
+
files:
|
| 203 |
+
token: resources/reference_lists/en/spoken_tri_contingency_2.csv
|
| 204 |
+
lemma: resources/reference_lists/en/spoken_trigram_lemma_contingency_2.csv
|
| 205 |
+
format: csv
|
| 206 |
+
columns:
|
| 207 |
+
trigram: 0
|
| 208 |
+
frequency: 1
|
| 209 |
+
mi_score: 2
|
| 210 |
+
mi_2_score: 3
|
| 211 |
+
t_score: 4
|
| 212 |
+
delta_p: 5
|
| 213 |
+
ap_collex: 6
|
| 214 |
+
has_header: true
|
| 215 |
+
enabled: true
|
| 216 |
+
japanese:
|
| 217 |
+
unigrams:
|
| 218 |
+
BCCWJ_frequency:
|
| 219 |
+
display_name: BCCWJ Written - Frequency
|
| 220 |
+
description: BCCWJ raw frequency counts for written Japanese
|
| 221 |
+
files:
|
| 222 |
+
token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 223 |
+
lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 224 |
+
format: tsv
|
| 225 |
+
has_header: true
|
| 226 |
+
enabled: true
|
| 227 |
+
japanese_corpus: true
|
| 228 |
+
columns:
|
| 229 |
+
surface_form: 1
|
| 230 |
+
lemma: 2
|
| 231 |
+
pos: 3
|
| 232 |
+
frequency: 6
|
| 233 |
+
BCCWJ_pmw:
|
| 234 |
+
display_name: BCCWJ Written - Per Million Words
|
| 235 |
+
description: BCCWJ normalized frequency for written Japanese
|
| 236 |
+
files:
|
| 237 |
+
token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 238 |
+
lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 239 |
+
format: tsv
|
| 240 |
+
has_header: true
|
| 241 |
+
enabled: true
|
| 242 |
+
japanese_corpus: true
|
| 243 |
+
columns:
|
| 244 |
+
surface_form: 1
|
| 245 |
+
lemma: 2
|
| 246 |
+
pos: 3
|
| 247 |
+
frequency: 7
|
| 248 |
+
BCCWJ_rank:
|
| 249 |
+
display_name: BCCWJ Written - Frequency Rank
|
| 250 |
+
description: BCCWJ frequency ranking for written Japanese
|
| 251 |
+
files:
|
| 252 |
+
token: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 253 |
+
lemma: resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
|
| 254 |
+
format: tsv
|
| 255 |
+
has_header: true
|
| 256 |
+
enabled: true
|
| 257 |
+
japanese_corpus: true
|
| 258 |
+
columns:
|
| 259 |
+
surface_form: 1
|
| 260 |
+
lemma: 2
|
| 261 |
+
pos: 3
|
| 262 |
+
frequency: 0
|
| 263 |
+
CSJ_frequency:
|
| 264 |
+
display_name: CSJ Spoken - Frequency
|
| 265 |
+
description: CSJ raw frequency counts for spoken Japanese
|
| 266 |
+
files:
|
| 267 |
+
token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 268 |
+
lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 269 |
+
format: tsv
|
| 270 |
+
has_header: true
|
| 271 |
+
enabled: true
|
| 272 |
+
japanese_corpus: true
|
| 273 |
+
columns:
|
| 274 |
+
surface_form: 1
|
| 275 |
+
lemma: 2
|
| 276 |
+
pos: 3
|
| 277 |
+
frequency: 6
|
| 278 |
+
CSJ_pmw:
|
| 279 |
+
display_name: CSJ Spoken - Per Million Words
|
| 280 |
+
description: CSJ normalized frequency for spoken Japanese
|
| 281 |
+
files:
|
| 282 |
+
token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 283 |
+
lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 284 |
+
format: tsv
|
| 285 |
+
has_header: true
|
| 286 |
+
enabled: true
|
| 287 |
+
japanese_corpus: true
|
| 288 |
+
columns:
|
| 289 |
+
surface_form: 1
|
| 290 |
+
lemma: 2
|
| 291 |
+
pos: 3
|
| 292 |
+
frequency: 7
|
| 293 |
+
CSJ_rank:
|
| 294 |
+
display_name: CSJ Spoken - Frequency Rank
|
| 295 |
+
description: CSJ frequency ranking for spoken Japanese
|
| 296 |
+
files:
|
| 297 |
+
token: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 298 |
+
lemma: resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
|
| 299 |
+
format: tsv
|
| 300 |
+
has_header: true
|
| 301 |
+
enabled: true
|
| 302 |
+
japanese_corpus: true
|
| 303 |
+
columns:
|
| 304 |
+
surface_form: 1
|
| 305 |
+
lemma: 2
|
| 306 |
+
pos: 3
|
| 307 |
+
frequency: 0
|
| 308 |
+
jp_frequency:
|
| 309 |
+
display_name: Japanese Frequency List
|
| 310 |
+
description: Frequency data for Japanese words
|
| 311 |
+
files:
|
| 312 |
+
token: resources/reference_lists/ja/jp_frequency_token.csv
|
| 313 |
+
lemma: resources/reference_lists/ja/jp_frequency_lemma.csv
|
| 314 |
+
format: csv
|
| 315 |
+
columns:
|
| 316 |
+
word: 0
|
| 317 |
+
frequency: 1
|
| 318 |
+
has_header: true
|
| 319 |
+
enabled: false
|
text_analyzer/lexical_sophistication.py
CHANGED
|
@@ -484,16 +484,69 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 484 |
|
| 485 |
return score
|
| 486 |
|
| 487 |
-
def
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
"""
|
| 490 |
Analyze text and return lexical sophistication scores.
|
| 491 |
|
| 492 |
Args:
|
| 493 |
text: Input text to analyze
|
| 494 |
selected_indices: List of reference indices to apply
|
| 495 |
-
apply_log: Whether to apply log10 transformation
|
| 496 |
word_type_filter: Filter by word type ('CW', 'FW', or None for all)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
|
| 498 |
Returns:
|
| 499 |
Dictionary containing analysis results
|
|
@@ -607,13 +660,21 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 607 |
token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
|
| 608 |
token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
|
| 609 |
|
| 610 |
-
# Collect for summary statistics
|
| 611 |
if token_score is not None:
|
| 612 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
all_scores[f"{index_name}_token_{word_type}"].append(score_val)
|
| 614 |
|
| 615 |
if lemma_score is not None:
|
| 616 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
all_scores[f"{index_name}_lemma_{word_type}"].append(score_val)
|
| 618 |
|
| 619 |
results['token_details'].append(token_detail)
|
|
@@ -664,10 +725,19 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 664 |
# Get available measures
|
| 665 |
available_measures = ref_data.columns[1:].tolist()
|
| 666 |
|
|
|
|
| 667 |
for measure in available_measures:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
score = self._lookup_score(ngram, index_name, ngram_type, measure)
|
| 669 |
if score is not None:
|
| 670 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
ngram_detail[f"{index_name}_{measure}"] = score_val
|
| 672 |
else:
|
| 673 |
ngram_detail[f"{index_name}_{measure}"] = None
|
|
@@ -686,12 +756,21 @@ class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
|
| 686 |
# Get available measures (all columns except the first one)
|
| 687 |
available_measures = ref_data.columns[1:].tolist()
|
| 688 |
|
|
|
|
| 689 |
for measure in available_measures:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
ngram_scores = []
|
| 691 |
for ngram in ngrams:
|
| 692 |
score = self._lookup_score(ngram, index_name, ngram_type, measure)
|
| 693 |
if score is not None:
|
| 694 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
ngram_scores.append(score_val)
|
| 696 |
|
| 697 |
if ngram_scores:
|
|
|
|
| 484 |
|
| 485 |
return score
|
| 486 |
|
| 487 |
+
def _should_apply_log_transform(self, index_name: str, analysis_type: str,
|
| 488 |
+
measure_name: str, log_transforms: Optional[Dict[str, List[str]]],
|
| 489 |
+
apply_log_fallback: bool) -> bool:
|
| 490 |
+
"""
|
| 491 |
+
Determine if a specific measure should be log-transformed.
|
| 492 |
+
|
| 493 |
+
Args:
|
| 494 |
+
index_name: Name of the reference index
|
| 495 |
+
analysis_type: 'token' or 'lemma'
|
| 496 |
+
measure_name: Name of the measure (e.g., 'frequency', 'MI')
|
| 497 |
+
log_transforms: Dict mapping index names to lists of measures to log-transform
|
| 498 |
+
apply_log_fallback: Legacy fallback boolean
|
| 499 |
+
|
| 500 |
+
Returns:
|
| 501 |
+
True if the measure should be log-transformed, False otherwise
|
| 502 |
+
"""
|
| 503 |
+
# If new log_transforms parameter is provided, use it
|
| 504 |
+
if log_transforms is not None:
|
| 505 |
+
index_transforms = log_transforms.get(index_name, [])
|
| 506 |
+
return measure_name in index_transforms
|
| 507 |
+
|
| 508 |
+
# Fallback to legacy apply_log behavior for backward compatibility
|
| 509 |
+
return apply_log_fallback
|
| 510 |
+
|
| 511 |
+
def _should_compute_measure(self, index_name: str, measure_name: str,
|
| 512 |
+
selected_measures: Optional[Dict[str, List[str]]]) -> bool:
|
| 513 |
+
"""
|
| 514 |
+
Determine if a specific measure should be computed.
|
| 515 |
+
|
| 516 |
+
Args:
|
| 517 |
+
index_name: Name of the reference index
|
| 518 |
+
measure_name: Name of the measure (e.g., 'frequency', 'MI')
|
| 519 |
+
selected_measures: Dict mapping index names to lists of measures to compute
|
| 520 |
+
|
| 521 |
+
Returns:
|
| 522 |
+
True if the measure should be computed, False otherwise
|
| 523 |
+
"""
|
| 524 |
+
# If selected_measures is provided, use it for filtering
|
| 525 |
+
if selected_measures is not None:
|
| 526 |
+
index_measures = selected_measures.get(index_name, [])
|
| 527 |
+
return measure_name in index_measures
|
| 528 |
+
|
| 529 |
+
# If not specified, compute all measures (backward compatibility)
|
| 530 |
+
return True
|
| 531 |
+
|
| 532 |
+
def analyze_text(self, text: str, selected_indices: List[str],
|
| 533 |
+
apply_log: bool = False, word_type_filter: Optional[str] = None,
|
| 534 |
+
log_transforms: Optional[Dict[str, List[str]]] = None,
|
| 535 |
+
selected_measures: Optional[Dict[str, List[str]]] = None) -> Dict:
|
| 536 |
"""
|
| 537 |
Analyze text and return lexical sophistication scores.
|
| 538 |
|
| 539 |
Args:
|
| 540 |
text: Input text to analyze
|
| 541 |
selected_indices: List of reference indices to apply
|
| 542 |
+
apply_log: Whether to apply log10 transformation (legacy parameter, superseded by log_transforms)
|
| 543 |
word_type_filter: Filter by word type ('CW', 'FW', or None for all)
|
| 544 |
+
log_transforms: Dict mapping index names to list of measures that should be log-transformed
|
| 545 |
+
e.g., {'COCA_spoken_frequency_token': ['frequency', 'normalized_freq']}
|
| 546 |
+
If None, falls back to apply_log behavior for backward compatibility
|
| 547 |
+
selected_measures: Dict mapping index names to list of measures to compute
|
| 548 |
+
e.g., {'COCA_spoken_frequency_token': ['frequency', 'range']}
|
| 549 |
+
If None, computes all available measures for backward compatibility
|
| 550 |
|
| 551 |
Returns:
|
| 552 |
Dictionary containing analysis results
|
|
|
|
| 660 |
token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
|
| 661 |
token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
|
| 662 |
|
| 663 |
+
# Collect for summary statistics with selective log transformation
|
| 664 |
if token_score is not None:
|
| 665 |
+
# Check if this specific measure should be log-transformed
|
| 666 |
+
should_log_transform = self._should_apply_log_transform(
|
| 667 |
+
index_name, 'token', 'frequency', log_transforms, apply_log
|
| 668 |
+
)
|
| 669 |
+
score_val = np.log10(token_score) if should_log_transform and token_score > 0 else token_score
|
| 670 |
all_scores[f"{index_name}_token_{word_type}"].append(score_val)
|
| 671 |
|
| 672 |
if lemma_score is not None:
|
| 673 |
+
# Check if this specific measure should be log-transformed
|
| 674 |
+
should_log_transform = self._should_apply_log_transform(
|
| 675 |
+
index_name, 'lemma', 'frequency', log_transforms, apply_log
|
| 676 |
+
)
|
| 677 |
+
score_val = np.log10(lemma_score) if should_log_transform and lemma_score > 0 else lemma_score
|
| 678 |
all_scores[f"{index_name}_lemma_{word_type}"].append(score_val)
|
| 679 |
|
| 680 |
results['token_details'].append(token_detail)
|
|
|
|
| 725 |
# Get available measures
|
| 726 |
available_measures = ref_data.columns[1:].tolist()
|
| 727 |
|
| 728 |
+
# Filter measures based on selection
|
| 729 |
for measure in available_measures:
|
| 730 |
+
# Check if this measure should be computed
|
| 731 |
+
if not self._should_compute_measure(index_name, measure, selected_measures):
|
| 732 |
+
continue
|
| 733 |
+
|
| 734 |
score = self._lookup_score(ngram, index_name, ngram_type, measure)
|
| 735 |
if score is not None:
|
| 736 |
+
# Check if this measure should be log-transformed
|
| 737 |
+
should_log_transform = self._should_apply_log_transform(
|
| 738 |
+
index_name, ngram_type, measure, log_transforms, apply_log
|
| 739 |
+
)
|
| 740 |
+
score_val = np.log10(score) if should_log_transform and score > 0 else score
|
| 741 |
ngram_detail[f"{index_name}_{measure}"] = score_val
|
| 742 |
else:
|
| 743 |
ngram_detail[f"{index_name}_{measure}"] = None
|
|
|
|
| 756 |
# Get available measures (all columns except the first one)
|
| 757 |
available_measures = ref_data.columns[1:].tolist()
|
| 758 |
|
| 759 |
+
# Filter measures based on selection and compute summary statistics
|
| 760 |
for measure in available_measures:
|
| 761 |
+
# Check if this measure should be computed
|
| 762 |
+
if not self._should_compute_measure(index_name, measure, selected_measures):
|
| 763 |
+
continue
|
| 764 |
+
|
| 765 |
ngram_scores = []
|
| 766 |
for ngram in ngrams:
|
| 767 |
score = self._lookup_score(ngram, index_name, ngram_type, measure)
|
| 768 |
if score is not None:
|
| 769 |
+
# Check if this measure should be log-transformed
|
| 770 |
+
should_log_transform = self._should_apply_log_transform(
|
| 771 |
+
index_name, ngram_type, measure, log_transforms, apply_log
|
| 772 |
+
)
|
| 773 |
+
score_val = np.log10(score) if should_log_transform and score > 0 else score
|
| 774 |
ngram_scores.append(score_val)
|
| 775 |
|
| 776 |
if ngram_scores:
|
web_app/components/ui_components.py
CHANGED
|
@@ -173,20 +173,82 @@ class UIComponents:
|
|
| 173 |
|
| 174 |
@staticmethod
|
| 175 |
def render_analysis_options():
|
| 176 |
-
"""Render analysis options UI."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
col1, col2 = st.columns(2)
|
| 178 |
|
| 179 |
with col1:
|
| 180 |
-
|
| 181 |
-
|
| 182 |
with col2:
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
@staticmethod
|
| 192 |
def display_configured_indices():
|
|
@@ -233,3 +295,53 @@ class UIComponents:
|
|
| 233 |
|
| 234 |
if success_count == 0:
|
| 235 |
st.error("No valid configurations found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
@staticmethod
|
| 175 |
def render_analysis_options():
|
| 176 |
+
"""Render enhanced analysis options UI with sophisticated hierarchical interface."""
|
| 177 |
+
from web_app.defaults_manager import DefaultsManager
|
| 178 |
+
from web_app.config_manager import ConfigManager
|
| 179 |
+
from web_app.session_manager import SessionManager
|
| 180 |
+
|
| 181 |
+
st.subheader("🔧 Analysis Configuration")
|
| 182 |
+
|
| 183 |
+
# Get current configuration
|
| 184 |
+
config = ConfigManager.load_reference_config()
|
| 185 |
+
reference_lists = SessionManager.get_reference_lists()
|
| 186 |
+
|
| 187 |
+
# Enhanced Reference Lists & Measures Section
|
| 188 |
+
st.write("### 📋 Reference Lists & Measures")
|
| 189 |
+
|
| 190 |
+
# Render the sophisticated hierarchical interface
|
| 191 |
+
selected_measures, log_transforms = UIComponents.render_enhanced_reference_selection(config, reference_lists)
|
| 192 |
+
|
| 193 |
+
# Global Analysis Options
|
| 194 |
+
st.write("### 🎯 Analysis Types")
|
| 195 |
col1, col2 = st.columns(2)
|
| 196 |
|
| 197 |
with col1:
|
| 198 |
+
token_analysis = st.checkbox("☑️ Token-based", value=True, key="token_analysis_enabled")
|
|
|
|
| 199 |
with col2:
|
| 200 |
+
lemma_analysis = st.checkbox("☑️ Lemma-based", value=True, key="lemma_analysis_enabled")
|
| 201 |
+
|
| 202 |
+
# Global Options
|
| 203 |
+
st.write("### ⚙️ Global Options")
|
| 204 |
+
word_type_filter = st.selectbox(
|
| 205 |
+
"Word Type Filter:",
|
| 206 |
+
options=[None, 'CW', 'FW'],
|
| 207 |
+
format_func=lambda x: 'All Words ▼' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
|
| 208 |
+
key="word_type_filter"
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# Advanced Configuration Section
|
| 212 |
+
with st.expander("🎯 Advanced Configuration (Optional)", expanded=False):
|
| 213 |
+
st.info("ℹ️ **Smart Defaults Active**: The system automatically applies appropriate settings. "
|
| 214 |
+
"Expand this section only if you need custom control.")
|
| 215 |
+
|
| 216 |
+
# Legacy log transformation toggle
|
| 217 |
+
legacy_log_toggle = st.checkbox(
|
| 218 |
+
"Apply log₁₀ transformation to ALL measures (Legacy Mode)",
|
| 219 |
+
value=False,
|
| 220 |
+
help="⚠️ Not recommended: This applies log transformation to all measures, "
|
| 221 |
+
"including those where it's scientifically inappropriate (e.g., concreteness ratings).",
|
| 222 |
+
key="legacy_log_transform"
|
| 223 |
)
|
| 224 |
+
|
| 225 |
+
if legacy_log_toggle:
|
| 226 |
+
st.warning("⚠️ Legacy mode enabled: Log transformation will be applied to ALL numerical measures. "
|
| 227 |
+
"This may produce scientifically invalid results for psycholinguistic measures.")
|
| 228 |
|
| 229 |
+
# Return enhanced configuration
|
| 230 |
+
return {
|
| 231 |
+
'token_analysis': token_analysis,
|
| 232 |
+
'lemma_analysis': lemma_analysis,
|
| 233 |
+
'word_type_filter': word_type_filter,
|
| 234 |
+
'selected_measures': selected_measures,
|
| 235 |
+
'log_transforms': log_transforms,
|
| 236 |
+
'use_smart_defaults': not st.session_state.get('legacy_log_transform', False),
|
| 237 |
+
'legacy_log_transform': st.session_state.get('legacy_log_transform', False)
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
@staticmethod
|
| 241 |
+
def _find_entry_config(entry_name: str, config: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
| 242 |
+
"""Find configuration entry by name."""
|
| 243 |
+
for language, lang_data in config.items():
|
| 244 |
+
if not isinstance(lang_data, dict):
|
| 245 |
+
continue
|
| 246 |
+
for ngram_type, type_data in lang_data.items():
|
| 247 |
+
if not isinstance(type_data, dict):
|
| 248 |
+
continue
|
| 249 |
+
if entry_name in type_data:
|
| 250 |
+
return type_data[entry_name]
|
| 251 |
+
return None
|
| 252 |
|
| 253 |
@staticmethod
|
| 254 |
def display_configured_indices():
|
|
|
|
| 295 |
|
| 296 |
if success_count == 0:
|
| 297 |
st.error("No valid configurations found")
|
| 298 |
+
|
| 299 |
+
@staticmethod
|
| 300 |
+
def render_enhanced_reference_selection(config: Dict[str, Any], reference_lists: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
|
| 301 |
+
"""Render the enhanced reference list selection interface with hierarchical display."""
|
| 302 |
+
from web_app.defaults_manager import DefaultsManager
|
| 303 |
+
|
| 304 |
+
# Initialize return values
|
| 305 |
+
selected_measures = {}
|
| 306 |
+
log_transforms = {}
|
| 307 |
+
|
| 308 |
+
if not reference_lists:
|
| 309 |
+
st.info("No reference lists selected. Please configure reference lists first.")
|
| 310 |
+
return selected_measures, log_transforms
|
| 311 |
+
|
| 312 |
+
# Simple hierarchical display showing selected lists with smart defaults info
|
| 313 |
+
for list_name in reference_lists.keys():
|
| 314 |
+
# Show smart defaults indicator
|
| 315 |
+
entry_config = UIComponents._find_entry_config(list_name, config)
|
| 316 |
+
if entry_config and entry_config.get('default_measures'):
|
| 317 |
+
defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
|
| 318 |
+
log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
|
| 319 |
+
|
| 320 |
+
# Determine analysis type badges
|
| 321 |
+
analysis_badges = []
|
| 322 |
+
if entry_config.get('analysis_type') == 'token' or not entry_config.get('analysis_type'):
|
| 323 |
+
analysis_badges.append("[Token ✓]")
|
| 324 |
+
if entry_config.get('analysis_type') == 'lemma' or not entry_config.get('analysis_type'):
|
| 325 |
+
analysis_badges.append("[Lemma ✓]")
|
| 326 |
+
|
| 327 |
+
analysis_info = " ".join(analysis_badges) if analysis_badges else ""
|
| 328 |
+
|
| 329 |
+
st.write(f"├─ **{list_name}** {analysis_info} [ℹ️ Smart defaults]")
|
| 330 |
+
st.write(f" {defaults_info}, {log_info}")
|
| 331 |
+
|
| 332 |
+
# Apply smart defaults to return values
|
| 333 |
+
selected_measures[list_name] = entry_config.get('default_measures', [])
|
| 334 |
+
log_transforms[list_name] = entry_config.get('default_log_transforms', [])
|
| 335 |
+
else:
|
| 336 |
+
st.write(f"├─ **{list_name}** [Legacy configuration]")
|
| 337 |
+
|
| 338 |
+
return selected_measures, log_transforms
|
| 339 |
+
|
| 340 |
+
@staticmethod
|
| 341 |
+
def group_has_smart_defaults(group_entries: List[str], config: Dict[str, Any]) -> bool:
|
| 342 |
+
"""Check if a group has smart defaults configured."""
|
| 343 |
+
for entry_name in group_entries:
|
| 344 |
+
entry_config = UIComponents._find_entry_config(entry_name, config)
|
| 345 |
+
if entry_config and entry_config.get('default_measures'):
|
| 346 |
+
return True
|
| 347 |
+
return False
|
web_app/config_manager.py
CHANGED
|
@@ -162,9 +162,17 @@ class ConfigManager:
|
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def load_reference_list_data(list_config: Dict[str, Any]) -> Dict[str, Any]:
|
| 165 |
-
"""Load actual data for a reference list based on its configuration.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
data = {}
|
| 167 |
|
|
|
|
|
|
|
|
|
|
| 168 |
# Check if this is a Japanese corpus
|
| 169 |
is_japanese_corpus = list_config.get('japanese_corpus', False)
|
| 170 |
|
|
@@ -173,7 +181,21 @@ class ConfigManager:
|
|
| 173 |
is_bigram = 'bigram' in columns
|
| 174 |
is_trigram = 'trigram' in columns
|
| 175 |
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
if file_path is None:
|
| 178 |
continue
|
| 179 |
|
|
|
|
| 162 |
|
| 163 |
@staticmethod
|
| 164 |
def load_reference_list_data(list_config: Dict[str, Any]) -> Dict[str, Any]:
|
| 165 |
+
"""Load actual data for a reference list based on its configuration.
|
| 166 |
+
|
| 167 |
+
Supports both old schema (files.token/files.lemma) and new schema (single file).
|
| 168 |
+
"""
|
| 169 |
+
from web_app.schema_validator import SchemaValidator
|
| 170 |
+
|
| 171 |
data = {}
|
| 172 |
|
| 173 |
+
# Detect schema version for this specific entry
|
| 174 |
+
is_new_schema = any(field in list_config for field in SchemaValidator.NEW_SCHEMA_FIELDS)
|
| 175 |
+
|
| 176 |
# Check if this is a Japanese corpus
|
| 177 |
is_japanese_corpus = list_config.get('japanese_corpus', False)
|
| 178 |
|
|
|
|
| 181 |
is_bigram = 'bigram' in columns
|
| 182 |
is_trigram = 'trigram' in columns
|
| 183 |
|
| 184 |
+
# Handle different schema formats
|
| 185 |
+
if is_new_schema:
|
| 186 |
+
# New schema: single file with analysis_type
|
| 187 |
+
file_path = list_config.get('file')
|
| 188 |
+
analysis_type = list_config.get('analysis_type', 'token')
|
| 189 |
+
|
| 190 |
+
if file_path:
|
| 191 |
+
files_to_process = {analysis_type: file_path}
|
| 192 |
+
else:
|
| 193 |
+
files_to_process = {}
|
| 194 |
+
else:
|
| 195 |
+
# Old schema: files.token/files.lemma
|
| 196 |
+
files_to_process = list_config.get('files', {})
|
| 197 |
+
|
| 198 |
+
for file_type, file_path in files_to_process.items():
|
| 199 |
if file_path is None:
|
| 200 |
continue
|
| 201 |
|
web_app/defaults_manager.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Smart Defaults Manager for Lexical Sophistication Analysis
|
| 3 |
+
Provides intelligent default configurations based on measure types and analysis context.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, List, Any, Tuple, Optional
|
| 7 |
+
import logging
|
| 8 |
+
from web_app.schema_validator import SchemaValidator
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class DefaultsManager:
|
| 14 |
+
"""Manages smart defaults for lexical sophistication analysis."""
|
| 15 |
+
|
| 16 |
+
# Define measure type patterns for intelligent classification
|
| 17 |
+
MEASURE_PATTERNS = {
|
| 18 |
+
'frequency': ['freq', 'frequency', 'count', 'occurrence'],
|
| 19 |
+
'association': ['mi', 't_score', 'delta_p', 'ap_collex', 'llr', 'dice'],
|
| 20 |
+
'psycholinguistic': ['concreteness', 'valence', 'arousal', 'dominance', 'imageability', 'familiarity'],
|
| 21 |
+
'range': ['range', 'documents', 'texts', 'dispersion'],
|
| 22 |
+
'rank': ['rank', 'ranking', 'order'],
|
| 23 |
+
'probability': ['probability', 'prob', 'likelihood']
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Define appropriate log transformation rules
|
| 27 |
+
LOG_TRANSFORM_RULES = {
|
| 28 |
+
'frequency': True, # Always log-transform frequency measures
|
| 29 |
+
'association': False, # Never log-transform association measures
|
| 30 |
+
'psycholinguistic': False, # Never log-transform ratings/scales
|
| 31 |
+
'range': False, # Never log-transform range measures
|
| 32 |
+
'rank': False, # Never log-transform ranks
|
| 33 |
+
'probability': False # Never log-transform probabilities
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Define default measure priorities (higher = more important/commonly used)
|
| 37 |
+
MEASURE_PRIORITIES = {
|
| 38 |
+
'frequency': 100,
|
| 39 |
+
'normalized_freq': 95,
|
| 40 |
+
'mi': 90,
|
| 41 |
+
't_score': 85,
|
| 42 |
+
'concreteness': 80,
|
| 43 |
+
'range': 75,
|
| 44 |
+
'dispersion': 70,
|
| 45 |
+
'delta_p': 65,
|
| 46 |
+
'rank': 60,
|
| 47 |
+
'ap_collex': 55
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
@classmethod
|
| 51 |
+
def classify_measure_type(cls, measure_name: str) -> str:
|
| 52 |
+
"""
|
| 53 |
+
Classify a measure into its type category.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
measure_name: Name of the measure to classify
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Category name ('frequency', 'association', 'psycholinguistic', 'range', 'rank', 'unknown')
|
| 60 |
+
"""
|
| 61 |
+
measure_lower = measure_name.lower().strip()
|
| 62 |
+
|
| 63 |
+
for category, patterns in cls.MEASURE_PATTERNS.items():
|
| 64 |
+
if any(pattern in measure_lower for pattern in patterns):
|
| 65 |
+
return category
|
| 66 |
+
|
| 67 |
+
return 'unknown'
|
| 68 |
+
|
| 69 |
+
@classmethod
|
| 70 |
+
def get_smart_defaults_for_entry(cls, entry_config: Dict[str, Any]) -> Dict[str, Any]:
|
| 71 |
+
"""
|
| 72 |
+
Generate smart defaults for a configuration entry.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
entry_config: Configuration entry (old or new schema format)
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Dictionary with smart default fields
|
| 79 |
+
"""
|
| 80 |
+
# Extract measure names from columns
|
| 81 |
+
columns = entry_config.get('columns', {})
|
| 82 |
+
|
| 83 |
+
# Get all non-word columns as potential measures
|
| 84 |
+
word_columns = {'word', 'surface_form', 'lemma', 'bigram', 'trigram', 'ngram'}
|
| 85 |
+
measure_names = []
|
| 86 |
+
|
| 87 |
+
for col_name, col_index in columns.items():
|
| 88 |
+
if col_name.lower() not in word_columns:
|
| 89 |
+
measure_names.append(col_name)
|
| 90 |
+
|
| 91 |
+
if not measure_names:
|
| 92 |
+
# Fallback: assume all columns except first are measures
|
| 93 |
+
if isinstance(columns, dict) and columns:
|
| 94 |
+
# Skip word column (typically index 0)
|
| 95 |
+
measure_names = [name for name, idx in columns.items() if idx != 0]
|
| 96 |
+
|
| 97 |
+
# Classify measures and generate defaults
|
| 98 |
+
return cls._generate_smart_defaults(measure_names)
|
| 99 |
+
|
| 100 |
+
@classmethod
|
| 101 |
+
def _generate_smart_defaults(cls, measure_names: List[str]) -> Dict[str, Any]:
|
| 102 |
+
"""
|
| 103 |
+
Generate smart defaults based on measure classification.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
measure_names: List of available measure names
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
Dictionary with smart default configuration
|
| 110 |
+
"""
|
| 111 |
+
# Classify each measure
|
| 112 |
+
measure_classifications = {}
|
| 113 |
+
for measure in measure_names:
|
| 114 |
+
measure_classifications[measure] = cls.classify_measure_type(measure)
|
| 115 |
+
|
| 116 |
+
# Determine log-transformable measures
|
| 117 |
+
log_transformable = []
|
| 118 |
+
for measure, category in measure_classifications.items():
|
| 119 |
+
if cls.LOG_TRANSFORM_RULES.get(category, False):
|
| 120 |
+
log_transformable.append(measure)
|
| 121 |
+
|
| 122 |
+
# Select default measures (prioritize by importance and type)
|
| 123 |
+
default_measures = cls._select_default_measures(measure_names, measure_classifications)
|
| 124 |
+
|
| 125 |
+
# Select default log transforms (intersection of defaults and log-transformable)
|
| 126 |
+
default_log_transforms = [m for m in default_measures if m in log_transformable]
|
| 127 |
+
|
| 128 |
+
return {
|
| 129 |
+
'log_transformable': log_transformable,
|
| 130 |
+
'selectable_measures': measure_names,
|
| 131 |
+
'default_measures': default_measures,
|
| 132 |
+
'default_log_transforms': default_log_transforms,
|
| 133 |
+
'measure_classifications': measure_classifications # For debugging/UI display
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
@classmethod
|
| 137 |
+
def _select_default_measures(cls, measure_names: List[str],
|
| 138 |
+
measure_classifications: Dict[str, str]) -> List[str]:
|
| 139 |
+
"""
|
| 140 |
+
Select default measures based on priority and balance.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
measure_names: Available measure names
|
| 144 |
+
measure_classifications: Classification of each measure
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
List of default measure names (typically 2-3 measures)
|
| 148 |
+
"""
|
| 149 |
+
# Score measures by priority and type balance
|
| 150 |
+
measure_scores = {}
|
| 151 |
+
|
| 152 |
+
for measure in measure_names:
|
| 153 |
+
# Base score from priority list
|
| 154 |
+
base_score = cls.MEASURE_PRIORITIES.get(measure.lower(), 0)
|
| 155 |
+
|
| 156 |
+
# Bonus for common patterns
|
| 157 |
+
if any(pattern in measure.lower() for pattern in ['freq', 'frequency']):
|
| 158 |
+
base_score += 50
|
| 159 |
+
elif any(pattern in measure.lower() for pattern in ['mi', 't_score']):
|
| 160 |
+
base_score += 40
|
| 161 |
+
elif any(pattern in measure.lower() for pattern in ['concreteness', 'range']):
|
| 162 |
+
base_score += 30
|
| 163 |
+
|
| 164 |
+
measure_scores[measure] = base_score
|
| 165 |
+
|
| 166 |
+
# Sort by score and select top measures
|
| 167 |
+
sorted_measures = sorted(measure_scores.items(), key=lambda x: x[1], reverse=True)
|
| 168 |
+
|
| 169 |
+
# Select top measures with type diversity
|
| 170 |
+
selected = []
|
| 171 |
+
selected_types = set()
|
| 172 |
+
|
| 173 |
+
for measure, score in sorted_measures:
|
| 174 |
+
measure_type = measure_classifications[measure]
|
| 175 |
+
|
| 176 |
+
# Always include high-priority measures
|
| 177 |
+
if score >= 90 or len(selected) < 2:
|
| 178 |
+
selected.append(measure)
|
| 179 |
+
selected_types.add(measure_type)
|
| 180 |
+
# Add diverse types up to 3-4 measures
|
| 181 |
+
elif len(selected) < 4 and measure_type not in selected_types:
|
| 182 |
+
selected.append(measure)
|
| 183 |
+
selected_types.add(measure_type)
|
| 184 |
+
# Stop at 4 measures max
|
| 185 |
+
elif len(selected) >= 4:
|
| 186 |
+
break
|
| 187 |
+
|
| 188 |
+
return selected[:4] # Limit to 4 measures max
|
| 189 |
+
|
| 190 |
+
@classmethod
|
| 191 |
+
def get_ui_groupings(cls, config_data: Dict[str, Any]) -> Dict[str, List[str]]:
|
| 192 |
+
"""
|
| 193 |
+
Generate UI groupings for reference list entries.
|
| 194 |
+
Groups related token/lemma entries together for display.
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
config_data: Full configuration data
|
| 198 |
+
|
| 199 |
+
Returns:
|
| 200 |
+
Dictionary mapping group names to entry lists
|
| 201 |
+
"""
|
| 202 |
+
groupings = {}
|
| 203 |
+
processed_entries = set()
|
| 204 |
+
|
| 205 |
+
for language, lang_data in config_data.items():
|
| 206 |
+
if not isinstance(lang_data, dict):
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
for ngram_type, type_data in lang_data.items():
|
| 210 |
+
if not isinstance(type_data, dict):
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
for entry_name, entry_config in type_data.items():
|
| 214 |
+
if entry_name in processed_entries:
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
# Check if this is a new schema entry with analysis_type
|
| 218 |
+
if entry_config.get('analysis_type'):
|
| 219 |
+
# Try to find matching token/lemma pair
|
| 220 |
+
base_name = entry_name.replace('_token', '').replace('_lemma', '')
|
| 221 |
+
token_name = f"{base_name}_token"
|
| 222 |
+
lemma_name = f"{base_name}_lemma"
|
| 223 |
+
|
| 224 |
+
if (token_name in type_data and lemma_name in type_data and
|
| 225 |
+
token_name not in processed_entries and lemma_name not in processed_entries):
|
| 226 |
+
# Group them together
|
| 227 |
+
group_key = f"{language}_{ngram_type}_{base_name}"
|
| 228 |
+
groupings[group_key] = {
|
| 229 |
+
'display_name': base_name.replace('_', ' ').title(),
|
| 230 |
+
'entries': [token_name, lemma_name],
|
| 231 |
+
'type': ngram_type,
|
| 232 |
+
'language': language
|
| 233 |
+
}
|
| 234 |
+
processed_entries.add(token_name)
|
| 235 |
+
processed_entries.add(lemma_name)
|
| 236 |
+
else:
|
| 237 |
+
# Single entry
|
| 238 |
+
group_key = f"{language}_{ngram_type}_{entry_name}"
|
| 239 |
+
groupings[group_key] = {
|
| 240 |
+
'display_name': entry_config.get('display_name', entry_name),
|
| 241 |
+
'entries': [entry_name],
|
| 242 |
+
'type': ngram_type,
|
| 243 |
+
'language': language
|
| 244 |
+
}
|
| 245 |
+
processed_entries.add(entry_name)
|
| 246 |
+
else:
|
| 247 |
+
# Old schema entry - single group
|
| 248 |
+
group_key = f"{language}_{ngram_type}_{entry_name}"
|
| 249 |
+
groupings[group_key] = {
|
| 250 |
+
'display_name': entry_config.get('display_name', entry_name),
|
| 251 |
+
'entries': [entry_name],
|
| 252 |
+
'type': ngram_type,
|
| 253 |
+
'language': language
|
| 254 |
+
}
|
| 255 |
+
processed_entries.add(entry_name)
|
| 256 |
+
|
| 257 |
+
return groupings
|
| 258 |
+
|
| 259 |
+
@classmethod
|
| 260 |
+
def apply_smart_defaults_to_config(cls, config_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 261 |
+
"""
|
| 262 |
+
Apply smart defaults to configuration entries that don't have them.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
config_data: Configuration data to enhance
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
Enhanced configuration data with smart defaults
|
| 269 |
+
"""
|
| 270 |
+
enhanced_config = config_data.copy()
|
| 271 |
+
|
| 272 |
+
for language, lang_data in enhanced_config.items():
|
| 273 |
+
if not isinstance(lang_data, dict):
|
| 274 |
+
continue
|
| 275 |
+
|
| 276 |
+
for ngram_type, type_data in lang_data.items():
|
| 277 |
+
if not isinstance(type_data, dict):
|
| 278 |
+
continue
|
| 279 |
+
|
| 280 |
+
for entry_name, entry_config in type_data.items():
|
| 281 |
+
if not isinstance(entry_config, dict):
|
| 282 |
+
continue
|
| 283 |
+
|
| 284 |
+
# Check if entry needs smart defaults
|
| 285 |
+
needs_defaults = not any(field in entry_config
|
| 286 |
+
for field in SchemaValidator.NEW_SCHEMA_FIELDS)
|
| 287 |
+
|
| 288 |
+
if needs_defaults:
|
| 289 |
+
# Generate and apply smart defaults
|
| 290 |
+
smart_defaults = cls.get_smart_defaults_for_entry(entry_config)
|
| 291 |
+
entry_config.update(smart_defaults)
|
| 292 |
+
logger.info(f"Applied smart defaults to {entry_name}")
|
| 293 |
+
|
| 294 |
+
return enhanced_config
|
| 295 |
+
|
| 296 |
+
@classmethod
|
| 297 |
+
def get_default_analysis_config(cls, selected_entries: List[str],
|
| 298 |
+
config_data: Dict[str, Any]) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
|
| 299 |
+
"""
|
| 300 |
+
Generate default analysis configuration for selected entries.
|
| 301 |
+
|
| 302 |
+
Args:
|
| 303 |
+
selected_entries: List of selected reference list entries
|
| 304 |
+
config_data: Full configuration data
|
| 305 |
+
|
| 306 |
+
Returns:
|
| 307 |
+
Tuple of (selected_measures, log_transforms) dictionaries
|
| 308 |
+
"""
|
| 309 |
+
selected_measures = {}
|
| 310 |
+
log_transforms = {}
|
| 311 |
+
|
| 312 |
+
for entry_name in selected_entries:
|
| 313 |
+
# Find the entry in config
|
| 314 |
+
entry_config = None
|
| 315 |
+
for language, lang_data in config_data.items():
|
| 316 |
+
if not isinstance(lang_data, dict):
|
| 317 |
+
continue
|
| 318 |
+
for ngram_type, type_data in lang_data.items():
|
| 319 |
+
if not isinstance(type_data, dict):
|
| 320 |
+
continue
|
| 321 |
+
if entry_name in type_data:
|
| 322 |
+
entry_config = type_data[entry_name]
|
| 323 |
+
break
|
| 324 |
+
if entry_config:
|
| 325 |
+
break
|
| 326 |
+
|
| 327 |
+
if not entry_config:
|
| 328 |
+
continue
|
| 329 |
+
|
| 330 |
+
# Get defaults from config or generate them
|
| 331 |
+
if entry_config.get('default_measures'):
|
| 332 |
+
selected_measures[entry_name] = entry_config['default_measures']
|
| 333 |
+
else:
|
| 334 |
+
# Generate smart defaults
|
| 335 |
+
defaults = cls.get_smart_defaults_for_entry(entry_config)
|
| 336 |
+
selected_measures[entry_name] = defaults['default_measures']
|
| 337 |
+
|
| 338 |
+
if entry_config.get('default_log_transforms'):
|
| 339 |
+
log_transforms[entry_name] = entry_config['default_log_transforms']
|
| 340 |
+
else:
|
| 341 |
+
# Generate smart defaults
|
| 342 |
+
defaults = cls.get_smart_defaults_for_entry(entry_config)
|
| 343 |
+
log_transforms[entry_name] = defaults['default_log_transforms']
|
| 344 |
+
|
| 345 |
+
return selected_measures, log_transforms
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def test_smart_defaults():
|
| 349 |
+
"""Test the smart defaults functionality."""
|
| 350 |
+
|
| 351 |
+
print("=== TESTING SMART DEFAULTS ENGINE ===")
|
| 352 |
+
|
| 353 |
+
# Test measure classification
|
| 354 |
+
test_measures = ['frequency', 'MI', 'concreteness', 'range', 'delta_p', 'normalized_freq']
|
| 355 |
+
|
| 356 |
+
print("\n📊 Measure Classification:")
|
| 357 |
+
for measure in test_measures:
|
| 358 |
+
category = DefaultsManager.classify_measure_type(measure)
|
| 359 |
+
should_log = DefaultsManager.LOG_TRANSFORM_RULES.get(category, False)
|
| 360 |
+
print(f" {measure} → {category} (log: {should_log})")
|
| 361 |
+
|
| 362 |
+
# Test smart defaults generation
|
| 363 |
+
print("\n🎯 Smart Defaults Generation:")
|
| 364 |
+
test_config = {
|
| 365 |
+
'columns': {
|
| 366 |
+
'word': 0,
|
| 367 |
+
'frequency': 1,
|
| 368 |
+
'normalized_freq': 2,
|
| 369 |
+
'range': 3,
|
| 370 |
+
'dispersion': 4
|
| 371 |
+
}
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
defaults = DefaultsManager.get_smart_defaults_for_entry(test_config)
|
| 375 |
+
print(f" Log transformable: {defaults['log_transformable']}")
|
| 376 |
+
print(f" Default measures: {defaults['default_measures']}")
|
| 377 |
+
print(f" Default log transforms: {defaults['default_log_transforms']}")
|
| 378 |
+
|
| 379 |
+
# Test association measures
|
| 380 |
+
print("\n🔗 Association Measures Test:")
|
| 381 |
+
assoc_config = {
|
| 382 |
+
'columns': {
|
| 383 |
+
'bigram': 0,
|
| 384 |
+
'frequency': 1,
|
| 385 |
+
'MI': 2,
|
| 386 |
+
'T': 3,
|
| 387 |
+
'delta_p': 4
|
| 388 |
+
}
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
assoc_defaults = DefaultsManager.get_smart_defaults_for_entry(assoc_config)
|
| 392 |
+
print(f" Log transformable: {assoc_defaults['log_transformable']}")
|
| 393 |
+
print(f" Default measures: {assoc_defaults['default_measures']}")
|
| 394 |
+
print(f" Default log transforms: {assoc_defaults['default_log_transforms']}")
|
| 395 |
+
|
| 396 |
+
print("\n✅ Smart Defaults Engine working perfectly!")
|
| 397 |
+
return defaults, assoc_defaults
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
if __name__ == "__main__":
|
| 401 |
+
test_smart_defaults()
|
web_app/handlers/analysis_handlers.py
CHANGED
|
@@ -71,8 +71,17 @@ class AnalysisHandlers:
|
|
| 71 |
ReferenceManager.configure_reference_lists(analyzer)
|
| 72 |
ReferenceManager.render_custom_upload_section()
|
| 73 |
|
| 74 |
-
#
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
# Analysis button
|
| 78 |
if st.button("Analyze Text", type="primary"):
|
|
@@ -86,13 +95,41 @@ class AnalysisHandlers:
|
|
| 86 |
# Load reference lists
|
| 87 |
analyzer.load_reference_lists(reference_lists)
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
# Display results
|
| 98 |
AnalysisHandlers.display_single_text_results(results)
|
|
@@ -406,4 +443,83 @@ class AnalysisHandlers:
|
|
| 406 |
bargap=0.05
|
| 407 |
)
|
| 408 |
|
| 409 |
-
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
ReferenceManager.configure_reference_lists(analyzer)
|
| 72 |
ReferenceManager.render_custom_upload_section()
|
| 73 |
|
| 74 |
+
# Enhanced analysis options with smart defaults
|
| 75 |
+
analysis_config = AnalysisHandlers.render_enhanced_analysis_options()
|
| 76 |
+
|
| 77 |
+
# Extract configuration
|
| 78 |
+
token_analysis = analysis_config['token_analysis']
|
| 79 |
+
lemma_analysis = analysis_config['lemma_analysis']
|
| 80 |
+
word_type_filter = analysis_config['word_type_filter']
|
| 81 |
+
use_smart_defaults = analysis_config['use_smart_defaults']
|
| 82 |
+
legacy_log_transform = analysis_config.get('legacy_log_transform', False)
|
| 83 |
+
selected_measures = analysis_config.get('selected_measures', {})
|
| 84 |
+
log_transforms = analysis_config.get('log_transforms', {})
|
| 85 |
|
| 86 |
# Analysis button
|
| 87 |
if st.button("Analyze Text", type="primary"):
|
|
|
|
| 95 |
# Load reference lists
|
| 96 |
analyzer.load_reference_lists(reference_lists)
|
| 97 |
|
| 98 |
+
# Get analysis configuration
|
| 99 |
+
if use_smart_defaults:
|
| 100 |
+
# Use smart defaults from configuration
|
| 101 |
+
from web_app.defaults_manager import DefaultsManager
|
| 102 |
+
from web_app.config_manager import ConfigManager
|
| 103 |
+
|
| 104 |
+
config = ConfigManager.load_reference_config()
|
| 105 |
+
selected_measures, log_transforms = DefaultsManager.get_default_analysis_config(
|
| 106 |
+
list(reference_lists.keys()), config
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Perform enhanced analysis with smart defaults
|
| 110 |
+
results = analyzer.analyze_text(
|
| 111 |
+
text_content,
|
| 112 |
+
list(reference_lists.keys()),
|
| 113 |
+
apply_log=False, # Superseded by log_transforms
|
| 114 |
+
word_type_filter=word_type_filter,
|
| 115 |
+
log_transforms=log_transforms,
|
| 116 |
+
selected_measures=selected_measures
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
st.success("✨ Analysis completed using Smart Defaults!")
|
| 120 |
+
st.info(f"📊 Applied selective log transforms to {sum(len(measures) for measures in log_transforms.values())} measures")
|
| 121 |
+
|
| 122 |
+
else:
|
| 123 |
+
# Legacy mode - use global log transformation
|
| 124 |
+
results = analyzer.analyze_text(
|
| 125 |
+
text_content,
|
| 126 |
+
list(reference_lists.keys()),
|
| 127 |
+
apply_log=legacy_log_transform,
|
| 128 |
+
word_type_filter=word_type_filter
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
if legacy_log_transform:
|
| 132 |
+
st.warning("⚠️ Legacy mode: Log transformation applied to ALL measures")
|
| 133 |
|
| 134 |
# Display results
|
| 135 |
AnalysisHandlers.display_single_text_results(results)
|
|
|
|
| 443 |
bargap=0.05
|
| 444 |
)
|
| 445 |
|
| 446 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 447 |
+
|
| 448 |
+
@staticmethod
|
| 449 |
+
def render_enhanced_analysis_options():
|
| 450 |
+
"""Render the enhanced analysis interface with smart defaults and hierarchical display."""
|
| 451 |
+
from web_app.defaults_manager import DefaultsManager
|
| 452 |
+
from web_app.config_manager import ConfigManager
|
| 453 |
+
from web_app.session_manager import SessionManager
|
| 454 |
+
|
| 455 |
+
st.subheader("🔧 Analysis Configuration")
|
| 456 |
+
|
| 457 |
+
# Get current configuration
|
| 458 |
+
config = ConfigManager.load_reference_config()
|
| 459 |
+
reference_lists = SessionManager.get_reference_lists()
|
| 460 |
+
|
| 461 |
+
# Enhanced Reference Lists & Measures Section
|
| 462 |
+
st.write("### 📋 Reference Lists & Measures")
|
| 463 |
+
|
| 464 |
+
# Simple hierarchical display for now (basic implementation)
|
| 465 |
+
if reference_lists:
|
| 466 |
+
st.write("**Selected Reference Lists:**")
|
| 467 |
+
for list_name in reference_lists.keys():
|
| 468 |
+
# Show smart defaults indicator
|
| 469 |
+
entry_config = UIComponents._find_entry_config(list_name, config)
|
| 470 |
+
if entry_config and entry_config.get('default_measures'):
|
| 471 |
+
defaults_info = f"📊 {len(entry_config['default_measures'])} measures selected"
|
| 472 |
+
log_info = f"🔄 {len(entry_config.get('default_log_transforms', []))} log-transformed"
|
| 473 |
+
st.write(f"├─ **{list_name}** [Token ✓] [Lemma ✓] [ℹ️ Smart defaults]")
|
| 474 |
+
st.write(f" {defaults_info}, {log_info}")
|
| 475 |
+
else:
|
| 476 |
+
st.write(f"├─ **{list_name}** [Legacy configuration]")
|
| 477 |
+
else:
|
| 478 |
+
st.info("No reference lists selected. Please configure reference lists first.")
|
| 479 |
+
|
| 480 |
+
# Global Analysis Options
|
| 481 |
+
st.write("### 🎯 Analysis Types")
|
| 482 |
+
col1, col2 = st.columns(2)
|
| 483 |
+
|
| 484 |
+
with col1:
|
| 485 |
+
token_analysis = st.checkbox("☑️ Token-based", value=True, key="token_analysis_enabled")
|
| 486 |
+
with col2:
|
| 487 |
+
lemma_analysis = st.checkbox("☑️ Lemma-based", value=True, key="lemma_analysis_enabled")
|
| 488 |
+
|
| 489 |
+
# Global Options
|
| 490 |
+
st.write("### ⚙️ Global Options")
|
| 491 |
+
word_type_filter = st.selectbox(
|
| 492 |
+
"Word Type Filter:",
|
| 493 |
+
options=[None, 'CW', 'FW'],
|
| 494 |
+
format_func=lambda x: 'All Words ▼' if x is None else ('Content Words' if x == 'CW' else 'Function Words'),
|
| 495 |
+
key="word_type_filter"
|
| 496 |
+
)
|
| 497 |
+
|
| 498 |
+
# Advanced Configuration Section
|
| 499 |
+
with st.expander("🎯 Advanced Configuration (Optional)", expanded=False):
|
| 500 |
+
st.info("ℹ️ **Smart Defaults Active**: The system automatically applies appropriate settings. "
|
| 501 |
+
"Expand this section only if you need custom control.")
|
| 502 |
+
|
| 503 |
+
# Legacy log transformation toggle
|
| 504 |
+
legacy_log_toggle = st.checkbox(
|
| 505 |
+
"Apply log₁₀ transformation to ALL measures (Legacy Mode)",
|
| 506 |
+
value=False,
|
| 507 |
+
help="⚠️ Not recommended: This applies log transformation to all measures, "
|
| 508 |
+
"including those where it's scientifically inappropriate (e.g., concreteness ratings).",
|
| 509 |
+
key="legacy_log_transform"
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
if legacy_log_toggle:
|
| 513 |
+
st.warning("⚠️ Legacy mode enabled: Log transformation will be applied to ALL numerical measures. "
|
| 514 |
+
"This may produce scientifically invalid results for psycholinguistic measures.")
|
| 515 |
+
|
| 516 |
+
# Return enhanced configuration
|
| 517 |
+
return {
|
| 518 |
+
'token_analysis': token_analysis,
|
| 519 |
+
'lemma_analysis': lemma_analysis,
|
| 520 |
+
'word_type_filter': word_type_filter,
|
| 521 |
+
'use_smart_defaults': not st.session_state.get('legacy_log_transform', False),
|
| 522 |
+
'legacy_log_transform': st.session_state.get('legacy_log_transform', False),
|
| 523 |
+
'selected_measures': {}, # Will be filled by smart defaults
|
| 524 |
+
'log_transforms': {} # Will be filled by smart defaults
|
| 525 |
+
}
|
web_app/reference_manager.py
CHANGED
|
@@ -64,7 +64,7 @@ class ReferenceManager:
|
|
| 64 |
def _update_default_reference_lists(selected_lists: List[tuple]):
|
| 65 |
"""Update default reference lists based on selections."""
|
| 66 |
current_keys = set(SessionManager.get_reference_lists().keys())
|
| 67 |
-
new_keys = set(
|
| 68 |
|
| 69 |
# Remove deselected lists (only default lists, not custom ones)
|
| 70 |
for key in current_keys - new_keys:
|
|
@@ -75,14 +75,13 @@ class ReferenceManager:
|
|
| 75 |
|
| 76 |
# Add newly selected lists
|
| 77 |
for ngram_type, list_key, list_config in selected_lists:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
if combined_key not in SessionManager.get_reference_lists():
|
| 81 |
# Load the actual data
|
| 82 |
data = ConfigManager.load_reference_list_data(list_config)
|
| 83 |
|
| 84 |
if data:
|
| 85 |
-
SessionManager.add_reference_list(
|
| 86 |
|
| 87 |
@staticmethod
|
| 88 |
def _display_loaded_lists():
|
|
@@ -221,4 +220,4 @@ class ReferenceManager:
|
|
| 221 |
'data_size': len(data.get('token', {})) if isinstance(data.get('token'), dict) else 0
|
| 222 |
}
|
| 223 |
|
| 224 |
-
return config
|
|
|
|
| 64 |
def _update_default_reference_lists(selected_lists: List[tuple]):
|
| 65 |
"""Update default reference lists based on selections."""
|
| 66 |
current_keys = set(SessionManager.get_reference_lists().keys())
|
| 67 |
+
new_keys = set(list_key for ngram_type, list_key, _ in selected_lists) # Use list_key directly
|
| 68 |
|
| 69 |
# Remove deselected lists (only default lists, not custom ones)
|
| 70 |
for key in current_keys - new_keys:
|
|
|
|
| 75 |
|
| 76 |
# Add newly selected lists
|
| 77 |
for ngram_type, list_key, list_config in selected_lists:
|
| 78 |
+
# Use the YAML entry name directly (list_key) instead of combining with ngram_type
|
| 79 |
+
if list_key not in SessionManager.get_reference_lists():
|
|
|
|
| 80 |
# Load the actual data
|
| 81 |
data = ConfigManager.load_reference_list_data(list_config)
|
| 82 |
|
| 83 |
if data:
|
| 84 |
+
SessionManager.add_reference_list(list_key, data) # Use list_key directly
|
| 85 |
|
| 86 |
@staticmethod
|
| 87 |
def _display_loaded_lists():
|
|
|
|
| 220 |
'data_size': len(data.get('token', {})) if isinstance(data.get('token'), dict) else 0
|
| 221 |
}
|
| 222 |
|
| 223 |
+
return config
|
web_app/schema_migrator.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Schema Migration Tool for Reference Lists Configuration
|
| 3 |
+
Converts old schema entries to new schema format.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import yaml
|
| 7 |
+
from typing import Dict, Any, List, Tuple
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import logging
|
| 10 |
+
from web_app.schema_validator import SchemaValidator
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SchemaMigrator:
|
| 16 |
+
"""Handles migration from old schema to new schema format."""
|
| 17 |
+
|
| 18 |
+
@classmethod
|
| 19 |
+
def migrate_single_entry(cls, entry_name: str, entry_config: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 20 |
+
"""
|
| 21 |
+
Migrate a single old schema entry to new schema format.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
entry_name: Name of the entry to migrate
|
| 25 |
+
entry_config: Old schema configuration
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
List of new schema entries (one for each analysis type)
|
| 29 |
+
"""
|
| 30 |
+
new_entries = []
|
| 31 |
+
|
| 32 |
+
# Get available files
|
| 33 |
+
files = entry_config.get('files', {})
|
| 34 |
+
|
| 35 |
+
for analysis_type in ['token', 'lemma']:
|
| 36 |
+
if analysis_type not in files:
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
# Create new entry name
|
| 40 |
+
new_entry_name = f"{entry_name}_{analysis_type}"
|
| 41 |
+
|
| 42 |
+
# Get measure names from columns (exclude word column)
|
| 43 |
+
columns = entry_config.get('columns', {})
|
| 44 |
+
word_column_index = columns.get('word', 0)
|
| 45 |
+
|
| 46 |
+
# Extract measure names (all columns except word column)
|
| 47 |
+
measure_names = []
|
| 48 |
+
for col_name, col_index in columns.items():
|
| 49 |
+
if col_name != 'word' and col_index != word_column_index:
|
| 50 |
+
measure_names.append(col_name)
|
| 51 |
+
|
| 52 |
+
# Create smart defaults for new schema fields
|
| 53 |
+
new_schema_fields = SchemaValidator.create_default_new_schema_fields(
|
| 54 |
+
measure_names, analysis_type
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Build new entry configuration
|
| 58 |
+
new_entry = {
|
| 59 |
+
'display_name': f"{entry_config.get('display_name', entry_name)} ({analysis_type.title()})",
|
| 60 |
+
'description': f"{entry_config.get('description', '')} - {analysis_type}-based analysis",
|
| 61 |
+
'file': files[analysis_type],
|
| 62 |
+
'format': entry_config.get('format', 'tsv'),
|
| 63 |
+
'columns': columns.copy(),
|
| 64 |
+
'has_header': entry_config.get('has_header', False),
|
| 65 |
+
'enabled': entry_config.get('enabled', True),
|
| 66 |
+
**new_schema_fields
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Add header_prefix if it exists
|
| 70 |
+
if 'header_prefix' in entry_config:
|
| 71 |
+
new_entry['header_prefix'] = entry_config['header_prefix']
|
| 72 |
+
|
| 73 |
+
# Add japanese_corpus flag if it exists
|
| 74 |
+
if entry_config.get('japanese_corpus', False):
|
| 75 |
+
new_entry['japanese_corpus'] = True
|
| 76 |
+
|
| 77 |
+
new_entries.append({
|
| 78 |
+
'name': new_entry_name,
|
| 79 |
+
'config': new_entry
|
| 80 |
+
})
|
| 81 |
+
|
| 82 |
+
return new_entries
|
| 83 |
+
|
| 84 |
+
@classmethod
|
| 85 |
+
def create_test_migration(cls, config_data: Dict[str, Any],
|
| 86 |
+
entry_path: Tuple[str, str, str]) -> Dict[str, Any]:
|
| 87 |
+
"""
|
| 88 |
+
Create a test migration for a specific entry without modifying the original.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
config_data: Full configuration data
|
| 92 |
+
entry_path: Tuple of (language, ngram_type, entry_name)
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
Dictionary with migrated configuration
|
| 96 |
+
"""
|
| 97 |
+
language, ngram_type, entry_name = entry_path
|
| 98 |
+
|
| 99 |
+
# Get the original entry
|
| 100 |
+
original_entry = config_data[language][ngram_type][entry_name]
|
| 101 |
+
|
| 102 |
+
# Migrate the entry
|
| 103 |
+
migrated_entries = cls.migrate_single_entry(entry_name, original_entry)
|
| 104 |
+
|
| 105 |
+
# Create new configuration structure
|
| 106 |
+
new_config = {
|
| 107 |
+
'original_entry': {
|
| 108 |
+
'path': f"{language}/{ngram_type}/{entry_name}",
|
| 109 |
+
'config': original_entry
|
| 110 |
+
},
|
| 111 |
+
'migrated_entries': {},
|
| 112 |
+
'migration_summary': {
|
| 113 |
+
'entries_created': len(migrated_entries),
|
| 114 |
+
'schema_version': 'new'
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Add migrated entries
|
| 119 |
+
for entry in migrated_entries:
|
| 120 |
+
new_config['migrated_entries'][entry['name']] = entry['config']
|
| 121 |
+
|
| 122 |
+
return new_config
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def test_migration():
|
| 126 |
+
"""Test migration functionality."""
|
| 127 |
+
from web_app.schema_validator import load_and_validate_config
|
| 128 |
+
|
| 129 |
+
# Load current config
|
| 130 |
+
config_data, validation_results = load_and_validate_config("config/reference_lists.yaml")
|
| 131 |
+
|
| 132 |
+
if not validation_results['is_valid']:
|
| 133 |
+
print("❌ Invalid configuration file")
|
| 134 |
+
return
|
| 135 |
+
|
| 136 |
+
# Test migration of COCA_spoken_frequency
|
| 137 |
+
test_result = SchemaMigrator.create_test_migration(
|
| 138 |
+
config_data,
|
| 139 |
+
('english', 'unigrams', 'COCA_spoken_frequency')
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
print("=== MIGRATION TEST RESULTS ===")
|
| 143 |
+
print(f"Original entry: {test_result['original_entry']['path']}")
|
| 144 |
+
print(f"Entries created: {test_result['migration_summary']['entries_created']}")
|
| 145 |
+
print("\n=== MIGRATED ENTRIES ===")
|
| 146 |
+
|
| 147 |
+
for entry_name, entry_config in test_result['migrated_entries'].items():
|
| 148 |
+
print(f"\n🔄 {entry_name}:")
|
| 149 |
+
print(f" - Display Name: {entry_config['display_name']}")
|
| 150 |
+
print(f" - Analysis Type: {entry_config['analysis_type']}")
|
| 151 |
+
print(f" - File: {entry_config['file']}")
|
| 152 |
+
print(f" - Selectable Measures: {entry_config['selectable_measures']}")
|
| 153 |
+
print(f" - Default Measures: {entry_config['default_measures']}")
|
| 154 |
+
print(f" - Log Transformable: {entry_config['log_transformable']}")
|
| 155 |
+
print(f" - Default Log Transforms: {entry_config['default_log_transforms']}")
|
| 156 |
+
|
| 157 |
+
return test_result
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
test_migration()
|
web_app/schema_validator.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
YAML Schema Validator for Reference Lists Configuration
|
| 3 |
+
Handles detection and validation of old vs new schema formats.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import yaml
|
| 7 |
+
from typing import Dict, Any, List, Optional, Tuple
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SchemaValidator:
|
| 14 |
+
"""Validates and detects YAML schema formats for reference lists."""
|
| 15 |
+
|
| 16 |
+
# New schema required fields
|
| 17 |
+
NEW_SCHEMA_FIELDS = {
|
| 18 |
+
'analysis_type',
|
| 19 |
+
'log_transformable',
|
| 20 |
+
'selectable_measures',
|
| 21 |
+
'default_measures',
|
| 22 |
+
'default_log_transforms'
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# Old schema indicator fields
|
| 26 |
+
OLD_SCHEMA_FIELDS = {
|
| 27 |
+
'files' # Old schema uses files.token/files.lemma
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
@classmethod
|
| 31 |
+
def detect_schema_version(cls, config_data: Dict[str, Any]) -> str:
|
| 32 |
+
"""
|
| 33 |
+
Detect whether configuration uses old or new schema.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
config_data: Parsed YAML configuration data
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
'old', 'new', or 'mixed' schema version
|
| 40 |
+
"""
|
| 41 |
+
old_count = 0
|
| 42 |
+
new_count = 0
|
| 43 |
+
|
| 44 |
+
# Check all language/type/entry combinations
|
| 45 |
+
for language, lang_data in config_data.items():
|
| 46 |
+
if not isinstance(lang_data, dict):
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
for ngram_type, type_data in lang_data.items():
|
| 50 |
+
if not isinstance(type_data, dict):
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
for entry_name, entry_config in type_data.items():
|
| 54 |
+
if not isinstance(entry_config, dict):
|
| 55 |
+
continue
|
| 56 |
+
|
| 57 |
+
# Check for old schema indicators
|
| 58 |
+
if any(field in entry_config for field in cls.OLD_SCHEMA_FIELDS):
|
| 59 |
+
old_count += 1
|
| 60 |
+
|
| 61 |
+
# Check for new schema indicators
|
| 62 |
+
if any(field in entry_config for field in cls.NEW_SCHEMA_FIELDS):
|
| 63 |
+
new_count += 1
|
| 64 |
+
|
| 65 |
+
if old_count > 0 and new_count == 0:
|
| 66 |
+
return 'old'
|
| 67 |
+
elif new_count > 0 and old_count == 0:
|
| 68 |
+
return 'new'
|
| 69 |
+
elif old_count > 0 and new_count > 0:
|
| 70 |
+
return 'mixed'
|
| 71 |
+
else:
|
| 72 |
+
# Default assumption if no clear indicators
|
| 73 |
+
return 'old'
|
| 74 |
+
|
| 75 |
+
@classmethod
|
| 76 |
+
def validate_old_schema(cls, entry_config: Dict[str, Any]) -> Tuple[bool, List[str]]:
|
| 77 |
+
"""
|
| 78 |
+
Validate old schema entry format.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
entry_config: Single entry configuration
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
Tuple of (is_valid, error_messages)
|
| 85 |
+
"""
|
| 86 |
+
errors = []
|
| 87 |
+
|
| 88 |
+
# Required fields for old schema
|
| 89 |
+
required_fields = {'display_name', 'description', 'files', 'format', 'columns', 'enabled'}
|
| 90 |
+
|
| 91 |
+
for field in required_fields:
|
| 92 |
+
if field not in entry_config:
|
| 93 |
+
errors.append(f"Missing required field: {field}")
|
| 94 |
+
|
| 95 |
+
# Validate files structure
|
| 96 |
+
if 'files' in entry_config:
|
| 97 |
+
files = entry_config['files']
|
| 98 |
+
if not isinstance(files, dict):
|
| 99 |
+
errors.append("'files' must be a dictionary")
|
| 100 |
+
else:
|
| 101 |
+
if 'token' not in files and 'lemma' not in files:
|
| 102 |
+
errors.append("'files' must contain at least 'token' or 'lemma'")
|
| 103 |
+
|
| 104 |
+
# Validate columns structure
|
| 105 |
+
if 'columns' in entry_config:
|
| 106 |
+
columns = entry_config['columns']
|
| 107 |
+
if not isinstance(columns, dict):
|
| 108 |
+
errors.append("'columns' must be a dictionary")
|
| 109 |
+
|
| 110 |
+
return len(errors) == 0, errors
|
| 111 |
+
|
| 112 |
+
@classmethod
|
| 113 |
+
def validate_new_schema(cls, entry_config: Dict[str, Any]) -> Tuple[bool, List[str]]:
|
| 114 |
+
"""
|
| 115 |
+
Validate new schema entry format.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
entry_config: Single entry configuration
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
Tuple of (is_valid, error_messages)
|
| 122 |
+
"""
|
| 123 |
+
errors = []
|
| 124 |
+
|
| 125 |
+
# Required fields for new schema
|
| 126 |
+
required_fields = {
|
| 127 |
+
'display_name', 'description', 'file', 'format', 'columns',
|
| 128 |
+
'enabled', 'analysis_type', 'log_transformable',
|
| 129 |
+
'selectable_measures', 'default_measures', 'default_log_transforms'
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
for field in required_fields:
|
| 133 |
+
if field not in entry_config:
|
| 134 |
+
errors.append(f"Missing required field: {field}")
|
| 135 |
+
|
| 136 |
+
# Validate analysis_type
|
| 137 |
+
if 'analysis_type' in entry_config:
|
| 138 |
+
analysis_type = entry_config['analysis_type']
|
| 139 |
+
if analysis_type not in ['token', 'lemma']:
|
| 140 |
+
errors.append(f"'analysis_type' must be 'token' or 'lemma', got: {analysis_type}")
|
| 141 |
+
|
| 142 |
+
# Validate list fields
|
| 143 |
+
list_fields = ['log_transformable', 'selectable_measures', 'default_measures', 'default_log_transforms']
|
| 144 |
+
for field in list_fields:
|
| 145 |
+
if field in entry_config:
|
| 146 |
+
value = entry_config[field]
|
| 147 |
+
if not isinstance(value, list):
|
| 148 |
+
errors.append(f"'{field}' must be a list, got: {type(value).__name__}")
|
| 149 |
+
|
| 150 |
+
# Validate file field (single file path instead of files dict)
|
| 151 |
+
if 'file' in entry_config:
|
| 152 |
+
file_path = entry_config['file']
|
| 153 |
+
if not isinstance(file_path, str):
|
| 154 |
+
errors.append("'file' must be a string path")
|
| 155 |
+
|
| 156 |
+
return len(errors) == 0, errors
|
| 157 |
+
|
| 158 |
+
@classmethod
|
| 159 |
+
def get_schema_migration_plan(cls, config_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 160 |
+
"""
|
| 161 |
+
Generate a migration plan for converting old schema to new schema.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
config_data: Current configuration data
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
Dictionary containing migration plan details
|
| 168 |
+
"""
|
| 169 |
+
schema_version = cls.detect_schema_version(config_data)
|
| 170 |
+
|
| 171 |
+
migration_plan = {
|
| 172 |
+
'current_schema': schema_version,
|
| 173 |
+
'requires_migration': schema_version in ['old', 'mixed'],
|
| 174 |
+
'entries_to_migrate': [],
|
| 175 |
+
'entries_to_split': [],
|
| 176 |
+
'new_entries_count': 0
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
if not migration_plan['requires_migration']:
|
| 180 |
+
return migration_plan
|
| 181 |
+
|
| 182 |
+
# Analyze entries that need migration
|
| 183 |
+
for language, lang_data in config_data.items():
|
| 184 |
+
if not isinstance(lang_data, dict):
|
| 185 |
+
continue
|
| 186 |
+
|
| 187 |
+
for ngram_type, type_data in lang_data.items():
|
| 188 |
+
if not isinstance(type_data, dict):
|
| 189 |
+
continue
|
| 190 |
+
|
| 191 |
+
for entry_name, entry_config in type_data.items():
|
| 192 |
+
if not isinstance(entry_config, dict):
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
# Check if this entry uses old schema
|
| 196 |
+
if 'files' in entry_config:
|
| 197 |
+
files = entry_config['files']
|
| 198 |
+
if isinstance(files, dict):
|
| 199 |
+
# Count how many files this entry will split into
|
| 200 |
+
file_count = len([k for k in files.keys() if k in ['token', 'lemma']])
|
| 201 |
+
|
| 202 |
+
migration_plan['entries_to_migrate'].append({
|
| 203 |
+
'language': language,
|
| 204 |
+
'type': ngram_type,
|
| 205 |
+
'name': entry_name,
|
| 206 |
+
'files': list(files.keys()),
|
| 207 |
+
'will_create': file_count
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
migration_plan['new_entries_count'] += file_count
|
| 211 |
+
|
| 212 |
+
return migration_plan
|
| 213 |
+
|
| 214 |
+
@classmethod
|
| 215 |
+
def create_default_new_schema_fields(cls, measure_names: List[str],
|
| 216 |
+
analysis_type: str = 'token') -> Dict[str, Any]:
|
| 217 |
+
"""
|
| 218 |
+
Create default values for new schema fields based on measure names.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
measure_names: List of available measure names from columns
|
| 222 |
+
analysis_type: 'token' or 'lemma'
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
Dictionary with default new schema fields
|
| 226 |
+
"""
|
| 227 |
+
# Smart defaults based on measure names
|
| 228 |
+
frequency_measures = []
|
| 229 |
+
association_measures = []
|
| 230 |
+
psycholinguistic_measures = []
|
| 231 |
+
|
| 232 |
+
for measure in measure_names:
|
| 233 |
+
measure_lower = measure.lower()
|
| 234 |
+
if any(freq_term in measure_lower for freq_term in ['freq', 'frequency', 'count']):
|
| 235 |
+
frequency_measures.append(measure)
|
| 236 |
+
elif any(assoc_term in measure_lower for assoc_term in ['mi', 't_score', 'delta_p', 'ap_collex']):
|
| 237 |
+
association_measures.append(measure)
|
| 238 |
+
elif any(psych_term in measure_lower for psych_term in ['concreteness', 'valence', 'arousal', 'dominance']):
|
| 239 |
+
psycholinguistic_measures.append(measure)
|
| 240 |
+
else:
|
| 241 |
+
# Default to no log transform for unknown measures
|
| 242 |
+
pass
|
| 243 |
+
|
| 244 |
+
# Set defaults
|
| 245 |
+
log_transformable = frequency_measures # Only frequency measures should be log-transformed
|
| 246 |
+
selectable_measures = measure_names
|
| 247 |
+
|
| 248 |
+
# Smart default selection
|
| 249 |
+
if frequency_measures:
|
| 250 |
+
default_measures = frequency_measures[:2] # First 2 frequency measures
|
| 251 |
+
elif association_measures:
|
| 252 |
+
# Prefer MI and T-score for associations
|
| 253 |
+
default_measures = [m for m in association_measures if any(pref in m.lower() for pref in ['mi', 't_score'])][:2]
|
| 254 |
+
else:
|
| 255 |
+
default_measures = measure_names[:2] if len(measure_names) >= 2 else measure_names
|
| 256 |
+
|
| 257 |
+
# Default log transforms (only for frequency measures)
|
| 258 |
+
default_log_transforms = [m for m in default_measures if m in frequency_measures]
|
| 259 |
+
|
| 260 |
+
return {
|
| 261 |
+
'analysis_type': analysis_type,
|
| 262 |
+
'log_transformable': log_transformable,
|
| 263 |
+
'selectable_measures': selectable_measures,
|
| 264 |
+
'default_measures': default_measures,
|
| 265 |
+
'default_log_transforms': default_log_transforms
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def load_and_validate_config(config_path: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
| 270 |
+
"""
|
| 271 |
+
Load and validate YAML configuration file.
|
| 272 |
+
|
| 273 |
+
Args:
|
| 274 |
+
config_path: Path to YAML configuration file
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
Tuple of (config_data, validation_results)
|
| 278 |
+
"""
|
| 279 |
+
try:
|
| 280 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
| 281 |
+
config_data = yaml.safe_load(f)
|
| 282 |
+
|
| 283 |
+
schema_version = SchemaValidator.detect_schema_version(config_data)
|
| 284 |
+
migration_plan = SchemaValidator.get_schema_migration_plan(config_data)
|
| 285 |
+
|
| 286 |
+
validation_results = {
|
| 287 |
+
'schema_version': schema_version,
|
| 288 |
+
'migration_plan': migration_plan,
|
| 289 |
+
'is_valid': True,
|
| 290 |
+
'errors': []
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
return config_data, validation_results
|
| 294 |
+
|
| 295 |
+
except Exception as e:
|
| 296 |
+
logger.error(f"Error loading config file {config_path}: {e}")
|
| 297 |
+
return {}, {
|
| 298 |
+
'schema_version': 'unknown',
|
| 299 |
+
'migration_plan': {},
|
| 300 |
+
'is_valid': False,
|
| 301 |
+
'errors': [str(e)]
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
if __name__ == "__main__":
|
| 306 |
+
# Test the validator
|
| 307 |
+
config_data, validation_results = load_and_validate_config("config/reference_lists.yaml")
|
| 308 |
+
print(f"Schema version: {validation_results['schema_version']}")
|
| 309 |
+
print(f"Migration plan: {validation_results['migration_plan']}")
|