Spaces:
Building
Building
Japanese language support
Browse files- .gitignore +388 -0
- README.md +238 -5
- config/reference_lists.yaml +97 -1
- japanese-nlp-test.ipynb +819 -0
- pyproject.toml +4 -0
- resources/reference_lists/ja/BCCWJ_frequencylist_luw2_ver1_1.tsv +3 -0
- resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1 copy.tsv +3 -0
- resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv +3 -0
- resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv +3 -0
- test/test_app.py +8 -6
- test/test_functionality.py +6 -4
- test/test_multi_index.py +2 -3
- test/test_yaml_config.py +2 -3
- test_frequency_flexible.py +1 -0
- test_fugashi_diagnostic.py +134 -0
- test_japanese_integration.py +135 -0
- test_unidic_diagnostic.py +201 -0
- text_analyzer/__pycache__/__init__.cpython-312.pyc +0 -0
- text_analyzer/__pycache__/lexical_sophistication.cpython-312.pyc +0 -0
- text_analyzer/__pycache__/pos_parser.cpython-312.pyc +0 -0
- text_analyzer/app_config.py +183 -0
- text_analyzer/base_analyzer.py +308 -0
- text_analyzer/frequency_analyzer.py +653 -0
- text_analyzer/lexical_sophistication.py +245 -62
- text_analyzer/pos_parser.py +11 -36
- text_analyzer/text_utility.py +289 -0
- text_analyzer/unidic_enricher.py +256 -0
- text_analyzer/unidic_extensions.py +25 -0
- uv.lock +420 -6
- web_app/__pycache__/analysis_handlers.cpython-312.pyc +0 -0
- web_app/__pycache__/app.cpython-312.pyc +0 -0
- web_app/__pycache__/comparison_functions.cpython-312.pyc +0 -0
- web_app/__pycache__/config_manager.cpython-312.pyc +0 -0
- web_app/__pycache__/pos_handlers.cpython-312.pyc +0 -0
- web_app/__pycache__/reference_manager.cpython-312.pyc +0 -0
- web_app/__pycache__/session_manager.cpython-312.pyc +0 -0
- web_app/__pycache__/ui_components.cpython-312.pyc +0 -0
- web_app/app.py +15 -3
- web_app/components/__pycache__/__init__.cpython-312.pyc +0 -0
- web_app/components/__pycache__/comparison_functions.cpython-312.pyc +0 -0
- web_app/components/__pycache__/ui_components.cpython-312.pyc +0 -0
- web_app/components/comparison_functions.py +2 -1
- web_app/components/ui_components.py +2 -2
- web_app/config_manager.py +110 -3
- web_app/handlers/__pycache__/__init__.cpython-312.pyc +0 -0
- web_app/handlers/__pycache__/analysis_handlers.cpython-312.pyc +0 -0
- web_app/handlers/__pycache__/pos_handlers.cpython-312.pyc +0 -0
- web_app/handlers/frequency_handlers.py +635 -0
.gitignore
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
*.manifest
|
| 31 |
+
*.spec
|
| 32 |
+
|
| 33 |
+
# Installer logs
|
| 34 |
+
pip-log.txt
|
| 35 |
+
pip-delete-this-directory.txt
|
| 36 |
+
|
| 37 |
+
# Unit test / coverage reports
|
| 38 |
+
htmlcov/
|
| 39 |
+
.tox/
|
| 40 |
+
.nox/
|
| 41 |
+
.coverage
|
| 42 |
+
.coverage.*
|
| 43 |
+
.cache
|
| 44 |
+
nosetests.xml
|
| 45 |
+
coverage.xml
|
| 46 |
+
*.cover
|
| 47 |
+
*.py,cover
|
| 48 |
+
.hypothesis/
|
| 49 |
+
.pytest_cache/
|
| 50 |
+
cover/
|
| 51 |
+
|
| 52 |
+
# Translations
|
| 53 |
+
*.mo
|
| 54 |
+
*.pot
|
| 55 |
+
|
| 56 |
+
# Django stuff:
|
| 57 |
+
*.log
|
| 58 |
+
local_settings.py
|
| 59 |
+
db.sqlite3
|
| 60 |
+
db.sqlite3-journal
|
| 61 |
+
|
| 62 |
+
# Flask stuff:
|
| 63 |
+
instance/
|
| 64 |
+
.webassets-cache
|
| 65 |
+
|
| 66 |
+
# Scrapy stuff:
|
| 67 |
+
.scrapy
|
| 68 |
+
|
| 69 |
+
# Sphinx documentation
|
| 70 |
+
docs/_build/
|
| 71 |
+
|
| 72 |
+
# PyBuilder
|
| 73 |
+
.pybuilder/
|
| 74 |
+
target/
|
| 75 |
+
|
| 76 |
+
# Jupyter Notebook
|
| 77 |
+
.ipynb_checkpoints
|
| 78 |
+
|
| 79 |
+
# IPython
|
| 80 |
+
profile_default/
|
| 81 |
+
ipython_config.py
|
| 82 |
+
|
| 83 |
+
# pyenv
|
| 84 |
+
.python-version
|
| 85 |
+
|
| 86 |
+
# pipenv
|
| 87 |
+
Pipfile.lock
|
| 88 |
+
|
| 89 |
+
# poetry
|
| 90 |
+
poetry.lock
|
| 91 |
+
|
| 92 |
+
# pdm
|
| 93 |
+
.pdm.toml
|
| 94 |
+
.pdm-python
|
| 95 |
+
|
| 96 |
+
# PEP 582
|
| 97 |
+
__pypackages__/
|
| 98 |
+
|
| 99 |
+
# Celery stuff
|
| 100 |
+
celerybeat-schedule
|
| 101 |
+
celerybeat.pid
|
| 102 |
+
|
| 103 |
+
# SageMath parsed files
|
| 104 |
+
*.sage.py
|
| 105 |
+
|
| 106 |
+
# Environments
|
| 107 |
+
.env
|
| 108 |
+
.venv
|
| 109 |
+
env/
|
| 110 |
+
venv/
|
| 111 |
+
ENV/
|
| 112 |
+
env.bak/
|
| 113 |
+
venv.bak/
|
| 114 |
+
|
| 115 |
+
# Spyder project settings
|
| 116 |
+
.spyderproject
|
| 117 |
+
.spyproject
|
| 118 |
+
|
| 119 |
+
# Rope project settings
|
| 120 |
+
.ropeproject
|
| 121 |
+
|
| 122 |
+
# mkdocs documentation
|
| 123 |
+
/site
|
| 124 |
+
|
| 125 |
+
# mypy
|
| 126 |
+
.mypy_cache/
|
| 127 |
+
.dmypy.json
|
| 128 |
+
dmypy.json
|
| 129 |
+
|
| 130 |
+
# Pyre type checker
|
| 131 |
+
.pyre/
|
| 132 |
+
|
| 133 |
+
# pytype static type analyzer
|
| 134 |
+
.pytype/
|
| 135 |
+
|
| 136 |
+
# Cython debug symbols
|
| 137 |
+
cython_debug/
|
| 138 |
+
|
| 139 |
+
# PyCharm
|
| 140 |
+
.idea/
|
| 141 |
+
|
| 142 |
+
# VS Code
|
| 143 |
+
.vscode/
|
| 144 |
+
*.code-workspace
|
| 145 |
+
|
| 146 |
+
# Local History for Visual Studio Code
|
| 147 |
+
.history/
|
| 148 |
+
|
| 149 |
+
# macOS
|
| 150 |
+
.DS_Store
|
| 151 |
+
.AppleDouble
|
| 152 |
+
.LSOverride
|
| 153 |
+
|
| 154 |
+
# Thumbnails
|
| 155 |
+
._*
|
| 156 |
+
|
| 157 |
+
# Files that might appear in the root of a volume
|
| 158 |
+
.DocumentRevisions-V100
|
| 159 |
+
.fseventsd
|
| 160 |
+
.Spotlight-V100
|
| 161 |
+
.TemporaryItems
|
| 162 |
+
.Trashes
|
| 163 |
+
.VolumeIcon.icns
|
| 164 |
+
.com.apple.timemachine.donotpresent
|
| 165 |
+
|
| 166 |
+
# Directories potentially created on remote AFP share
|
| 167 |
+
.AppleDB
|
| 168 |
+
.AppleDesktop
|
| 169 |
+
Network Trash Folder
|
| 170 |
+
Temporary Items
|
| 171 |
+
.apdisk
|
| 172 |
+
|
| 173 |
+
# Windows
|
| 174 |
+
Thumbs.db
|
| 175 |
+
Thumbs.db:encryptable
|
| 176 |
+
ehthumbs.db
|
| 177 |
+
ehthumbs_vista.db
|
| 178 |
+
|
| 179 |
+
# Dump file
|
| 180 |
+
*.stackdump
|
| 181 |
+
|
| 182 |
+
# Folder config file
|
| 183 |
+
[Dd]esktop.ini
|
| 184 |
+
|
| 185 |
+
# Recycle Bin used on file shares
|
| 186 |
+
$RECYCLE.BIN/
|
| 187 |
+
|
| 188 |
+
# Windows Installer files
|
| 189 |
+
*.cab
|
| 190 |
+
*.msi
|
| 191 |
+
*.msix
|
| 192 |
+
*.msm
|
| 193 |
+
*.msp
|
| 194 |
+
|
| 195 |
+
# Windows shortcuts
|
| 196 |
+
*.lnk
|
| 197 |
+
|
| 198 |
+
# Linux
|
| 199 |
+
*~
|
| 200 |
+
|
| 201 |
+
# temporary files which can be created if a process still has a handle open of a deleted file
|
| 202 |
+
.fuse_hidden*
|
| 203 |
+
|
| 204 |
+
# KDE directory preferences
|
| 205 |
+
.directory
|
| 206 |
+
|
| 207 |
+
# Linux trash folder which might appear on any partition or disk
|
| 208 |
+
.Trash-*
|
| 209 |
+
|
| 210 |
+
# .nfs files are created when an open file is removed but is still being accessed
|
| 211 |
+
.nfs*
|
| 212 |
+
|
| 213 |
+
# Editor backups
|
| 214 |
+
*.bak
|
| 215 |
+
*.swp
|
| 216 |
+
*.swo
|
| 217 |
+
*~
|
| 218 |
+
|
| 219 |
+
# Logs
|
| 220 |
+
logs/
|
| 221 |
+
*.log
|
| 222 |
+
npm-debug.log*
|
| 223 |
+
yarn-debug.log*
|
| 224 |
+
yarn-error.log*
|
| 225 |
+
lerna-debug.log*
|
| 226 |
+
.pnpm-debug.log*
|
| 227 |
+
|
| 228 |
+
# Runtime data
|
| 229 |
+
pids
|
| 230 |
+
*.pid
|
| 231 |
+
*.seed
|
| 232 |
+
*.pid.lock
|
| 233 |
+
|
| 234 |
+
# Directory for instrumented libs generated by jscoverage/JSCover
|
| 235 |
+
lib-cov
|
| 236 |
+
|
| 237 |
+
# Coverage directory used by tools like istanbul
|
| 238 |
+
coverage
|
| 239 |
+
*.lcov
|
| 240 |
+
|
| 241 |
+
# Dependency directories
|
| 242 |
+
node_modules/
|
| 243 |
+
jspm_packages/
|
| 244 |
+
|
| 245 |
+
# TypeScript cache
|
| 246 |
+
*.tsbuildinfo
|
| 247 |
+
|
| 248 |
+
# Optional npm cache directory
|
| 249 |
+
.npm
|
| 250 |
+
|
| 251 |
+
# Optional eslint cache
|
| 252 |
+
.eslintcache
|
| 253 |
+
|
| 254 |
+
# Optional stylelint cache
|
| 255 |
+
.stylelintcache
|
| 256 |
+
|
| 257 |
+
# Microbundle cache
|
| 258 |
+
.rpt2_cache/
|
| 259 |
+
.rts2_cache_cjs/
|
| 260 |
+
.rts2_cache_es/
|
| 261 |
+
.rts2_cache_umd/
|
| 262 |
+
|
| 263 |
+
# Optional REPL history
|
| 264 |
+
.node_repl_history
|
| 265 |
+
|
| 266 |
+
# Output of 'npm pack'
|
| 267 |
+
*.tgz
|
| 268 |
+
|
| 269 |
+
# Yarn Integrity file
|
| 270 |
+
.yarn-integrity
|
| 271 |
+
|
| 272 |
+
# dotenv environment variable files
|
| 273 |
+
.env
|
| 274 |
+
.env.development.local
|
| 275 |
+
.env.test.local
|
| 276 |
+
.env.production.local
|
| 277 |
+
.env.local
|
| 278 |
+
|
| 279 |
+
# parcel-bundler cache
|
| 280 |
+
.cache
|
| 281 |
+
.parcel-cache
|
| 282 |
+
|
| 283 |
+
# Next.js build output
|
| 284 |
+
.next
|
| 285 |
+
out
|
| 286 |
+
|
| 287 |
+
# Nuxt.js build / generate output
|
| 288 |
+
.nuxt
|
| 289 |
+
dist
|
| 290 |
+
|
| 291 |
+
# Gatsby files
|
| 292 |
+
.cache/
|
| 293 |
+
public
|
| 294 |
+
|
| 295 |
+
# vuepress build output
|
| 296 |
+
.vuepress/dist
|
| 297 |
+
|
| 298 |
+
# vuepress v2.x temp and cache directory
|
| 299 |
+
.temp
|
| 300 |
+
.cache
|
| 301 |
+
|
| 302 |
+
# Docusaurus cache and generated files
|
| 303 |
+
.docusaurus
|
| 304 |
+
|
| 305 |
+
# Serverless directories
|
| 306 |
+
.serverless/
|
| 307 |
+
|
| 308 |
+
# FuseBox cache
|
| 309 |
+
.fusebox/
|
| 310 |
+
|
| 311 |
+
# DynamoDB Local files
|
| 312 |
+
.dynamodb/
|
| 313 |
+
|
| 314 |
+
# TernJS port file
|
| 315 |
+
.tern-port
|
| 316 |
+
|
| 317 |
+
# Stores VSCode versions used for testing VSCode extensions
|
| 318 |
+
.vscode-test
|
| 319 |
+
|
| 320 |
+
# yarn v2
|
| 321 |
+
.yarn/cache
|
| 322 |
+
.yarn/unplugged
|
| 323 |
+
.yarn/build-state.yml
|
| 324 |
+
.yarn/install-state.gz
|
| 325 |
+
.pnp.*
|
| 326 |
+
|
| 327 |
+
# Temporary files
|
| 328 |
+
tmp/
|
| 329 |
+
temp/
|
| 330 |
+
*.tmp
|
| 331 |
+
*.temp
|
| 332 |
+
|
| 333 |
+
# Database files
|
| 334 |
+
*.db
|
| 335 |
+
*.sqlite
|
| 336 |
+
*.sqlite3
|
| 337 |
+
|
| 338 |
+
# Secret files
|
| 339 |
+
secrets/
|
| 340 |
+
*.key
|
| 341 |
+
*.pem
|
| 342 |
+
*.cert
|
| 343 |
+
*.crt
|
| 344 |
+
|
| 345 |
+
# Config files with sensitive data
|
| 346 |
+
config.local.js
|
| 347 |
+
config.local.json
|
| 348 |
+
settings.local.json
|
| 349 |
+
|
| 350 |
+
# Build artifacts
|
| 351 |
+
bin/
|
| 352 |
+
obj/
|
| 353 |
+
|
| 354 |
+
# Package files
|
| 355 |
+
*.jar
|
| 356 |
+
*.war
|
| 357 |
+
*.nar
|
| 358 |
+
*.ear
|
| 359 |
+
*.zip
|
| 360 |
+
*.tar.gz
|
| 361 |
+
*.rar
|
| 362 |
+
|
| 363 |
+
# Virtual machine crash logs
|
| 364 |
+
hs_err_pid*
|
| 365 |
+
|
| 366 |
+
# Core dumps
|
| 367 |
+
core.*
|
| 368 |
+
|
| 369 |
+
# Compiled source
|
| 370 |
+
*.com
|
| 371 |
+
*.class
|
| 372 |
+
*.dll
|
| 373 |
+
*.exe
|
| 374 |
+
*.o
|
| 375 |
+
*.out
|
| 376 |
+
|
| 377 |
+
# Ignore all dotfiles except .gitignore
|
| 378 |
+
.*
|
| 379 |
+
!.gitignore
|
| 380 |
+
!.gitkeep
|
| 381 |
+
!.github/
|
| 382 |
+
!.gitlab-ci.yml
|
| 383 |
+
!.travis.yml
|
| 384 |
+
!.editorconfig
|
| 385 |
+
!.prettierrc
|
| 386 |
+
!.eslintrc*
|
| 387 |
+
!.stylelintrc*
|
| 388 |
+
!.babelrc*
|
README.md
CHANGED
|
@@ -7,14 +7,247 @@ sdk: docker
|
|
| 7 |
app_port: 8501
|
| 8 |
tags:
|
| 9 |
- streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
pinned: false
|
| 11 |
-
short_description:
|
| 12 |
license: cc-by-nc-4.0
|
| 13 |
---
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
app_port: 8501
|
| 8 |
tags:
|
| 9 |
- streamlit
|
| 10 |
+
- nlp
|
| 11 |
+
- linguistics
|
| 12 |
+
- japanese
|
| 13 |
+
- corpus-linguistics
|
| 14 |
pinned: false
|
| 15 |
+
short_description: Advanced lexical sophistication analyzer for English and Japanese texts
|
| 16 |
license: cc-by-nc-4.0
|
| 17 |
---
|
| 18 |
|
| 19 |
+
# Simple Text Analyzer
|
| 20 |
|
| 21 |
+
A comprehensive web-based application for lexical sophistication analysis supporting both English and Japanese languages. This tool provides detailed linguistic analysis using corpus-based frequency data and advanced NLP techniques.
|
| 22 |
|
| 23 |
+
## 🌟 Features
|
| 24 |
+
|
| 25 |
+
### Multi-Language Support
|
| 26 |
+
- **English**: COCA corpus frequency analysis with unigrams, bigrams, and trigrams
|
| 27 |
+
- **Japanese**: BCCWJ (written) and CSJ (spoken) corpus integration with POS-aware frequency matching
|
| 28 |
+
|
| 29 |
+
### Analysis Capabilities
|
| 30 |
+
- **Lexical Sophistication**: Frequency-based lexical complexity analysis
|
| 31 |
+
- **Part-of-Speech Analysis**: Detailed POS tagging and classification
|
| 32 |
+
- **N-gram Analysis**: Bigram and trigram frequency analysis
|
| 33 |
+
- **Content vs Function Words**: Automatic classification and separate analysis
|
| 34 |
+
- **Batch Processing**: Multiple file analysis with comparative results
|
| 35 |
+
|
| 36 |
+
### Japanese Language Features ✨ **NEW**
|
| 37 |
+
- **BCCWJ Integration**: Balanced Corpus of Contemporary Written Japanese
|
| 38 |
+
- Raw frequency counts
|
| 39 |
+
- Normalized frequency (per million words)
|
| 40 |
+
- Frequency rankings
|
| 41 |
+
- **CSJ Integration**: Corpus of Spontaneous Japanese (spoken data)
|
| 42 |
+
- Academic and conversational speech patterns
|
| 43 |
+
- Multiple speech style analysis
|
| 44 |
+
- **POS-Aware Matching**: Composite key lookup using `lemma + POS` for accurate frequency matching
|
| 45 |
+
- **Robust Fallback System**: Three-tier lookup strategy:
|
| 46 |
+
1. Primary: `lemma_pos` composite key (e.g., "行く_動詞-自立")
|
| 47 |
+
2. Fallback 1: `lemma` only lookup
|
| 48 |
+
3. Fallback 2: `surface_form` lookup
|
| 49 |
+
|
| 50 |
+
## 🚀 Quick Start
|
| 51 |
+
|
| 52 |
+
### Prerequisites
|
| 53 |
+
- Python 3.8+
|
| 54 |
+
- uv (recommended) or pip for package management
|
| 55 |
+
|
| 56 |
+
### Installation
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
# Clone the repository
|
| 60 |
+
git clone https://github.com/your-repo/simple-text-analyzer.git
|
| 61 |
+
cd simple-text-analyzer
|
| 62 |
+
|
| 63 |
+
# Install dependencies using uv
|
| 64 |
+
uv sync
|
| 65 |
+
|
| 66 |
+
# Or using pip
|
| 67 |
+
pip install -r requirements.txt
|
| 68 |
+
|
| 69 |
+
# Install required SpaCy models
|
| 70 |
+
python -m spacy download en_core_web_trf
|
| 71 |
+
python -m spacy download ja_core_news_md # For Japanese support
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
### Running the Application
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
# Using uv
|
| 78 |
+
uv run streamlit run web_app/app.py
|
| 79 |
+
|
| 80 |
+
# Or directly
|
| 81 |
+
streamlit run web_app/app.py
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
## 📊 Supported Corpora
|
| 85 |
+
|
| 86 |
+
### English
|
| 87 |
+
- **COCA Spoken**: Corpus of Contemporary American English (spoken subcorpus)
|
| 88 |
+
- **COCA Magazine**: Magazine text frequency data
|
| 89 |
+
- **Bigram/Trigram Analysis**: Multi-word expression frequency and association measures
|
| 90 |
+
|
| 91 |
+
### Japanese
|
| 92 |
+
- **BCCWJ (Balanced Corpus of Contemporary Written Japanese)**
|
| 93 |
+
- 182,604 unique word forms with POS tags
|
| 94 |
+
- Multiple text registers (books, newspapers, magazines, etc.)
|
| 95 |
+
- Comprehensive written language coverage
|
| 96 |
+
|
| 97 |
+
- **CSJ (Corpus of Spontaneous Japanese)**
|
| 98 |
+
- 41,892 unique word forms from spoken data
|
| 99 |
+
- Academic presentations and casual conversations
|
| 100 |
+
- Natural speech pattern analysis
|
| 101 |
+
|
| 102 |
+
## 🔧 Architecture
|
| 103 |
+
|
| 104 |
+
### Core Components
|
| 105 |
+
- **LexicalSophisticationAnalyzer**: Main analysis engine with multi-language support
|
| 106 |
+
- **ConfigManager**: Flexible configuration system for corpus integration
|
| 107 |
+
- **ReferenceManager**: Dynamic reference list management
|
| 108 |
+
- **SessionManager**: State management for web interface
|
| 109 |
+
|
| 110 |
+
### Japanese Integration Features
|
| 111 |
+
- **Composite Key Matching**: Precision matching using lemma and POS combinations
|
| 112 |
+
- **Extensible Design**: Easy addition of new subcorpora via YAML configuration
|
| 113 |
+
- **Fallback Mechanisms**: Robust lookup strategies for maximum coverage
|
| 114 |
+
- **Performance Optimized**: Pre-computed lookup dictionaries for fast analysis
|
| 115 |
+
|
| 116 |
+
## 📁 File Structure
|
| 117 |
+
|
| 118 |
+
```
|
| 119 |
+
simple-text-analyzer/
|
| 120 |
+
├── web_app/ # Streamlit web application
|
| 121 |
+
│ ├── app.py # Main application entry
|
| 122 |
+
│ ├── config_manager.py # Configuration management
|
| 123 |
+
│ ├── reference_manager.py # Reference list handling
|
| 124 |
+
│ └── components/ # UI components
|
| 125 |
+
├── text_analyzer/ # Core analysis modules
|
| 126 |
+
│ ├── lexical_sophistication.py # Main analyzer
|
| 127 |
+
│ ├── frequency_analyzer.py # Frequency analysis
|
| 128 |
+
│ └── pos_parser.py # POS tagging utilities
|
| 129 |
+
├── config/ # Configuration files
|
| 130 |
+
│ └── reference_lists.yaml # Corpus configurations
|
| 131 |
+
├── resources/ # Corpus data files
|
| 132 |
+
│ └── reference_lists/
|
| 133 |
+
│ ├── en/ # English corpus files
|
| 134 |
+
│ └── ja/ # Japanese corpus files
|
| 135 |
+
└── test/ # Test modules
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## 🧪 Testing
|
| 139 |
+
|
| 140 |
+
Test the Japanese integration:
|
| 141 |
+
|
| 142 |
+
```bash
|
| 143 |
+
uv run python test_japanese_integration.py
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
Expected output:
|
| 147 |
+
- ✅ SpaCy model loading
|
| 148 |
+
- ✅ Reference data loading (182K+ BCCWJ entries, 41K+ CSJ entries)
|
| 149 |
+
- ✅ Composite key lookup functionality
|
| 150 |
+
- ✅ Fallback mechanism verification
|
| 151 |
+
- ✅ Complete text analysis pipeline
|
| 152 |
+
|
| 153 |
+
## 📈 Usage Examples
|
| 154 |
+
|
| 155 |
+
### Japanese Text Analysis
|
| 156 |
+
```python
|
| 157 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 158 |
+
|
| 159 |
+
# Initialize Japanese analyzer
|
| 160 |
+
analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
|
| 161 |
+
|
| 162 |
+
# Load Japanese corpus references
|
| 163 |
+
selected_indices = ["BCCWJ_frequency", "CSJ_frequency"]
|
| 164 |
+
|
| 165 |
+
# Analyze Japanese text
|
| 166 |
+
results = analyzer.analyze_text(
|
| 167 |
+
"私は毎日学校に行きます。",
|
| 168 |
+
selected_indices
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Access frequency scores
|
| 172 |
+
for token in results['token_details']:
|
| 173 |
+
print(f"{token['token']}: BCCWJ={token.get('BCCWJ_frequency_lemma', 'NA')}")
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
### English Text Analysis
|
| 177 |
+
```python
|
| 178 |
+
# Initialize English analyzer
|
| 179 |
+
analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf")
|
| 180 |
+
|
| 181 |
+
# Analyze with COCA frequency data
|
| 182 |
+
results = analyzer.analyze_text(
|
| 183 |
+
"The students studied linguistics carefully.",
|
| 184 |
+
["COCA_spoken_frequency"]
|
| 185 |
+
)
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
## 🔧 Configuration
|
| 189 |
+
|
| 190 |
+
### Adding New Japanese Subcorpora
|
| 191 |
+
|
| 192 |
+
The system is designed for easy expansion. To add a new subcorpus (e.g., BCCWJ Books):
|
| 193 |
+
|
| 194 |
+
```yaml
|
| 195 |
+
# config/reference_lists.yaml
|
| 196 |
+
japanese:
|
| 197 |
+
unigrams:
|
| 198 |
+
BCCWJ_books_frequency:
|
| 199 |
+
display_name: "BCCWJ Books - Frequency"
|
| 200 |
+
description: "BCCWJ books subcorpus frequency data"
|
| 201 |
+
files:
|
| 202 |
+
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 203 |
+
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 204 |
+
format: "tsv"
|
| 205 |
+
has_header: true
|
| 206 |
+
enabled: true
|
| 207 |
+
japanese_corpus: true
|
| 208 |
+
columns:
|
| 209 |
+
surface_form: 1 # lForm column
|
| 210 |
+
lemma: 2 # lemma column
|
| 211 |
+
pos: 3 # pos column
|
| 212 |
+
frequency: 10 # PB_frequency column (books subcorpus)
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
No code changes required - the system automatically detects and integrates new configurations!
|
| 216 |
+
|
| 217 |
+
## 📚 Research Applications
|
| 218 |
+
|
| 219 |
+
This tool is ideal for:
|
| 220 |
+
- **Language Learning Research**: Analyzing text complexity for Japanese learners
|
| 221 |
+
- **Corpus Linguistics**: Cross-linguistic frequency analysis
|
| 222 |
+
- **Computational Linguistics**: Lexical sophistication measurement
|
| 223 |
+
- **Educational Assessment**: Text difficulty evaluation
|
| 224 |
+
- **Translation Studies**: Comparative lexical analysis
|
| 225 |
+
|
| 226 |
+
## 🤝 Contributing
|
| 227 |
+
|
| 228 |
+
1. Fork the repository
|
| 229 |
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
| 230 |
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
| 231 |
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
| 232 |
+
5. Open a Pull Request
|
| 233 |
+
|
| 234 |
+
## 📄 License
|
| 235 |
+
|
| 236 |
+
This project is licensed under the CC BY-NC 4.0 License - see the [LICENSE](LICENSE) file for details.
|
| 237 |
+
|
| 238 |
+
## 🙏 Acknowledgments
|
| 239 |
+
|
| 240 |
+
- **BCCWJ**: National Institute for Japanese Language and Linguistics
|
| 241 |
+
- **CSJ**: National Institute for Japanese Language and Linguistics
|
| 242 |
+
- **COCA**: Mark Davies, Brigham Young University
|
| 243 |
+
- **SpaCy**: Explosion AI for robust NLP models
|
| 244 |
+
|
| 245 |
+
## 📞 Support
|
| 246 |
+
|
| 247 |
+
For questions, issues, or contributions:
|
| 248 |
+
- Open an issue on GitHub
|
| 249 |
+
- Contact: [Your contact information]
|
| 250 |
+
|
| 251 |
+
---
|
| 252 |
+
|
| 253 |
+
**Happy analyzing!** 🚀📊
|
config/reference_lists.yaml
CHANGED
|
@@ -137,6 +137,102 @@ english:
|
|
| 137 |
|
| 138 |
japanese:
|
| 139 |
unigrams:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
jp_frequency:
|
| 141 |
display_name: "Japanese Frequency List"
|
| 142 |
description: "Frequency data for Japanese words"
|
|
@@ -151,4 +247,4 @@ japanese:
|
|
| 151 |
enabled: false # Disabled until files exist
|
| 152 |
|
| 153 |
# bigrams: {}
|
| 154 |
-
# trigrams: {}
|
|
|
|
| 137 |
|
| 138 |
japanese:
|
| 139 |
unigrams:
|
| 140 |
+
BCCWJ_frequency:
|
| 141 |
+
display_name: "BCCWJ Written - Frequency"
|
| 142 |
+
description: "BCCWJ raw frequency counts for written Japanese"
|
| 143 |
+
files:
|
| 144 |
+
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 145 |
+
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 146 |
+
format: "tsv"
|
| 147 |
+
has_header: true
|
| 148 |
+
enabled: true
|
| 149 |
+
japanese_corpus: true
|
| 150 |
+
columns:
|
| 151 |
+
surface_form: 1 # lForm
|
| 152 |
+
lemma: 2 # lemma
|
| 153 |
+
pos: 3 # pos
|
| 154 |
+
frequency: 6 # primary measure column
|
| 155 |
+
|
| 156 |
+
BCCWJ_pmw:
|
| 157 |
+
display_name: "BCCWJ Written - Per Million Words"
|
| 158 |
+
description: "BCCWJ normalized frequency for written Japanese"
|
| 159 |
+
files:
|
| 160 |
+
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 161 |
+
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 162 |
+
format: "tsv"
|
| 163 |
+
has_header: true
|
| 164 |
+
enabled: true
|
| 165 |
+
japanese_corpus: true
|
| 166 |
+
columns:
|
| 167 |
+
surface_form: 1
|
| 168 |
+
lemma: 2
|
| 169 |
+
pos: 3
|
| 170 |
+
frequency: 7 # pmw column
|
| 171 |
+
|
| 172 |
+
BCCWJ_rank:
|
| 173 |
+
display_name: "BCCWJ Written - Frequency Rank"
|
| 174 |
+
description: "BCCWJ frequency ranking for written Japanese"
|
| 175 |
+
files:
|
| 176 |
+
token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 177 |
+
lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
|
| 178 |
+
format: "tsv"
|
| 179 |
+
has_header: true
|
| 180 |
+
enabled: true
|
| 181 |
+
japanese_corpus: true
|
| 182 |
+
columns:
|
| 183 |
+
surface_form: 1
|
| 184 |
+
lemma: 2
|
| 185 |
+
pos: 3
|
| 186 |
+
frequency: 0 # rank column
|
| 187 |
+
|
| 188 |
+
CSJ_frequency:
|
| 189 |
+
display_name: "CSJ Spoken - Frequency"
|
| 190 |
+
description: "CSJ raw frequency counts for spoken Japanese"
|
| 191 |
+
files:
|
| 192 |
+
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 193 |
+
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 194 |
+
format: "tsv"
|
| 195 |
+
has_header: true
|
| 196 |
+
enabled: true
|
| 197 |
+
japanese_corpus: true
|
| 198 |
+
columns:
|
| 199 |
+
surface_form: 1
|
| 200 |
+
lemma: 2
|
| 201 |
+
pos: 3
|
| 202 |
+
frequency: 6
|
| 203 |
+
|
| 204 |
+
CSJ_pmw:
|
| 205 |
+
display_name: "CSJ Spoken - Per Million Words"
|
| 206 |
+
description: "CSJ normalized frequency for spoken Japanese"
|
| 207 |
+
files:
|
| 208 |
+
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 209 |
+
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 210 |
+
format: "tsv"
|
| 211 |
+
has_header: true
|
| 212 |
+
enabled: true
|
| 213 |
+
japanese_corpus: true
|
| 214 |
+
columns:
|
| 215 |
+
surface_form: 1
|
| 216 |
+
lemma: 2
|
| 217 |
+
pos: 3
|
| 218 |
+
frequency: 7
|
| 219 |
+
|
| 220 |
+
CSJ_rank:
|
| 221 |
+
display_name: "CSJ Spoken - Frequency Rank"
|
| 222 |
+
description: "CSJ frequency ranking for spoken Japanese"
|
| 223 |
+
files:
|
| 224 |
+
token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 225 |
+
lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
|
| 226 |
+
format: "tsv"
|
| 227 |
+
has_header: true
|
| 228 |
+
enabled: true
|
| 229 |
+
japanese_corpus: true
|
| 230 |
+
columns:
|
| 231 |
+
surface_form: 1
|
| 232 |
+
lemma: 2
|
| 233 |
+
pos: 3
|
| 234 |
+
frequency: 0
|
| 235 |
+
|
| 236 |
jp_frequency:
|
| 237 |
display_name: "Japanese Frequency List"
|
| 238 |
description: "Frequency data for Japanese words"
|
|
|
|
| 247 |
enabled: false # Disabled until files exist
|
| 248 |
|
| 249 |
# bigrams: {}
|
| 250 |
+
# trigrams: {}
|
japanese-nlp-test.ipynb
ADDED
|
@@ -0,0 +1,819 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Japanese NLP Analysis: Comparative Study of UniDic-based Approaches\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"This notebook implements and compares two approaches for Japanese morphological analysis with BCCWJ frequency matching:\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"- **Plan A**: MeCab (fugashi) + UniDic direct pipeline\n",
|
| 12 |
+
"- **Plan B**: GiNZA (Sudachi) + UniDic alignment pipeline\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"Each approach is designed for reproducible setup, implementation, validation, and operational use."
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "markdown",
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"source": [
|
| 21 |
+
"## 1. Environment Setup & Verification\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"First, let's verify and set up our environment with all required packages."
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"execution_count": 10,
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [
|
| 31 |
+
{
|
| 32 |
+
"name": "stdout",
|
| 33 |
+
"output_type": "stream",
|
| 34 |
+
"text": [
|
| 35 |
+
"Python version: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ]\n",
|
| 36 |
+
"Working directory: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/2025/notebooks\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"Checking package availability:\n",
|
| 39 |
+
"✓ fugashi\n",
|
| 40 |
+
"✓ unidic\n",
|
| 41 |
+
"✗ unidic-lite - NOT FOUND\n",
|
| 42 |
+
"✓ spacy\n",
|
| 43 |
+
"✓ ginza\n",
|
| 44 |
+
"✗ ja-ginza - NOT FOUND\n",
|
| 45 |
+
"✓ sudachipy\n",
|
| 46 |
+
"✓ pandas\n",
|
| 47 |
+
"✓ numpy\n",
|
| 48 |
+
"✓ matplotlib\n",
|
| 49 |
+
"✓ collections (built-in)\n"
|
| 50 |
+
]
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"source": [
|
| 54 |
+
"# Environment verification and setup\n",
|
| 55 |
+
"import sys\n",
|
| 56 |
+
"import subprocess\n",
|
| 57 |
+
"from pathlib import Path\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"print(f\"Python version: {sys.version}\")\n",
|
| 60 |
+
"print(f\"Working directory: {Path.cwd()}\")\n",
|
| 61 |
+
"\n",
|
| 62 |
+
"# Required packages\n",
|
| 63 |
+
"required_packages = [\n",
|
| 64 |
+
" 'fugashi', 'unidic', 'unidic-lite', 'spacy', 'ginza', \n",
|
| 65 |
+
" 'ja-ginza', 'sudachipy', 'pandas', 'numpy', 'matplotlib', 'collections'\n",
|
| 66 |
+
"]\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"print(\"\\nChecking package availability:\")\n",
|
| 69 |
+
"for package in required_packages:\n",
|
| 70 |
+
" try:\n",
|
| 71 |
+
" if package == 'collections':\n",
|
| 72 |
+
" import collections\n",
|
| 73 |
+
" print(f\"✓ {package} (built-in)\")\n",
|
| 74 |
+
" else:\n",
|
| 75 |
+
" __import__(package)\n",
|
| 76 |
+
" print(f\"✓ {package}\")\n",
|
| 77 |
+
" except ImportError:\n",
|
| 78 |
+
" print(f\"✗ {package} - NOT FOUND\")"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"cell_type": "code",
|
| 83 |
+
"execution_count": 11,
|
| 84 |
+
"metadata": {},
|
| 85 |
+
"outputs": [
|
| 86 |
+
{
|
| 87 |
+
"name": "stdout",
|
| 88 |
+
"output_type": "stream",
|
| 89 |
+
"text": [
|
| 90 |
+
"scipy not available - will use numpy for correlation\n",
|
| 91 |
+
"All imports successful!\n"
|
| 92 |
+
]
|
| 93 |
+
}
|
| 94 |
+
],
|
| 95 |
+
"source": [
|
| 96 |
+
"# Import all necessary libraries\n",
|
| 97 |
+
"import pandas as pd\n",
|
| 98 |
+
"import numpy as np\n",
|
| 99 |
+
"import matplotlib.pyplot as plt\n",
|
| 100 |
+
"from collections import Counter, defaultdict\n",
|
| 101 |
+
"import time\n",
|
| 102 |
+
"import warnings\n",
|
| 103 |
+
"from typing import List, Tuple, Dict, Optional\n",
|
| 104 |
+
"\n",
|
| 105 |
+
"# Japanese NLP libraries\n",
|
| 106 |
+
"import fugashi\n",
|
| 107 |
+
"import unidic\n",
|
| 108 |
+
"import spacy\n",
|
| 109 |
+
"from spacy.tokens import Token, Doc\n",
|
| 110 |
+
"\n",
|
| 111 |
+
"# Statistical analysis\n",
|
| 112 |
+
"try:\n",
|
| 113 |
+
" from scipy.stats import spearmanr\n",
|
| 114 |
+
" scipy_available = True\n",
|
| 115 |
+
"except ImportError:\n",
|
| 116 |
+
" print(\"scipy not available - will use numpy for correlation\")\n",
|
| 117 |
+
" scipy_available = False\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"print(\"All imports successful!\")\n",
|
| 120 |
+
"warnings.filterwarnings('ignore')"
|
| 121 |
+
]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "code",
|
| 125 |
+
"execution_count": 12,
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"outputs": [
|
| 128 |
+
{
|
| 129 |
+
"name": "stdout",
|
| 130 |
+
"output_type": "stream",
|
| 131 |
+
"text": [
|
| 132 |
+
"UniDic directory: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n",
|
| 133 |
+
"UniDic is properly installed\n",
|
| 134 |
+
"Fugashi + UniDic test successful: テスト\n"
|
| 135 |
+
]
|
| 136 |
+
}
|
| 137 |
+
],
|
| 138 |
+
"source": [
|
| 139 |
+
"# Check UniDic installation and download if needed\n",
|
| 140 |
+
"try:\n",
|
| 141 |
+
" print(f\"UniDic directory: {unidic.DICDIR}\")\n",
|
| 142 |
+
" print(\"UniDic is properly installed\")\n",
|
| 143 |
+
"except Exception as e:\n",
|
| 144 |
+
" print(f\"UniDic issue: {e}\")\n",
|
| 145 |
+
" print(\"You may need to run: python -m unidic download\")\n",
|
| 146 |
+
"\n",
|
| 147 |
+
"# Test basic fugashi functionality\n",
|
| 148 |
+
"try:\n",
|
| 149 |
+
" tagger = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
|
| 150 |
+
" test_result = list(tagger(\"テスト\"))\n",
|
| 151 |
+
" print(f\"Fugashi + UniDic test successful: {test_result[0].surface}\")\n",
|
| 152 |
+
"except Exception as e:\n",
|
| 153 |
+
" print(f\"Fugashi test failed: {e}\")"
|
| 154 |
+
]
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"cell_type": "markdown",
|
| 158 |
+
"metadata": {},
|
| 159 |
+
"source": [
|
| 160 |
+
"## 2. Sample Data Preparation\n",
|
| 161 |
+
"\n",
|
| 162 |
+
"Let's create realistic Japanese text samples for testing our pipelines."
|
| 163 |
+
]
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"cell_type": "code",
|
| 167 |
+
"execution_count": 13,
|
| 168 |
+
"metadata": {},
|
| 169 |
+
"outputs": [
|
| 170 |
+
{
|
| 171 |
+
"name": "stdout",
|
| 172 |
+
"output_type": "stream",
|
| 173 |
+
"text": [
|
| 174 |
+
"Sample texts prepared:\n",
|
| 175 |
+
" 1. 彼は日ごろから本を読むのが好きです。\n",
|
| 176 |
+
" 2. ひごろの勉強が大切だと思います。\n",
|
| 177 |
+
" 3. 日頃の努力が実を結ぶでしょう。\n",
|
| 178 |
+
" 4. 彼女は書きあらわすことが得意です。\n",
|
| 179 |
+
" 5. その問題を書き表すのは難しい。\n",
|
| 180 |
+
" 6. 今日は東京オリンピックについて話しましょう。\n",
|
| 181 |
+
" 7. コーヒーを飲んで、呑み込んで、また飲んでしまった。\n",
|
| 182 |
+
" 8. 国際的な協力が必要不可欠です。\n",
|
| 183 |
+
" 9. 機械学習の技術が進歩している。\n",
|
| 184 |
+
"10. 自然言語処理は興味深い分野だ。\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"Extended corpus: 30 texts\n"
|
| 187 |
+
]
|
| 188 |
+
}
|
| 189 |
+
],
|
| 190 |
+
"source": [
|
| 191 |
+
"# Sample Japanese texts for testing\n",
|
| 192 |
+
"sample_texts = [\n",
|
| 193 |
+
" \"彼は日ごろから本を読むのが好きです。\",\n",
|
| 194 |
+
" \"ひごろの勉強が大切だと思います。\",\n",
|
| 195 |
+
" \"日頃の努力が実を結ぶでしょう。\",\n",
|
| 196 |
+
" \"彼女は書きあらわすことが得意です。\",\n",
|
| 197 |
+
" \"その問題を書き表すのは難しい。\",\n",
|
| 198 |
+
" \"今日は東京オリンピックについて話しましょう。\",\n",
|
| 199 |
+
" \"コーヒーを飲んで、呑み込んで、また飲んでしまった。\",\n",
|
| 200 |
+
" \"国際的な協力が必要不可欠です。\",\n",
|
| 201 |
+
" \"機械学習の技術が進歩している。\",\n",
|
| 202 |
+
" \"自然言語処理は興味深い分野だ。\"\n",
|
| 203 |
+
"]\n",
|
| 204 |
+
"\n",
|
| 205 |
+
"print(\"Sample texts prepared:\")\n",
|
| 206 |
+
"for i, text in enumerate(sample_texts, 1):\n",
|
| 207 |
+
" print(f\"{i:2d}. {text}\")\n",
|
| 208 |
+
"\n",
|
| 209 |
+
"# Create a larger corpus by repeating and slightly modifying texts\n",
|
| 210 |
+
"extended_corpus = sample_texts * 3 # Simulate frequency variations\n",
|
| 211 |
+
"print(f\"\\nExtended corpus: {len(extended_corpus)} texts\")"
|
| 212 |
+
]
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"cell_type": "code",
|
| 216 |
+
"execution_count": 14,
|
| 217 |
+
"metadata": {},
|
| 218 |
+
"outputs": [
|
| 219 |
+
{
|
| 220 |
+
"name": "stdout",
|
| 221 |
+
"output_type": "stream",
|
| 222 |
+
"text": [
|
| 223 |
+
"Mock BCCWJ frequency data:\n",
|
| 224 |
+
" lemma reading pos freq_bccwj key\n",
|
| 225 |
+
"0 日頃 ヒゴロ 名詞 1250 (日頃, ヒゴロ, 名詞)\n",
|
| 226 |
+
"1 本 ホン 名詞 8500 (本, ホン, 名詞)\n",
|
| 227 |
+
"2 読む ヨム 動詞 3200 (読む, ヨム, 動詞)\n",
|
| 228 |
+
"3 好き スキ 形容動詞 2100 (好き, スキ, 形容動詞)\n",
|
| 229 |
+
"4 勉強 ベンキョウ 名詞 4200 (勉強, ベンキョウ, 名詞)\n",
|
| 230 |
+
"5 大切 タイセツ 形容動詞 1800 (大切, タイセツ, 形容動詞)\n",
|
| 231 |
+
"6 思う オモウ 動詞 9500 (思う, オモウ, 動詞)\n",
|
| 232 |
+
"7 努力 ドリョク 名詞 2200 (努力, ドリョク, 名詞)\n",
|
| 233 |
+
"8 実 ミ 名詞 1100 (実, ミ, 名詞)\n",
|
| 234 |
+
"9 結ぶ ムスブ 動詞 800 (結ぶ, ムスブ, 動詞)\n",
|
| 235 |
+
"\n",
|
| 236 |
+
"Total entries: 25\n"
|
| 237 |
+
]
|
| 238 |
+
}
|
| 239 |
+
],
|
| 240 |
+
"source": [
|
| 241 |
+
"# Create mock BCCWJ frequency data for testing\n",
|
| 242 |
+
"# In real usage, this would be loaded from an actual BCCWJ frequency file\n",
|
| 243 |
+
"\n",
|
| 244 |
+
"mock_bccwj_data = [\n",
|
| 245 |
+
" ('日頃', 'ヒゴロ', '名詞', 1250),\n",
|
| 246 |
+
" ('本', 'ホン', '名詞', 8500),\n",
|
| 247 |
+
" ('読む', 'ヨム', '動詞', 3200),\n",
|
| 248 |
+
" ('好き', 'スキ', '形容動詞', 2100),\n",
|
| 249 |
+
" ('勉強', 'ベンキョウ', '名詞', 4200),\n",
|
| 250 |
+
" ('大切', 'タイセツ', '形容動詞', 1800),\n",
|
| 251 |
+
" ('思う', 'オモウ', '動詞', 9500),\n",
|
| 252 |
+
" ('努力', 'ドリョク', '名詞', 2200),\n",
|
| 253 |
+
" ('実', 'ミ', '名詞', 1100),\n",
|
| 254 |
+
" ('結ぶ', 'ムスブ', '動詞', 800),\n",
|
| 255 |
+
" ('書く', 'カク', '動詞', 4100),\n",
|
| 256 |
+
" ('表す', 'アラワス', '動詞', 1500),\n",
|
| 257 |
+
" ('得意', 'トクイ', '形容動詞', 1300),\n",
|
| 258 |
+
" ('問題', 'モンダイ', '名詞', 6200),\n",
|
| 259 |
+
" ('難しい', 'ムズカシイ', '形容詞', 3800),\n",
|
| 260 |
+
" ('今日', 'キョウ', '名詞', 5500),\n",
|
| 261 |
+
" ('東京', 'トウキョウ', '名詞', 4800),\n",
|
| 262 |
+
" ('話す', 'ハナス', '動詞', 3600),\n",
|
| 263 |
+
" ('飲む', 'ノム', '動詞', 2400),\n",
|
| 264 |
+
" ('呑む', 'ノム', '動詞', 150),\n",
|
| 265 |
+
" ('国際', 'コクサイ', '名詞', 2800),\n",
|
| 266 |
+
" ('協力', 'キョウリョク', '名詞', 1900),\n",
|
| 267 |
+
" ('必要', 'ヒツヨウ', '形容動詞', 4500),\n",
|
| 268 |
+
" ('技術', 'ギジュツ', '名詞', 3900),\n",
|
| 269 |
+
" ('進歩', 'シンポ', '名詞', 1100)\n",
|
| 270 |
+
"]\n",
|
| 271 |
+
"\n",
|
| 272 |
+
"# Create DataFrame\n",
|
| 273 |
+
"df_bccwj = pd.DataFrame(mock_bccwj_data, columns=['lemma', 'reading', 'pos', 'freq_bccwj'])\n",
|
| 274 |
+
"df_bccwj['key'] = list(zip(df_bccwj.lemma, df_bccwj.reading, df_bccwj.pos))\n",
|
| 275 |
+
"\n",
|
| 276 |
+
"print(\"Mock BCCWJ frequency data:\")\n",
|
| 277 |
+
"print(df_bccwj.head(10))\n",
|
| 278 |
+
"print(f\"\\nTotal entries: {len(df_bccwj)}\")"
|
| 279 |
+
]
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"cell_type": "markdown",
|
| 283 |
+
"metadata": {},
|
| 284 |
+
"source": [
|
| 285 |
+
"## 3. Plan A: MeCab (fugashi) + UniDic Direct Pipeline\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"### A-1 to A-3: Setup and Configuration\n",
|
| 288 |
+
"\n",
|
| 289 |
+
"UniDic provides the morphological analysis system used in BCCWJ, making it ideal for frequency matching."
|
| 290 |
+
]
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"cell_type": "code",
|
| 294 |
+
"execution_count": 14,
|
| 295 |
+
"metadata": {},
|
| 296 |
+
"outputs": [
|
| 297 |
+
{
|
| 298 |
+
"name": "stdout",
|
| 299 |
+
"output_type": "stream",
|
| 300 |
+
"text": [
|
| 301 |
+
"Initializing Plan A: fugashi + UniDic pipeline\n",
|
| 302 |
+
"Tagger initialized with UniDic dictionary: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n",
|
| 303 |
+
"\n",
|
| 304 |
+
"Test analysis of '日ごろから勉強している。':\n",
|
| 305 |
+
" 日ごろ -> 日頃 [名,詞,,,普,通,名,詞,,,副,詞,可,能,,,*]\n",
|
| 306 |
+
" から -> から [助,詞,,,格,助,詞,,,*,,,*]\n",
|
| 307 |
+
" 勉強 -> 勉強 [名,詞,,,普,通,名,詞,,,サ,変,可,能,,,*]\n",
|
| 308 |
+
" し -> 為る [動,詞,,,非,自,立,可,能,,,*,,,*]\n",
|
| 309 |
+
" て -> て [助,詞,,,接,続,助,詞,,,*,,,*]\n",
|
| 310 |
+
" いる -> 居る [動,詞,,,非,自,立,可,能,,,*,,,*]\n",
|
| 311 |
+
" 。 -> 。 [補,助,記,号,,,句,点,,,*,,,*]\n"
|
| 312 |
+
]
|
| 313 |
+
}
|
| 314 |
+
],
|
| 315 |
+
"source": [
|
| 316 |
+
"# A-3: Initialize fugashi with UniDic\n",
|
| 317 |
+
"print(\"Initializing Plan A: fugashi + UniDic pipeline\")\n",
|
| 318 |
+
"\n",
|
| 319 |
+
"# Initialize tagger with explicit UniDic path\n",
|
| 320 |
+
"tagger_a = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
|
| 321 |
+
"print(f\"Tagger initialized with UniDic dictionary: {unidic.DICDIR}\")\n",
|
| 322 |
+
"\n",
|
| 323 |
+
"# Test the tagger\n",
|
| 324 |
+
"test_text = \"日ごろから勉強している。\"\n",
|
| 325 |
+
"tokens = list(tagger_a(test_text))\n",
|
| 326 |
+
"print(f\"\\nTest analysis of '{test_text}':\")\n",
|
| 327 |
+
"for token in tokens:\n",
|
| 328 |
+
" print(f\" {token.surface} -> {token.feature.lemma} [{','.join(token.pos)}]\")"
|
| 329 |
+
]
|
| 330 |
+
},
|
| 331 |
+
{
|
| 332 |
+
"cell_type": "code",
|
| 333 |
+
"execution_count": 18,
|
| 334 |
+
"metadata": {},
|
| 335 |
+
"outputs": [
|
| 336 |
+
{
|
| 337 |
+
"name": "stdout",
|
| 338 |
+
"output_type": "stream",
|
| 339 |
+
"text": [
|
| 340 |
+
"Extracted keys from '日ごろから勉強している。':\n",
|
| 341 |
+
" (日ごろ, ヒゴロ, 名)\n",
|
| 342 |
+
" (から, カラ, 助)\n",
|
| 343 |
+
" (勉強, ベンキョー, 名)\n",
|
| 344 |
+
" (する, スル, 動)\n",
|
| 345 |
+
" (て, テ, 助)\n",
|
| 346 |
+
" (いる, イル, 動)\n",
|
| 347 |
+
" (。, *, 補)\n"
|
| 348 |
+
]
|
| 349 |
+
}
|
| 350 |
+
],
|
| 351 |
+
"source": [
|
| 352 |
+
"# A-4: Morphological field extraction function\n",
|
| 353 |
+
"def iter_lemma_keys_plan_a(text: str, tagger) -> List[Tuple[str, str, str]]:\n",
|
| 354 |
+
" \"\"\"\n",
|
| 355 |
+
" Extract (lemma, reading, pos_major) tuples from text using UniDic.\n",
|
| 356 |
+
" \n",
|
| 357 |
+
" Args:\n",
|
| 358 |
+
" text: Input Japanese text\n",
|
| 359 |
+
" tagger: fugashi Tagger instance\n",
|
| 360 |
+
" \n",
|
| 361 |
+
" Returns:\n",
|
| 362 |
+
" List of (dictionary_form, reading, pos_major) tuples\n",
|
| 363 |
+
" \"\"\"\n",
|
| 364 |
+
" keys = []\n",
|
| 365 |
+
" for m in tagger(text):\n",
|
| 366 |
+
" if m.surface.strip(): # Skip empty tokens\n",
|
| 367 |
+
" # UniDic POS is hierarchical; use major category (pos[0])\n",
|
| 368 |
+
" pos_major = m.pos[0] if m.pos else 'UNKNOWN'\n",
|
| 369 |
+
" lemma = m.feature[10] if m.feature[10] else m.surface\n",
|
| 370 |
+
" reading = m.feature[11] if m.feature[11] else ''\n",
|
| 371 |
+
" keys.append((lemma, reading, pos_major))\n",
|
| 372 |
+
" return keys\n",
|
| 373 |
+
"\n",
|
| 374 |
+
"# Test the extraction function\n",
|
| 375 |
+
"test_keys = iter_lemma_keys_plan_a(test_text, tagger_a)\n",
|
| 376 |
+
"print(f\"Extracted keys from '{test_text}':\")\n",
|
| 377 |
+
"for lemma, reading, pos in test_keys:\n",
|
| 378 |
+
" print(f\" ({lemma}, {reading}, {pos})\")"
|
| 379 |
+
]
|
| 380 |
+
},
|
| 381 |
+
{
|
| 382 |
+
"cell_type": "code",
|
| 383 |
+
"execution_count": 19,
|
| 384 |
+
"metadata": {},
|
| 385 |
+
"outputs": [
|
| 386 |
+
{
|
| 387 |
+
"name": "stdout",
|
| 388 |
+
"output_type": "stream",
|
| 389 |
+
"text": [
|
| 390 |
+
"Extracted keys from '日ごろから勉強している。' (fixed version):\n",
|
| 391 |
+
" (日ごろ, ヒゴロ, 名)\n",
|
| 392 |
+
" (から, カラ, 助)\n",
|
| 393 |
+
" (勉強, ベンキョー, 名)\n",
|
| 394 |
+
" (する, シ, 動)\n",
|
| 395 |
+
" (て, テ, 助)\n",
|
| 396 |
+
" (いる, イル, 動)\n",
|
| 397 |
+
" (。, *, 補)\n"
|
| 398 |
+
]
|
| 399 |
+
}
|
| 400 |
+
],
|
| 401 |
+
"source": [
|
| 402 |
+
"# Fixed version with proper fugashi/UniDic attribute handling\n",
|
| 403 |
+
"def iter_lemma_keys_fixed(text: str, tagger) -> List[Tuple[str, str, str]]:\n",
|
| 404 |
+
" \"\"\"\n",
|
| 405 |
+
" Extract (lemma, reading, pos_major) tuples from text using UniDic.\n",
|
| 406 |
+
" Fixed version that handles fugashi attribute variations.\n",
|
| 407 |
+
" \"\"\"\n",
|
| 408 |
+
" keys = []\n",
|
| 409 |
+
" for m in tagger(text):\n",
|
| 410 |
+
" if m.surface.strip(): # Skip empty tokens\n",
|
| 411 |
+
" # UniDic POS is hierarchical; use major category (pos[0])\n",
|
| 412 |
+
" pos_major = m.pos[0] if m.pos else 'UNKNOWN'\n",
|
| 413 |
+
" \n",
|
| 414 |
+
" # Handle different attribute names for lemma\n",
|
| 415 |
+
" try:\n",
|
| 416 |
+
" lemma = m.lemma if hasattr(m, 'lemma') else m.feature[10]\n",
|
| 417 |
+
" except:\n",
|
| 418 |
+
" lemma = m.surface # fallback\n",
|
| 419 |
+
" \n",
|
| 420 |
+
" # Handle different attribute names for reading\n",
|
| 421 |
+
" try:\n",
|
| 422 |
+
" reading = m.feature[9] if len(m.feature) > 9 else ''\n",
|
| 423 |
+
" except:\n",
|
| 424 |
+
" reading = '' # fallback\n",
|
| 425 |
+
" \n",
|
| 426 |
+
" keys.append((lemma, reading, pos_major))\n",
|
| 427 |
+
" return keys\n",
|
| 428 |
+
"\n",
|
| 429 |
+
"# Use the fixed function\n",
|
| 430 |
+
"iter_lemma_keys_plan_a = iter_lemma_keys_fixed\n",
|
| 431 |
+
"\n",
|
| 432 |
+
"# Test the fixed function\n",
|
| 433 |
+
"test_keys = iter_lemma_keys_plan_a(test_text, tagger_a)\n",
|
| 434 |
+
"print(f\"Extracted keys from '{test_text}' (fixed version):\")\n",
|
| 435 |
+
"for lemma, reading, pos in test_keys:\n",
|
| 436 |
+
" print(f\" ({lemma}, {reading}, {pos})\")"
|
| 437 |
+
]
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"cell_type": "code",
|
| 441 |
+
"execution_count": 20,
|
| 442 |
+
"metadata": {},
|
| 443 |
+
"outputs": [
|
| 444 |
+
{
|
| 445 |
+
"name": "stdout",
|
| 446 |
+
"output_type": "stream",
|
| 447 |
+
"text": [
|
| 448 |
+
"Analyzing 30 texts with Plan A...\n",
|
| 449 |
+
"\n",
|
| 450 |
+
"Plan A Results (top 15):\n",
|
| 451 |
+
" lemma reading pos freq_local freq_bccwj\n",
|
| 452 |
+
"11 。 * 補 30 NaN\n",
|
| 453 |
+
"8 が ガ 助 18 NaN\n",
|
| 454 |
+
"1 は ワ 助 15 NaN\n",
|
| 455 |
+
"7 の ノ 助 15 NaN\n",
|
| 456 |
+
"5 を オ 助 12 NaN\n",
|
| 457 |
+
"10 です デス 助 9 NaN\n",
|
| 458 |
+
"42 で デ 助 9 NaN\n",
|
| 459 |
+
"15 だ ダ 助 6 NaN\n",
|
| 460 |
+
"37 て テ 助 6 NaN\n",
|
| 461 |
+
"41 飲む ノン 動 6 NaN\n",
|
| 462 |
+
"43 、 * 補 6 NaN\n",
|
| 463 |
+
"48 国際 コクサイ 名 3 NaN\n",
|
| 464 |
+
"47 た タ 助 3 NaN\n",
|
| 465 |
+
"46 しまう シマッ 動 3 NaN\n",
|
| 466 |
+
"0 彼 カレ 代 3 NaN\n"
|
| 467 |
+
]
|
| 468 |
+
}
|
| 469 |
+
],
|
| 470 |
+
"source": [
|
| 471 |
+
"# A-5: Frequency analysis with BCCWJ matching\n",
|
| 472 |
+
"def analyze_corpus_plan_a(corpus: List[str], tagger, bccwj_df: pd.DataFrame) -> pd.DataFrame:\n",
|
| 473 |
+
" \"\"\"Analyze corpus using Plan A and match with BCCWJ frequencies.\"\"\"\n",
|
| 474 |
+
" freq = Counter()\n",
|
| 475 |
+
" \n",
|
| 476 |
+
" print(f\"Analyzing {len(corpus)} texts with Plan A...\")\n",
|
| 477 |
+
" for text in corpus:\n",
|
| 478 |
+
" for key in iter_lemma_keys_plan_a(text, tagger):\n",
|
| 479 |
+
" freq[key] += 1\n",
|
| 480 |
+
" \n",
|
| 481 |
+
" # Convert to DataFrame\n",
|
| 482 |
+
" rows = []\n",
|
| 483 |
+
" for (lemma, reading, pos), count in freq.items():\n",
|
| 484 |
+
" rows.append((lemma, reading, pos, count))\n",
|
| 485 |
+
" \n",
|
| 486 |
+
" df_local = pd.DataFrame(rows, columns=['lemma', 'reading', 'pos', 'freq_local'])\n",
|
| 487 |
+
" df_local['key'] = list(zip(df_local.lemma, df_local.reading, df_local.pos))\n",
|
| 488 |
+
" \n",
|
| 489 |
+
" # Merge with BCCWJ data\n",
|
| 490 |
+
" merged = df_local.merge(bccwj_df[['key', 'freq_bccwj']], on='key', how='left')\n",
|
| 491 |
+
" \n",
|
| 492 |
+
" return merged.sort_values('freq_local', ascending=False)\n",
|
| 493 |
+
"\n",
|
| 494 |
+
"# Run Plan A analysis\n",
|
| 495 |
+
"results_a = analyze_corpus_plan_a(extended_corpus, tagger_a, df_bccwj)\n",
|
| 496 |
+
"print(f\"\\nPlan A Results (top 15):\")\n",
|
| 497 |
+
"print(results_a.head(15)[['lemma', 'reading', 'pos', 'freq_local', 'freq_bccwj']])"
|
| 498 |
+
]
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"cell_type": "code",
|
| 502 |
+
"execution_count": 21,
|
| 503 |
+
"metadata": {},
|
| 504 |
+
"outputs": [
|
| 505 |
+
{
|
| 506 |
+
"name": "stdout",
|
| 507 |
+
"output_type": "stream",
|
| 508 |
+
"text": [
|
| 509 |
+
"Plan A Evaluation Metrics:\n",
|
| 510 |
+
" type_coverage: 0.000\n",
|
| 511 |
+
" token_coverage: 0.000\n",
|
| 512 |
+
" correlation: None\n",
|
| 513 |
+
" p_value: None\n",
|
| 514 |
+
" total_types: 66\n",
|
| 515 |
+
" matched_types: 0\n",
|
| 516 |
+
" total_tokens: 297\n",
|
| 517 |
+
" matched_tokens: 0\n"
|
| 518 |
+
]
|
| 519 |
+
}
|
| 520 |
+
],
|
| 521 |
+
"source": [
|
| 522 |
+
"# A-6: Evaluation metrics for Plan A\n",
|
| 523 |
+
"def calculate_metrics(df: pd.DataFrame) -> Dict[str, float]:\n",
|
| 524 |
+
" \"\"\"Calculate coverage and correlation metrics.\"\"\"\n",
|
| 525 |
+
" # Coverage: percentage of local tokens found in BCCWJ\n",
|
| 526 |
+
" matched = df.dropna(subset=['freq_bccwj'])\n",
|
| 527 |
+
" coverage = len(matched) / len(df) * 100\n",
|
| 528 |
+
" \n",
|
| 529 |
+
" # Token coverage (by frequency)\n",
|
| 530 |
+
" total_tokens = df['freq_local'].sum()\n",
|
| 531 |
+
" matched_tokens = matched['freq_local'].sum()\n",
|
| 532 |
+
" token_coverage = matched_tokens / total_tokens * 100\n",
|
| 533 |
+
" \n",
|
| 534 |
+
" # Spearman correlation for matched items\n",
|
| 535 |
+
" if len(matched) > 1:\n",
|
| 536 |
+
" if scipy_available:\n",
|
| 537 |
+
" correlation, p_value = spearmanr(matched['freq_local'], matched['freq_bccwj'])\n",
|
| 538 |
+
" else:\n",
|
| 539 |
+
" correlation = np.corrcoef(matched['freq_local'].rank(), matched['freq_bccwj'].rank())[0,1]\n",
|
| 540 |
+
" p_value = None\n",
|
| 541 |
+
" else:\n",
|
| 542 |
+
" correlation, p_value = None, None\n",
|
| 543 |
+
" \n",
|
| 544 |
+
" return {\n",
|
| 545 |
+
" 'type_coverage': coverage,\n",
|
| 546 |
+
" 'token_coverage': token_coverage,\n",
|
| 547 |
+
" 'correlation': correlation,\n",
|
| 548 |
+
" 'p_value': p_value,\n",
|
| 549 |
+
" 'total_types': len(df),\n",
|
| 550 |
+
" 'matched_types': len(matched),\n",
|
| 551 |
+
" 'total_tokens': total_tokens,\n",
|
| 552 |
+
" 'matched_tokens': matched_tokens\n",
|
| 553 |
+
" }\n",
|
| 554 |
+
"\n",
|
| 555 |
+
"metrics_a = calculate_metrics(results_a)\n",
|
| 556 |
+
"print(\"Plan A Evaluation Metrics:\")\n",
|
| 557 |
+
"for key, value in metrics_a.items():\n",
|
| 558 |
+
" if isinstance(value, float) and value is not None:\n",
|
| 559 |
+
" print(f\" {key}: {value:.3f}\")\n",
|
| 560 |
+
" else:\n",
|
| 561 |
+
" print(f\" {key}: {value}\")"
|
| 562 |
+
]
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"cell_type": "markdown",
|
| 566 |
+
"metadata": {},
|
| 567 |
+
"source": [
|
| 568 |
+
"# Using Fugashi"
|
| 569 |
+
]
|
| 570 |
+
},
|
| 571 |
+
{
|
| 572 |
+
"cell_type": "code",
|
| 573 |
+
"execution_count": 17,
|
| 574 |
+
"metadata": {},
|
| 575 |
+
"outputs": [
|
| 576 |
+
{
|
| 577 |
+
"name": "stdout",
|
| 578 |
+
"output_type": "stream",
|
| 579 |
+
"text": [
|
| 580 |
+
"彼 [('彼', '代名詞', '代')]\n",
|
| 581 |
+
"は [('は', '助詞', '助')]\n",
|
| 582 |
+
"日ごろ [('日頃', '名詞', '名')]\n",
|
| 583 |
+
"本 [('本', '名詞', '名')]\n",
|
| 584 |
+
"を [('を', '助詞', '助')]\n",
|
| 585 |
+
"読む [('読む', '動詞', '動')]\n",
|
| 586 |
+
"。 [('。', '補助記号', '補')]\n"
|
| 587 |
+
]
|
| 588 |
+
}
|
| 589 |
+
],
|
| 590 |
+
"source": [
|
| 591 |
+
"import fugashi, unidic\n",
|
| 592 |
+
"from spacy.tokens import Token\n",
|
| 593 |
+
"tagger = fugashi.Tagger()\n",
|
| 594 |
+
"tagger = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
|
| 595 |
+
"\n",
|
| 596 |
+
"if not Token.has_extension(\"unidic_lemmas\"):\n",
|
| 597 |
+
" Token.set_extension(\"unidic_lemmas\", default=None)\n",
|
| 598 |
+
"\n",
|
| 599 |
+
"def enrich_with_unidic(doc):\n",
|
| 600 |
+
" text = doc.text\n",
|
| 601 |
+
" # GiNZA token start index -> token\n",
|
| 602 |
+
" start_map = {tok.idx: tok for tok in doc}\n",
|
| 603 |
+
" cursor = 0\n",
|
| 604 |
+
" for m in tagger(text):\n",
|
| 605 |
+
" surf = m.surface\n",
|
| 606 |
+
" start = text.find(surf, cursor)\n",
|
| 607 |
+
" if start < 0:\n",
|
| 608 |
+
" continue\n",
|
| 609 |
+
" cursor = start + len(surf)\n",
|
| 610 |
+
" tok = start_map.get(start)\n",
|
| 611 |
+
" if tok:\n",
|
| 612 |
+
" if tok._.unidic_lemmas is None:\n",
|
| 613 |
+
" tok._.unidic_lemmas = []\n",
|
| 614 |
+
" tok._.unidic_lemmas.append(\n",
|
| 615 |
+
" (m.feature.lemma, m.feature.pos1, m.pos[0])\n",
|
| 616 |
+
" )\n",
|
| 617 |
+
" return doc\n",
|
| 618 |
+
"\n",
|
| 619 |
+
"doc = enrich_with_unidic(doc)\n",
|
| 620 |
+
"for t in doc:\n",
|
| 621 |
+
" print(t.text, t._.unidic_lemmas)"
|
| 622 |
+
]
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
"cell_type": "code",
|
| 626 |
+
"execution_count": 5,
|
| 627 |
+
"metadata": {},
|
| 628 |
+
"outputs": [],
|
| 629 |
+
"source": [
|
| 630 |
+
"text = \"日頃からの日ごろをてっていする。\""
|
| 631 |
+
]
|
| 632 |
+
},
|
| 633 |
+
{
|
| 634 |
+
"cell_type": "code",
|
| 635 |
+
"execution_count": 6,
|
| 636 |
+
"metadata": {},
|
| 637 |
+
"outputs": [],
|
| 638 |
+
"source": [
|
| 639 |
+
"import spacy\n",
|
| 640 |
+
"from fugashi import Tagger\n",
|
| 641 |
+
"import unidic # or unidic_lite\n",
|
| 642 |
+
"\n",
|
| 643 |
+
"nlp = spacy.load(\"ja_ginza\")\n",
|
| 644 |
+
"tagger = Tagger(f'-d \"{unidic.DICDIR}\"') # フル UniDic\n",
|
| 645 |
+
"doc = nlp(text)\n",
|
| 646 |
+
"mecab_tokens = list(tagger(text))\n",
|
| 647 |
+
"# → 文字オフセットでアライメントして doc の token に UniDic 情報を付与"
|
| 648 |
+
]
|
| 649 |
+
},
|
| 650 |
+
{
|
| 651 |
+
"cell_type": "code",
|
| 652 |
+
"execution_count": 7,
|
| 653 |
+
"metadata": {},
|
| 654 |
+
"outputs": [
|
| 655 |
+
{
|
| 656 |
+
"data": {
|
| 657 |
+
"text/plain": [
|
| 658 |
+
"[日頃, から, の, 日ごろ, を, てってい, する, 。]"
|
| 659 |
+
]
|
| 660 |
+
},
|
| 661 |
+
"execution_count": 7,
|
| 662 |
+
"metadata": {},
|
| 663 |
+
"output_type": "execute_result"
|
| 664 |
+
}
|
| 665 |
+
],
|
| 666 |
+
"source": [
|
| 667 |
+
"mecab_tokens"
|
| 668 |
+
]
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"cell_type": "code",
|
| 672 |
+
"execution_count": 8,
|
| 673 |
+
"metadata": {},
|
| 674 |
+
"outputs": [
|
| 675 |
+
{
|
| 676 |
+
"name": "stdout",
|
| 677 |
+
"output_type": "stream",
|
| 678 |
+
"text": [
|
| 679 |
+
"<fugashi.fugashi.Tagger object at 0x1183bad80>\n"
|
| 680 |
+
]
|
| 681 |
+
}
|
| 682 |
+
],
|
| 683 |
+
"source": [
|
| 684 |
+
"print(tagger)"
|
| 685 |
+
]
|
| 686 |
+
},
|
| 687 |
+
{
|
| 688 |
+
"cell_type": "code",
|
| 689 |
+
"execution_count": 9,
|
| 690 |
+
"metadata": {},
|
| 691 |
+
"outputs": [
|
| 692 |
+
{
|
| 693 |
+
"name": "stdout",
|
| 694 |
+
"output_type": "stream",
|
| 695 |
+
"text": [
|
| 696 |
+
"Using unidic at: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n"
|
| 697 |
+
]
|
| 698 |
+
}
|
| 699 |
+
],
|
| 700 |
+
"source": [
|
| 701 |
+
"import unidic\n",
|
| 702 |
+
"print(\"Using unidic at:\", unidic.DICDIR)"
|
| 703 |
+
]
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"cell_type": "code",
|
| 707 |
+
"execution_count": 10,
|
| 708 |
+
"metadata": {},
|
| 709 |
+
"outputs": [
|
| 710 |
+
{
|
| 711 |
+
"name": "stdout",
|
| 712 |
+
"output_type": "stream",
|
| 713 |
+
"text": [
|
| 714 |
+
"feature_len: 29\n"
|
| 715 |
+
]
|
| 716 |
+
}
|
| 717 |
+
],
|
| 718 |
+
"source": [
|
| 719 |
+
"sample = next(iter(tagger(\"テスト\")))\n",
|
| 720 |
+
"print(\"feature_len:\", len(sample.feature))\n",
|
| 721 |
+
"# 17 = unidic-lite (2.1.2), 29前後 = フル UniDic 3.x"
|
| 722 |
+
]
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"cell_type": "code",
|
| 726 |
+
"execution_count": null,
|
| 727 |
+
"metadata": {},
|
| 728 |
+
"outputs": [
|
| 729 |
+
{
|
| 730 |
+
"name": "stdout",
|
| 731 |
+
"output_type": "stream",
|
| 732 |
+
"text": [
|
| 733 |
+
"['dictionary_info']\n"
|
| 734 |
+
]
|
| 735 |
+
}
|
| 736 |
+
],
|
| 737 |
+
"source": [
|
| 738 |
+
"print([a for a in dir(tagger) if 'dic' in a.lower()])"
|
| 739 |
+
]
|
| 740 |
+
},
|
| 741 |
+
{
|
| 742 |
+
"cell_type": "code",
|
| 743 |
+
"execution_count": 12,
|
| 744 |
+
"metadata": {},
|
| 745 |
+
"outputs": [
|
| 746 |
+
{
|
| 747 |
+
"name": "stdout",
|
| 748 |
+
"output_type": "stream",
|
| 749 |
+
"text": [
|
| 750 |
+
"Available attrs: ['char_type', 'feature', 'feature_raw', 'is_unk', 'length', 'pos', 'posid', 'rlength', 'stat', 'surface', 'white_space']\n"
|
| 751 |
+
]
|
| 752 |
+
}
|
| 753 |
+
],
|
| 754 |
+
"source": [
|
| 755 |
+
"import fugashi\n",
|
| 756 |
+
"from fugashi import Tagger\n",
|
| 757 |
+
"\n",
|
| 758 |
+
"tagger = Tagger() # まずオプションなし\n",
|
| 759 |
+
"m = next(iter(tagger(\"日ごろ\")))\n",
|
| 760 |
+
"print(\"Available attrs:\", [a for a in dir(m) if not a.startswith('_')][:25])"
|
| 761 |
+
]
|
| 762 |
+
},
|
| 763 |
+
{
|
| 764 |
+
"cell_type": "code",
|
| 765 |
+
"execution_count": 13,
|
| 766 |
+
"metadata": {},
|
| 767 |
+
"outputs": [
|
| 768 |
+
{
|
| 769 |
+
"name": "stdout",
|
| 770 |
+
"output_type": "stream",
|
| 771 |
+
"text": [
|
| 772 |
+
"Tagger repr: <fugashi.fugashi.Tagger object at 0x13f33b5c0>\n",
|
| 773 |
+
"surface: 日ごろ\n",
|
| 774 |
+
"feature_len: 29\n",
|
| 775 |
+
"raw feature: UnidicFeatures29(pos1='名詞', pos2='普通名詞', pos3='副詞可能', pos4='*', cType='*', cForm='*', lForm='ヒゴロ', lemma='日頃', orth='日ごろ', pron='ヒゴロ', orthBase='日ごろ', pronBase='ヒゴロ', goshu='和', iType='*', iForm='*', fType='*', fForm='*', iConType='*', fConType='*', type='体', kana='ヒゴロ', kanaBase='ヒゴロ', form='ヒゴロ', formBase='ヒゴロ', aType='0', aConType='C2', aModType='*', lid='8605061500510720', lemma_id='31305')\n"
|
| 776 |
+
]
|
| 777 |
+
}
|
| 778 |
+
],
|
| 779 |
+
"source": [
|
| 780 |
+
"import fugashi\n",
|
| 781 |
+
"t = fugashi.Tagger()\n",
|
| 782 |
+
"print(\"Tagger repr:\", t) # ここに 'ipa' や 'unidic' などヒントが出ることが多い\n",
|
| 783 |
+
"\n",
|
| 784 |
+
"w = next(iter(t(\"日ごろ\")))\n",
|
| 785 |
+
"print(\"surface:\", w.surface)\n",
|
| 786 |
+
"print(\"feature_len:\", len(w.feature))\n",
|
| 787 |
+
"print(\"raw feature:\", w.feature) # まず 1語分"
|
| 788 |
+
]
|
| 789 |
+
},
|
| 790 |
+
{
|
| 791 |
+
"cell_type": "code",
|
| 792 |
+
"execution_count": null,
|
| 793 |
+
"metadata": {},
|
| 794 |
+
"outputs": [],
|
| 795 |
+
"source": []
|
| 796 |
+
}
|
| 797 |
+
],
|
| 798 |
+
"metadata": {
|
| 799 |
+
"kernelspec": {
|
| 800 |
+
"display_name": "Python 3",
|
| 801 |
+
"language": "python",
|
| 802 |
+
"name": "python3"
|
| 803 |
+
},
|
| 804 |
+
"language_info": {
|
| 805 |
+
"codemirror_mode": {
|
| 806 |
+
"name": "ipython",
|
| 807 |
+
"version": 3
|
| 808 |
+
},
|
| 809 |
+
"file_extension": ".py",
|
| 810 |
+
"mimetype": "text/x-python",
|
| 811 |
+
"name": "python",
|
| 812 |
+
"nbconvert_exporter": "python",
|
| 813 |
+
"pygments_lexer": "ipython3",
|
| 814 |
+
"version": "3.12.0"
|
| 815 |
+
}
|
| 816 |
+
},
|
| 817 |
+
"nbformat": 4,
|
| 818 |
+
"nbformat_minor": 4
|
| 819 |
+
}
|
pyproject.toml
CHANGED
|
@@ -18,4 +18,8 @@ dependencies = [
|
|
| 18 |
"ja-core-news-md @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl",
|
| 19 |
"ja-core-news-trf @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl",
|
| 20 |
"huggingface-hub[cli]>=0.33.4",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
]
|
|
|
|
| 18 |
"ja-core-news-md @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl",
|
| 19 |
"ja-core-news-trf @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl",
|
| 20 |
"huggingface-hub[cli]>=0.33.4",
|
| 21 |
+
"chardet>=5.2.0",
|
| 22 |
+
"fugashi>=1.3.0",
|
| 23 |
+
"unidic>=1.1.0",
|
| 24 |
+
"ipykernel>=6.29.5",
|
| 25 |
]
|
resources/reference_lists/ja/BCCWJ_frequencylist_luw2_ver1_1.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abdfe3f5c6383be148809f615834a8f8890d6acab1415428ca350cff08438908
|
| 3 |
+
size 355289031
|
resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1 copy.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51c38228ac27858cf3fa35c71cddd54f2290b86f9ca5e705e360b2f849350179
|
| 3 |
+
size 5123687
|
resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59cc5d3e0961f130b073a17736e8ff4c5f0f63bd759e27e3c7cd0d96e79f4443
|
| 3 |
+
size 76573321
|
resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84fa50dd87a9094f85006c81d78d14afab54bfad55e4a7137c1beab89b7200a4
|
| 3 |
+
size 17713132
|
test/test_app.py
CHANGED
|
@@ -4,13 +4,15 @@ Basic test script to validate the application components.
|
|
| 4 |
|
| 5 |
import sys
|
| 6 |
import os
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def test_imports():
|
| 10 |
"""Test that all required modules can be imported."""
|
| 11 |
try:
|
| 12 |
-
from lexical_sophistication import LexicalSophisticationAnalyzer
|
| 13 |
-
from pos_parser import POSParser
|
| 14 |
print("✓ Backend modules imported successfully")
|
| 15 |
return True
|
| 16 |
except ImportError as e:
|
|
@@ -20,8 +22,8 @@ def test_imports():
|
|
| 20 |
def test_basic_functionality():
|
| 21 |
"""Test basic functionality with SpaCy models."""
|
| 22 |
try:
|
| 23 |
-
from lexical_sophistication import LexicalSophisticationAnalyzer
|
| 24 |
-
from pos_parser import POSParser
|
| 25 |
|
| 26 |
print("Testing basic class instantiation...")
|
| 27 |
print("Note: This will fail without SpaCy models installed")
|
|
@@ -64,4 +66,4 @@ def main():
|
|
| 64 |
return True
|
| 65 |
|
| 66 |
if __name__ == "__main__":
|
| 67 |
-
main()
|
|
|
|
| 4 |
|
| 5 |
import sys
|
| 6 |
import os
|
| 7 |
+
|
| 8 |
+
# Add the parent directory to the Python path for imports
|
| 9 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 10 |
|
| 11 |
def test_imports():
|
| 12 |
"""Test that all required modules can be imported."""
|
| 13 |
try:
|
| 14 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 15 |
+
from text_analyzer.pos_parser import POSParser
|
| 16 |
print("✓ Backend modules imported successfully")
|
| 17 |
return True
|
| 18 |
except ImportError as e:
|
|
|
|
| 22 |
def test_basic_functionality():
|
| 23 |
"""Test basic functionality with SpaCy models."""
|
| 24 |
try:
|
| 25 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 26 |
+
from text_analyzer.pos_parser import POSParser
|
| 27 |
|
| 28 |
print("Testing basic class instantiation...")
|
| 29 |
print("Note: This will fail without SpaCy models installed")
|
|
|
|
| 66 |
return True
|
| 67 |
|
| 68 |
if __name__ == "__main__":
|
| 69 |
+
main()
|
test/test_functionality.py
CHANGED
|
@@ -6,10 +6,12 @@ Extended test script to validate application functionality.
|
|
| 6 |
import sys
|
| 7 |
import os
|
| 8 |
import tempfile
|
| 9 |
-
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
import pandas as pd
|
| 14 |
|
| 15 |
def test_lexical_sophistication():
|
|
@@ -122,4 +124,4 @@ def main():
|
|
| 122 |
|
| 123 |
if __name__ == "__main__":
|
| 124 |
success = main()
|
| 125 |
-
sys.exit(0 if success else 1)
|
|
|
|
| 6 |
import sys
|
| 7 |
import os
|
| 8 |
import tempfile
|
|
|
|
| 9 |
|
| 10 |
+
# Add the parent directory to the Python path for imports
|
| 11 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 12 |
+
|
| 13 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 14 |
+
from text_analyzer.pos_parser import POSParser
|
| 15 |
import pandas as pd
|
| 16 |
|
| 17 |
def test_lexical_sophistication():
|
|
|
|
| 124 |
|
| 125 |
if __name__ == "__main__":
|
| 126 |
success = main()
|
| 127 |
+
sys.exit(0 if success else 1)
|
test/test_multi_index.py
CHANGED
|
@@ -3,9 +3,8 @@
|
|
| 3 |
import sys
|
| 4 |
import os
|
| 5 |
import tempfile
|
| 6 |
-
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
|
| 7 |
|
| 8 |
-
from lexical_sophistication import LexicalSophisticationAnalyzer
|
| 9 |
|
| 10 |
def test_multi_index_functionality():
|
| 11 |
print("Testing multi-index functionality...")
|
|
@@ -130,4 +129,4 @@ that,,,7,12279,500,12063.320,1.000"""
|
|
| 130 |
traceback.print_exc()
|
| 131 |
|
| 132 |
if __name__ == "__main__":
|
| 133 |
-
test_multi_index_functionality()
|
|
|
|
| 3 |
import sys
|
| 4 |
import os
|
| 5 |
import tempfile
|
|
|
|
| 6 |
|
| 7 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 8 |
|
| 9 |
def test_multi_index_functionality():
|
| 10 |
print("Testing multi-index functionality...")
|
|
|
|
| 129 |
traceback.print_exc()
|
| 130 |
|
| 131 |
if __name__ == "__main__":
|
| 132 |
+
test_multi_index_functionality()
|
test/test_yaml_config.py
CHANGED
|
@@ -5,9 +5,8 @@ Test script to validate YAML configuration system.
|
|
| 5 |
|
| 6 |
import sys
|
| 7 |
import os
|
| 8 |
-
sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
|
| 9 |
|
| 10 |
-
from lexical_sophistication import LexicalSophisticationAnalyzer
|
| 11 |
import yaml
|
| 12 |
from pathlib import Path
|
| 13 |
|
|
@@ -153,4 +152,4 @@ def main():
|
|
| 153 |
|
| 154 |
if __name__ == "__main__":
|
| 155 |
success = main()
|
| 156 |
-
sys.exit(0 if success else 1)
|
|
|
|
| 5 |
|
| 6 |
import sys
|
| 7 |
import os
|
|
|
|
| 8 |
|
| 9 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 10 |
import yaml
|
| 11 |
from pathlib import Path
|
| 12 |
|
|
|
|
| 152 |
|
| 153 |
if __name__ == "__main__":
|
| 154 |
success = main()
|
| 155 |
+
sys.exit(0 if success else 1)
|
test_frequency_flexible.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
\n#!/usr/bin/env python3\n\"\"\"\nTest script for the enhanced FrequencyAnalyzer with flexible column mapping.\nThis demonstrates the new functionality with sample data.\n\"\"\"\n\nimport pandas as pd\nimport numpy as np\nfrom io import StringIO\nimport sys\nimport os\n\n# Add the text_analyzer to path\nsys.path.append('text_analyzer')\n\nfrom frequency_analyzer import FrequencyAnalyzer\n\ndef create_sample_data():\n \"\"\"Create sample frequency data in the new format.\"\"\"\n sample_data = \"\"\"rank\tlForm\tlemma\tpos\tsubLemma\twType\tfrequency\tpmw\tPB_frequency\tPB_pmw\tPM_frequency\tPM_pmw\tcore_frequency\tcore_pmw\n1\tノ\tの\t助詞-格助詞\t\t和\t5061558\t48383.9\t1473494\t51791.5\t208748\t47179.3\t1398950\t51737.2\n2\tニ\tに\t助詞-格助詞\t\t和\t3576558\t34188.7\t1036653\t36437.1\t140178\t31681.7\t985766\t36456.5\n3\tテ\tて\t助詞-接続助詞\t\t和\t3493117\t33391.0\t948430\t33336.1\t124241\t28079.8\t902379\t33372.6\n4\tハ\tは\t助詞-係助詞\t\t和\t3289932\t31448.8\t945084\t33218.5\t129378\t29240.8\t899776\t33276.3\n5\tガ\tが\t助詞-格助詞\t\t和\t2518164\t24070.6\t743621\t26131.8\t103456\t23390.2\t707331\t26139.9\"\"\"\n return sample_data\n\ndef test_file_format_detection():\n \"\"\"Test file format detection functionality.\"\"\"\n print(\"=== Testing File Format Detection ===\")\n \n analyzer = FrequencyAnalyzer(file_size_limit_mb=300)\n sample_data = create_sample_data()\n \n format_info = analyzer.detect_file_format(sample_data)\n print(f\"Detected separator: '{format_info['separator']}'\")\n print(f\"Has header: {format_info['has_header']}\")\n print(f\"Estimated columns: {format_info['estimated_columns']}\")\n print(f\"Sample lines: {format_info['sample_lines'][:2]}\")\n print()\n\ndef test_column_detection():\n \"\"\"Test column detection and categorization.\"\"\"\n print(\"=== Testing Column Detection ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Read sample data for column detection\n df = pd.read_csv(StringIO(sample_data), sep='\\t')\n detected_cols = analyzer.detect_columns(df)\n \n print(\"Detected columns:\")\n for category, columns in detected_cols.items():\n print(f\" {category}: {columns}\")\n print()\n\ndef test_flexible_loading():\n \"\"\"Test flexible data loading with column configuration.\"\"\"\n print(\"=== Testing Flexible Data Loading ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Test with different column configurations\n configs = [\n {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency',\n 'pos_column': 'pos'\n },\n {\n 'word_column': 'lemma',\n 'frequency_column': 'pmw',\n 'pos_column': 'pos'\n },\n {\n 'word_column': 'lForm',\n 'frequency_column': 'PB_frequency'\n }\n ]\n \n for i, config in enumerate(configs, 1):\n print(f\"Configuration {i}: {config}\")\n try:\n df = analyzer.load_frequency_data(sample_data, config)\n print(f\" ✓ Successfully loaded {len(df)} entries\")\n print(f\" ✓ Available frequency columns: {analyzer.get_available_frequency_columns()}\")\n print(f\" ✓ Available word columns: {analyzer.get_available_word_columns()}\")\n except Exception as e:\n print(f\" ✗ Error: {e}\")\n print()\n\ndef test_multi_frequency_analysis():\n \"\"\"Test multi-frequency analysis functionality.\"\"\"\n print(\"=== Testing Multi-Frequency Analysis ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency',\n 'pos_column': 'pos'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n # Test analysis with multiple frequency columns\n freq_columns = ['frequency', 'pmw', 'PB_frequency']\n \n try:\n results = analyzer.create_multi_frequency_analysis(freq_columns, bin_size=2)\n \n print(f\"Multi-frequency analysis results:\")\n for col, result in results.items():\n print(f\" {col}: {len(result['group_labels'])} groups\")\n print(f\" Sample frequencies: {result['avg_frequencies'][:3]}\")\n \n except Exception as e:\n print(f\"Error in multi-frequency analysis: {e}\")\n print()\n\ndef test_rank_based_visualization():\n \"\"\"Test rank-based visualization with flexible columns.\"\"\"\n print(\"=== Testing Rank-Based Visualization ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n try:\n # Test with different frequency columns\n for col in ['frequency', 'pmw', 'PB_frequency']:\n result = analyzer.create_rank_based_visualization_flexible(\n column=col, \n bin_size=2, \n log_transform=False\n )\n \n print(f\"Analysis for column '{col}':\")\n print(f\" Groups: {len(result['group_labels'])}\")\n print(f\" Sample words: {[w['word'] for w in result['sample_words'].get(0, [])]}\")\n print(f\" Avg frequencies: {result['avg_frequencies']}\")\n \n except Exception as e:\n print(f\"Error in rank-based visualization: {e}\")\n print()\n\ndef test_backward_compatibility():\n \"\"\"Test backward compatibility with legacy interface.\"\"\"\n print(\"=== Testing Backward Compatibility ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Test with flexible loading first\n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n # Then test legacy methods\n try:\n legacy_cols = analyzer.get_available_columns()\n print(f\"Legacy available columns: {legacy_cols}\")\n \n if legacy_cols:\n stats = analyzer.calculate_statistics(legacy_cols[0])\n print(f\"Statistics for {legacy_cols[0]}: mean={stats['mean']:.1f}, count={stats['count']}\")\n \n top_words = analyzer.get_top_words(legacy_cols[0], n=3)\n print(f\"Top 3 words: {[w['word'] for w in top_words]}\")\n \n except Exception as e:\n print(f\"Error in backward compatibility test: {e}\")\n print()\n\nif __name__ == \"__main__\":\n print(\"Testing Enhanced FrequencyAnalyzer with Flexible Column Mapping\")\n print(\"=\" * 60)\n \n test_file_format_detection()\n test_column_detection()\n test_flexible_loading()\n test_multi_frequency_analysis()\n test_rank_based_visualization()\n test_backward_compatibility()\n \n print(\"All tests completed!\")\n
|
test_fugashi_diagnostic.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Diagnostic test to check if fugashi is working and what matching methods are being used.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
sys.path.append('.')
|
| 8 |
+
|
| 9 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 10 |
+
from web_app.config_manager import ConfigManager
|
| 11 |
+
|
| 12 |
+
def test_fugashi_diagnostic():
|
| 13 |
+
"""Test what matching methods are actually being used."""
|
| 14 |
+
|
| 15 |
+
print("=== Fugashi Diagnostic Test ===\n")
|
| 16 |
+
|
| 17 |
+
# Initialize Japanese analyzer
|
| 18 |
+
print("1. Initializing Japanese analyzer...")
|
| 19 |
+
try:
|
| 20 |
+
analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
|
| 21 |
+
print("✓ Japanese SpaCy model loaded successfully")
|
| 22 |
+
|
| 23 |
+
# Check if UniDic enricher is available
|
| 24 |
+
if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
|
| 25 |
+
print("✓ UniDic enricher initialized successfully")
|
| 26 |
+
else:
|
| 27 |
+
print("⚠ UniDic enricher not available - using legacy mode")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"✗ Failed to load Japanese model: {e}")
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
# Load reference configuration
|
| 33 |
+
print("\n2. Loading BCCWJ frequency data only...")
|
| 34 |
+
config = ConfigManager.load_reference_config()
|
| 35 |
+
japanese_config = config.get('japanese', {}).get('unigrams', {})
|
| 36 |
+
|
| 37 |
+
# Load just BCCWJ frequency for testing
|
| 38 |
+
bccwj_config = japanese_config.get('BCCWJ_frequency')
|
| 39 |
+
if not bccwj_config:
|
| 40 |
+
print("✗ BCCWJ configuration not found")
|
| 41 |
+
return False
|
| 42 |
+
|
| 43 |
+
print("✓ BCCWJ configuration found")
|
| 44 |
+
|
| 45 |
+
# Load the data
|
| 46 |
+
bccwj_data = ConfigManager.load_reference_list_data(bccwj_config)
|
| 47 |
+
if not bccwj_data:
|
| 48 |
+
print("✗ Failed to load BCCWJ data")
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
print(f"✓ BCCWJ data loaded successfully")
|
| 52 |
+
|
| 53 |
+
# Load into analyzer
|
| 54 |
+
reference_data = {"unigrams_BCCWJ_frequency": bccwj_data}
|
| 55 |
+
analyzer.load_reference_lists(reference_data)
|
| 56 |
+
print("✓ Reference data loaded into analyzer")
|
| 57 |
+
|
| 58 |
+
# Test with a simple Japanese sentence
|
| 59 |
+
print("\n3. Testing token matching methods...")
|
| 60 |
+
test_text = "私は学校に行きます。"
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
results = analyzer.analyze_text(test_text, ["unigrams_BCCWJ_frequency"])
|
| 64 |
+
|
| 65 |
+
print(f"\nAnalysis completed for: '{test_text}'")
|
| 66 |
+
print(f"Total tokens analyzed: {len(results['token_details'])}")
|
| 67 |
+
|
| 68 |
+
print("\nDetailed token matching results:")
|
| 69 |
+
for i, token in enumerate(results['token_details']):
|
| 70 |
+
print(f"\nToken {i+1}: '{token['token']}' (lemma: '{token['lemma']}')")
|
| 71 |
+
print(f" POS: {token['pos']}, Tag: {token['tag']}")
|
| 72 |
+
|
| 73 |
+
# Check matching methods
|
| 74 |
+
token_method = token.get('unigrams_BCCWJ_frequency_token_match_method', 'unknown')
|
| 75 |
+
lemma_method = token.get('unigrams_BCCWJ_frequency_lemma_match_method', 'unknown')
|
| 76 |
+
|
| 77 |
+
token_score = token.get('unigrams_BCCWJ_frequency_token')
|
| 78 |
+
lemma_score = token.get('unigrams_BCCWJ_frequency_lemma')
|
| 79 |
+
|
| 80 |
+
print(f" Token matching method: {token_method}")
|
| 81 |
+
print(f" Lemma matching method: {lemma_method}")
|
| 82 |
+
print(f" Token score: {token_score}")
|
| 83 |
+
print(f" Lemma score: {lemma_score}")
|
| 84 |
+
|
| 85 |
+
# Show UniDic features if available
|
| 86 |
+
if 'unidic_features' in token:
|
| 87 |
+
unidic = token['unidic_features']
|
| 88 |
+
print(f" UniDic features available:")
|
| 89 |
+
print(f" lemma: '{unidic.get('lemma', '')}'")
|
| 90 |
+
print(f" lForm: '{unidic.get('lForm', '')}'")
|
| 91 |
+
print(f" pos1: '{unidic.get('pos1', '')}'")
|
| 92 |
+
print(f" pos2: '{unidic.get('pos2', '')}'")
|
| 93 |
+
print(f" alignment_confidence: {unidic.get('alignment_confidence', 0.0)}")
|
| 94 |
+
else:
|
| 95 |
+
print(" No UniDic features available")
|
| 96 |
+
|
| 97 |
+
# Summary
|
| 98 |
+
print("\n4. Summary:")
|
| 99 |
+
methods_used = {}
|
| 100 |
+
for token in results['token_details']:
|
| 101 |
+
token_method = token.get('unigrams_BCCWJ_frequency_token_match_method', 'unknown')
|
| 102 |
+
lemma_method = token.get('unigrams_BCCWJ_frequency_lemma_match_method', 'unknown')
|
| 103 |
+
methods_used[token_method] = methods_used.get(token_method, 0) + 1
|
| 104 |
+
if token_method != lemma_method:
|
| 105 |
+
methods_used[lemma_method] = methods_used.get(lemma_method, 0) + 1
|
| 106 |
+
|
| 107 |
+
print("Matching methods used:")
|
| 108 |
+
for method, count in methods_used.items():
|
| 109 |
+
print(f" {method}: {count} matches")
|
| 110 |
+
|
| 111 |
+
if 'legacy_spacy' in methods_used and len(methods_used) == 1:
|
| 112 |
+
print("\n❌ ALL tokens are using legacy_spacy - fugashi is NOT being used!")
|
| 113 |
+
return False
|
| 114 |
+
elif any('unidic' in method for method in methods_used):
|
| 115 |
+
print("\n✅ Some tokens are using UniDic-based matching - fugashi is working!")
|
| 116 |
+
return True
|
| 117 |
+
else:
|
| 118 |
+
print("\n⚠ Mixed or unexpected matching methods")
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"✗ Error during analysis: {e}")
|
| 123 |
+
import traceback
|
| 124 |
+
traceback.print_exc()
|
| 125 |
+
return False
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
success = test_fugashi_diagnostic()
|
| 129 |
+
if success:
|
| 130 |
+
print("\n🎉 Fugashi diagnostic test indicates fugashi is working!")
|
| 131 |
+
else:
|
| 132 |
+
print("\n❌ Fugashi diagnostic test indicates fugashi is NOT working!")
|
| 133 |
+
|
| 134 |
+
sys.exit(0 if success else 1)
|
test_japanese_integration.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for Japanese lexical sophistication integration.
|
| 4 |
+
Tests the BCCWJ and CSJ frequency analysis with composite key lookup.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
sys.path.append('.')
|
| 10 |
+
|
| 11 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 12 |
+
from web_app.config_manager import ConfigManager
|
| 13 |
+
|
| 14 |
+
def test_japanese_integration():
|
| 15 |
+
"""Test Japanese corpus integration with sample text."""
|
| 16 |
+
|
| 17 |
+
print("=== Japanese Lexical Sophistication Integration Test ===\n")
|
| 18 |
+
|
| 19 |
+
# Initialize Japanese analyzer
|
| 20 |
+
print("1. Initializing Japanese analyzer...")
|
| 21 |
+
try:
|
| 22 |
+
analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
|
| 23 |
+
print("✓ Japanese SpaCy model loaded successfully")
|
| 24 |
+
|
| 25 |
+
# Check if UniDic enricher is available
|
| 26 |
+
if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
|
| 27 |
+
print("✓ UniDic enricher initialized successfully")
|
| 28 |
+
else:
|
| 29 |
+
print("⚠ UniDic enricher not available - using legacy mode")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"✗ Failed to load Japanese model: {e}")
|
| 32 |
+
print("Please install: python -m spacy download ja_core_news_md")
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
# Load reference configuration
|
| 36 |
+
print("\n2. Loading reference configuration...")
|
| 37 |
+
config = ConfigManager.load_reference_config()
|
| 38 |
+
japanese_config = config.get('japanese', {}).get('unigrams', {})
|
| 39 |
+
|
| 40 |
+
if not japanese_config:
|
| 41 |
+
print("✗ No Japanese configuration found")
|
| 42 |
+
return False
|
| 43 |
+
|
| 44 |
+
print(f"✓ Found {len(japanese_config)} Japanese reference lists")
|
| 45 |
+
|
| 46 |
+
# Test data loading for available files
|
| 47 |
+
print("\n3. Testing data loading...")
|
| 48 |
+
reference_data = {}
|
| 49 |
+
|
| 50 |
+
for list_name, list_config in japanese_config.items():
|
| 51 |
+
if not list_config.get('enabled', False):
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
file_path = list_config.get('files', {}).get('token', '')
|
| 55 |
+
if not os.path.exists(file_path):
|
| 56 |
+
print(f"⚠ File not found: {file_path}")
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
print(f" Loading {list_name}...")
|
| 60 |
+
try:
|
| 61 |
+
data = ConfigManager.load_reference_list_data(list_config)
|
| 62 |
+
if data:
|
| 63 |
+
reference_data[f"unigrams_{list_name}"] = data
|
| 64 |
+
|
| 65 |
+
# Check if Japanese corpus data was created correctly
|
| 66 |
+
for file_type, file_data in data.items():
|
| 67 |
+
if isinstance(file_data, dict) and file_data.get('is_japanese_corpus'):
|
| 68 |
+
composite_count = len(file_data.get('composite_dict', {}))
|
| 69 |
+
lemma_count = len(file_data.get('lemma_dict', {}))
|
| 70 |
+
surface_count = len(file_data.get('surface_dict', {}))
|
| 71 |
+
print(f" ✓ {list_name}: {composite_count} composite keys, {lemma_count} lemmas, {surface_count} surface forms")
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f" ✗ Error loading {list_name}: {e}")
|
| 75 |
+
|
| 76 |
+
if not reference_data:
|
| 77 |
+
print("✗ No reference data loaded successfully")
|
| 78 |
+
return False
|
| 79 |
+
|
| 80 |
+
# Load reference data into analyzer
|
| 81 |
+
print("\n4. Loading reference data into analyzer...")
|
| 82 |
+
analyzer.load_reference_lists(reference_data)
|
| 83 |
+
print(f"✓ Loaded {len(reference_data)} reference lists")
|
| 84 |
+
|
| 85 |
+
# Test with Japanese text
|
| 86 |
+
print("\n5. Testing Japanese text analysis...")
|
| 87 |
+
japanese_text = """
|
| 88 |
+
私は毎日学校に行きます。
|
| 89 |
+
友達と一緒に勉強して、とても楽しいです。
|
| 90 |
+
日本語の文法は少し難しいですが、頑張って覚えています。
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
selected_indices = list(reference_data.keys())
|
| 94 |
+
print(f" Using indices: {', '.join(selected_indices)}")
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
results = analyzer.analyze_text(japanese_text, selected_indices)
|
| 98 |
+
|
| 99 |
+
# Display results
|
| 100 |
+
print(f"\n6. Analysis Results:")
|
| 101 |
+
print(f" Total tokens: {results['text_stats']['total_tokens']}")
|
| 102 |
+
print(f" Content words: {results['text_stats']['content_words']}")
|
| 103 |
+
print(f" Function words: {results['text_stats']['function_words']}")
|
| 104 |
+
|
| 105 |
+
# Show some token details
|
| 106 |
+
print(f"\n Sample token analysis:")
|
| 107 |
+
for i, token in enumerate(results['token_details'][:5]): # First 5 tokens
|
| 108 |
+
print(f" {i+1}. {token['token']} (lemma: {token['lemma']}, pos: {token['pos']})")
|
| 109 |
+
for key, value in token.items():
|
| 110 |
+
if key.endswith('_token') or key.endswith('_lemma'):
|
| 111 |
+
if value != 'NA':
|
| 112 |
+
print(f" {key}: {value}")
|
| 113 |
+
|
| 114 |
+
# Show summary statistics
|
| 115 |
+
print(f"\n Summary statistics:")
|
| 116 |
+
for key, stats in results['summary'].items():
|
| 117 |
+
print(f" {key}: mean={stats['mean']:.2f}, count={stats['count']}")
|
| 118 |
+
|
| 119 |
+
print(f"\n✓ Japanese text analysis completed successfully!")
|
| 120 |
+
return True
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f"✗ Error during analysis: {e}")
|
| 124 |
+
import traceback
|
| 125 |
+
traceback.print_exc()
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
if __name__ == "__main__":
|
| 129 |
+
success = test_japanese_integration()
|
| 130 |
+
if success:
|
| 131 |
+
print("\n🎉 Japanese integration test PASSED!")
|
| 132 |
+
else:
|
| 133 |
+
print("\n❌ Japanese integration test FAILED!")
|
| 134 |
+
|
| 135 |
+
sys.exit(0 if success else 1)
|
test_unidic_diagnostic.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Diagnostic test for UniDic integration functionality.
|
| 4 |
+
Tests both the fallback mechanism and enhanced features.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
sys.path.append('.')
|
| 10 |
+
|
| 11 |
+
from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
|
| 12 |
+
from web_app.config_manager import ConfigManager
|
| 13 |
+
|
| 14 |
+
def test_unidic_diagnostic():
|
| 15 |
+
"""Test UniDic integration with diagnostic information."""
|
| 16 |
+
|
| 17 |
+
print("=== UniDic Integration Diagnostic Test ===\n")
|
| 18 |
+
|
| 19 |
+
# Initialize Japanese analyzer
|
| 20 |
+
print("1. Initializing Japanese analyzer...")
|
| 21 |
+
try:
|
| 22 |
+
analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
|
| 23 |
+
print("✓ Japanese SpaCy model loaded successfully")
|
| 24 |
+
|
| 25 |
+
# Check UniDic availability
|
| 26 |
+
unidic_available = hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher
|
| 27 |
+
if unidic_available:
|
| 28 |
+
print("✓ UniDic enricher initialized - enhanced mode available")
|
| 29 |
+
else:
|
| 30 |
+
print("⚠ UniDic enricher not available - using legacy fallback mode")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"✗ Failed to initialize analyzer: {e}")
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
# Load reference data
|
| 36 |
+
print("\n2. Loading reference configuration...")
|
| 37 |
+
config = ConfigManager.load_reference_config()
|
| 38 |
+
japanese_config = config.get('japanese', {}).get('unigrams', {})
|
| 39 |
+
|
| 40 |
+
# Get first available Japanese corpus
|
| 41 |
+
reference_data = {}
|
| 42 |
+
for list_name, list_config in japanese_config.items():
|
| 43 |
+
if list_config.get('enabled', False):
|
| 44 |
+
file_path = list_config.get('files', {}).get('token', '')
|
| 45 |
+
if os.path.exists(file_path):
|
| 46 |
+
data = ConfigManager.load_reference_list_data(list_config)
|
| 47 |
+
if data:
|
| 48 |
+
reference_data[f"unigrams_{list_name}"] = data
|
| 49 |
+
print(f"✓ Loaded {list_name} for testing")
|
| 50 |
+
break
|
| 51 |
+
|
| 52 |
+
if not reference_data:
|
| 53 |
+
print("✗ No reference data available")
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
# Load into analyzer
|
| 57 |
+
analyzer.load_reference_lists(reference_data)
|
| 58 |
+
|
| 59 |
+
# Test with sample Japanese text
|
| 60 |
+
print("\n3. Testing Japanese text analysis...")
|
| 61 |
+
test_text = "私は学校に行く。"
|
| 62 |
+
selected_indices = list(reference_data.keys())
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
results = analyzer.analyze_text(test_text, selected_indices)
|
| 66 |
+
|
| 67 |
+
print(f"\n4. Analysis Results:")
|
| 68 |
+
print(f" Total tokens: {results['text_stats']['total_tokens']}")
|
| 69 |
+
|
| 70 |
+
# Show detailed token analysis with diagnostic information
|
| 71 |
+
print(f"\n Token Details with Diagnostics:")
|
| 72 |
+
for i, token_detail in enumerate(results['token_details'][:4]): # First 4 tokens
|
| 73 |
+
print(f"\n Token {i+1}: '{token_detail['token']}'")
|
| 74 |
+
print(f" SpaCy: lemma='{token_detail['lemma']}', pos='{token_detail['pos']}', tag='{token_detail['tag']}'")
|
| 75 |
+
|
| 76 |
+
# Look for UniDic features
|
| 77 |
+
if 'unidic_features' in token_detail:
|
| 78 |
+
unidic_feat = token_detail['unidic_features']
|
| 79 |
+
print(f" UniDic: lemma='{unidic_feat.get('lemma', '')}', lForm='{unidic_feat.get('lForm', '')}', pos1='{unidic_feat.get('pos1', '')}', goshu='{unidic_feat.get('goshu', '')}'")
|
| 80 |
+
print(f" Alignment confidence: {unidic_feat.get('alignment_confidence', 0.0):.2f}")
|
| 81 |
+
|
| 82 |
+
# Show matching methods for each index
|
| 83 |
+
for idx_name in selected_indices:
|
| 84 |
+
token_method = token_detail.get(f"{idx_name}_token_match_method", "N/A")
|
| 85 |
+
lemma_method = token_detail.get(f"{idx_name}_lemma_match_method", "N/A")
|
| 86 |
+
token_score = token_detail.get(f"{idx_name}_token", "N/A")
|
| 87 |
+
lemma_score = token_detail.get(f"{idx_name}_lemma", "N/A")
|
| 88 |
+
|
| 89 |
+
print(f" {idx_name}:")
|
| 90 |
+
print(f" Token: score={token_score}, method={token_method}")
|
| 91 |
+
print(f" Lemma: score={lemma_score}, method={lemma_method}")
|
| 92 |
+
|
| 93 |
+
# Show summary
|
| 94 |
+
print(f"\n Summary Statistics:")
|
| 95 |
+
matching_methods = {}
|
| 96 |
+
for token_detail in results['token_details']:
|
| 97 |
+
for key, value in token_detail.items():
|
| 98 |
+
if key.endswith('_match_method'):
|
| 99 |
+
method = value
|
| 100 |
+
matching_methods[method] = matching_methods.get(method, 0) + 1
|
| 101 |
+
|
| 102 |
+
print(f" Matching method distribution:")
|
| 103 |
+
for method, count in matching_methods.items():
|
| 104 |
+
print(f" {method}: {count} matches")
|
| 105 |
+
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
print(f"✗ Error during analysis: {e}")
|
| 110 |
+
import traceback
|
| 111 |
+
traceback.print_exc()
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
def test_unidic_fallback_levels():
|
| 115 |
+
"""Test the 3-level UniDic fallback strategy simulation."""
|
| 116 |
+
print("\n=== UniDic Fallback Strategy Test ===\n")
|
| 117 |
+
|
| 118 |
+
# Simulate UniDic features for different fallback levels
|
| 119 |
+
test_cases = [
|
| 120 |
+
{
|
| 121 |
+
'name': 'Complete UniDic features (Level 1)',
|
| 122 |
+
'features': {
|
| 123 |
+
'lemma': '行く', 'lForm': 'イク', 'pos1': '動詞', 'pos2': '一般', 'goshu': '和'
|
| 124 |
+
},
|
| 125 |
+
'expected_keys': [
|
| 126 |
+
'行く_イク_動詞_一般_和', # Level 1
|
| 127 |
+
'行く_イク_動詞_一般', # Level 2
|
| 128 |
+
'行く_イク_動詞' # Level 3
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
'name': 'Partial features (Level 2)',
|
| 133 |
+
'features': {
|
| 134 |
+
'lemma': '学校', 'lForm': 'ガッコウ', 'pos1': '名詞', 'pos2': '一般', 'goshu': ''
|
| 135 |
+
},
|
| 136 |
+
'expected_keys': [
|
| 137 |
+
'学校_ガッコウ_名詞_一般', # Level 2
|
| 138 |
+
'学校_ガッコウ_名詞' # Level 3
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
'name': 'Minimal features (Level 3)',
|
| 143 |
+
'features': {
|
| 144 |
+
'lemma': '私', 'lForm': 'ワタシ', 'pos1': '代名詞', 'pos2': '', 'goshu': ''
|
| 145 |
+
},
|
| 146 |
+
'expected_keys': [
|
| 147 |
+
'私_ワタシ_代名詞' # Level 3 only
|
| 148 |
+
]
|
| 149 |
+
}
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
for case in test_cases:
|
| 153 |
+
print(f"Testing: {case['name']}")
|
| 154 |
+
features = case['features']
|
| 155 |
+
expected = case['expected_keys']
|
| 156 |
+
|
| 157 |
+
# Generate actual keys that would be attempted
|
| 158 |
+
actual_keys = []
|
| 159 |
+
|
| 160 |
+
# Level 1: {lemma}_{lForm}_{pos1}_{pos2}_{goshu}
|
| 161 |
+
if all([features['lemma'], features['lForm'], features['pos1'], features['pos2'], features['goshu']]):
|
| 162 |
+
level1_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}_{features['pos2']}_{features['goshu']}"
|
| 163 |
+
actual_keys.append(level1_key)
|
| 164 |
+
|
| 165 |
+
# Level 2: {lemma}_{lForm}_{pos1}_{pos2}
|
| 166 |
+
if all([features['lemma'], features['lForm'], features['pos1'], features['pos2']]):
|
| 167 |
+
level2_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}_{features['pos2']}"
|
| 168 |
+
actual_keys.append(level2_key)
|
| 169 |
+
|
| 170 |
+
# Level 3: {lemma}_{lForm}_{pos1}
|
| 171 |
+
if all([features['lemma'], features['lForm'], features['pos1']]):
|
| 172 |
+
level3_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}"
|
| 173 |
+
actual_keys.append(level3_key)
|
| 174 |
+
|
| 175 |
+
# Check if matches expected
|
| 176 |
+
match = actual_keys == expected
|
| 177 |
+
status = "✓" if match else "✗"
|
| 178 |
+
print(f" {status} Generated keys: {actual_keys}")
|
| 179 |
+
if not match:
|
| 180 |
+
print(f" Expected: {expected}")
|
| 181 |
+
print()
|
| 182 |
+
|
| 183 |
+
return True
|
| 184 |
+
|
| 185 |
+
if __name__ == "__main__":
|
| 186 |
+
print("Running UniDic integration diagnostics...\n")
|
| 187 |
+
|
| 188 |
+
success1 = test_unidic_diagnostic()
|
| 189 |
+
success2 = test_unidic_fallback_levels()
|
| 190 |
+
|
| 191 |
+
if success1 and success2:
|
| 192 |
+
print("\n🎉 All UniDic diagnostic tests PASSED!")
|
| 193 |
+
print("\nSystem Status:")
|
| 194 |
+
print("- Legacy Japanese analysis: ✓ Working")
|
| 195 |
+
print("- Fallback strategy: ✓ Implemented")
|
| 196 |
+
print("- Diagnostic tracking: ✓ Available")
|
| 197 |
+
print("- UniDic integration: ⚠ Ready (requires MeCab setup)")
|
| 198 |
+
else:
|
| 199 |
+
print("\n❌ Some diagnostic tests FAILED!")
|
| 200 |
+
|
| 201 |
+
sys.exit(0 if success1 and success2 else 1)
|
text_analyzer/__pycache__/__init__.cpython-312.pyc
DELETED
|
Binary file (297 Bytes)
|
|
|
text_analyzer/__pycache__/lexical_sophistication.cpython-312.pyc
DELETED
|
Binary file (24.3 kB)
|
|
|
text_analyzer/__pycache__/pos_parser.cpython-312.pyc
DELETED
|
Binary file (9.72 kB)
|
|
|
text_analyzer/app_config.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralized configuration module for the text analysis application.
|
| 3 |
+
Contains all constants, settings, and configuration loading utilities.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import yaml
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, Any, Optional
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class AppConfig:
|
| 15 |
+
"""Centralized configuration management for the text analysis application."""
|
| 16 |
+
|
| 17 |
+
# SpaCy Model Mappings
|
| 18 |
+
SPACY_MODELS = {
|
| 19 |
+
("en", "md"): "en_core_web_md",
|
| 20 |
+
("en", "trf"): "en_core_web_trf",
|
| 21 |
+
("ja", "md"): "ja_core_news_md",
|
| 22 |
+
("ja", "trf"): "ja_core_news_trf"
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# Default Settings
|
| 26 |
+
DEFAULT_LANGUAGE = "en"
|
| 27 |
+
DEFAULT_MODEL_SIZE = "md" # Changed from "trf" to be more accessible
|
| 28 |
+
|
| 29 |
+
# Analysis Limits (shared constants)
|
| 30 |
+
MAX_TOKENS_FOR_VISUALIZATION = 30
|
| 31 |
+
DEFAULT_HISTOGRAM_BINS = 25
|
| 32 |
+
DEFAULT_RANK_BIN_SIZE = 500
|
| 33 |
+
MAX_NGRAM_SENTENCE_LENGTH = 100
|
| 34 |
+
|
| 35 |
+
# File Processing (generic utilities)
|
| 36 |
+
SUPPORTED_ENCODINGS = ['utf-8', 'utf-16', 'latin-1']
|
| 37 |
+
SUPPORTED_DELIMITERS = [',', '\t', ';']
|
| 38 |
+
|
| 39 |
+
# Configuration Paths
|
| 40 |
+
REFERENCE_LISTS_CONFIG = "config/reference_lists.yaml"
|
| 41 |
+
|
| 42 |
+
@classmethod
|
| 43 |
+
def get_spacy_model_name(cls, language: str, model_size: str) -> Optional[str]:
|
| 44 |
+
"""
|
| 45 |
+
Get the SpaCy model name for given language and size.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
language: Language code ('en' or 'ja')
|
| 49 |
+
model_size: Model size ('md' or 'trf')
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
SpaCy model name or None if not found
|
| 53 |
+
"""
|
| 54 |
+
return cls.SPACY_MODELS.get((language, model_size))
|
| 55 |
+
|
| 56 |
+
@classmethod
|
| 57 |
+
def get_supported_languages(cls) -> list[str]:
|
| 58 |
+
"""Get list of supported languages."""
|
| 59 |
+
return list(set(lang for lang, _ in cls.SPACY_MODELS.keys()))
|
| 60 |
+
|
| 61 |
+
@classmethod
|
| 62 |
+
def get_supported_model_sizes(cls) -> list[str]:
|
| 63 |
+
"""Get list of supported model sizes."""
|
| 64 |
+
return list(set(size for _, size in cls.SPACY_MODELS.keys()))
|
| 65 |
+
|
| 66 |
+
@classmethod
|
| 67 |
+
def load_reference_config(cls) -> Dict[str, Any]:
|
| 68 |
+
"""
|
| 69 |
+
Load reference lists configuration from YAML file.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Configuration dictionary loaded from YAML
|
| 73 |
+
"""
|
| 74 |
+
config_path = Path(cls.REFERENCE_LISTS_CONFIG)
|
| 75 |
+
|
| 76 |
+
if not config_path.exists():
|
| 77 |
+
logger.warning(f"Reference config file not found: {config_path}")
|
| 78 |
+
return cls._get_default_config()
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
| 82 |
+
config = yaml.safe_load(f)
|
| 83 |
+
if config is None:
|
| 84 |
+
logger.warning("Empty YAML configuration, using defaults")
|
| 85 |
+
return cls._get_default_config()
|
| 86 |
+
return config
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"Error loading reference configuration: {e}")
|
| 89 |
+
return cls._get_default_config()
|
| 90 |
+
|
| 91 |
+
@classmethod
|
| 92 |
+
def get_corpus_configuration(cls, corpus_name: str) -> Dict[str, Any]:
|
| 93 |
+
"""
|
| 94 |
+
Get configuration for a specific corpus from YAML.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
corpus_name: Name of the corpus to find
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Corpus configuration dictionary
|
| 101 |
+
"""
|
| 102 |
+
config = cls.load_reference_config()
|
| 103 |
+
|
| 104 |
+
# Search through all languages and ngram types
|
| 105 |
+
for lang_config in config.values():
|
| 106 |
+
if not isinstance(lang_config, dict):
|
| 107 |
+
continue
|
| 108 |
+
for ngram_type_config in lang_config.values():
|
| 109 |
+
if not isinstance(ngram_type_config, dict):
|
| 110 |
+
continue
|
| 111 |
+
if corpus_name in ngram_type_config:
|
| 112 |
+
return ngram_type_config[corpus_name]
|
| 113 |
+
|
| 114 |
+
logger.warning(f"Corpus configuration not found: {corpus_name}")
|
| 115 |
+
return {}
|
| 116 |
+
|
| 117 |
+
@classmethod
|
| 118 |
+
def get_corpus_columns(cls, corpus_name: str) -> Dict[str, int]:
|
| 119 |
+
"""
|
| 120 |
+
Get column mappings for a specific corpus.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
corpus_name: Name of the corpus
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
Dictionary mapping column names to indices
|
| 127 |
+
"""
|
| 128 |
+
corpus_config = cls.get_corpus_configuration(corpus_name)
|
| 129 |
+
return corpus_config.get('columns', {})
|
| 130 |
+
|
| 131 |
+
@classmethod
|
| 132 |
+
def is_japanese_corpus(cls, corpus_name: str) -> bool:
|
| 133 |
+
"""
|
| 134 |
+
Check if a corpus is marked as Japanese corpus.
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
corpus_name: Name of the corpus
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
True if it's a Japanese corpus
|
| 141 |
+
"""
|
| 142 |
+
corpus_config = cls.get_corpus_configuration(corpus_name)
|
| 143 |
+
return corpus_config.get('japanese_corpus', False)
|
| 144 |
+
|
| 145 |
+
@classmethod
|
| 146 |
+
def _get_default_config(cls) -> Dict[str, Any]:
|
| 147 |
+
"""Get default configuration structure if YAML fails to load."""
|
| 148 |
+
return {
|
| 149 |
+
"english": {
|
| 150 |
+
"unigrams": {},
|
| 151 |
+
"bigrams": {},
|
| 152 |
+
"trigrams": {}
|
| 153 |
+
},
|
| 154 |
+
"japanese": {
|
| 155 |
+
"unigrams": {},
|
| 156 |
+
"bigrams": {},
|
| 157 |
+
"trigrams": {}
|
| 158 |
+
}
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
@classmethod
|
| 162 |
+
def validate_language_model_combination(cls, language: str, model_size: str) -> bool:
|
| 163 |
+
"""
|
| 164 |
+
Validate that a language/model combination is supported.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
language: Language code
|
| 168 |
+
model_size: Model size
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
True if combination is supported
|
| 172 |
+
"""
|
| 173 |
+
return (language, model_size) in cls.SPACY_MODELS
|
| 174 |
+
|
| 175 |
+
@classmethod
|
| 176 |
+
def get_processing_limits(cls) -> Dict[str, int]:
|
| 177 |
+
"""Get all processing limits as a dictionary."""
|
| 178 |
+
return {
|
| 179 |
+
'max_tokens_visualization': cls.MAX_TOKENS_FOR_VISUALIZATION,
|
| 180 |
+
'default_histogram_bins': cls.DEFAULT_HISTOGRAM_BINS,
|
| 181 |
+
'default_rank_bin_size': cls.DEFAULT_RANK_BIN_SIZE,
|
| 182 |
+
'max_ngram_sentence_length': cls.MAX_NGRAM_SENTENCE_LENGTH
|
| 183 |
+
}
|
text_analyzer/base_analyzer.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base analyzer module providing shared SpaCy infrastructure.
|
| 3 |
+
Eliminates code duplication and provides common functionality for all SpaCy-based analyzers.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import spacy
|
| 7 |
+
from typing import Dict, List, Any, Optional, Iterator, Tuple, TYPE_CHECKING
|
| 8 |
+
import logging
|
| 9 |
+
import tempfile
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from .app_config import AppConfig
|
| 12 |
+
from .text_utility import TextUtility
|
| 13 |
+
|
| 14 |
+
# Import UniDic extensions and enricher
|
| 15 |
+
try:
|
| 16 |
+
from . import unidic_extensions # This registers the token extensions
|
| 17 |
+
from .unidic_enricher import UniDicEnricher
|
| 18 |
+
UNIDIC_AVAILABLE = True
|
| 19 |
+
except ImportError as e:
|
| 20 |
+
logger.warning(f"UniDic integration not available: {e}")
|
| 21 |
+
UNIDIC_AVAILABLE = False
|
| 22 |
+
UniDicEnricher = None
|
| 23 |
+
|
| 24 |
+
if TYPE_CHECKING:
|
| 25 |
+
import spacy
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class BaseAnalyzer:
|
| 31 |
+
"""
|
| 32 |
+
Base class for all SpaCy-based text analyzers.
|
| 33 |
+
Provides shared model loading, document processing, and utility functions.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def __init__(self, language: str = None, model_size: str = None):
|
| 37 |
+
"""
|
| 38 |
+
Initialize the base analyzer.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
language: Language code ('en' or 'ja')
|
| 42 |
+
model_size: Model size ('md' or 'trf')
|
| 43 |
+
"""
|
| 44 |
+
self.language = language or AppConfig.DEFAULT_LANGUAGE
|
| 45 |
+
self.model_size = model_size or AppConfig.DEFAULT_MODEL_SIZE
|
| 46 |
+
self.nlp = None
|
| 47 |
+
self._model_info = {}
|
| 48 |
+
self.unidic_enricher = None
|
| 49 |
+
|
| 50 |
+
self._load_spacy_model()
|
| 51 |
+
|
| 52 |
+
# Initialize UniDic enricher for Japanese
|
| 53 |
+
if self.language == 'ja' and UNIDIC_AVAILABLE:
|
| 54 |
+
try:
|
| 55 |
+
self.unidic_enricher = UniDicEnricher()
|
| 56 |
+
logger.info("UniDic enricher initialized for Japanese analysis")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.warning(f"Failed to initialize UniDic enricher: {e}")
|
| 59 |
+
self.unidic_enricher = None
|
| 60 |
+
|
| 61 |
+
def _load_spacy_model(self) -> None:
|
| 62 |
+
"""Load appropriate SpaCy model based on language and size."""
|
| 63 |
+
# Validate combination
|
| 64 |
+
if not AppConfig.validate_language_model_combination(self.language, self.model_size):
|
| 65 |
+
raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
|
| 66 |
+
|
| 67 |
+
model_name = AppConfig.get_spacy_model_name(self.language, self.model_size)
|
| 68 |
+
if not model_name:
|
| 69 |
+
raise ValueError(f"No model found for language '{self.language}' and size '{self.model_size}'")
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
self.nlp = spacy.load(model_name)
|
| 73 |
+
self._model_info = {
|
| 74 |
+
'name': model_name,
|
| 75 |
+
'language': self.language,
|
| 76 |
+
'model_size': self.model_size,
|
| 77 |
+
'version': spacy.__version__
|
| 78 |
+
}
|
| 79 |
+
logger.info(f"Loaded SpaCy model: {model_name}")
|
| 80 |
+
except OSError as e:
|
| 81 |
+
error_msg = f"SpaCy model {model_name} not found. Please install it first."
|
| 82 |
+
logger.error(error_msg)
|
| 83 |
+
raise OSError(error_msg) from e
|
| 84 |
+
|
| 85 |
+
def get_model_info(self) -> Dict[str, str]:
|
| 86 |
+
"""
|
| 87 |
+
Get information about the loaded model.
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Dictionary with model information
|
| 91 |
+
"""
|
| 92 |
+
return self._model_info.copy()
|
| 93 |
+
|
| 94 |
+
def process_document(self, text: str) -> "spacy.Doc":
|
| 95 |
+
"""
|
| 96 |
+
Process text into a SpaCy document.
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
text: Input text to process
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
Processed SpaCy document
|
| 103 |
+
|
| 104 |
+
Raises:
|
| 105 |
+
ValueError: If model not loaded or text processing fails
|
| 106 |
+
"""
|
| 107 |
+
if not self.nlp:
|
| 108 |
+
raise ValueError("SpaCy model not loaded")
|
| 109 |
+
|
| 110 |
+
if not text or not text.strip():
|
| 111 |
+
raise ValueError("Empty text provided")
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
# Clean text before processing
|
| 115 |
+
cleaned_text = TextUtility.clean_text_input(text)
|
| 116 |
+
|
| 117 |
+
# Process with SpaCy
|
| 118 |
+
doc = self.nlp(cleaned_text)
|
| 119 |
+
|
| 120 |
+
# Add UniDic enrichment for Japanese
|
| 121 |
+
if self.unidic_enricher and self.language == 'ja':
|
| 122 |
+
try:
|
| 123 |
+
self.unidic_enricher.enrich_spacy_doc(doc, cleaned_text)
|
| 124 |
+
logger.debug("UniDic enrichment completed")
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.warning(f"UniDic enrichment failed: {e}")
|
| 127 |
+
|
| 128 |
+
return doc
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
self.handle_processing_error(e, f"processing text of length {len(text)}")
|
| 132 |
+
raise
|
| 133 |
+
|
| 134 |
+
def handle_processing_error(self, error: Exception, context: str) -> None:
|
| 135 |
+
"""
|
| 136 |
+
Handle processing errors with appropriate logging.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
error: The exception that occurred
|
| 140 |
+
context: Context description for the error
|
| 141 |
+
"""
|
| 142 |
+
error_msg = f"Error {context}: {error}"
|
| 143 |
+
logger.error(error_msg)
|
| 144 |
+
|
| 145 |
+
def filter_tokens(self,
|
| 146 |
+
doc: "spacy.Doc",
|
| 147 |
+
exclude_punct: bool = True,
|
| 148 |
+
exclude_space: bool = True,
|
| 149 |
+
word_type_filter: Optional[str] = None) -> List["spacy.Token"]:
|
| 150 |
+
"""
|
| 151 |
+
Filter tokens based on various criteria.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
doc: SpaCy document
|
| 155 |
+
exclude_punct: Whether to exclude punctuation
|
| 156 |
+
exclude_space: Whether to exclude spaces
|
| 157 |
+
word_type_filter: Filter by word type ('CW', 'FW', or None)
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
List of filtered tokens
|
| 161 |
+
"""
|
| 162 |
+
filtered_tokens = []
|
| 163 |
+
|
| 164 |
+
for token in doc:
|
| 165 |
+
# Basic filtering
|
| 166 |
+
if exclude_space and token.is_space:
|
| 167 |
+
continue
|
| 168 |
+
if exclude_punct and token.is_punct:
|
| 169 |
+
continue
|
| 170 |
+
|
| 171 |
+
# Word type filtering
|
| 172 |
+
if word_type_filter:
|
| 173 |
+
word_type = self._classify_pos(token)
|
| 174 |
+
if word_type != word_type_filter:
|
| 175 |
+
continue
|
| 176 |
+
|
| 177 |
+
filtered_tokens.append(token)
|
| 178 |
+
|
| 179 |
+
return filtered_tokens
|
| 180 |
+
|
| 181 |
+
def _classify_pos(self, token: "spacy.Token") -> str:
|
| 182 |
+
"""
|
| 183 |
+
Classify token as content word (CW) or function word (FW).
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
token: SpaCy token object
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
'CW' for content words, 'FW' for function words
|
| 190 |
+
"""
|
| 191 |
+
content_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
|
| 192 |
+
function_pos = {'DET', 'PRON', 'ADP', 'CONJ', 'CCONJ', 'SCONJ'}
|
| 193 |
+
|
| 194 |
+
if token.pos_ in content_pos:
|
| 195 |
+
return 'CW'
|
| 196 |
+
elif token.pos_ in function_pos:
|
| 197 |
+
return 'FW'
|
| 198 |
+
else:
|
| 199 |
+
# Default classification for ambiguous cases
|
| 200 |
+
return 'CW' if token.pos_ not in {'PUNCT', 'SPACE', 'X'} else 'FW'
|
| 201 |
+
|
| 202 |
+
def format_token_for_display(self, token: "spacy.Token", include_syntax: bool = True) -> Dict[str, Any]:
|
| 203 |
+
"""
|
| 204 |
+
Format token for UI display - only call when needed for output.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
token: SpaCy token
|
| 208 |
+
include_syntax: Whether to include syntactic information (dep_, head, etc.)
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
Formatted token data dictionary for display
|
| 212 |
+
"""
|
| 213 |
+
result = {
|
| 214 |
+
'token': token.text,
|
| 215 |
+
'lemma': token.lemma_,
|
| 216 |
+
'pos': token.pos_,
|
| 217 |
+
'tag': token.tag_,
|
| 218 |
+
'word_type': self._classify_pos(token)
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
if include_syntax:
|
| 222 |
+
result.update({
|
| 223 |
+
'dep_': token.dep_,
|
| 224 |
+
'head_text': token.head.text,
|
| 225 |
+
'head_pos': token.head.pos_,
|
| 226 |
+
})
|
| 227 |
+
|
| 228 |
+
return result
|
| 229 |
+
|
| 230 |
+
def get_syntactic_context(self, token: "spacy.Token") -> Dict[str, Any]:
|
| 231 |
+
"""
|
| 232 |
+
Get comprehensive syntactic relationships for a token.
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
token: SpaCy token
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
Dictionary with syntactic context information
|
| 239 |
+
"""
|
| 240 |
+
return {
|
| 241 |
+
'dep_': token.dep_,
|
| 242 |
+
'head': token.head,
|
| 243 |
+
'children': list(token.children),
|
| 244 |
+
'ancestors': list(token.ancestors),
|
| 245 |
+
'subtree_span': token.subtree,
|
| 246 |
+
'left_edge': token.left_edge,
|
| 247 |
+
'right_edge': token.right_edge
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
def process_sentences(self,
|
| 251 |
+
doc: "spacy.Doc",
|
| 252 |
+
max_tokens: Optional[int] = None) -> List["spacy.Span"]:
|
| 253 |
+
"""
|
| 254 |
+
Process sentences with optional token limits.
|
| 255 |
+
|
| 256 |
+
Args:
|
| 257 |
+
doc: SpaCy document
|
| 258 |
+
max_tokens: Maximum tokens per sentence (uses config default if None)
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
List of sentence spans
|
| 262 |
+
"""
|
| 263 |
+
max_tokens = max_tokens or AppConfig.MAX_TOKENS_FOR_VISUALIZATION
|
| 264 |
+
|
| 265 |
+
processed_sentences = []
|
| 266 |
+
for sent in doc.sents:
|
| 267 |
+
# Filter tokens (exclude spaces for counting)
|
| 268 |
+
sent_tokens = [token for token in sent if not token.is_space]
|
| 269 |
+
|
| 270 |
+
if len(sent_tokens) > max_tokens:
|
| 271 |
+
# Truncate sentence
|
| 272 |
+
truncated_tokens = sent_tokens[:max_tokens]
|
| 273 |
+
# Create new span with truncated tokens
|
| 274 |
+
start_idx = truncated_tokens[0].i
|
| 275 |
+
end_idx = truncated_tokens[-1].i + 1
|
| 276 |
+
truncated_span = doc[start_idx:end_idx]
|
| 277 |
+
processed_sentences.append(truncated_span)
|
| 278 |
+
else:
|
| 279 |
+
processed_sentences.append(sent)
|
| 280 |
+
|
| 281 |
+
return processed_sentences
|
| 282 |
+
|
| 283 |
+
def setup_batch_processing(self, file_paths: List[str]) -> Iterator[Tuple[str, str]]:
|
| 284 |
+
"""
|
| 285 |
+
Set up batch processing for multiple files.
|
| 286 |
+
|
| 287 |
+
Args:
|
| 288 |
+
file_paths: List of file paths to process
|
| 289 |
+
|
| 290 |
+
Yields:
|
| 291 |
+
Tuples of (file_path, text_content)
|
| 292 |
+
"""
|
| 293 |
+
for file_path in file_paths:
|
| 294 |
+
try:
|
| 295 |
+
text_content = TextUtility.extract_text_from_file(file_path)
|
| 296 |
+
yield file_path, text_content
|
| 297 |
+
except Exception as e:
|
| 298 |
+
logger.error(f"Error processing file {file_path}: {e}")
|
| 299 |
+
yield file_path, f"ERROR: {e}"
|
| 300 |
+
|
| 301 |
+
def cleanup_batch_processing(self, temp_files: List[str]) -> None:
|
| 302 |
+
"""
|
| 303 |
+
Clean up temporary files from batch processing.
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
temp_files: List of temporary file paths
|
| 307 |
+
"""
|
| 308 |
+
TextUtility.cleanup_temp_files(temp_files)
|
text_analyzer/frequency_analyzer.py
ADDED
|
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Frequency Analysis Module for Word Frequency Visualization
|
| 3 |
+
|
| 4 |
+
This module provides functionality to analyze word frequency data from various file formats,
|
| 5 |
+
create histogram data, and sample representative words for each frequency bin.
|
| 6 |
+
Supports flexible column mapping for diverse frequency data formats.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
+
from typing import Dict, List, Tuple, Optional, Union
|
| 12 |
+
import logging
|
| 13 |
+
import random
|
| 14 |
+
from io import StringIO
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class FrequencyAnalyzer:
|
| 20 |
+
"""
|
| 21 |
+
Analyzes word frequency data and provides visualization-ready outputs.
|
| 22 |
+
|
| 23 |
+
Supports flexible column mapping for various frequency data formats.
|
| 24 |
+
Can handle both traditional 'Type'/'Freq' format and modern multi-column formats.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
# Default column names to try for auto-detection
|
| 28 |
+
DEFAULT_WORD_COLUMNS = ['lForm', 'lemma', 'word', 'Type', 'surface_form']
|
| 29 |
+
DEFAULT_FREQUENCY_COLUMNS = ['frequency', 'freq', 'Freq', 'pmw', 'NormFreq']
|
| 30 |
+
DEFAULT_POS_COLUMNS = ['pos', 'POS', 'tag']
|
| 31 |
+
|
| 32 |
+
def __init__(self, file_size_limit_mb: int = 300):
|
| 33 |
+
"""
|
| 34 |
+
Initialize the frequency analyzer.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
file_size_limit_mb: Maximum file size limit in MB for uploads
|
| 38 |
+
"""
|
| 39 |
+
self.data = None
|
| 40 |
+
self.original_data = None
|
| 41 |
+
self.column_config = None
|
| 42 |
+
self.file_size_limit = file_size_limit_mb * 1024 * 1024
|
| 43 |
+
self.detected_columns = None
|
| 44 |
+
|
| 45 |
+
def detect_file_format(self, content: Union[str, bytes]) -> Dict[str, any]:
|
| 46 |
+
"""
|
| 47 |
+
Detect file format and separator.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
content: File content as string or bytes
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
Dict with format information
|
| 54 |
+
"""
|
| 55 |
+
if isinstance(content, bytes):
|
| 56 |
+
content = content.decode('utf-8')
|
| 57 |
+
|
| 58 |
+
# Check file size
|
| 59 |
+
if len(content.encode('utf-8')) > self.file_size_limit:
|
| 60 |
+
raise ValueError(f"File too large. Maximum size is {self.file_size_limit // (1024*1024)}MB")
|
| 61 |
+
|
| 62 |
+
# Detect separator by checking first few lines
|
| 63 |
+
lines = content.strip().split('\n')[:5]
|
| 64 |
+
separators = ['\t', ',', ';', '|']
|
| 65 |
+
best_sep = '\t'
|
| 66 |
+
max_columns = 0
|
| 67 |
+
|
| 68 |
+
for sep in separators:
|
| 69 |
+
avg_cols = np.mean([len(line.split(sep)) for line in lines])
|
| 70 |
+
if avg_cols > max_columns:
|
| 71 |
+
max_columns = avg_cols
|
| 72 |
+
best_sep = sep
|
| 73 |
+
|
| 74 |
+
# Detect if first row is header
|
| 75 |
+
first_line = lines[0].split(best_sep)
|
| 76 |
+
second_line = lines[1].split(best_sep) if len(lines) > 1 else []
|
| 77 |
+
|
| 78 |
+
# Simple heuristic: if first row contains mostly strings and second row has numbers
|
| 79 |
+
has_header = True
|
| 80 |
+
if len(second_line) > 0:
|
| 81 |
+
try:
|
| 82 |
+
# Try to convert second row elements to numbers
|
| 83 |
+
numeric_count = sum(1 for x in second_line if self._is_numeric(x.strip()))
|
| 84 |
+
if numeric_count > len(second_line) * 0.3: # If >30% are numeric
|
| 85 |
+
has_header = True
|
| 86 |
+
except:
|
| 87 |
+
has_header = False
|
| 88 |
+
|
| 89 |
+
return {
|
| 90 |
+
'separator': best_sep,
|
| 91 |
+
'has_header': has_header,
|
| 92 |
+
'estimated_columns': int(max_columns),
|
| 93 |
+
'sample_lines': lines[:3]
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
def _is_numeric(self, value: str) -> bool:
|
| 97 |
+
"""Check if a string value is numeric."""
|
| 98 |
+
try:
|
| 99 |
+
float(value)
|
| 100 |
+
return True
|
| 101 |
+
except (ValueError, TypeError):
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
def detect_columns(self, df: pd.DataFrame) -> Dict[str, List[str]]:
|
| 105 |
+
"""
|
| 106 |
+
Detect and categorize columns by data type and content.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
df: DataFrame to analyze
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
Dict with categorized column lists
|
| 113 |
+
"""
|
| 114 |
+
word_candidates = []
|
| 115 |
+
frequency_candidates = []
|
| 116 |
+
pos_candidates = []
|
| 117 |
+
other_columns = []
|
| 118 |
+
|
| 119 |
+
for col in df.columns:
|
| 120 |
+
col_str = str(col).lower()
|
| 121 |
+
|
| 122 |
+
# Check if column contains string data (potential word column)
|
| 123 |
+
if df[col].dtype == 'object':
|
| 124 |
+
# Check if it looks like words (not mostly numbers)
|
| 125 |
+
sample_values = df[col].dropna().head(100)
|
| 126 |
+
if len(sample_values) > 0:
|
| 127 |
+
non_numeric_ratio = sum(1 for x in sample_values if not self._is_numeric(str(x))) / len(sample_values)
|
| 128 |
+
if non_numeric_ratio > 0.8: # >80% non-numeric
|
| 129 |
+
if any(word in col_str for word in ['form', 'lemma', 'word', 'type']):
|
| 130 |
+
word_candidates.append(col)
|
| 131 |
+
elif any(pos in col_str for pos in ['pos', 'tag', 'part']):
|
| 132 |
+
pos_candidates.append(col)
|
| 133 |
+
else:
|
| 134 |
+
word_candidates.append(col) # Default string columns to word candidates
|
| 135 |
+
|
| 136 |
+
# Check if column contains numeric data (potential frequency column)
|
| 137 |
+
elif pd.api.types.is_numeric_dtype(df[col]):
|
| 138 |
+
# Skip rank columns (usually sequential integers starting from 1)
|
| 139 |
+
if col_str in ['rank', 'index'] or (df[col].equals(pd.Series(range(1, len(df) + 1)))):
|
| 140 |
+
other_columns.append(col)
|
| 141 |
+
else:
|
| 142 |
+
frequency_candidates.append(col)
|
| 143 |
+
|
| 144 |
+
else:
|
| 145 |
+
other_columns.append(col)
|
| 146 |
+
|
| 147 |
+
# Sort candidates by preference based on common naming patterns
|
| 148 |
+
word_candidates = self._sort_by_preference(word_candidates, self.DEFAULT_WORD_COLUMNS)
|
| 149 |
+
frequency_candidates = self._sort_by_preference(frequency_candidates, self.DEFAULT_FREQUENCY_COLUMNS)
|
| 150 |
+
pos_candidates = self._sort_by_preference(pos_candidates, self.DEFAULT_POS_COLUMNS)
|
| 151 |
+
|
| 152 |
+
return {
|
| 153 |
+
'word_columns': word_candidates,
|
| 154 |
+
'frequency_columns': frequency_candidates,
|
| 155 |
+
'pos_columns': pos_candidates,
|
| 156 |
+
'other_columns': other_columns
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
def _sort_by_preference(self, columns: List[str], preferred_order: List[str]) -> List[str]:
|
| 160 |
+
"""Sort columns by preference order."""
|
| 161 |
+
sorted_cols = []
|
| 162 |
+
remaining_cols = columns.copy()
|
| 163 |
+
|
| 164 |
+
# Add preferred columns first
|
| 165 |
+
for pref in preferred_order:
|
| 166 |
+
for col in columns:
|
| 167 |
+
if pref.lower() in str(col).lower() and col in remaining_cols:
|
| 168 |
+
sorted_cols.append(col)
|
| 169 |
+
remaining_cols.remove(col)
|
| 170 |
+
break
|
| 171 |
+
|
| 172 |
+
# Add remaining columns
|
| 173 |
+
sorted_cols.extend(remaining_cols)
|
| 174 |
+
return sorted_cols
|
| 175 |
+
|
| 176 |
+
def load_frequency_data(self, content: Union[str, bytes], column_config: Dict[str, str]) -> pd.DataFrame:
|
| 177 |
+
"""
|
| 178 |
+
Load and validate frequency data with flexible column mapping.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
content: File content as string or bytes
|
| 182 |
+
column_config: Column mapping configuration
|
| 183 |
+
{
|
| 184 |
+
'word_column': 'lForm',
|
| 185 |
+
'frequency_column': 'frequency',
|
| 186 |
+
'pos_column': 'pos', # optional
|
| 187 |
+
'separator': '\t' # optional, will auto-detect if not provided
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
pd.DataFrame: Loaded and validated frequency data
|
| 192 |
+
|
| 193 |
+
Raises:
|
| 194 |
+
ValueError: If data format is invalid or columns not found
|
| 195 |
+
"""
|
| 196 |
+
try:
|
| 197 |
+
# Handle both string and bytes input
|
| 198 |
+
if isinstance(content, bytes):
|
| 199 |
+
content = content.decode('utf-8')
|
| 200 |
+
|
| 201 |
+
# Auto-detect format if separator not provided
|
| 202 |
+
if 'separator' not in column_config:
|
| 203 |
+
format_info = self.detect_file_format(content)
|
| 204 |
+
separator = format_info['separator']
|
| 205 |
+
has_header = format_info['has_header']
|
| 206 |
+
else:
|
| 207 |
+
separator = column_config['separator']
|
| 208 |
+
has_header = column_config.get('has_header', True)
|
| 209 |
+
|
| 210 |
+
# Read data
|
| 211 |
+
df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None)
|
| 212 |
+
|
| 213 |
+
# Store column configuration
|
| 214 |
+
self.column_config = column_config.copy()
|
| 215 |
+
self.column_config['separator'] = separator
|
| 216 |
+
self.column_config['has_header'] = has_header
|
| 217 |
+
|
| 218 |
+
# Detect available columns
|
| 219 |
+
self.detected_columns = self.detect_columns(df)
|
| 220 |
+
|
| 221 |
+
# Validate column configuration
|
| 222 |
+
if not self.validate_column_config(df, column_config):
|
| 223 |
+
raise ValueError("Invalid column configuration")
|
| 224 |
+
|
| 225 |
+
# Clean and prepare data with flexible column mapping
|
| 226 |
+
df = self._clean_data_flexible(df, column_config)
|
| 227 |
+
|
| 228 |
+
# Store data
|
| 229 |
+
self.original_data = df.copy()
|
| 230 |
+
self.data = df
|
| 231 |
+
|
| 232 |
+
logger.info(f"Loaded {len(df)} frequency entries with columns: {list(df.columns)}")
|
| 233 |
+
return df
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
logger.error(f"Error loading frequency data: {str(e)}")
|
| 237 |
+
raise ValueError(f"Failed to load frequency data: {str(e)}")
|
| 238 |
+
|
| 239 |
+
def validate_column_config(self, df: pd.DataFrame, column_config: Dict[str, str]) -> bool:
|
| 240 |
+
"""
|
| 241 |
+
Validate that the specified columns exist and contain appropriate data.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
df: DataFrame to validate
|
| 245 |
+
column_config: Column configuration
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
bool: True if configuration is valid
|
| 249 |
+
"""
|
| 250 |
+
# Check required columns exist
|
| 251 |
+
word_col = column_config.get('word_column')
|
| 252 |
+
freq_col = column_config.get('frequency_column')
|
| 253 |
+
|
| 254 |
+
if not word_col or word_col not in df.columns:
|
| 255 |
+
logger.error(f"Word column '{word_col}' not found in data")
|
| 256 |
+
return False
|
| 257 |
+
|
| 258 |
+
if not freq_col or freq_col not in df.columns:
|
| 259 |
+
logger.error(f"Frequency column '{freq_col}' not found in data")
|
| 260 |
+
return False
|
| 261 |
+
|
| 262 |
+
# Check that word column contains string data
|
| 263 |
+
if df[word_col].dtype != 'object':
|
| 264 |
+
logger.error(f"Word column '{word_col}' must contain text data")
|
| 265 |
+
return False
|
| 266 |
+
|
| 267 |
+
# Check that frequency column contains numeric data
|
| 268 |
+
if not pd.api.types.is_numeric_dtype(df[freq_col]):
|
| 269 |
+
logger.error(f"Frequency column '{freq_col}' must contain numeric data")
|
| 270 |
+
return False
|
| 271 |
+
|
| 272 |
+
# Check optional POS column if specified
|
| 273 |
+
pos_col = column_config.get('pos_column')
|
| 274 |
+
if pos_col and pos_col not in df.columns:
|
| 275 |
+
logger.warning(f"POS column '{pos_col}' not found in data, skipping")
|
| 276 |
+
|
| 277 |
+
return True
|
| 278 |
+
|
| 279 |
+
def _clean_data_flexible(self, df: pd.DataFrame, column_config: Dict[str, str]) -> pd.DataFrame:
|
| 280 |
+
"""
|
| 281 |
+
Clean and prepare the frequency data with flexible column mapping.
|
| 282 |
+
|
| 283 |
+
Args:
|
| 284 |
+
df: Raw DataFrame
|
| 285 |
+
column_config: Column configuration
|
| 286 |
+
|
| 287 |
+
Returns:
|
| 288 |
+
pd.DataFrame: Cleaned DataFrame with standardized column names
|
| 289 |
+
"""
|
| 290 |
+
word_col = column_config['word_column']
|
| 291 |
+
freq_col = column_config['frequency_column']
|
| 292 |
+
pos_col = column_config.get('pos_column')
|
| 293 |
+
|
| 294 |
+
# Create a copy and rename columns to standard names for compatibility
|
| 295 |
+
df_clean = df.copy()
|
| 296 |
+
|
| 297 |
+
# Remove rows with missing word or frequency data
|
| 298 |
+
df_clean = df_clean.dropna(subset=[word_col, freq_col])
|
| 299 |
+
|
| 300 |
+
# Ensure frequency is numeric
|
| 301 |
+
df_clean[freq_col] = pd.to_numeric(df_clean[freq_col], errors='coerce')
|
| 302 |
+
df_clean = df_clean.dropna(subset=[freq_col])
|
| 303 |
+
|
| 304 |
+
# Remove zero or negative frequencies
|
| 305 |
+
df_clean = df_clean[df_clean[freq_col] > 0]
|
| 306 |
+
|
| 307 |
+
# Clean word column (remove extra whitespace)
|
| 308 |
+
df_clean[word_col] = df_clean[word_col].astype(str).str.strip()
|
| 309 |
+
|
| 310 |
+
# Add standardized column names for backward compatibility
|
| 311 |
+
df_clean['Type'] = df_clean[word_col]
|
| 312 |
+
df_clean['Freq'] = df_clean[freq_col]
|
| 313 |
+
|
| 314 |
+
# Add POS column if available
|
| 315 |
+
if pos_col and pos_col in df_clean.columns:
|
| 316 |
+
df_clean['POS'] = df_clean[pos_col]
|
| 317 |
+
|
| 318 |
+
# Sort by frequency (descending) for better analysis
|
| 319 |
+
df_clean = df_clean.sort_values(freq_col, ascending=False).reset_index(drop=True)
|
| 320 |
+
|
| 321 |
+
return df_clean
|
| 322 |
+
|
| 323 |
+
def get_available_frequency_columns(self) -> List[str]:
|
| 324 |
+
"""
|
| 325 |
+
Get list of available frequency columns for analysis.
|
| 326 |
+
|
| 327 |
+
Returns:
|
| 328 |
+
List[str]: Available frequency columns from the detected columns
|
| 329 |
+
"""
|
| 330 |
+
if self.detected_columns is None:
|
| 331 |
+
return []
|
| 332 |
+
|
| 333 |
+
return self.detected_columns.get('frequency_columns', [])
|
| 334 |
+
|
| 335 |
+
def get_available_word_columns(self) -> List[str]:
|
| 336 |
+
"""
|
| 337 |
+
Get list of available word columns.
|
| 338 |
+
|
| 339 |
+
Returns:
|
| 340 |
+
List[str]: Available word columns from the detected columns
|
| 341 |
+
"""
|
| 342 |
+
if self.detected_columns is None:
|
| 343 |
+
return []
|
| 344 |
+
|
| 345 |
+
return self.detected_columns.get('word_columns', [])
|
| 346 |
+
|
| 347 |
+
def create_multi_frequency_analysis(self, frequency_columns: List[str], bin_size: int = 500, log_transform: bool = False) -> Dict[str, Dict]:
|
| 348 |
+
"""
|
| 349 |
+
Create rank-based analysis for multiple frequency columns.
|
| 350 |
+
|
| 351 |
+
Args:
|
| 352 |
+
frequency_columns: List of frequency column names to analyze
|
| 353 |
+
bin_size: Number of words per rank group
|
| 354 |
+
log_transform: Whether to apply log10 transformation
|
| 355 |
+
|
| 356 |
+
Returns:
|
| 357 |
+
Dict mapping column names to their analysis results
|
| 358 |
+
"""
|
| 359 |
+
if self.original_data is None:
|
| 360 |
+
raise ValueError("No data loaded")
|
| 361 |
+
|
| 362 |
+
results = {}
|
| 363 |
+
|
| 364 |
+
for freq_col in frequency_columns:
|
| 365 |
+
if freq_col not in self.original_data.columns:
|
| 366 |
+
logger.warning(f"Frequency column '{freq_col}' not found, skipping")
|
| 367 |
+
continue
|
| 368 |
+
|
| 369 |
+
try:
|
| 370 |
+
# Create analysis for this frequency column
|
| 371 |
+
analysis = self.create_rank_based_visualization_flexible(
|
| 372 |
+
column=freq_col,
|
| 373 |
+
bin_size=bin_size,
|
| 374 |
+
log_transform=log_transform
|
| 375 |
+
)
|
| 376 |
+
results[freq_col] = analysis
|
| 377 |
+
except Exception as e:
|
| 378 |
+
logger.error(f"Error analyzing column '{freq_col}': {e}")
|
| 379 |
+
continue
|
| 380 |
+
|
| 381 |
+
return results
|
| 382 |
+
|
| 383 |
+
def create_rank_based_visualization_flexible(self, column: str, bin_size: int = 500, log_transform: bool = False, max_words_to_retain: Optional[int] = None) -> Dict:
|
| 384 |
+
"""
|
| 385 |
+
Create rank-based visualization with flexible column support.
|
| 386 |
+
|
| 387 |
+
Args:
|
| 388 |
+
column: Column name to analyze (can be any numeric column)
|
| 389 |
+
bin_size: Number of words per rank group
|
| 390 |
+
log_transform: Whether to apply log10 transformation
|
| 391 |
+
max_words_to_retain: Maximum number of top frequent words to retain for analysis
|
| 392 |
+
|
| 393 |
+
Returns:
|
| 394 |
+
Dict: Rank-based visualization data
|
| 395 |
+
"""
|
| 396 |
+
if self.original_data is None:
|
| 397 |
+
raise ValueError("No data loaded")
|
| 398 |
+
|
| 399 |
+
if column not in self.original_data.columns:
|
| 400 |
+
raise ValueError(f"Column '{column}' not found in data")
|
| 401 |
+
|
| 402 |
+
# Get word column from config or use default
|
| 403 |
+
word_col = self.column_config.get('word_column', 'Type') if self.column_config else 'Type'
|
| 404 |
+
if word_col not in self.original_data.columns:
|
| 405 |
+
word_col = 'Type' # Fallback to standardized column
|
| 406 |
+
|
| 407 |
+
# Sort by the specified frequency column (descending)
|
| 408 |
+
sorted_data = self.original_data.sort_values(column, ascending=False).reset_index(drop=True)
|
| 409 |
+
|
| 410 |
+
# Apply word limit if specified
|
| 411 |
+
if max_words_to_retain and max_words_to_retain < len(sorted_data):
|
| 412 |
+
sorted_data = sorted_data.head(max_words_to_retain)
|
| 413 |
+
logger.info(f"Limited analysis to top {max_words_to_retain} words")
|
| 414 |
+
|
| 415 |
+
# Create bins by slicing exactly bin_size words
|
| 416 |
+
group_labels = []
|
| 417 |
+
group_centers = []
|
| 418 |
+
avg_frequencies = []
|
| 419 |
+
sample_words = {}
|
| 420 |
+
group_stats_list = []
|
| 421 |
+
|
| 422 |
+
# Limit to top 20 bins for better UI performance
|
| 423 |
+
max_display_bins = 20
|
| 424 |
+
|
| 425 |
+
for i in range(0, len(sorted_data), bin_size):
|
| 426 |
+
if len(group_labels) >= max_display_bins:
|
| 427 |
+
break
|
| 428 |
+
|
| 429 |
+
end_idx = min(i + bin_size, len(sorted_data))
|
| 430 |
+
bin_data = sorted_data[i:end_idx]
|
| 431 |
+
|
| 432 |
+
# Calculate group boundaries
|
| 433 |
+
start_rank = i + 1
|
| 434 |
+
end_rank = end_idx
|
| 435 |
+
group_label = f"{start_rank}-{end_rank}"
|
| 436 |
+
group_labels.append(group_label)
|
| 437 |
+
group_centers.append((start_rank + end_rank) / 2)
|
| 438 |
+
|
| 439 |
+
# Calculate average frequency
|
| 440 |
+
avg_freq = bin_data[column].mean()
|
| 441 |
+
if log_transform:
|
| 442 |
+
avg_freq = np.log10(avg_freq + 1e-10)
|
| 443 |
+
avg_frequencies.append(avg_freq)
|
| 444 |
+
|
| 445 |
+
# Get sample words (5 randomly sampled from this bin)
|
| 446 |
+
n_samples = min(5, len(bin_data))
|
| 447 |
+
if n_samples > 0:
|
| 448 |
+
if n_samples == len(bin_data):
|
| 449 |
+
# If fewer than 5 words, take all
|
| 450 |
+
sample_word_list = bin_data[word_col].tolist()
|
| 451 |
+
else:
|
| 452 |
+
# Randomly sample 5 words
|
| 453 |
+
sample_indices = random.sample(range(len(bin_data)), n_samples)
|
| 454 |
+
sample_word_list = [bin_data.iloc[idx][word_col] for idx in sample_indices]
|
| 455 |
+
else:
|
| 456 |
+
sample_word_list = []
|
| 457 |
+
|
| 458 |
+
group_idx = len(group_labels) - 1
|
| 459 |
+
sample_words[group_idx] = [{'word': word, 'group': group_label} for word in sample_word_list]
|
| 460 |
+
|
| 461 |
+
# Store group statistics
|
| 462 |
+
group_stats_list.append({
|
| 463 |
+
'group_idx': group_idx,
|
| 464 |
+
f'{column}_mean': bin_data[column].mean(),
|
| 465 |
+
f'{column}_count': len(bin_data),
|
| 466 |
+
f'{column}_min': bin_data[column].min(),
|
| 467 |
+
f'{column}_max': bin_data[column].max(),
|
| 468 |
+
'start_rank': start_rank,
|
| 469 |
+
'end_rank': end_rank
|
| 470 |
+
})
|
| 471 |
+
|
| 472 |
+
# Create a DataFrame for group stats
|
| 473 |
+
group_stats = pd.DataFrame(group_stats_list)
|
| 474 |
+
|
| 475 |
+
# Create title suffix with word limit info
|
| 476 |
+
title_parts = [f"Bin Size: {bin_size}"]
|
| 477 |
+
if max_words_to_retain:
|
| 478 |
+
title_parts.append(f"Top {max_words_to_retain:,} words")
|
| 479 |
+
title_parts.append(f"{'Log₁₀ ' if log_transform else ''}{column}")
|
| 480 |
+
title_suffix = " (" + ", ".join(title_parts) + ")"
|
| 481 |
+
|
| 482 |
+
return {
|
| 483 |
+
'group_labels': group_labels,
|
| 484 |
+
'group_centers': group_centers,
|
| 485 |
+
'avg_frequencies': avg_frequencies,
|
| 486 |
+
'group_stats': group_stats,
|
| 487 |
+
'sample_words': sample_words,
|
| 488 |
+
'bin_size': bin_size,
|
| 489 |
+
'column': column,
|
| 490 |
+
'log_transform': log_transform,
|
| 491 |
+
'max_words_to_retain': max_words_to_retain,
|
| 492 |
+
'total_groups': len(group_labels),
|
| 493 |
+
'title_suffix': title_suffix,
|
| 494 |
+
'x_label': f"Rank Groups (bin size: {bin_size})",
|
| 495 |
+
'y_label': f"{'Log₁₀ ' if log_transform else ''}Average {column}"
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
# Legacy methods for backward compatibility
|
| 499 |
+
def validate_format(self, df: pd.DataFrame) -> bool:
|
| 500 |
+
"""Legacy method for backward compatibility."""
|
| 501 |
+
return 'Type' in df.columns and 'Freq' in df.columns
|
| 502 |
+
|
| 503 |
+
def get_available_columns(self) -> List[str]:
|
| 504 |
+
"""Legacy method for backward compatibility."""
|
| 505 |
+
if self.data is None:
|
| 506 |
+
return []
|
| 507 |
+
|
| 508 |
+
freq_columns = []
|
| 509 |
+
if 'Freq' in self.data.columns:
|
| 510 |
+
freq_columns.append('Freq')
|
| 511 |
+
if 'NormFreq' in self.data.columns:
|
| 512 |
+
freq_columns.append('NormFreq')
|
| 513 |
+
|
| 514 |
+
return freq_columns
|
| 515 |
+
|
| 516 |
+
def create_histogram_data(self, column: str = 'Freq', bins: int = 25, log_transform: bool = False) -> Dict:
|
| 517 |
+
"""Legacy histogram method for backward compatibility."""
|
| 518 |
+
if self.data is None:
|
| 519 |
+
raise ValueError("No data loaded")
|
| 520 |
+
|
| 521 |
+
if column not in self.data.columns:
|
| 522 |
+
raise ValueError(f"Column '{column}' not found in data")
|
| 523 |
+
|
| 524 |
+
# Get frequency values
|
| 525 |
+
freq_values = self.data[column].copy()
|
| 526 |
+
|
| 527 |
+
# Apply log transformation if requested
|
| 528 |
+
if log_transform:
|
| 529 |
+
freq_values = np.log10(freq_values + 1e-10)
|
| 530 |
+
title_suffix = f" (Log₁₀ {column})"
|
| 531 |
+
x_label = f"Log₁₀ {column}"
|
| 532 |
+
else:
|
| 533 |
+
title_suffix = f" ({column})"
|
| 534 |
+
x_label = column
|
| 535 |
+
|
| 536 |
+
# Create histogram
|
| 537 |
+
counts, bin_edges = np.histogram(freq_values, bins=bins)
|
| 538 |
+
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
|
| 539 |
+
bin_widths = bin_edges[1:] - bin_edges[:-1]
|
| 540 |
+
|
| 541 |
+
return {
|
| 542 |
+
'counts': counts,
|
| 543 |
+
'bin_edges': bin_edges,
|
| 544 |
+
'bin_centers': bin_centers,
|
| 545 |
+
'bin_widths': bin_widths,
|
| 546 |
+
'freq_values': freq_values,
|
| 547 |
+
'original_column': column,
|
| 548 |
+
'log_transform': log_transform,
|
| 549 |
+
'title_suffix': title_suffix,
|
| 550 |
+
'x_label': x_label,
|
| 551 |
+
'total_words': len(self.data)
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
def sample_words_per_bin(self, histogram_data: Dict, samples_per_bin: int = 5) -> Dict[int, List[Dict]]:
|
| 555 |
+
"""Legacy word sampling method for backward compatibility."""
|
| 556 |
+
if self.data is None:
|
| 557 |
+
raise ValueError("No data loaded")
|
| 558 |
+
|
| 559 |
+
bin_edges = histogram_data['bin_edges']
|
| 560 |
+
freq_values = histogram_data['freq_values']
|
| 561 |
+
original_column = histogram_data['original_column']
|
| 562 |
+
|
| 563 |
+
sampled_words = {}
|
| 564 |
+
|
| 565 |
+
for i in range(len(bin_edges) - 1):
|
| 566 |
+
bin_start = bin_edges[i]
|
| 567 |
+
bin_end = bin_edges[i + 1]
|
| 568 |
+
|
| 569 |
+
# Find words in this bin
|
| 570 |
+
if i == len(bin_edges) - 2: # Last bin, include right edge
|
| 571 |
+
mask = (freq_values >= bin_start) & (freq_values <= bin_end)
|
| 572 |
+
else:
|
| 573 |
+
mask = (freq_values >= bin_start) & (freq_values < bin_end)
|
| 574 |
+
|
| 575 |
+
bin_words = self.data[mask]
|
| 576 |
+
|
| 577 |
+
if len(bin_words) > 0:
|
| 578 |
+
# Sample words (up to samples_per_bin)
|
| 579 |
+
n_samples = min(samples_per_bin, len(bin_words))
|
| 580 |
+
sampled = bin_words.sample(n=n_samples, random_state=42)
|
| 581 |
+
|
| 582 |
+
# Create word info list
|
| 583 |
+
word_list = []
|
| 584 |
+
for _, word_row in sampled.iterrows():
|
| 585 |
+
word_info = {
|
| 586 |
+
'word': word_row['Type'],
|
| 587 |
+
'freq': word_row[original_column],
|
| 588 |
+
'rank': word_row.get('Rank', 'N/A'),
|
| 589 |
+
'original_freq': word_row['Freq'] if original_column != 'Freq' else word_row['Freq']
|
| 590 |
+
}
|
| 591 |
+
word_list.append(word_info)
|
| 592 |
+
|
| 593 |
+
sampled_words[i] = word_list
|
| 594 |
+
else:
|
| 595 |
+
sampled_words[i] = []
|
| 596 |
+
|
| 597 |
+
return sampled_words
|
| 598 |
+
|
| 599 |
+
def create_rank_based_visualization(self, column: str = 'Freq', bin_size: int = 500, log_transform: bool = False) -> Dict:
|
| 600 |
+
"""Legacy rank-based visualization method for backward compatibility."""
|
| 601 |
+
return self.create_rank_based_visualization_flexible(column, bin_size, log_transform)
|
| 602 |
+
|
| 603 |
+
def calculate_statistics(self, column: str = 'Freq') -> Dict:
|
| 604 |
+
"""Calculate descriptive statistics for the frequency data."""
|
| 605 |
+
if self.data is None:
|
| 606 |
+
raise ValueError("No data loaded")
|
| 607 |
+
|
| 608 |
+
if column not in self.data.columns:
|
| 609 |
+
raise ValueError(f"Column '{column}' not found in data")
|
| 610 |
+
|
| 611 |
+
freq_values = self.data[column]
|
| 612 |
+
|
| 613 |
+
stats = {
|
| 614 |
+
'count': len(freq_values),
|
| 615 |
+
'mean': float(freq_values.mean()),
|
| 616 |
+
'median': float(freq_values.median()),
|
| 617 |
+
'std': float(freq_values.std()),
|
| 618 |
+
'min': float(freq_values.min()),
|
| 619 |
+
'max': float(freq_values.max()),
|
| 620 |
+
'q25': float(freq_values.quantile(0.25)),
|
| 621 |
+
'q75': float(freq_values.quantile(0.75)),
|
| 622 |
+
'skewness': float(freq_values.skew()),
|
| 623 |
+
'column_name': column
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
# Add some additional insights
|
| 627 |
+
stats['range'] = stats['max'] - stats['min']
|
| 628 |
+
stats['iqr'] = stats['q75'] - stats['q25']
|
| 629 |
+
stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] != 0 else 0
|
| 630 |
+
|
| 631 |
+
return stats
|
| 632 |
+
|
| 633 |
+
def get_top_words(self, column: str = 'Freq', n: int = 10) -> List[Dict]:
|
| 634 |
+
"""Get the top N words by frequency."""
|
| 635 |
+
if self.data is None:
|
| 636 |
+
raise ValueError("No data loaded")
|
| 637 |
+
|
| 638 |
+
if column not in self.data.columns:
|
| 639 |
+
raise ValueError(f"Column '{column}' not found in data")
|
| 640 |
+
|
| 641 |
+
top_words = self.data.nlargest(n, column)
|
| 642 |
+
|
| 643 |
+
result = []
|
| 644 |
+
for _, row in top_words.iterrows():
|
| 645 |
+
word_info = {
|
| 646 |
+
'word': row['Type'],
|
| 647 |
+
'freq': row[column],
|
| 648 |
+
'rank': row.get('Rank', 'N/A'),
|
| 649 |
+
'original_freq': row['Freq']
|
| 650 |
+
}
|
| 651 |
+
result.append(word_info)
|
| 652 |
+
|
| 653 |
+
return result
|
text_analyzer/lexical_sophistication.py
CHANGED
|
@@ -13,50 +13,30 @@ import logging
|
|
| 13 |
from collections import defaultdict
|
| 14 |
import re
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
# Configure logging
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
|
| 21 |
-
class LexicalSophisticationAnalyzer:
|
| 22 |
"""
|
| 23 |
Main class for lexical sophistication analysis.
|
| 24 |
Handles tokenization, n-gram generation, and score calculation.
|
| 25 |
"""
|
| 26 |
|
| 27 |
-
def __init__(self, language: str =
|
| 28 |
"""
|
| 29 |
Initialize analyzer with specified language and model.
|
| 30 |
|
| 31 |
Args:
|
| 32 |
language (str): Language code ('en' for English, 'ja' for Japanese)
|
| 33 |
-
model_size (str): SpaCy model size ('
|
| 34 |
"""
|
| 35 |
-
|
| 36 |
-
self.model_size = model_size
|
| 37 |
-
self.nlp = None
|
| 38 |
self.reference_lists = {}
|
| 39 |
-
self._load_spacy_model()
|
| 40 |
-
|
| 41 |
-
def _load_spacy_model(self):
|
| 42 |
-
"""Load appropriate SpaCy model based on language and size."""
|
| 43 |
-
model_map = {
|
| 44 |
-
("en", "md"): "en_core_web_md",
|
| 45 |
-
("en", "trf"): "en_core_web_trf",
|
| 46 |
-
("ja", "md"): "ja_core_news_md",
|
| 47 |
-
("ja", "trf"): "ja_core_news_trf"
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
model_name = model_map.get((self.language, self.model_size))
|
| 51 |
-
if not model_name:
|
| 52 |
-
raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
|
| 53 |
-
|
| 54 |
-
try:
|
| 55 |
-
self.nlp = spacy.load(model_name)
|
| 56 |
-
logger.info(f"Loaded SpaCy model: {model_name}")
|
| 57 |
-
except OSError:
|
| 58 |
-
logger.error(f"SpaCy model {model_name} not found. Please install it first.")
|
| 59 |
-
raise
|
| 60 |
|
| 61 |
def load_reference_lists(self, reference_files: Dict[str, Dict[str, Union[str, dict]]]):
|
| 62 |
"""
|
|
@@ -235,26 +215,6 @@ class LexicalSophisticationAnalyzer:
|
|
| 235 |
logger.error(f"Error parsing custom config: {e}")
|
| 236 |
return {}
|
| 237 |
|
| 238 |
-
def _classify_pos(self, token) -> str:
|
| 239 |
-
"""
|
| 240 |
-
Classify token as content word (CW) or function word (FW).
|
| 241 |
-
|
| 242 |
-
Args:
|
| 243 |
-
token: SpaCy token object
|
| 244 |
-
|
| 245 |
-
Returns:
|
| 246 |
-
str: 'CW' for content words, 'FW' for function words
|
| 247 |
-
"""
|
| 248 |
-
content_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
|
| 249 |
-
function_pos = {'DET', 'PRON', 'ADP', 'CONJ', 'CCONJ', 'SCONJ'}
|
| 250 |
-
|
| 251 |
-
if token.pos_ in content_pos:
|
| 252 |
-
return 'CW'
|
| 253 |
-
elif token.pos_ in function_pos:
|
| 254 |
-
return 'FW'
|
| 255 |
-
else:
|
| 256 |
-
# Default classification for ambiguous cases
|
| 257 |
-
return 'CW' if token.pos_ not in {'PUNCT', 'SPACE', 'X'} else 'FW'
|
| 258 |
|
| 259 |
def _generate_ngrams(self, tokens: List, n: int, sep: str = " ") -> List[str]:
|
| 260 |
"""
|
|
@@ -296,7 +256,7 @@ class LexicalSophisticationAnalyzer:
|
|
| 296 |
measure_col: Optional[str] = None) -> Optional[float]:
|
| 297 |
"""
|
| 298 |
Look up score for a word in reference lists.
|
| 299 |
-
|
| 300 |
Args:
|
| 301 |
word: Word to look up
|
| 302 |
index_name: Name of the reference index
|
|
@@ -314,6 +274,12 @@ class LexicalSophisticationAnalyzer:
|
|
| 314 |
return None
|
| 315 |
|
| 316 |
if file_type in ['token', 'lemma']:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
# Simple dictionary lookup for unigrams
|
| 318 |
return ref_data.get(word.lower())
|
| 319 |
else:
|
|
@@ -344,6 +310,169 @@ class LexicalSophisticationAnalyzer:
|
|
| 344 |
except (ValueError, TypeError):
|
| 345 |
return None
|
| 346 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
def analyze_text(self, text: str, selected_indices: List[str],
|
| 349 |
apply_log: bool = False, word_type_filter: Optional[str] = None) -> Dict:
|
|
@@ -359,12 +488,9 @@ class LexicalSophisticationAnalyzer:
|
|
| 359 |
Returns:
|
| 360 |
Dictionary containing analysis results
|
| 361 |
"""
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
# Process text
|
| 366 |
-
doc = self.nlp(text)
|
| 367 |
-
tokens = [token for token in doc if not token.is_punct and not token.is_space]
|
| 368 |
|
| 369 |
# Generate n-grams
|
| 370 |
bigrams = self._generate_ngrams(tokens, 2)
|
|
@@ -382,7 +508,9 @@ class LexicalSophisticationAnalyzer:
|
|
| 382 |
'content_words': len([t for t in tokens if self._classify_pos(t) == 'CW']),
|
| 383 |
'function_words': len([t for t in tokens if self._classify_pos(t) == 'FW'])
|
| 384 |
},
|
| 385 |
-
'raw_scores': {} #
|
|
|
|
|
|
|
| 386 |
}
|
| 387 |
|
| 388 |
# Initialize score collections
|
|
@@ -396,23 +524,78 @@ class LexicalSophisticationAnalyzer:
|
|
| 396 |
if word_type_filter and word_type != word_type_filter:
|
| 397 |
continue
|
| 398 |
|
|
|
|
| 399 |
token_detail = {
|
| 400 |
'id': i + 1,
|
| 401 |
'token': token.text,
|
| 402 |
'lemma': token.lemma_,
|
| 403 |
'pos': token.pos_,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
'word_type': word_type
|
| 405 |
}
|
| 406 |
|
| 407 |
# Look up scores for each selected index
|
| 408 |
for index_name in selected_indices:
|
| 409 |
-
#
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
# Collect for summary statistics
|
| 418 |
if token_score is not None:
|
|
@@ -477,7 +660,7 @@ class LexicalSophisticationAnalyzer:
|
|
| 477 |
score_val = np.log10(score) if apply_log and score > 0 else score
|
| 478 |
ngram_detail[f"{index_name}_{measure}"] = score_val
|
| 479 |
else:
|
| 480 |
-
ngram_detail[f"{index_name}_{measure}"] =
|
| 481 |
|
| 482 |
results[ngram_details_key].append(ngram_detail)
|
| 483 |
|
|
|
|
| 13 |
from collections import defaultdict
|
| 14 |
import re
|
| 15 |
|
| 16 |
+
from .base_analyzer import BaseAnalyzer
|
| 17 |
+
from .app_config import AppConfig
|
| 18 |
+
|
| 19 |
# Configure logging
|
| 20 |
logging.basicConfig(level=logging.INFO)
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
|
| 23 |
|
| 24 |
+
class LexicalSophisticationAnalyzer(BaseAnalyzer):
|
| 25 |
"""
|
| 26 |
Main class for lexical sophistication analysis.
|
| 27 |
Handles tokenization, n-gram generation, and score calculation.
|
| 28 |
"""
|
| 29 |
|
| 30 |
+
def __init__(self, language: str = None, model_size: str = None):
|
| 31 |
"""
|
| 32 |
Initialize analyzer with specified language and model.
|
| 33 |
|
| 34 |
Args:
|
| 35 |
language (str): Language code ('en' for English, 'ja' for Japanese)
|
| 36 |
+
model_size (str): SpaCy model size ('md' or 'trf')
|
| 37 |
"""
|
| 38 |
+
super().__init__(language, model_size)
|
|
|
|
|
|
|
| 39 |
self.reference_lists = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def load_reference_lists(self, reference_files: Dict[str, Dict[str, Union[str, dict]]]):
|
| 42 |
"""
|
|
|
|
| 215 |
logger.error(f"Error parsing custom config: {e}")
|
| 216 |
return {}
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
def _generate_ngrams(self, tokens: List, n: int, sep: str = " ") -> List[str]:
|
| 220 |
"""
|
|
|
|
| 256 |
measure_col: Optional[str] = None) -> Optional[float]:
|
| 257 |
"""
|
| 258 |
Look up score for a word in reference lists.
|
| 259 |
+
|
| 260 |
Args:
|
| 261 |
word: Word to look up
|
| 262 |
index_name: Name of the reference index
|
|
|
|
| 274 |
return None
|
| 275 |
|
| 276 |
if file_type in ['token', 'lemma']:
|
| 277 |
+
# Check if this is Japanese corpus data
|
| 278 |
+
if isinstance(ref_data, dict) and ref_data.get('is_japanese_corpus', False):
|
| 279 |
+
# This should not be called directly for Japanese data
|
| 280 |
+
# Use _lookup_japanese_score instead
|
| 281 |
+
return None
|
| 282 |
+
|
| 283 |
# Simple dictionary lookup for unigrams
|
| 284 |
return ref_data.get(word.lower())
|
| 285 |
else:
|
|
|
|
| 310 |
except (ValueError, TypeError):
|
| 311 |
return None
|
| 312 |
return None
|
| 313 |
+
|
| 314 |
+
def _lookup_with_unidic_fallback(self, token, index_name: str, file_type: str) -> Dict:
|
| 315 |
+
"""
|
| 316 |
+
Enhanced Japanese lookup with UniDic 3-level fallback using corpus-compatible keys.
|
| 317 |
+
|
| 318 |
+
Args:
|
| 319 |
+
token: SpaCy token object with UniDic extensions
|
| 320 |
+
index_name: Name of the reference index
|
| 321 |
+
file_type: Type of reference file ('token', 'lemma')
|
| 322 |
+
|
| 323 |
+
Returns:
|
| 324 |
+
Dictionary with score, method, key, and diagnostic information
|
| 325 |
+
"""
|
| 326 |
+
# Initialize diagnostic tracking
|
| 327 |
+
attempted_keys = []
|
| 328 |
+
diagnostic_info = {
|
| 329 |
+
'attempted_keys': attempted_keys,
|
| 330 |
+
'unidic_features': {},
|
| 331 |
+
'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0),
|
| 332 |
+
'spacy_fallback_used': False,
|
| 333 |
+
'no_match': False
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
# Get UniDic features from token extensions
|
| 337 |
+
unidic_features = {
|
| 338 |
+
'lemma': getattr(token._, 'unidic_lemma', '') or '',
|
| 339 |
+
'lForm': getattr(token._, 'unidic_lform', '') or '',
|
| 340 |
+
'pos1': getattr(token._, 'unidic_pos1', '') or '',
|
| 341 |
+
'pos2': getattr(token._, 'unidic_pos2', '') or '',
|
| 342 |
+
'pos3': getattr(token._, 'unidic_pos3', '') or '',
|
| 343 |
+
'goshu': getattr(token._, 'unidic_goshu', '') or ''
|
| 344 |
+
}
|
| 345 |
+
diagnostic_info['unidic_features'] = unidic_features
|
| 346 |
+
|
| 347 |
+
# Only proceed with UniDic matching if we have good alignment and features
|
| 348 |
+
if diagnostic_info['alignment_confidence'] > 0.5 and any(unidic_features.values()):
|
| 349 |
+
|
| 350 |
+
# Try corpus-compatible keys using the hierarchical lookup dictionaries
|
| 351 |
+
# Level 1: {lemma}_{lForm}_{pos1}_{pos2}_{pos3} (when pos3 exists)
|
| 352 |
+
if all([unidic_features['lemma'], unidic_features['lForm'],
|
| 353 |
+
unidic_features['pos1'], unidic_features['pos2'], unidic_features['pos3']]):
|
| 354 |
+
level1_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}_{unidic_features['pos2']}_{unidic_features['pos3']}"
|
| 355 |
+
attempted_keys.append(level1_key)
|
| 356 |
+
score = self._lookup_japanese_corpus_level(level1_key, index_name, file_type, 'level1_dict')
|
| 357 |
+
if score is not None:
|
| 358 |
+
return {
|
| 359 |
+
'score': score,
|
| 360 |
+
'match_method': 'unidic_corpus_level_1',
|
| 361 |
+
'match_key': level1_key,
|
| 362 |
+
'diagnostic_info': diagnostic_info
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
# Level 2: {lemma}_{lForm}_{pos1}_{pos2}
|
| 366 |
+
if all([unidic_features['lemma'], unidic_features['lForm'],
|
| 367 |
+
unidic_features['pos1'], unidic_features['pos2']]):
|
| 368 |
+
level2_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}_{unidic_features['pos2']}"
|
| 369 |
+
attempted_keys.append(level2_key)
|
| 370 |
+
score = self._lookup_japanese_corpus_level(level2_key, index_name, file_type, 'level2_dict')
|
| 371 |
+
if score is not None:
|
| 372 |
+
return {
|
| 373 |
+
'score': score,
|
| 374 |
+
'match_method': 'unidic_corpus_level_2',
|
| 375 |
+
'match_key': level2_key,
|
| 376 |
+
'diagnostic_info': diagnostic_info
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
+
# Level 3: {lemma}_{lForm}_{pos1}
|
| 380 |
+
if all([unidic_features['lemma'], unidic_features['lForm'], unidic_features['pos1']]):
|
| 381 |
+
level3_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}"
|
| 382 |
+
attempted_keys.append(level3_key)
|
| 383 |
+
score = self._lookup_japanese_corpus_level(level3_key, index_name, file_type, 'level3_dict')
|
| 384 |
+
if score is not None:
|
| 385 |
+
return {
|
| 386 |
+
'score': score,
|
| 387 |
+
'match_method': 'unidic_corpus_level_3',
|
| 388 |
+
'match_key': level3_key,
|
| 389 |
+
'diagnostic_info': diagnostic_info
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
# Fallback to legacy spaCy-based matching
|
| 393 |
+
diagnostic_info['spacy_fallback_used'] = True
|
| 394 |
+
legacy_score = self._lookup_japanese_score(token, index_name, file_type, fallback=True)
|
| 395 |
+
if legacy_score is not None:
|
| 396 |
+
legacy_key = f"{token.lemma_}_{token.tag_}"
|
| 397 |
+
attempted_keys.append(f"legacy: {legacy_key}")
|
| 398 |
+
return {
|
| 399 |
+
'score': legacy_score,
|
| 400 |
+
'match_method': 'legacy_spacy',
|
| 401 |
+
'match_key': legacy_key,
|
| 402 |
+
'diagnostic_info': diagnostic_info
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
# No match found
|
| 406 |
+
diagnostic_info['no_match'] = True
|
| 407 |
+
return {
|
| 408 |
+
'score': None,
|
| 409 |
+
'match_method': 'none',
|
| 410 |
+
'match_key': None,
|
| 411 |
+
'diagnostic_info': diagnostic_info
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
def _lookup_japanese_corpus_level(self, key: str, index_name: str, file_type: str, level_dict_name: str) -> Optional[float]:
|
| 415 |
+
"""
|
| 416 |
+
Look up score in a specific level dictionary of Japanese corpus data.
|
| 417 |
+
|
| 418 |
+
Args:
|
| 419 |
+
key: Composite key to look up
|
| 420 |
+
index_name: Name of the reference index
|
| 421 |
+
file_type: Type of reference file ('token', 'lemma')
|
| 422 |
+
level_dict_name: Name of the level dictionary ('level1_dict', 'level2_dict', 'level3_dict')
|
| 423 |
+
|
| 424 |
+
Returns:
|
| 425 |
+
Score if found, None otherwise
|
| 426 |
+
"""
|
| 427 |
+
if index_name not in self.reference_lists:
|
| 428 |
+
return None
|
| 429 |
+
|
| 430 |
+
ref_data = self.reference_lists[index_name].get(file_type)
|
| 431 |
+
if ref_data is None or not isinstance(ref_data, dict):
|
| 432 |
+
return None
|
| 433 |
+
|
| 434 |
+
if not ref_data.get('is_japanese_corpus', False):
|
| 435 |
+
return None
|
| 436 |
+
|
| 437 |
+
level_dict = ref_data.get(level_dict_name, {})
|
| 438 |
+
return level_dict.get(key)
|
| 439 |
+
|
| 440 |
+
def _lookup_japanese_score(self, token, index_name: str, file_type: str, fallback: bool = False) -> Optional[float]:
|
| 441 |
+
"""
|
| 442 |
+
Look up score for a Japanese word using composite key approach.
|
| 443 |
+
|
| 444 |
+
Args:
|
| 445 |
+
token: SpaCy token object
|
| 446 |
+
index_name: Name of the reference index
|
| 447 |
+
file_type: Type of reference file ('token', 'lemma')
|
| 448 |
+
fallback: Whether to use fallback search strategies
|
| 449 |
+
|
| 450 |
+
Returns:
|
| 451 |
+
Score if found, None otherwise
|
| 452 |
+
"""
|
| 453 |
+
if index_name not in self.reference_lists:
|
| 454 |
+
return None
|
| 455 |
+
|
| 456 |
+
ref_data = self.reference_lists[index_name].get(file_type)
|
| 457 |
+
if ref_data is None or not isinstance(ref_data, dict):
|
| 458 |
+
return None
|
| 459 |
+
|
| 460 |
+
if not ref_data.get('is_japanese_corpus', False):
|
| 461 |
+
return None
|
| 462 |
+
|
| 463 |
+
# Try composite key first (lemma_pos)
|
| 464 |
+
composite_key = f"{token.lemma_}_{token.tag_}"
|
| 465 |
+
score = ref_data.get('composite_dict', {}).get(composite_key)
|
| 466 |
+
|
| 467 |
+
if score is None and fallback:
|
| 468 |
+
# Fallback to lemma only
|
| 469 |
+
score = ref_data.get('lemma_dict', {}).get(token.lemma_.lower())
|
| 470 |
+
|
| 471 |
+
if score is None and fallback:
|
| 472 |
+
# Final fallback to surface form
|
| 473 |
+
score = ref_data.get('surface_dict', {}).get(token.text.lower())
|
| 474 |
+
|
| 475 |
+
return score
|
| 476 |
|
| 477 |
def analyze_text(self, text: str, selected_indices: List[str],
|
| 478 |
apply_log: bool = False, word_type_filter: Optional[str] = None) -> Dict:
|
|
|
|
| 488 |
Returns:
|
| 489 |
Dictionary containing analysis results
|
| 490 |
"""
|
| 491 |
+
# Process text using base class
|
| 492 |
+
doc = self.process_document(text)
|
| 493 |
+
tokens = self.filter_tokens(doc, exclude_punct=True, exclude_space=True)
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
# Generate n-grams
|
| 496 |
bigrams = self._generate_ngrams(tokens, 2)
|
|
|
|
| 508 |
'content_words': len([t for t in tokens if self._classify_pos(t) == 'CW']),
|
| 509 |
'function_words': len([t for t in tokens if self._classify_pos(t) == 'FW'])
|
| 510 |
},
|
| 511 |
+
'raw_scores': {}, # Raw scores for plotting
|
| 512 |
+
'tokens': tokens, # Raw spaCy tokens for advanced analysis
|
| 513 |
+
'doc': doc # Full spaCy doc for complex operations
|
| 514 |
}
|
| 515 |
|
| 516 |
# Initialize score collections
|
|
|
|
| 524 |
if word_type_filter and word_type != word_type_filter:
|
| 525 |
continue
|
| 526 |
|
| 527 |
+
# Work directly with spaCy token - include syntactic information
|
| 528 |
token_detail = {
|
| 529 |
'id': i + 1,
|
| 530 |
'token': token.text,
|
| 531 |
'lemma': token.lemma_,
|
| 532 |
'pos': token.pos_,
|
| 533 |
+
'tag': token.tag_,
|
| 534 |
+
'dep_': token.dep_, # Add dependency relation
|
| 535 |
+
'head_text': token.head.text, # Add head word
|
| 536 |
+
'head_pos': token.head.pos_, # Add head POS
|
| 537 |
'word_type': word_type
|
| 538 |
}
|
| 539 |
|
| 540 |
# Look up scores for each selected index
|
| 541 |
for index_name in selected_indices:
|
| 542 |
+
# Check if this is a Japanese corpus reference list
|
| 543 |
+
ref_data = self.reference_lists.get(index_name, {})
|
| 544 |
+
is_japanese_corpus = False
|
| 545 |
+
for file_type in ['token', 'lemma']:
|
| 546 |
+
data = ref_data.get(file_type, {})
|
| 547 |
+
if isinstance(data, dict) and data.get('is_japanese_corpus', False):
|
| 548 |
+
is_japanese_corpus = True
|
| 549 |
+
break
|
| 550 |
|
| 551 |
+
if is_japanese_corpus and self.language == 'ja':
|
| 552 |
+
# Use enhanced UniDic lookup with 3-level fallback and diagnostics
|
| 553 |
+
token_result = self._lookup_with_unidic_fallback(token, index_name, 'token')
|
| 554 |
+
lemma_result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
|
| 555 |
+
|
| 556 |
+
# Extract scores and diagnostic information
|
| 557 |
+
token_score = token_result['score']
|
| 558 |
+
lemma_score = lemma_result['score']
|
| 559 |
+
|
| 560 |
+
# Store enhanced details with diagnostic information
|
| 561 |
+
token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
|
| 562 |
+
token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
|
| 563 |
+
|
| 564 |
+
# Add diagnostic information for debugging
|
| 565 |
+
token_detail[f"{index_name}_token_match_method"] = token_result['match_method']
|
| 566 |
+
token_detail[f"{index_name}_lemma_match_method"] = lemma_result['match_method']
|
| 567 |
+
token_detail[f"{index_name}_token_match_key"] = token_result['match_key'] or None
|
| 568 |
+
token_detail[f"{index_name}_lemma_match_key"] = lemma_result['match_key'] or None
|
| 569 |
+
|
| 570 |
+
# Store UniDic features for display
|
| 571 |
+
if hasattr(token, '_') and hasattr(token._, 'unidic_lemma'):
|
| 572 |
+
token_detail['unidic_features'] = {
|
| 573 |
+
'lemma': getattr(token._, 'unidic_lemma', ''),
|
| 574 |
+
'lForm': getattr(token._, 'unidic_lform', ''),
|
| 575 |
+
'pos1': getattr(token._, 'unidic_pos1', ''),
|
| 576 |
+
'pos2': getattr(token._, 'unidic_pos2', ''),
|
| 577 |
+
'goshu': getattr(token._, 'unidic_goshu', ''),
|
| 578 |
+
'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
elif is_japanese_corpus:
|
| 582 |
+
# Fallback to legacy Japanese lookup if UniDic not available
|
| 583 |
+
token_score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
|
| 584 |
+
lemma_score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
|
| 585 |
+
|
| 586 |
+
# Store scores
|
| 587 |
+
token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
|
| 588 |
+
token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
|
| 589 |
+
token_detail[f"{index_name}_token_match_method"] = "legacy_spacy"
|
| 590 |
+
token_detail[f"{index_name}_lemma_match_method"] = "legacy_spacy"
|
| 591 |
+
else:
|
| 592 |
+
# Standard lookup for non-Japanese data
|
| 593 |
+
token_score = self._lookup_score(token.text, index_name, 'token')
|
| 594 |
+
lemma_score = self._lookup_score(token.lemma_, index_name, 'lemma')
|
| 595 |
+
|
| 596 |
+
# Store scores
|
| 597 |
+
token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
|
| 598 |
+
token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
|
| 599 |
|
| 600 |
# Collect for summary statistics
|
| 601 |
if token_score is not None:
|
|
|
|
| 660 |
score_val = np.log10(score) if apply_log and score > 0 else score
|
| 661 |
ngram_detail[f"{index_name}_{measure}"] = score_val
|
| 662 |
else:
|
| 663 |
+
ngram_detail[f"{index_name}_{measure}"] = None
|
| 664 |
|
| 665 |
results[ngram_details_key].append(ngram_detail)
|
| 666 |
|
text_analyzer/pos_parser.py
CHANGED
|
@@ -13,15 +13,18 @@ import base64
|
|
| 13 |
from io import BytesIO
|
| 14 |
import zipfile
|
| 15 |
|
|
|
|
|
|
|
| 16 |
# Configure logging
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
|
| 20 |
|
| 21 |
-
class POSParser:
|
| 22 |
"""
|
| 23 |
Main class for POS tagging and dependency parsing.
|
| 24 |
Handles multilingual analysis and visualization.
|
|
|
|
| 25 |
"""
|
| 26 |
|
| 27 |
def __init__(self, language: str = "en", model_size: str = "trf"):
|
|
@@ -30,32 +33,9 @@ class POSParser:
|
|
| 30 |
|
| 31 |
Args:
|
| 32 |
language (str): Language code ('en' for English, 'ja' for Japanese)
|
| 33 |
-
model_size (str): SpaCy model size ('trf' or '
|
| 34 |
"""
|
| 35 |
-
|
| 36 |
-
self.model_size = model_size
|
| 37 |
-
self.nlp = None
|
| 38 |
-
self._load_spacy_model()
|
| 39 |
-
|
| 40 |
-
def _load_spacy_model(self):
|
| 41 |
-
"""Load appropriate SpaCy model based on language and size."""
|
| 42 |
-
model_map = {
|
| 43 |
-
("en", "md"): "en_core_web_md",
|
| 44 |
-
("en", "trf"): "en_core_web_trf",
|
| 45 |
-
("ja", "md"): "ja_core_news_md",
|
| 46 |
-
("ja", "trf"): "ja_core_news_trf"
|
| 47 |
-
}
|
| 48 |
-
|
| 49 |
-
model_name = model_map.get((self.language, self.model_size))
|
| 50 |
-
if not model_name:
|
| 51 |
-
raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
|
| 52 |
-
|
| 53 |
-
try:
|
| 54 |
-
self.nlp = spacy.load(model_name)
|
| 55 |
-
logger.info(f"Loaded SpaCy model: {model_name}")
|
| 56 |
-
except OSError:
|
| 57 |
-
logger.error(f"SpaCy model {model_name} not found. Please install it first.")
|
| 58 |
-
raise
|
| 59 |
|
| 60 |
def analyze_text(self, text: str) -> Dict:
|
| 61 |
"""
|
|
@@ -67,11 +47,8 @@ class POSParser:
|
|
| 67 |
Returns:
|
| 68 |
Dictionary containing analysis results
|
| 69 |
"""
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
# Process text
|
| 74 |
-
doc = self.nlp(text)
|
| 75 |
|
| 76 |
# Extract token information
|
| 77 |
token_data = []
|
|
@@ -131,10 +108,8 @@ class POSParser:
|
|
| 131 |
Returns:
|
| 132 |
List of HTML strings, one per sentence
|
| 133 |
"""
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
doc = self.nlp(text)
|
| 138 |
html_outputs = []
|
| 139 |
|
| 140 |
for sent in doc.sents:
|
|
@@ -235,4 +210,4 @@ class POSParser:
|
|
| 235 |
zip_file.write(file_path, file_path.name)
|
| 236 |
|
| 237 |
zip_buffer.seek(0)
|
| 238 |
-
return zip_buffer.getvalue()
|
|
|
|
| 13 |
from io import BytesIO
|
| 14 |
import zipfile
|
| 15 |
|
| 16 |
+
from .base_analyzer import BaseAnalyzer
|
| 17 |
+
|
| 18 |
# Configure logging
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
| 22 |
|
| 23 |
+
class POSParser(BaseAnalyzer):
|
| 24 |
"""
|
| 25 |
Main class for POS tagging and dependency parsing.
|
| 26 |
Handles multilingual analysis and visualization.
|
| 27 |
+
Inherits from BaseAnalyzer for consistent SpaCy model management.
|
| 28 |
"""
|
| 29 |
|
| 30 |
def __init__(self, language: str = "en", model_size: str = "trf"):
|
|
|
|
| 33 |
|
| 34 |
Args:
|
| 35 |
language (str): Language code ('en' for English, 'ja' for Japanese)
|
| 36 |
+
model_size (str): SpaCy model size ('trf' or 'md')
|
| 37 |
"""
|
| 38 |
+
super().__init__(language, model_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def analyze_text(self, text: str) -> Dict:
|
| 41 |
"""
|
|
|
|
| 47 |
Returns:
|
| 48 |
Dictionary containing analysis results
|
| 49 |
"""
|
| 50 |
+
# Process text using base class method
|
| 51 |
+
doc = self.process_document(text)
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# Extract token information
|
| 54 |
token_data = []
|
|
|
|
| 108 |
Returns:
|
| 109 |
List of HTML strings, one per sentence
|
| 110 |
"""
|
| 111 |
+
# Process text using base class method
|
| 112 |
+
doc = self.process_document(text)
|
|
|
|
|
|
|
| 113 |
html_outputs = []
|
| 114 |
|
| 115 |
for sent in doc.sents:
|
|
|
|
| 210 |
zip_file.write(file_path, file_path.name)
|
| 211 |
|
| 212 |
zip_buffer.seek(0)
|
| 213 |
+
return zip_buffer.getvalue()
|
text_analyzer/text_utility.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text processing utilities module.
|
| 3 |
+
Contains reusable functions for file handling, encoding detection, and text cleaning.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
import chardet
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Union, Tuple, List, Dict, Any, Optional
|
| 11 |
+
import logging
|
| 12 |
+
import re
|
| 13 |
+
from .app_config import AppConfig
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TextUtility:
|
| 19 |
+
"""Collection of text processing and file handling utilities."""
|
| 20 |
+
|
| 21 |
+
@staticmethod
|
| 22 |
+
def detect_encoding(content: bytes) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Detect encoding of byte content.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
content: Byte content to analyze
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
Detected encoding string
|
| 31 |
+
"""
|
| 32 |
+
try:
|
| 33 |
+
# Try chardet for automatic detection
|
| 34 |
+
result = chardet.detect(content)
|
| 35 |
+
encoding = result.get('encoding', 'utf-8')
|
| 36 |
+
|
| 37 |
+
# Validate detected encoding against supported list
|
| 38 |
+
if encoding and encoding.lower() in [enc.lower() for enc in AppConfig.SUPPORTED_ENCODINGS]:
|
| 39 |
+
return encoding
|
| 40 |
+
|
| 41 |
+
# Fall back to trying supported encodings
|
| 42 |
+
for enc in AppConfig.SUPPORTED_ENCODINGS:
|
| 43 |
+
try:
|
| 44 |
+
content.decode(enc)
|
| 45 |
+
return enc
|
| 46 |
+
except UnicodeDecodeError:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
# Final fallback
|
| 50 |
+
return 'utf-8'
|
| 51 |
+
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.warning(f"Error detecting encoding: {e}, defaulting to utf-8")
|
| 54 |
+
return 'utf-8'
|
| 55 |
+
|
| 56 |
+
@staticmethod
|
| 57 |
+
def detect_delimiter(text: str) -> str:
|
| 58 |
+
"""
|
| 59 |
+
Detect delimiter in text content.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
text: Text content to analyze
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
Detected delimiter
|
| 66 |
+
"""
|
| 67 |
+
# Count occurrences of each supported delimiter
|
| 68 |
+
delimiter_counts = {}
|
| 69 |
+
for delimiter in AppConfig.SUPPORTED_DELIMITERS:
|
| 70 |
+
delimiter_counts[delimiter] = text.count(delimiter)
|
| 71 |
+
|
| 72 |
+
# Return the most frequent delimiter, or tab as default
|
| 73 |
+
if delimiter_counts:
|
| 74 |
+
return max(delimiter_counts, key=delimiter_counts.get)
|
| 75 |
+
return '\t'
|
| 76 |
+
|
| 77 |
+
@staticmethod
|
| 78 |
+
def clean_text_input(text: str) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Clean text input by normalizing whitespace and removing problematic characters.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
text: Raw text input
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Cleaned text
|
| 87 |
+
"""
|
| 88 |
+
if not text:
|
| 89 |
+
return ""
|
| 90 |
+
|
| 91 |
+
# Normalize whitespace
|
| 92 |
+
text = TextUtility.normalize_whitespace(text)
|
| 93 |
+
|
| 94 |
+
# Remove or replace problematic characters
|
| 95 |
+
# Remove null bytes
|
| 96 |
+
text = text.replace('\x00', '')
|
| 97 |
+
|
| 98 |
+
# Normalize unicode
|
| 99 |
+
text = text.encode('utf-8', errors='ignore').decode('utf-8')
|
| 100 |
+
|
| 101 |
+
return text.strip()
|
| 102 |
+
|
| 103 |
+
@staticmethod
|
| 104 |
+
def normalize_whitespace(text: str) -> str:
|
| 105 |
+
"""
|
| 106 |
+
Normalize whitespace in text.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
text: Text to normalize
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
Text with normalized whitespace
|
| 113 |
+
"""
|
| 114 |
+
if not text:
|
| 115 |
+
return ""
|
| 116 |
+
|
| 117 |
+
# Replace multiple whitespace with single space
|
| 118 |
+
text = re.sub(r'\s+', ' ', text)
|
| 119 |
+
|
| 120 |
+
# Remove leading/trailing whitespace from each line
|
| 121 |
+
lines = text.split('\n')
|
| 122 |
+
lines = [line.strip() for line in lines]
|
| 123 |
+
|
| 124 |
+
# Remove empty lines at beginning and end
|
| 125 |
+
while lines and not lines[0]:
|
| 126 |
+
lines.pop(0)
|
| 127 |
+
while lines and not lines[-1]:
|
| 128 |
+
lines.pop()
|
| 129 |
+
|
| 130 |
+
return '\n'.join(lines)
|
| 131 |
+
|
| 132 |
+
@staticmethod
|
| 133 |
+
def validate_text_length(text: str, max_length: int = None) -> bool:
|
| 134 |
+
"""
|
| 135 |
+
Validate text length against limits.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
text: Text to validate
|
| 139 |
+
max_length: Maximum allowed length (optional)
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
True if text length is valid
|
| 143 |
+
"""
|
| 144 |
+
if not text:
|
| 145 |
+
return False
|
| 146 |
+
|
| 147 |
+
if max_length and len(text) > max_length:
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
return True
|
| 151 |
+
|
| 152 |
+
@staticmethod
|
| 153 |
+
def extract_text_from_file(file_path: str) -> str:
|
| 154 |
+
"""
|
| 155 |
+
Extract text content from a file with encoding detection.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
file_path: Path to the file
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
Extracted text content
|
| 162 |
+
"""
|
| 163 |
+
try:
|
| 164 |
+
# Read as bytes first for encoding detection
|
| 165 |
+
with open(file_path, 'rb') as f:
|
| 166 |
+
content = f.read()
|
| 167 |
+
|
| 168 |
+
# Detect encoding
|
| 169 |
+
encoding = TextUtility.detect_encoding(content)
|
| 170 |
+
|
| 171 |
+
# Decode with detected encoding
|
| 172 |
+
text = content.decode(encoding)
|
| 173 |
+
|
| 174 |
+
# Clean the text
|
| 175 |
+
return TextUtility.clean_text_input(text)
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
logger.error(f"Error extracting text from {file_path}: {e}")
|
| 179 |
+
raise ValueError(f"Failed to extract text from file: {e}")
|
| 180 |
+
|
| 181 |
+
@staticmethod
|
| 182 |
+
def prepare_batch_files(file_paths: List[str]) -> List[Tuple[str, str]]:
|
| 183 |
+
"""
|
| 184 |
+
Prepare batch files for processing by extracting text content.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
file_paths: List of file paths
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
List of tuples (file_path, text_content)
|
| 191 |
+
"""
|
| 192 |
+
prepared_files = []
|
| 193 |
+
|
| 194 |
+
for file_path in file_paths:
|
| 195 |
+
try:
|
| 196 |
+
text_content = TextUtility.extract_text_from_file(file_path)
|
| 197 |
+
prepared_files.append((file_path, text_content))
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.error(f"Error preparing file {file_path}: {e}")
|
| 200 |
+
# Add error entry
|
| 201 |
+
prepared_files.append((file_path, f"ERROR: {e}"))
|
| 202 |
+
|
| 203 |
+
return prepared_files
|
| 204 |
+
|
| 205 |
+
@staticmethod
|
| 206 |
+
def sanitize_filename(filename: str) -> str:
|
| 207 |
+
"""
|
| 208 |
+
Sanitize filename by removing problematic characters.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
filename: Original filename
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
Sanitized filename
|
| 215 |
+
"""
|
| 216 |
+
# Remove or replace problematic characters
|
| 217 |
+
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
| 218 |
+
|
| 219 |
+
# Remove control characters
|
| 220 |
+
filename = ''.join(char for char in filename if ord(char) >= 32)
|
| 221 |
+
|
| 222 |
+
# Limit length
|
| 223 |
+
if len(filename) > 255:
|
| 224 |
+
name, ext = os.path.splitext(filename)
|
| 225 |
+
filename = name[:255-len(ext)] + ext
|
| 226 |
+
|
| 227 |
+
return filename or "unnamed_file"
|
| 228 |
+
|
| 229 |
+
@staticmethod
|
| 230 |
+
def create_safe_temp_file(content: str, suffix: str = '.txt') -> str:
|
| 231 |
+
"""
|
| 232 |
+
Create a temporary file with given content safely.
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
content: Content to write to file
|
| 236 |
+
suffix: File suffix
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
Path to created temporary file
|
| 240 |
+
"""
|
| 241 |
+
try:
|
| 242 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False, encoding='utf-8') as f:
|
| 243 |
+
f.write(content)
|
| 244 |
+
return f.name
|
| 245 |
+
except Exception as e:
|
| 246 |
+
logger.error(f"Error creating temporary file: {e}")
|
| 247 |
+
raise ValueError(f"Failed to create temporary file: {e}")
|
| 248 |
+
|
| 249 |
+
@staticmethod
|
| 250 |
+
def load_corpus_config(corpus_name: str) -> Dict[str, Any]:
|
| 251 |
+
"""
|
| 252 |
+
Load specific corpus configuration from reference_lists.yaml
|
| 253 |
+
|
| 254 |
+
Args:
|
| 255 |
+
corpus_name: Name of the corpus
|
| 256 |
+
|
| 257 |
+
Returns:
|
| 258 |
+
Corpus configuration dictionary
|
| 259 |
+
"""
|
| 260 |
+
return AppConfig.get_corpus_configuration(corpus_name)
|
| 261 |
+
|
| 262 |
+
@staticmethod
|
| 263 |
+
def get_column_mapping(config: Dict, corpus_type: str = 'columns') -> Dict[str, int]:
|
| 264 |
+
"""
|
| 265 |
+
Extract column mappings from corpus configuration
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
config: Corpus configuration dictionary
|
| 269 |
+
corpus_type: Type of mapping to extract
|
| 270 |
+
|
| 271 |
+
Returns:
|
| 272 |
+
Dictionary mapping column names to indices
|
| 273 |
+
"""
|
| 274 |
+
return config.get(corpus_type, {})
|
| 275 |
+
|
| 276 |
+
@staticmethod
|
| 277 |
+
def cleanup_temp_files(file_paths: List[str]) -> None:
|
| 278 |
+
"""
|
| 279 |
+
Clean up temporary files safely.
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
file_paths: List of temporary file paths to clean up
|
| 283 |
+
"""
|
| 284 |
+
for file_path in file_paths:
|
| 285 |
+
try:
|
| 286 |
+
if os.path.exists(file_path):
|
| 287 |
+
os.unlink(file_path)
|
| 288 |
+
except Exception as e:
|
| 289 |
+
logger.warning(f"Error cleaning up temporary file {file_path}: {e}")
|
text_analyzer/unidic_enricher.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
UniDic morphological enricher for Japanese text analysis.
|
| 3 |
+
Provides fugashi/UniDic integration with character offset alignment.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import fugashi
|
| 7 |
+
from unidic import DICDIR
|
| 8 |
+
from typing import List, Dict, Optional, Tuple
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class UniDicEnricher:
|
| 15 |
+
"""
|
| 16 |
+
Enriches spaCy tokens with UniDic morphological features using fugashi.
|
| 17 |
+
Handles character offset alignment and provides comprehensive feature extraction.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self, unidic_path: Optional[str] = None):
|
| 21 |
+
"""
|
| 22 |
+
Initialize with full UniDic dictionary.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
unidic_path: Path to UniDic dictionary. Uses default if None.
|
| 26 |
+
"""
|
| 27 |
+
if unidic_path is None:
|
| 28 |
+
unidic_path = DICDIR
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
# Initialize tagger with full UniDic
|
| 32 |
+
self.tagger = fugashi.Tagger(f'-Owakati -d {unidic_path}')
|
| 33 |
+
logger.info(f"UniDicEnricher initialized with dictionary: {unidic_path}")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
logger.error(f"Failed to initialize UniDic tagger: {e}")
|
| 36 |
+
raise
|
| 37 |
+
|
| 38 |
+
def extract_full_features(self, word_node) -> Dict[str, str]:
|
| 39 |
+
"""
|
| 40 |
+
Extract complete UniDic features using proper fugashi API.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
word_node: Fugashi word node object
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Dictionary of UniDic morphological features
|
| 47 |
+
"""
|
| 48 |
+
try:
|
| 49 |
+
features = {
|
| 50 |
+
'surface': word_node.surface,
|
| 51 |
+
'pos1': word_node.feature.pos1, # 品詞大分類
|
| 52 |
+
'pos2': word_node.feature.pos2, # 品詞中分類
|
| 53 |
+
'pos3': word_node.feature.pos3, # 品詞小分類
|
| 54 |
+
'pos4': word_node.feature.pos4, # 品詞細分類
|
| 55 |
+
'cType': word_node.feature.cType, # 活用型
|
| 56 |
+
'cForm': word_node.feature.cForm, # 活用形
|
| 57 |
+
'lemma': word_node.feature.lemma, # 基本形
|
| 58 |
+
'lForm': word_node.feature.lForm, # 読み
|
| 59 |
+
'orth': word_node.feature.orth, # 表記
|
| 60 |
+
'orthBase': word_node.feature.orthBase, # 表記基本形
|
| 61 |
+
'goshu': word_node.feature.goshu, # 語種 (和/漢/外/混)
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Handle None values by converting to empty strings
|
| 65 |
+
for key, value in features.items():
|
| 66 |
+
if value is None:
|
| 67 |
+
features[key] = ""
|
| 68 |
+
|
| 69 |
+
return features
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.warning(f"Error extracting features from word node: {e}")
|
| 73 |
+
return self._get_empty_features(word_node.surface if hasattr(word_node, 'surface') else "")
|
| 74 |
+
|
| 75 |
+
def _get_empty_features(self, surface: str) -> Dict[str, str]:
|
| 76 |
+
"""Return empty feature dictionary with surface form."""
|
| 77 |
+
return {
|
| 78 |
+
'surface': surface,
|
| 79 |
+
'pos1': '', 'pos2': '', 'pos3': '', 'pos4': '',
|
| 80 |
+
'cType': '', 'cForm': '', 'lemma': surface, 'lForm': '',
|
| 81 |
+
'orth': '', 'orthBase': '', 'goshu': ''
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
def parse_text(self, text: str) -> List[Dict[str, any]]:
|
| 85 |
+
"""
|
| 86 |
+
Parse text with fugashi and extract character positions.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
text: Input text to parse
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
List of dictionaries containing features and character positions
|
| 93 |
+
"""
|
| 94 |
+
result = []
|
| 95 |
+
char_pos = 0
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
for word_node in self.tagger(text):
|
| 99 |
+
surface = word_node.surface
|
| 100 |
+
features = self.extract_full_features(word_node)
|
| 101 |
+
|
| 102 |
+
# Find character position in original text
|
| 103 |
+
start_pos = text.find(surface, char_pos)
|
| 104 |
+
if start_pos == -1:
|
| 105 |
+
# Fallback: assume consecutive positioning
|
| 106 |
+
start_pos = char_pos
|
| 107 |
+
|
| 108 |
+
end_pos = start_pos + len(surface)
|
| 109 |
+
|
| 110 |
+
result.append({
|
| 111 |
+
'surface': surface,
|
| 112 |
+
'start': start_pos,
|
| 113 |
+
'end': end_pos,
|
| 114 |
+
'features': features
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
char_pos = end_pos
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
logger.error(f"Error parsing text with fugashi: {e}")
|
| 121 |
+
|
| 122 |
+
return result
|
| 123 |
+
|
| 124 |
+
def align_with_spacy_tokens(self, text: str, spacy_tokens) -> List[Dict]:
|
| 125 |
+
"""
|
| 126 |
+
Align fugashi tokens with spaCy tokens using character offsets.
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
text: Original input text
|
| 130 |
+
spacy_tokens: List of spaCy token objects
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
List of alignment results with confidence scores
|
| 134 |
+
"""
|
| 135 |
+
fugashi_tokens = self.parse_text(text)
|
| 136 |
+
alignments = []
|
| 137 |
+
|
| 138 |
+
for spacy_token in spacy_tokens:
|
| 139 |
+
spacy_start = spacy_token.idx
|
| 140 |
+
spacy_end = spacy_token.idx + len(spacy_token.text)
|
| 141 |
+
|
| 142 |
+
best_match = None
|
| 143 |
+
best_confidence = 0.0
|
| 144 |
+
|
| 145 |
+
# Find best overlapping fugashi token
|
| 146 |
+
for fugashi_token in fugashi_tokens:
|
| 147 |
+
overlap = self._calculate_overlap(
|
| 148 |
+
spacy_start, spacy_end,
|
| 149 |
+
fugashi_token['start'], fugashi_token['end']
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
if overlap > best_confidence:
|
| 153 |
+
best_confidence = overlap
|
| 154 |
+
best_match = fugashi_token
|
| 155 |
+
|
| 156 |
+
alignment = {
|
| 157 |
+
'spacy_token': spacy_token,
|
| 158 |
+
'fugashi_token': best_match,
|
| 159 |
+
'confidence': best_confidence,
|
| 160 |
+
'aligned': best_confidence > 0.5 # Threshold for successful alignment
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
alignments.append(alignment)
|
| 164 |
+
|
| 165 |
+
return alignments
|
| 166 |
+
|
| 167 |
+
def _calculate_overlap(self, start1: int, end1: int, start2: int, end2: int) -> float:
|
| 168 |
+
"""
|
| 169 |
+
Calculate overlap ratio between two character ranges.
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
start1, end1: First range
|
| 173 |
+
start2, end2: Second range
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Overlap ratio (0.0 to 1.0)
|
| 177 |
+
"""
|
| 178 |
+
if end1 <= start2 or end2 <= start1:
|
| 179 |
+
return 0.0 # No overlap
|
| 180 |
+
|
| 181 |
+
overlap_start = max(start1, start2)
|
| 182 |
+
overlap_end = min(end1, end2)
|
| 183 |
+
overlap_length = overlap_end - overlap_start
|
| 184 |
+
|
| 185 |
+
total_length = max(end1 - start1, end2 - start2)
|
| 186 |
+
|
| 187 |
+
return overlap_length / total_length if total_length > 0 else 0.0
|
| 188 |
+
|
| 189 |
+
def enrich_spacy_doc(self, doc, text: str):
|
| 190 |
+
"""
|
| 191 |
+
Add UniDic features to spaCy tokens via extensions.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
doc: spaCy document object
|
| 195 |
+
text: Original input text
|
| 196 |
+
"""
|
| 197 |
+
try:
|
| 198 |
+
# Get token alignments
|
| 199 |
+
alignments = self.align_with_spacy_tokens(text, doc)
|
| 200 |
+
|
| 201 |
+
# Apply UniDic features to spaCy tokens
|
| 202 |
+
for alignment in alignments:
|
| 203 |
+
token = alignment['spacy_token']
|
| 204 |
+
fugashi_token = alignment['fugashi_token']
|
| 205 |
+
confidence = alignment['confidence']
|
| 206 |
+
|
| 207 |
+
# Set alignment confidence
|
| 208 |
+
token._.alignment_confidence = confidence
|
| 209 |
+
|
| 210 |
+
if fugashi_token and alignment['aligned']:
|
| 211 |
+
features = fugashi_token['features']
|
| 212 |
+
|
| 213 |
+
# Set UniDic features on token extensions
|
| 214 |
+
token._.unidic_surface = features.get('surface', '')
|
| 215 |
+
token._.unidic_lemma = features.get('lemma', '')
|
| 216 |
+
token._.unidic_lform = features.get('lForm', '')
|
| 217 |
+
token._.unidic_pos1 = features.get('pos1', '')
|
| 218 |
+
token._.unidic_pos2 = features.get('pos2', '')
|
| 219 |
+
token._.unidic_pos3 = features.get('pos3', '')
|
| 220 |
+
token._.unidic_pos4 = features.get('pos4', '')
|
| 221 |
+
token._.unidic_goshu = features.get('goshu', '')
|
| 222 |
+
token._.unidic_orth = features.get('orth', '')
|
| 223 |
+
token._.unidic_ctype = features.get('cType', '')
|
| 224 |
+
token._.unidic_cform = features.get('cForm', '')
|
| 225 |
+
token._.unidic_orthbase = features.get('orthBase', '')
|
| 226 |
+
|
| 227 |
+
# Store full entry for debugging
|
| 228 |
+
token._.unidic_entries = [features]
|
| 229 |
+
else:
|
| 230 |
+
# No alignment found - set empty values
|
| 231 |
+
self._set_empty_unidic_features(token)
|
| 232 |
+
|
| 233 |
+
logger.debug(f"Enriched {len(alignments)} tokens with UniDic features")
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
logger.error(f"Error enriching spaCy doc: {e}")
|
| 237 |
+
# Set empty features for all tokens on error
|
| 238 |
+
for token in doc:
|
| 239 |
+
self._set_empty_unidic_features(token)
|
| 240 |
+
|
| 241 |
+
def _set_empty_unidic_features(self, token):
|
| 242 |
+
"""Set empty UniDic features on a token."""
|
| 243 |
+
token._.unidic_surface = ""
|
| 244 |
+
token._.unidic_lemma = ""
|
| 245 |
+
token._.unidic_lform = ""
|
| 246 |
+
token._.unidic_pos1 = ""
|
| 247 |
+
token._.unidic_pos2 = ""
|
| 248 |
+
token._.unidic_pos3 = ""
|
| 249 |
+
token._.unidic_pos4 = ""
|
| 250 |
+
token._.unidic_goshu = ""
|
| 251 |
+
token._.unidic_orth = ""
|
| 252 |
+
token._.unidic_ctype = ""
|
| 253 |
+
token._.unidic_cform = ""
|
| 254 |
+
token._.unidic_orthbase = ""
|
| 255 |
+
token._.unidic_entries = []
|
| 256 |
+
token._.alignment_confidence = 0.0
|
text_analyzer/unidic_extensions.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
spaCy token extensions for UniDic morphological features.
|
| 3 |
+
This module defines custom token extensions to store UniDic analysis results.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from spacy.tokens import Token
|
| 7 |
+
|
| 8 |
+
# Comprehensive UniDic feature extensions
|
| 9 |
+
Token.set_extension("unidic_entries", default=[], force=True)
|
| 10 |
+
Token.set_extension("unidic_lemma", default=None, force=True)
|
| 11 |
+
Token.set_extension("unidic_lform", default=None, force=True)
|
| 12 |
+
Token.set_extension("unidic_pos1", default=None, force=True)
|
| 13 |
+
Token.set_extension("unidic_pos2", default=None, force=True)
|
| 14 |
+
Token.set_extension("unidic_pos3", default=None, force=True)
|
| 15 |
+
Token.set_extension("unidic_sublemma", default=None, force=True)
|
| 16 |
+
Token.set_extension("unidic_goshu", default=None, force=True)
|
| 17 |
+
Token.set_extension("unidic_orth", default=None, force=True)
|
| 18 |
+
Token.set_extension("alignment_confidence", default=1.0, force=True)
|
| 19 |
+
|
| 20 |
+
# Additional extensions for diagnostic tracking
|
| 21 |
+
Token.set_extension("unidic_surface", default=None, force=True)
|
| 22 |
+
Token.set_extension("unidic_pos4", default=None, force=True)
|
| 23 |
+
Token.set_extension("unidic_ctype", default=None, force=True)
|
| 24 |
+
Token.set_extension("unidic_cform", default=None, force=True)
|
| 25 |
+
Token.set_extension("unidic_orthbase", default=None, force=True)
|
uv.lock
CHANGED
|
@@ -27,6 +27,24 @@ wheels = [
|
|
| 27 |
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
|
| 28 |
]
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
[[package]]
|
| 31 |
name = "attrs"
|
| 32 |
version = "25.3.0"
|
|
@@ -88,6 +106,48 @@ wheels = [
|
|
| 88 |
{ url = "https://files.pythonhosted.org/packages/4f/52/34c6cf5bb9285074dc3531c437b3919e825d976fde097a7a73f79e726d03/certifi-2025.7.14-py3-none-any.whl", hash = "sha256:6b31f564a415d79ee77df69d757bb49a5bb53bd9f756cbbe24394ffd6fc1f4b2", size = 162722, upload-time = "2025-07-14T03:29:26.863Z" },
|
| 89 |
]
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
[[package]]
|
| 92 |
name = "charset-normalizer"
|
| 93 |
version = "3.4.2"
|
|
@@ -153,6 +213,18 @@ wheels = [
|
|
| 153 |
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
| 154 |
]
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
[[package]]
|
| 157 |
name = "confection"
|
| 158 |
version = "0.1.5"
|
|
@@ -216,6 +288,32 @@ wheels = [
|
|
| 216 |
{ url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload-time = "2025-01-16T21:50:24.239Z" },
|
| 217 |
]
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
[[package]]
|
| 220 |
name = "en-core-web-md"
|
| 221 |
version = "3.7.0"
|
|
@@ -248,6 +346,15 @@ requires-dist = [
|
|
| 248 |
{ name = "spacy-curated-transformers", specifier = ">=0.2.0,<0.3.0" },
|
| 249 |
]
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
[[package]]
|
| 252 |
name = "filelock"
|
| 253 |
version = "3.18.0"
|
|
@@ -266,6 +373,26 @@ wheels = [
|
|
| 266 |
{ url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
|
| 267 |
]
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
[[package]]
|
| 270 |
name = "gitdb"
|
| 271 |
version = "4.0.12"
|
|
@@ -351,6 +478,63 @@ wheels = [
|
|
| 351 |
{ url = "https://files.pythonhosted.org/packages/ce/ff/3b59672c47c6284e8005b42e84ceba13864aa0f39f067c973d1af02f5d91/InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4", size = 67677, upload-time = "2022-06-27T23:11:17.723Z" },
|
| 352 |
]
|
| 353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
[[package]]
|
| 355 |
name = "ja-core-news-md"
|
| 356 |
version = "3.7.0"
|
|
@@ -393,6 +577,18 @@ requires-dist = [
|
|
| 393 |
{ name = "sudachipy", specifier = ">=0.5.2,!=0.6.1" },
|
| 394 |
]
|
| 395 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
[[package]]
|
| 397 |
name = "jinja2"
|
| 398 |
version = "3.1.6"
|
|
@@ -432,6 +628,36 @@ wheels = [
|
|
| 432 |
{ url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" },
|
| 433 |
]
|
| 434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
[[package]]
|
| 436 |
name = "langcodes"
|
| 437 |
version = "3.5.0"
|
|
@@ -539,6 +765,18 @@ wheels = [
|
|
| 539 |
{ url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" },
|
| 540 |
]
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
[[package]]
|
| 543 |
name = "mdurl"
|
| 544 |
version = "0.1.2"
|
|
@@ -588,6 +826,15 @@ wheels = [
|
|
| 588 |
{ url = "https://files.pythonhosted.org/packages/c0/15/278693412221859a0159719878e51a79812a189edceef2fe325160a8e661/narwhals-1.47.1-py3-none-any.whl", hash = "sha256:b9f2b2557aba054231361a00f6fcabc5017e338575e810e82155eb34e38ace93", size = 375506, upload-time = "2025-07-17T18:23:02.492Z" },
|
| 589 |
]
|
| 590 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
[[package]]
|
| 592 |
name = "networkx"
|
| 593 |
version = "3.5"
|
|
@@ -789,6 +1036,27 @@ wheels = [
|
|
| 789 |
{ url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
|
| 790 |
]
|
| 791 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
[[package]]
|
| 793 |
name = "pfzy"
|
| 794 |
version = "0.3.4"
|
|
@@ -864,6 +1132,24 @@ wheels = [
|
|
| 864 |
{ url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
|
| 865 |
]
|
| 866 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 867 |
[[package]]
|
| 868 |
name = "plotly"
|
| 869 |
version = "6.2.0"
|
|
@@ -929,6 +1215,39 @@ wheels = [
|
|
| 929 |
{ url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload-time = "2025-05-28T19:25:53.926Z" },
|
| 930 |
]
|
| 931 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 932 |
[[package]]
|
| 933 |
name = "pyarrow"
|
| 934 |
version = "21.0.0"
|
|
@@ -958,6 +1277,15 @@ wheels = [
|
|
| 958 |
{ url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
|
| 959 |
]
|
| 960 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
[[package]]
|
| 962 |
name = "pydantic"
|
| 963 |
version = "2.11.7"
|
|
@@ -1058,6 +1386,22 @@ wheels = [
|
|
| 1058 |
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
|
| 1059 |
]
|
| 1060 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1061 |
[[package]]
|
| 1062 |
name = "pyyaml"
|
| 1063 |
version = "6.0.2"
|
|
@@ -1084,6 +1428,36 @@ wheels = [
|
|
| 1084 |
{ url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
|
| 1085 |
]
|
| 1086 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1087 |
[[package]]
|
| 1088 |
name = "referencing"
|
| 1089 |
version = "0.36.2"
|
|
@@ -1301,9 +1675,12 @@ name = "simple-text-analyzer"
|
|
| 1301 |
version = "0.1.0"
|
| 1302 |
source = { virtual = "." }
|
| 1303 |
dependencies = [
|
|
|
|
| 1304 |
{ name = "en-core-web-md" },
|
| 1305 |
{ name = "en-core-web-trf" },
|
|
|
|
| 1306 |
{ name = "huggingface-hub", extra = ["cli"] },
|
|
|
|
| 1307 |
{ name = "ja-core-news-md" },
|
| 1308 |
{ name = "ja-core-news-trf" },
|
| 1309 |
{ name = "numpy" },
|
|
@@ -1314,13 +1691,17 @@ dependencies = [
|
|
| 1314 |
{ name = "spacy" },
|
| 1315 |
{ name = "spacy-curated-transformers" },
|
| 1316 |
{ name = "streamlit" },
|
|
|
|
| 1317 |
]
|
| 1318 |
|
| 1319 |
[package.metadata]
|
| 1320 |
requires-dist = [
|
|
|
|
| 1321 |
{ name = "en-core-web-md", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl" },
|
| 1322 |
{ name = "en-core-web-trf", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl" },
|
|
|
|
| 1323 |
{ name = "huggingface-hub", extras = ["cli"], specifier = ">=0.33.4" },
|
|
|
|
| 1324 |
{ name = "ja-core-news-md", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl" },
|
| 1325 |
{ name = "ja-core-news-trf", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl" },
|
| 1326 |
{ name = "numpy", specifier = ">=1.24.0,<2.0" },
|
|
@@ -1331,6 +1712,7 @@ requires-dist = [
|
|
| 1331 |
{ name = "spacy", specifier = ">=3.7.0" },
|
| 1332 |
{ name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
|
| 1333 |
{ name = "streamlit", specifier = ">=1.28.0" },
|
|
|
|
| 1334 |
]
|
| 1335 |
|
| 1336 |
[[package]]
|
|
@@ -1455,6 +1837,20 @@ wheels = [
|
|
| 1455 |
{ url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload-time = "2025-01-17T09:26:10.018Z" },
|
| 1456 |
]
|
| 1457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1458 |
[[package]]
|
| 1459 |
name = "streamlit"
|
| 1460 |
version = "1.47.0"
|
|
@@ -1643,6 +2039,15 @@ wheels = [
|
|
| 1643 |
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
|
| 1644 |
]
|
| 1645 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1646 |
[[package]]
|
| 1647 |
name = "triton"
|
| 1648 |
version = "3.3.1"
|
|
@@ -1701,6 +2106,18 @@ wheels = [
|
|
| 1701 |
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
|
| 1702 |
]
|
| 1703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1704 |
[[package]]
|
| 1705 |
name = "urllib3"
|
| 1706 |
version = "2.5.0"
|
|
@@ -1712,14 +2129,11 @@ wheels = [
|
|
| 1712 |
|
| 1713 |
[[package]]
|
| 1714 |
name = "wasabi"
|
| 1715 |
-
version = "
|
| 1716 |
source = { registry = "https://pypi.org/simple" }
|
| 1717 |
-
|
| 1718 |
-
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
| 1719 |
-
]
|
| 1720 |
-
sdist = { url = "https://files.pythonhosted.org/packages/ac/f9/054e6e2f1071e963b5e746b48d1e3727470b2a490834d18ad92364929db3/wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878", size = 30391, upload-time = "2024-05-31T16:56:18.99Z" }
|
| 1721 |
wheels = [
|
| 1722 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 1723 |
]
|
| 1724 |
|
| 1725 |
[[package]]
|
|
|
|
| 27 |
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
|
| 28 |
]
|
| 29 |
|
| 30 |
+
[[package]]
|
| 31 |
+
name = "appnope"
|
| 32 |
+
version = "0.1.4"
|
| 33 |
+
source = { registry = "https://pypi.org/simple" }
|
| 34 |
+
sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" }
|
| 35 |
+
wheels = [
|
| 36 |
+
{ url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" },
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
[[package]]
|
| 40 |
+
name = "asttokens"
|
| 41 |
+
version = "3.0.0"
|
| 42 |
+
source = { registry = "https://pypi.org/simple" }
|
| 43 |
+
sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" }
|
| 44 |
+
wheels = [
|
| 45 |
+
{ url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" },
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
[[package]]
|
| 49 |
name = "attrs"
|
| 50 |
version = "25.3.0"
|
|
|
|
| 106 |
{ url = "https://files.pythonhosted.org/packages/4f/52/34c6cf5bb9285074dc3531c437b3919e825d976fde097a7a73f79e726d03/certifi-2025.7.14-py3-none-any.whl", hash = "sha256:6b31f564a415d79ee77df69d757bb49a5bb53bd9f756cbbe24394ffd6fc1f4b2", size = 162722, upload-time = "2025-07-14T03:29:26.863Z" },
|
| 107 |
]
|
| 108 |
|
| 109 |
+
[[package]]
|
| 110 |
+
name = "cffi"
|
| 111 |
+
version = "1.17.1"
|
| 112 |
+
source = { registry = "https://pypi.org/simple" }
|
| 113 |
+
dependencies = [
|
| 114 |
+
{ name = "pycparser" },
|
| 115 |
+
]
|
| 116 |
+
sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" }
|
| 117 |
+
wheels = [
|
| 118 |
+
{ url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178, upload-time = "2024-09-04T20:44:12.232Z" },
|
| 119 |
+
{ url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840, upload-time = "2024-09-04T20:44:13.739Z" },
|
| 120 |
+
{ url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803, upload-time = "2024-09-04T20:44:15.231Z" },
|
| 121 |
+
{ url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850, upload-time = "2024-09-04T20:44:17.188Z" },
|
| 122 |
+
{ url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729, upload-time = "2024-09-04T20:44:18.688Z" },
|
| 123 |
+
{ url = "https://files.pythonhosted.org/packages/91/2b/9a1ddfa5c7f13cab007a2c9cc295b70fbbda7cb10a286aa6810338e60ea1/cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99", size = 471256, upload-time = "2024-09-04T20:44:20.248Z" },
|
| 124 |
+
{ url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424, upload-time = "2024-09-04T20:44:21.673Z" },
|
| 125 |
+
{ url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568, upload-time = "2024-09-04T20:44:23.245Z" },
|
| 126 |
+
{ url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736, upload-time = "2024-09-04T20:44:24.757Z" },
|
| 127 |
+
{ url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448, upload-time = "2024-09-04T20:44:26.208Z" },
|
| 128 |
+
{ url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976, upload-time = "2024-09-04T20:44:27.578Z" },
|
| 129 |
+
{ url = "https://files.pythonhosted.org/packages/8d/f8/dd6c246b148639254dad4d6803eb6a54e8c85c6e11ec9df2cffa87571dbe/cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e", size = 182989, upload-time = "2024-09-04T20:44:28.956Z" },
|
| 130 |
+
{ url = "https://files.pythonhosted.org/packages/8b/f1/672d303ddf17c24fc83afd712316fda78dc6fce1cd53011b839483e1ecc8/cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2", size = 178802, upload-time = "2024-09-04T20:44:30.289Z" },
|
| 131 |
+
{ url = "https://files.pythonhosted.org/packages/0e/2d/eab2e858a91fdff70533cab61dcff4a1f55ec60425832ddfdc9cd36bc8af/cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3", size = 454792, upload-time = "2024-09-04T20:44:32.01Z" },
|
| 132 |
+
{ url = "https://files.pythonhosted.org/packages/75/b2/fbaec7c4455c604e29388d55599b99ebcc250a60050610fadde58932b7ee/cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683", size = 478893, upload-time = "2024-09-04T20:44:33.606Z" },
|
| 133 |
+
{ url = "https://files.pythonhosted.org/packages/4f/b7/6e4a2162178bf1935c336d4da8a9352cccab4d3a5d7914065490f08c0690/cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5", size = 485810, upload-time = "2024-09-04T20:44:35.191Z" },
|
| 134 |
+
{ url = "https://files.pythonhosted.org/packages/c7/8a/1d0e4a9c26e54746dc08c2c6c037889124d4f59dffd853a659fa545f1b40/cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4", size = 471200, upload-time = "2024-09-04T20:44:36.743Z" },
|
| 135 |
+
{ url = "https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd", size = 479447, upload-time = "2024-09-04T20:44:38.492Z" },
|
| 136 |
+
{ url = "https://files.pythonhosted.org/packages/5f/e4/fb8b3dd8dc0e98edf1135ff067ae070bb32ef9d509d6cb0f538cd6f7483f/cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed", size = 484358, upload-time = "2024-09-04T20:44:40.046Z" },
|
| 137 |
+
{ url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469, upload-time = "2024-09-04T20:44:41.616Z" },
|
| 138 |
+
{ url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475, upload-time = "2024-09-04T20:44:43.733Z" },
|
| 139 |
+
{ url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" },
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
[[package]]
|
| 143 |
+
name = "chardet"
|
| 144 |
+
version = "5.2.0"
|
| 145 |
+
source = { registry = "https://pypi.org/simple" }
|
| 146 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" }
|
| 147 |
+
wheels = [
|
| 148 |
+
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" },
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
[[package]]
|
| 152 |
name = "charset-normalizer"
|
| 153 |
version = "3.4.2"
|
|
|
|
| 213 |
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
| 214 |
]
|
| 215 |
|
| 216 |
+
[[package]]
|
| 217 |
+
name = "comm"
|
| 218 |
+
version = "0.2.2"
|
| 219 |
+
source = { registry = "https://pypi.org/simple" }
|
| 220 |
+
dependencies = [
|
| 221 |
+
{ name = "traitlets" },
|
| 222 |
+
]
|
| 223 |
+
sdist = { url = "https://files.pythonhosted.org/packages/e9/a8/fb783cb0abe2b5fded9f55e5703015cdf1c9c85b3669087c538dd15a6a86/comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e", size = 6210, upload-time = "2024-03-12T16:53:41.133Z" }
|
| 224 |
+
wheels = [
|
| 225 |
+
{ url = "https://files.pythonhosted.org/packages/e6/75/49e5bfe642f71f272236b5b2d2691cf915a7283cc0ceda56357b61daa538/comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3", size = 7180, upload-time = "2024-03-12T16:53:39.226Z" },
|
| 226 |
+
]
|
| 227 |
+
|
| 228 |
[[package]]
|
| 229 |
name = "confection"
|
| 230 |
version = "0.1.5"
|
|
|
|
| 288 |
{ url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload-time = "2025-01-16T21:50:24.239Z" },
|
| 289 |
]
|
| 290 |
|
| 291 |
+
[[package]]
|
| 292 |
+
name = "debugpy"
|
| 293 |
+
version = "1.8.15"
|
| 294 |
+
source = { registry = "https://pypi.org/simple" }
|
| 295 |
+
sdist = { url = "https://files.pythonhosted.org/packages/8c/8b/3a9a28ddb750a76eaec445c7f4d3147ea2c579a97dbd9e25d39001b92b21/debugpy-1.8.15.tar.gz", hash = "sha256:58d7a20b7773ab5ee6bdfb2e6cf622fdf1e40c9d5aef2857d85391526719ac00", size = 1643279, upload-time = "2025-07-15T16:43:29.135Z" }
|
| 296 |
+
wheels = [
|
| 297 |
+
{ url = "https://files.pythonhosted.org/packages/ab/4a/4508d256e52897f5cdfee6a6d7580974811e911c6d01321df3264508a5ac/debugpy-1.8.15-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:3dcc7225cb317469721ab5136cda9ff9c8b6e6fb43e87c9e15d5b108b99d01ba", size = 2511197, upload-time = "2025-07-15T16:43:42.343Z" },
|
| 298 |
+
{ url = "https://files.pythonhosted.org/packages/99/8d/7f6ef1097e7fecf26b4ef72338d08e41644a41b7ee958a19f494ffcffc29/debugpy-1.8.15-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:047a493ca93c85ccede1dbbaf4e66816794bdc214213dde41a9a61e42d27f8fc", size = 4229517, upload-time = "2025-07-15T16:43:44.14Z" },
|
| 299 |
+
{ url = "https://files.pythonhosted.org/packages/3f/e8/e8c6a9aa33a9c9c6dacbf31747384f6ed2adde4de2e9693c766bdf323aa3/debugpy-1.8.15-cp312-cp312-win32.whl", hash = "sha256:b08e9b0bc260cf324c890626961dad4ffd973f7568fbf57feb3c3a65ab6b6327", size = 5276132, upload-time = "2025-07-15T16:43:45.529Z" },
|
| 300 |
+
{ url = "https://files.pythonhosted.org/packages/e9/ad/231050c6177b3476b85fcea01e565dac83607b5233d003ff067e2ee44d8f/debugpy-1.8.15-cp312-cp312-win_amd64.whl", hash = "sha256:e2a4fe357c92334272eb2845fcfcdbec3ef9f22c16cf613c388ac0887aed15fa", size = 5317645, upload-time = "2025-07-15T16:43:46.968Z" },
|
| 301 |
+
{ url = "https://files.pythonhosted.org/packages/28/70/2928aad2310726d5920b18ed9f54b9f06df5aa4c10cf9b45fa18ff0ab7e8/debugpy-1.8.15-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:f5e01291ad7d6649aed5773256c5bba7a1a556196300232de1474c3c372592bf", size = 2495538, upload-time = "2025-07-15T16:43:48.927Z" },
|
| 302 |
+
{ url = "https://files.pythonhosted.org/packages/9e/c6/9b8ffb4ca91fac8b2877eef63c9cc0e87dd2570b1120054c272815ec4cd0/debugpy-1.8.15-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dc0f0d00e528d915e0ce1c78e771475b2335b376c49afcc7382ee0b146bab6", size = 4221874, upload-time = "2025-07-15T16:43:50.282Z" },
|
| 303 |
+
{ url = "https://files.pythonhosted.org/packages/55/8a/9b8d59674b4bf489318c7c46a1aab58e606e583651438084b7e029bf3c43/debugpy-1.8.15-cp313-cp313-win32.whl", hash = "sha256:fcf0748d4f6e25f89dc5e013d1129ca6f26ad4da405e0723a4f704583896a709", size = 5275949, upload-time = "2025-07-15T16:43:52.079Z" },
|
| 304 |
+
{ url = "https://files.pythonhosted.org/packages/72/83/9e58e6fdfa8710a5e6ec06c2401241b9ad48b71c0a7eb99570a1f1edb1d3/debugpy-1.8.15-cp313-cp313-win_amd64.whl", hash = "sha256:73c943776cb83e36baf95e8f7f8da765896fd94b05991e7bc162456d25500683", size = 5317720, upload-time = "2025-07-15T16:43:53.703Z" },
|
| 305 |
+
{ url = "https://files.pythonhosted.org/packages/07/d5/98748d9860e767a1248b5e31ffa7ce8cb7006e97bf8abbf3d891d0a8ba4e/debugpy-1.8.15-py2.py3-none-any.whl", hash = "sha256:bce2e6c5ff4f2e00b98d45e7e01a49c7b489ff6df5f12d881c67d2f1ac635f3d", size = 5282697, upload-time = "2025-07-15T16:44:07.996Z" },
|
| 306 |
+
]
|
| 307 |
+
|
| 308 |
+
[[package]]
|
| 309 |
+
name = "decorator"
|
| 310 |
+
version = "5.2.1"
|
| 311 |
+
source = { registry = "https://pypi.org/simple" }
|
| 312 |
+
sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
|
| 313 |
+
wheels = [
|
| 314 |
+
{ url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
|
| 315 |
+
]
|
| 316 |
+
|
| 317 |
[[package]]
|
| 318 |
name = "en-core-web-md"
|
| 319 |
version = "3.7.0"
|
|
|
|
| 346 |
{ name = "spacy-curated-transformers", specifier = ">=0.2.0,<0.3.0" },
|
| 347 |
]
|
| 348 |
|
| 349 |
+
[[package]]
|
| 350 |
+
name = "executing"
|
| 351 |
+
version = "2.2.0"
|
| 352 |
+
source = { registry = "https://pypi.org/simple" }
|
| 353 |
+
sdist = { url = "https://files.pythonhosted.org/packages/91/50/a9d80c47ff289c611ff12e63f7c5d13942c65d68125160cefd768c73e6e4/executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755", size = 978693, upload-time = "2025-01-22T15:41:29.403Z" }
|
| 354 |
+
wheels = [
|
| 355 |
+
{ url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702, upload-time = "2025-01-22T15:41:25.929Z" },
|
| 356 |
+
]
|
| 357 |
+
|
| 358 |
[[package]]
|
| 359 |
name = "filelock"
|
| 360 |
version = "3.18.0"
|
|
|
|
| 373 |
{ url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
|
| 374 |
]
|
| 375 |
|
| 376 |
+
[[package]]
|
| 377 |
+
name = "fugashi"
|
| 378 |
+
version = "1.5.1"
|
| 379 |
+
source = { registry = "https://pypi.org/simple" }
|
| 380 |
+
sdist = { url = "https://files.pythonhosted.org/packages/5e/09/e41bb13152e591f3dd5984be112a97927f6a1ae73ab0301f3cbd1c38db20/fugashi-1.5.1.tar.gz", hash = "sha256:3ff9b4d0e40e04d56d7ced906ae8fba6c6fa41aac46f5210de1b56d6626e7a1f", size = 339745, upload-time = "2025-06-05T10:29:49.158Z" }
|
| 381 |
+
wheels = [
|
| 382 |
+
{ url = "https://files.pythonhosted.org/packages/0f/03/cb79fcc4ec503e39e4aec9878aa4ee2038f56794f418de7e5dccc127b6c3/fugashi-1.5.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9d6e6398a1dd8d704dbd26790195455166f6f93d0fdbebf5d1913a69d15adb22", size = 562515, upload-time = "2025-06-05T10:35:16.458Z" },
|
| 383 |
+
{ url = "https://files.pythonhosted.org/packages/17/6d/cf637e80350e2127d682593ba51916c19dbea9eb7abc5f69b58c5cbbd0d6/fugashi-1.5.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a02a8e4ab7758c29d8b217c8d7b019079220846fdeb04b7e1ddd4dfdb2570b7e", size = 507454, upload-time = "2025-06-05T10:35:17.982Z" },
|
| 384 |
+
{ url = "https://files.pythonhosted.org/packages/51/a1/41eeea4f5e71615b60f0ad39037dbbd787b9376e383219a2cc48e94b3733/fugashi-1.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a01c97af19a378545d7600bcb10552bebb4fe70b54a66032cc78cee1be328d66", size = 503416, upload-time = "2025-06-05T10:35:19.041Z" },
|
| 385 |
+
{ url = "https://files.pythonhosted.org/packages/a6/c1/02fa1c2bcdbb661cc618d11ef23aef5ed243a8f2e680cbf7398ae913961e/fugashi-1.5.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:97906d1c7c56907b87c3fcf587a4990504784d7beecb67673c78c8dd608644c1", size = 675822, upload-time = "2025-06-05T10:54:33.357Z" },
|
| 386 |
+
{ url = "https://files.pythonhosted.org/packages/ee/be/e5723a9c3a6866c14207e7dbb6d06bc49d55ea97e1784bf1096c86f0d954/fugashi-1.5.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:823e6db97d57079da4c3fcc26f04943b894974af5a22f4762e6f6ba2ed63f212", size = 697875, upload-time = "2025-06-05T10:30:50.634Z" },
|
| 387 |
+
{ url = "https://files.pythonhosted.org/packages/f4/bc/a65acd05eca1e5583f34f215df866635a232e6345a80d965ed23d1af0718/fugashi-1.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5afa5a2bf11d039a8e45eac0ba5c2bff54ed9ef9379cb9ad7f67c987a7f6dfc0", size = 513282, upload-time = "2025-06-05T10:29:38.667Z" },
|
| 388 |
+
{ url = "https://files.pythonhosted.org/packages/0d/2c/684cd6bb8d0a988f1d4b7e41c8eebe0385417113b2a18006c3d032df7139/fugashi-1.5.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e6f69766af17299635fa5c2ee9fe99476482003126ee1769f565a661ebd4cfb1", size = 560845, upload-time = "2025-06-05T10:35:20.042Z" },
|
| 389 |
+
{ url = "https://files.pythonhosted.org/packages/96/c8/e8ce5efa5a7a80a5ad75770f1944c4b22694408e956b7d8a5780cda879dd/fugashi-1.5.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2055a0e0993261906f3010522ccc94b8bb9278b35a726ed58b314a5b539b9511", size = 506664, upload-time = "2025-06-05T10:35:21.015Z" },
|
| 390 |
+
{ url = "https://files.pythonhosted.org/packages/1a/5d/46a06d2ed06cccf8a553ba0c6d723bb9863b0a02ba81463a425e30eab082/fugashi-1.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f0f3e269bfd9ba92c64086d9e6963a0bd81a3dffb9b6eeb981f33902738b7956", size = 502687, upload-time = "2025-06-05T10:35:22.298Z" },
|
| 391 |
+
{ url = "https://files.pythonhosted.org/packages/14/89/7f90847fd65ea1ef50a070b0cb63a8fad12b18f54d95627cf4ac57af3a41/fugashi-1.5.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:53ff43069ed46bd0d5dec4140115f7883bf4a590d70f3c90a422c61260be342b", size = 672332, upload-time = "2025-06-05T10:54:34.757Z" },
|
| 392 |
+
{ url = "https://files.pythonhosted.org/packages/72/6e/b92fec651f430e258c9fd0a82b924be2fcc23d0defd74e76ad6a5bbd97f6/fugashi-1.5.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53ce31df44b4e95904793280eda0e9895646828859801d457314efc1d535cb4f", size = 693962, upload-time = "2025-06-05T10:30:52.246Z" },
|
| 393 |
+
{ url = "https://files.pythonhosted.org/packages/84/a9/72a7c8261ddceb0fbaee8fe075d4acd9023504c8fa8cbea2cf6140892040/fugashi-1.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:adf1646103151af5c0b78f11fd01e145c506774609243935c0978606e4a96ad3", size = 513083, upload-time = "2025-06-05T10:29:30.189Z" },
|
| 394 |
+
]
|
| 395 |
+
|
| 396 |
[[package]]
|
| 397 |
name = "gitdb"
|
| 398 |
version = "4.0.12"
|
|
|
|
| 478 |
{ url = "https://files.pythonhosted.org/packages/ce/ff/3b59672c47c6284e8005b42e84ceba13864aa0f39f067c973d1af02f5d91/InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4", size = 67677, upload-time = "2022-06-27T23:11:17.723Z" },
|
| 479 |
]
|
| 480 |
|
| 481 |
+
[[package]]
|
| 482 |
+
name = "ipykernel"
|
| 483 |
+
version = "6.29.5"
|
| 484 |
+
source = { registry = "https://pypi.org/simple" }
|
| 485 |
+
dependencies = [
|
| 486 |
+
{ name = "appnope", marker = "sys_platform == 'darwin'" },
|
| 487 |
+
{ name = "comm" },
|
| 488 |
+
{ name = "debugpy" },
|
| 489 |
+
{ name = "ipython" },
|
| 490 |
+
{ name = "jupyter-client" },
|
| 491 |
+
{ name = "jupyter-core" },
|
| 492 |
+
{ name = "matplotlib-inline" },
|
| 493 |
+
{ name = "nest-asyncio" },
|
| 494 |
+
{ name = "packaging" },
|
| 495 |
+
{ name = "psutil" },
|
| 496 |
+
{ name = "pyzmq" },
|
| 497 |
+
{ name = "tornado" },
|
| 498 |
+
{ name = "traitlets" },
|
| 499 |
+
]
|
| 500 |
+
sdist = { url = "https://files.pythonhosted.org/packages/e9/5c/67594cb0c7055dc50814b21731c22a601101ea3b1b50a9a1b090e11f5d0f/ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215", size = 163367, upload-time = "2024-07-01T14:07:22.543Z" }
|
| 501 |
+
wheels = [
|
| 502 |
+
{ url = "https://files.pythonhosted.org/packages/94/5c/368ae6c01c7628438358e6d337c19b05425727fbb221d2a3c4303c372f42/ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5", size = 117173, upload-time = "2024-07-01T14:07:19.603Z" },
|
| 503 |
+
]
|
| 504 |
+
|
| 505 |
+
[[package]]
|
| 506 |
+
name = "ipython"
|
| 507 |
+
version = "9.4.0"
|
| 508 |
+
source = { registry = "https://pypi.org/simple" }
|
| 509 |
+
dependencies = [
|
| 510 |
+
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
| 511 |
+
{ name = "decorator" },
|
| 512 |
+
{ name = "ipython-pygments-lexers" },
|
| 513 |
+
{ name = "jedi" },
|
| 514 |
+
{ name = "matplotlib-inline" },
|
| 515 |
+
{ name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
|
| 516 |
+
{ name = "prompt-toolkit" },
|
| 517 |
+
{ name = "pygments" },
|
| 518 |
+
{ name = "stack-data" },
|
| 519 |
+
{ name = "traitlets" },
|
| 520 |
+
]
|
| 521 |
+
sdist = { url = "https://files.pythonhosted.org/packages/54/80/406f9e3bde1c1fd9bf5a0be9d090f8ae623e401b7670d8f6fdf2ab679891/ipython-9.4.0.tar.gz", hash = "sha256:c033c6d4e7914c3d9768aabe76bbe87ba1dc66a92a05db6bfa1125d81f2ee270", size = 4385338, upload-time = "2025-07-01T11:11:30.606Z" }
|
| 522 |
+
wheels = [
|
| 523 |
+
{ url = "https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl", hash = "sha256:25850f025a446d9b359e8d296ba175a36aedd32e83ca9b5060430fe16801f066", size = 611021, upload-time = "2025-07-01T11:11:27.85Z" },
|
| 524 |
+
]
|
| 525 |
+
|
| 526 |
+
[[package]]
|
| 527 |
+
name = "ipython-pygments-lexers"
|
| 528 |
+
version = "1.1.1"
|
| 529 |
+
source = { registry = "https://pypi.org/simple" }
|
| 530 |
+
dependencies = [
|
| 531 |
+
{ name = "pygments" },
|
| 532 |
+
]
|
| 533 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" }
|
| 534 |
+
wheels = [
|
| 535 |
+
{ url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
|
| 536 |
+
]
|
| 537 |
+
|
| 538 |
[[package]]
|
| 539 |
name = "ja-core-news-md"
|
| 540 |
version = "3.7.0"
|
|
|
|
| 577 |
{ name = "sudachipy", specifier = ">=0.5.2,!=0.6.1" },
|
| 578 |
]
|
| 579 |
|
| 580 |
+
[[package]]
|
| 581 |
+
name = "jedi"
|
| 582 |
+
version = "0.19.2"
|
| 583 |
+
source = { registry = "https://pypi.org/simple" }
|
| 584 |
+
dependencies = [
|
| 585 |
+
{ name = "parso" },
|
| 586 |
+
]
|
| 587 |
+
sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
|
| 588 |
+
wheels = [
|
| 589 |
+
{ url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
|
| 590 |
+
]
|
| 591 |
+
|
| 592 |
[[package]]
|
| 593 |
name = "jinja2"
|
| 594 |
version = "3.1.6"
|
|
|
|
| 628 |
{ url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" },
|
| 629 |
]
|
| 630 |
|
| 631 |
+
[[package]]
|
| 632 |
+
name = "jupyter-client"
|
| 633 |
+
version = "8.6.3"
|
| 634 |
+
source = { registry = "https://pypi.org/simple" }
|
| 635 |
+
dependencies = [
|
| 636 |
+
{ name = "jupyter-core" },
|
| 637 |
+
{ name = "python-dateutil" },
|
| 638 |
+
{ name = "pyzmq" },
|
| 639 |
+
{ name = "tornado" },
|
| 640 |
+
{ name = "traitlets" },
|
| 641 |
+
]
|
| 642 |
+
sdist = { url = "https://files.pythonhosted.org/packages/71/22/bf9f12fdaeae18019a468b68952a60fe6dbab5d67cd2a103cac7659b41ca/jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419", size = 342019, upload-time = "2024-09-17T10:44:17.613Z" }
|
| 643 |
+
wheels = [
|
| 644 |
+
{ url = "https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f", size = 106105, upload-time = "2024-09-17T10:44:15.218Z" },
|
| 645 |
+
]
|
| 646 |
+
|
| 647 |
+
[[package]]
|
| 648 |
+
name = "jupyter-core"
|
| 649 |
+
version = "5.8.1"
|
| 650 |
+
source = { registry = "https://pypi.org/simple" }
|
| 651 |
+
dependencies = [
|
| 652 |
+
{ name = "platformdirs" },
|
| 653 |
+
{ name = "pywin32", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'win32'" },
|
| 654 |
+
{ name = "traitlets" },
|
| 655 |
+
]
|
| 656 |
+
sdist = { url = "https://files.pythonhosted.org/packages/99/1b/72906d554acfeb588332eaaa6f61577705e9ec752ddb486f302dafa292d9/jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941", size = 88923, upload-time = "2025-05-27T07:38:16.655Z" }
|
| 657 |
+
wheels = [
|
| 658 |
+
{ url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" },
|
| 659 |
+
]
|
| 660 |
+
|
| 661 |
[[package]]
|
| 662 |
name = "langcodes"
|
| 663 |
version = "3.5.0"
|
|
|
|
| 765 |
{ url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" },
|
| 766 |
]
|
| 767 |
|
| 768 |
+
[[package]]
|
| 769 |
+
name = "matplotlib-inline"
|
| 770 |
+
version = "0.1.7"
|
| 771 |
+
source = { registry = "https://pypi.org/simple" }
|
| 772 |
+
dependencies = [
|
| 773 |
+
{ name = "traitlets" },
|
| 774 |
+
]
|
| 775 |
+
sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159, upload-time = "2024-04-15T13:44:44.803Z" }
|
| 776 |
+
wheels = [
|
| 777 |
+
{ url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" },
|
| 778 |
+
]
|
| 779 |
+
|
| 780 |
[[package]]
|
| 781 |
name = "mdurl"
|
| 782 |
version = "0.1.2"
|
|
|
|
| 826 |
{ url = "https://files.pythonhosted.org/packages/c0/15/278693412221859a0159719878e51a79812a189edceef2fe325160a8e661/narwhals-1.47.1-py3-none-any.whl", hash = "sha256:b9f2b2557aba054231361a00f6fcabc5017e338575e810e82155eb34e38ace93", size = 375506, upload-time = "2025-07-17T18:23:02.492Z" },
|
| 827 |
]
|
| 828 |
|
| 829 |
+
[[package]]
|
| 830 |
+
name = "nest-asyncio"
|
| 831 |
+
version = "1.6.0"
|
| 832 |
+
source = { registry = "https://pypi.org/simple" }
|
| 833 |
+
sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" }
|
| 834 |
+
wheels = [
|
| 835 |
+
{ url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" },
|
| 836 |
+
]
|
| 837 |
+
|
| 838 |
[[package]]
|
| 839 |
name = "networkx"
|
| 840 |
version = "3.5"
|
|
|
|
| 1036 |
{ url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
|
| 1037 |
]
|
| 1038 |
|
| 1039 |
+
[[package]]
|
| 1040 |
+
name = "parso"
|
| 1041 |
+
version = "0.8.4"
|
| 1042 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1043 |
+
sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609, upload-time = "2024-04-05T09:43:55.897Z" }
|
| 1044 |
+
wheels = [
|
| 1045 |
+
{ url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" },
|
| 1046 |
+
]
|
| 1047 |
+
|
| 1048 |
+
[[package]]
|
| 1049 |
+
name = "pexpect"
|
| 1050 |
+
version = "4.9.0"
|
| 1051 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1052 |
+
dependencies = [
|
| 1053 |
+
{ name = "ptyprocess" },
|
| 1054 |
+
]
|
| 1055 |
+
sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
|
| 1056 |
+
wheels = [
|
| 1057 |
+
{ url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
|
| 1058 |
+
]
|
| 1059 |
+
|
| 1060 |
[[package]]
|
| 1061 |
name = "pfzy"
|
| 1062 |
version = "0.3.4"
|
|
|
|
| 1132 |
{ url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
|
| 1133 |
]
|
| 1134 |
|
| 1135 |
+
[[package]]
|
| 1136 |
+
name = "plac"
|
| 1137 |
+
version = "1.4.5"
|
| 1138 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1139 |
+
sdist = { url = "https://files.pythonhosted.org/packages/23/09/26ef2d614cabdcc52a7f383d0dc7967bf46be3c9700898c594e37b710c3d/plac-1.4.5.tar.gz", hash = "sha256:5f05bf85235c017fcd76c73c8101d4ff8e96beb3dc58b9a37de49cac7de82d14", size = 38988, upload-time = "2025-04-04T14:03:25.651Z" }
|
| 1140 |
+
wheels = [
|
| 1141 |
+
{ url = "https://files.pythonhosted.org/packages/15/36/38676114a0dbee137ec366daa86603d667a07e9a52667d5ebf5c580100ba/plac-1.4.5-py2.py3-none-any.whl", hash = "sha256:87187786b4e446688b1cf5112e18fed8a23ab3b316c25fe91266a10bd1736b16", size = 22468, upload-time = "2025-04-04T14:03:24.761Z" },
|
| 1142 |
+
]
|
| 1143 |
+
|
| 1144 |
+
[[package]]
|
| 1145 |
+
name = "platformdirs"
|
| 1146 |
+
version = "4.3.8"
|
| 1147 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1148 |
+
sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" }
|
| 1149 |
+
wheels = [
|
| 1150 |
+
{ url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" },
|
| 1151 |
+
]
|
| 1152 |
+
|
| 1153 |
[[package]]
|
| 1154 |
name = "plotly"
|
| 1155 |
version = "6.2.0"
|
|
|
|
| 1215 |
{ url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload-time = "2025-05-28T19:25:53.926Z" },
|
| 1216 |
]
|
| 1217 |
|
| 1218 |
+
[[package]]
|
| 1219 |
+
name = "psutil"
|
| 1220 |
+
version = "7.0.0"
|
| 1221 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1222 |
+
sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
|
| 1223 |
+
wheels = [
|
| 1224 |
+
{ url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
|
| 1225 |
+
{ url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
|
| 1226 |
+
{ url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
|
| 1227 |
+
{ url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
|
| 1228 |
+
{ url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
|
| 1229 |
+
{ url = "https://files.pythonhosted.org/packages/50/e6/eecf58810b9d12e6427369784efe814a1eec0f492084ce8eb8f4d89d6d61/psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99", size = 241053, upload-time = "2025-02-13T21:54:34.31Z" },
|
| 1230 |
+
{ url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885, upload-time = "2025-02-13T21:54:37.486Z" },
|
| 1231 |
+
]
|
| 1232 |
+
|
| 1233 |
+
[[package]]
|
| 1234 |
+
name = "ptyprocess"
|
| 1235 |
+
version = "0.7.0"
|
| 1236 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1237 |
+
sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
|
| 1238 |
+
wheels = [
|
| 1239 |
+
{ url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
|
| 1240 |
+
]
|
| 1241 |
+
|
| 1242 |
+
[[package]]
|
| 1243 |
+
name = "pure-eval"
|
| 1244 |
+
version = "0.2.3"
|
| 1245 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1246 |
+
sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
|
| 1247 |
+
wheels = [
|
| 1248 |
+
{ url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
|
| 1249 |
+
]
|
| 1250 |
+
|
| 1251 |
[[package]]
|
| 1252 |
name = "pyarrow"
|
| 1253 |
version = "21.0.0"
|
|
|
|
| 1277 |
{ url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
|
| 1278 |
]
|
| 1279 |
|
| 1280 |
+
[[package]]
|
| 1281 |
+
name = "pycparser"
|
| 1282 |
+
version = "2.22"
|
| 1283 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1284 |
+
sdist = { url = "https://files.pythonhosted.org/packages/1d/b2/31537cf4b1ca988837256c910a668b553fceb8f069bedc4b1c826024b52c/pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6", size = 172736, upload-time = "2024-03-30T13:22:22.564Z" }
|
| 1285 |
+
wheels = [
|
| 1286 |
+
{ url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552, upload-time = "2024-03-30T13:22:20.476Z" },
|
| 1287 |
+
]
|
| 1288 |
+
|
| 1289 |
[[package]]
|
| 1290 |
name = "pydantic"
|
| 1291 |
version = "2.11.7"
|
|
|
|
| 1386 |
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
|
| 1387 |
]
|
| 1388 |
|
| 1389 |
+
[[package]]
|
| 1390 |
+
name = "pywin32"
|
| 1391 |
+
version = "311"
|
| 1392 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1393 |
+
wheels = [
|
| 1394 |
+
{ url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" },
|
| 1395 |
+
{ url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" },
|
| 1396 |
+
{ url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" },
|
| 1397 |
+
{ url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" },
|
| 1398 |
+
{ url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" },
|
| 1399 |
+
{ url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" },
|
| 1400 |
+
{ url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" },
|
| 1401 |
+
{ url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" },
|
| 1402 |
+
{ url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" },
|
| 1403 |
+
]
|
| 1404 |
+
|
| 1405 |
[[package]]
|
| 1406 |
name = "pyyaml"
|
| 1407 |
version = "6.0.2"
|
|
|
|
| 1428 |
{ url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
|
| 1429 |
]
|
| 1430 |
|
| 1431 |
+
[[package]]
|
| 1432 |
+
name = "pyzmq"
|
| 1433 |
+
version = "27.0.0"
|
| 1434 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1435 |
+
dependencies = [
|
| 1436 |
+
{ name = "cffi", marker = "implementation_name == 'pypy'" },
|
| 1437 |
+
]
|
| 1438 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f1/06/50a4e9648b3e8b992bef8eb632e457307553a89d294103213cfd47b3da69/pyzmq-27.0.0.tar.gz", hash = "sha256:b1f08eeb9ce1510e6939b6e5dcd46a17765e2333daae78ecf4606808442e52cf", size = 280478, upload-time = "2025-06-13T14:09:07.087Z" }
|
| 1439 |
+
wheels = [
|
| 1440 |
+
{ url = "https://files.pythonhosted.org/packages/93/a7/9ad68f55b8834ede477842214feba6a4c786d936c022a67625497aacf61d/pyzmq-27.0.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:cbabc59dcfaac66655c040dfcb8118f133fb5dde185e5fc152628354c1598e52", size = 1305438, upload-time = "2025-06-13T14:07:31.676Z" },
|
| 1441 |
+
{ url = "https://files.pythonhosted.org/packages/ba/ee/26aa0f98665a22bc90ebe12dced1de5f3eaca05363b717f6fb229b3421b3/pyzmq-27.0.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:cb0ac5179cba4b2f94f1aa208fbb77b62c4c9bf24dd446278b8b602cf85fcda3", size = 895095, upload-time = "2025-06-13T14:07:33.104Z" },
|
| 1442 |
+
{ url = "https://files.pythonhosted.org/packages/cf/85/c57e7ab216ecd8aa4cc7e3b83b06cc4e9cf45c87b0afc095f10cd5ce87c1/pyzmq-27.0.0-cp312-abi3-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53a48f0228eab6cbf69fde3aa3c03cbe04e50e623ef92ae395fce47ef8a76152", size = 651826, upload-time = "2025-06-13T14:07:34.831Z" },
|
| 1443 |
+
{ url = "https://files.pythonhosted.org/packages/69/9a/9ea7e230feda9400fb0ae0d61d7d6ddda635e718d941c44eeab22a179d34/pyzmq-27.0.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:111db5f395e09f7e775f759d598f43cb815fc58e0147623c4816486e1a39dc22", size = 839750, upload-time = "2025-06-13T14:07:36.553Z" },
|
| 1444 |
+
{ url = "https://files.pythonhosted.org/packages/08/66/4cebfbe71f3dfbd417011daca267539f62ed0fbc68105357b68bbb1a25b7/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c8878011653dcdc27cc2c57e04ff96f0471e797f5c19ac3d7813a245bcb24371", size = 1641357, upload-time = "2025-06-13T14:07:38.21Z" },
|
| 1445 |
+
{ url = "https://files.pythonhosted.org/packages/ac/f6/b0f62578c08d2471c791287149cb8c2aaea414ae98c6e995c7dbe008adfb/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:c0ed2c1f335ba55b5fdc964622254917d6b782311c50e138863eda409fbb3b6d", size = 2020281, upload-time = "2025-06-13T14:07:39.599Z" },
|
| 1446 |
+
{ url = "https://files.pythonhosted.org/packages/37/b9/4f670b15c7498495da9159edc374ec09c88a86d9cd5a47d892f69df23450/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e918d70862d4cfd4b1c187310015646a14e1f5917922ab45b29f28f345eeb6be", size = 1877110, upload-time = "2025-06-13T14:07:41.027Z" },
|
| 1447 |
+
{ url = "https://files.pythonhosted.org/packages/66/31/9dee25c226295b740609f0d46db2fe972b23b6f5cf786360980524a3ba92/pyzmq-27.0.0-cp312-abi3-win32.whl", hash = "sha256:88b4e43cab04c3c0f0d55df3b1eef62df2b629a1a369b5289a58f6fa8b07c4f4", size = 559297, upload-time = "2025-06-13T14:07:42.533Z" },
|
| 1448 |
+
{ url = "https://files.pythonhosted.org/packages/9b/12/52da5509800f7ff2d287b2f2b4e636e7ea0f001181cba6964ff6c1537778/pyzmq-27.0.0-cp312-abi3-win_amd64.whl", hash = "sha256:dce4199bf5f648a902ce37e7b3afa286f305cd2ef7a8b6ec907470ccb6c8b371", size = 619203, upload-time = "2025-06-13T14:07:43.843Z" },
|
| 1449 |
+
{ url = "https://files.pythonhosted.org/packages/93/6d/7f2e53b19d1edb1eb4f09ec7c3a1f945ca0aac272099eab757d15699202b/pyzmq-27.0.0-cp312-abi3-win_arm64.whl", hash = "sha256:56e46bbb85d52c1072b3f809cc1ce77251d560bc036d3a312b96db1afe76db2e", size = 551927, upload-time = "2025-06-13T14:07:45.51Z" },
|
| 1450 |
+
{ url = "https://files.pythonhosted.org/packages/19/62/876b27c4ff777db4ceba1c69ea90d3c825bb4f8d5e7cd987ce5802e33c55/pyzmq-27.0.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:c36ad534c0c29b4afa088dc53543c525b23c0797e01b69fef59b1a9c0e38b688", size = 1340826, upload-time = "2025-06-13T14:07:46.881Z" },
|
| 1451 |
+
{ url = "https://files.pythonhosted.org/packages/43/69/58ef8f4f59d3bcd505260c73bee87b008850f45edca40ddaba54273c35f4/pyzmq-27.0.0-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:67855c14173aec36395d7777aaba3cc527b393821f30143fd20b98e1ff31fd38", size = 897283, upload-time = "2025-06-13T14:07:49.562Z" },
|
| 1452 |
+
{ url = "https://files.pythonhosted.org/packages/43/15/93a0d0396700a60475ad3c5d42c5f1c308d3570bc94626b86c71ef9953e0/pyzmq-27.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8617c7d43cd8ccdb62aebe984bfed77ca8f036e6c3e46dd3dddda64b10f0ab7a", size = 660567, upload-time = "2025-06-13T14:07:51.364Z" },
|
| 1453 |
+
{ url = "https://files.pythonhosted.org/packages/0e/b3/fe055513e498ca32f64509abae19b9c9eb4d7c829e02bd8997dd51b029eb/pyzmq-27.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:67bfbcbd0a04c575e8103a6061d03e393d9f80ffdb9beb3189261e9e9bc5d5e9", size = 847681, upload-time = "2025-06-13T14:07:52.77Z" },
|
| 1454 |
+
{ url = "https://files.pythonhosted.org/packages/b6/4f/ff15300b00b5b602191f3df06bbc8dd4164e805fdd65bb77ffbb9c5facdc/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5cd11d46d7b7e5958121b3eaf4cd8638eff3a720ec527692132f05a57f14341d", size = 1650148, upload-time = "2025-06-13T14:07:54.178Z" },
|
| 1455 |
+
{ url = "https://files.pythonhosted.org/packages/c4/6f/84bdfff2a224a6f26a24249a342e5906993c50b0761e311e81b39aef52a7/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:b801c2e40c5aa6072c2f4876de8dccd100af6d9918d4d0d7aa54a1d982fd4f44", size = 2023768, upload-time = "2025-06-13T14:07:55.714Z" },
|
| 1456 |
+
{ url = "https://files.pythonhosted.org/packages/64/39/dc2db178c26a42228c5ac94a9cc595030458aa64c8d796a7727947afbf55/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:20d5cb29e8c5f76a127c75b6e7a77e846bc4b655c373baa098c26a61b7ecd0ef", size = 1885199, upload-time = "2025-06-13T14:07:57.166Z" },
|
| 1457 |
+
{ url = "https://files.pythonhosted.org/packages/c7/21/dae7b06a1f8cdee5d8e7a63d99c5d129c401acc40410bef2cbf42025e26f/pyzmq-27.0.0-cp313-cp313t-win32.whl", hash = "sha256:a20528da85c7ac7a19b7384e8c3f8fa707841fd85afc4ed56eda59d93e3d98ad", size = 575439, upload-time = "2025-06-13T14:07:58.959Z" },
|
| 1458 |
+
{ url = "https://files.pythonhosted.org/packages/eb/bc/1709dc55f0970cf4cb8259e435e6773f9946f41a045c2cb90e870b7072da/pyzmq-27.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d8229f2efece6a660ee211d74d91dbc2a76b95544d46c74c615e491900dc107f", size = 639933, upload-time = "2025-06-13T14:08:00.777Z" },
|
| 1459 |
+
]
|
| 1460 |
+
|
| 1461 |
[[package]]
|
| 1462 |
name = "referencing"
|
| 1463 |
version = "0.36.2"
|
|
|
|
| 1675 |
version = "0.1.0"
|
| 1676 |
source = { virtual = "." }
|
| 1677 |
dependencies = [
|
| 1678 |
+
{ name = "chardet" },
|
| 1679 |
{ name = "en-core-web-md" },
|
| 1680 |
{ name = "en-core-web-trf" },
|
| 1681 |
+
{ name = "fugashi" },
|
| 1682 |
{ name = "huggingface-hub", extra = ["cli"] },
|
| 1683 |
+
{ name = "ipykernel" },
|
| 1684 |
{ name = "ja-core-news-md" },
|
| 1685 |
{ name = "ja-core-news-trf" },
|
| 1686 |
{ name = "numpy" },
|
|
|
|
| 1691 |
{ name = "spacy" },
|
| 1692 |
{ name = "spacy-curated-transformers" },
|
| 1693 |
{ name = "streamlit" },
|
| 1694 |
+
{ name = "unidic" },
|
| 1695 |
]
|
| 1696 |
|
| 1697 |
[package.metadata]
|
| 1698 |
requires-dist = [
|
| 1699 |
+
{ name = "chardet", specifier = ">=5.2.0" },
|
| 1700 |
{ name = "en-core-web-md", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl" },
|
| 1701 |
{ name = "en-core-web-trf", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl" },
|
| 1702 |
+
{ name = "fugashi", specifier = ">=1.3.0" },
|
| 1703 |
{ name = "huggingface-hub", extras = ["cli"], specifier = ">=0.33.4" },
|
| 1704 |
+
{ name = "ipykernel", specifier = ">=6.29.5" },
|
| 1705 |
{ name = "ja-core-news-md", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl" },
|
| 1706 |
{ name = "ja-core-news-trf", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl" },
|
| 1707 |
{ name = "numpy", specifier = ">=1.24.0,<2.0" },
|
|
|
|
| 1712 |
{ name = "spacy", specifier = ">=3.7.0" },
|
| 1713 |
{ name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
|
| 1714 |
{ name = "streamlit", specifier = ">=1.28.0" },
|
| 1715 |
+
{ name = "unidic", specifier = ">=1.1.0" },
|
| 1716 |
]
|
| 1717 |
|
| 1718 |
[[package]]
|
|
|
|
| 1837 |
{ url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload-time = "2025-01-17T09:26:10.018Z" },
|
| 1838 |
]
|
| 1839 |
|
| 1840 |
+
[[package]]
|
| 1841 |
+
name = "stack-data"
|
| 1842 |
+
version = "0.6.3"
|
| 1843 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1844 |
+
dependencies = [
|
| 1845 |
+
{ name = "asttokens" },
|
| 1846 |
+
{ name = "executing" },
|
| 1847 |
+
{ name = "pure-eval" },
|
| 1848 |
+
]
|
| 1849 |
+
sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
|
| 1850 |
+
wheels = [
|
| 1851 |
+
{ url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
|
| 1852 |
+
]
|
| 1853 |
+
|
| 1854 |
[[package]]
|
| 1855 |
name = "streamlit"
|
| 1856 |
version = "1.47.0"
|
|
|
|
| 2039 |
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
|
| 2040 |
]
|
| 2041 |
|
| 2042 |
+
[[package]]
|
| 2043 |
+
name = "traitlets"
|
| 2044 |
+
version = "5.14.3"
|
| 2045 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2046 |
+
sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
|
| 2047 |
+
wheels = [
|
| 2048 |
+
{ url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
|
| 2049 |
+
]
|
| 2050 |
+
|
| 2051 |
[[package]]
|
| 2052 |
name = "triton"
|
| 2053 |
version = "3.3.1"
|
|
|
|
| 2106 |
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
|
| 2107 |
]
|
| 2108 |
|
| 2109 |
+
[[package]]
|
| 2110 |
+
name = "unidic"
|
| 2111 |
+
version = "1.1.0"
|
| 2112 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2113 |
+
dependencies = [
|
| 2114 |
+
{ name = "plac" },
|
| 2115 |
+
{ name = "requests" },
|
| 2116 |
+
{ name = "tqdm" },
|
| 2117 |
+
{ name = "wasabi" },
|
| 2118 |
+
]
|
| 2119 |
+
sdist = { url = "https://files.pythonhosted.org/packages/5a/09/271dfbf8d5b56adddc70e30fa94249f5d3ab35f615bf278d65258045564a/unidic-1.1.0.tar.gz", hash = "sha256:0ab91c05de342c84d2a6314901fd3afb9061ecd7534dd4a0431dccbb87d921b7", size = 7688, upload-time = "2021-10-10T08:56:44.301Z" }
|
| 2120 |
+
|
| 2121 |
[[package]]
|
| 2122 |
name = "urllib3"
|
| 2123 |
version = "2.5.0"
|
|
|
|
| 2129 |
|
| 2130 |
[[package]]
|
| 2131 |
name = "wasabi"
|
| 2132 |
+
version = "0.10.1"
|
| 2133 |
source = { registry = "https://pypi.org/simple" }
|
| 2134 |
+
sdist = { url = "https://files.pythonhosted.org/packages/69/41/0c31737ee1a29c8b829690ebb4ab988b1f489aa2c3efa115a732a9dd7997/wasabi-0.10.1.tar.gz", hash = "sha256:c8e372781be19272942382b14d99314d175518d7822057cb7a97010c4259d249", size = 28380, upload-time = "2022-07-28T08:17:54.968Z" }
|
|
|
|
|
|
|
|
|
|
| 2135 |
wheels = [
|
| 2136 |
+
{ url = "https://files.pythonhosted.org/packages/34/74/bd566f876c2de097e75d525c2696fb9829009987a0d93a4fb3576778a0a8/wasabi-0.10.1-py3-none-any.whl", hash = "sha256:fe862cc24034fbc9f04717cd312ab884f71f51a8ecabebc3449b751c2a649d83", size = 26075, upload-time = "2022-07-28T08:17:53.504Z" },
|
| 2137 |
]
|
| 2138 |
|
| 2139 |
[[package]]
|
web_app/__pycache__/analysis_handlers.cpython-312.pyc
DELETED
|
Binary file (17.9 kB)
|
|
|
web_app/__pycache__/app.cpython-312.pyc
DELETED
|
Binary file (4.4 kB)
|
|
|
web_app/__pycache__/comparison_functions.cpython-312.pyc
DELETED
|
Binary file (13.2 kB)
|
|
|
web_app/__pycache__/config_manager.cpython-312.pyc
DELETED
|
Binary file (9.89 kB)
|
|
|
web_app/__pycache__/pos_handlers.cpython-312.pyc
DELETED
|
Binary file (7.49 kB)
|
|
|
web_app/__pycache__/reference_manager.cpython-312.pyc
DELETED
|
Binary file (10.6 kB)
|
|
|
web_app/__pycache__/session_manager.cpython-312.pyc
DELETED
|
Binary file (6.81 kB)
|
|
|
web_app/__pycache__/ui_components.cpython-312.pyc
DELETED
|
Binary file (11.8 kB)
|
|
|
web_app/app.py
CHANGED
|
@@ -19,6 +19,7 @@ from web_app.components.ui_components import UIComponents
|
|
| 19 |
from web_app.handlers.analysis_handlers import AnalysisHandlers
|
| 20 |
from web_app.reference_manager import ReferenceManager
|
| 21 |
from web_app.handlers.pos_handlers import POSHandlers
|
|
|
|
| 22 |
|
| 23 |
# Configure Streamlit page
|
| 24 |
st.set_page_config(
|
|
@@ -32,7 +33,7 @@ st.set_page_config(
|
|
| 32 |
def main():
|
| 33 |
"""Main application entry point."""
|
| 34 |
st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
|
| 35 |
-
st.markdown("*Educational tools for lexical sophistication analysis
|
| 36 |
|
| 37 |
# Initialize session state
|
| 38 |
SessionManager.initialize_session_state()
|
|
@@ -46,8 +47,10 @@ def main():
|
|
| 46 |
# Route to appropriate interface
|
| 47 |
if tool_choice == 'Lexical Sophistication':
|
| 48 |
render_lexical_sophistication_interface()
|
| 49 |
-
|
| 50 |
render_pos_parser_interface()
|
|
|
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def render_sidebar():
|
|
@@ -117,5 +120,14 @@ def render_pos_parser_interface():
|
|
| 117 |
POSHandlers.handle_batch_pos_analysis(parser)
|
| 118 |
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
if __name__ == "__main__":
|
| 121 |
-
main()
|
|
|
|
| 19 |
from web_app.handlers.analysis_handlers import AnalysisHandlers
|
| 20 |
from web_app.reference_manager import ReferenceManager
|
| 21 |
from web_app.handlers.pos_handlers import POSHandlers
|
| 22 |
+
from web_app.handlers.frequency_handlers import FrequencyHandlers
|
| 23 |
|
| 24 |
# Configure Streamlit page
|
| 25 |
st.set_page_config(
|
|
|
|
| 33 |
def main():
|
| 34 |
"""Main application entry point."""
|
| 35 |
st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
|
| 36 |
+
st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
|
| 37 |
|
| 38 |
# Initialize session state
|
| 39 |
SessionManager.initialize_session_state()
|
|
|
|
| 47 |
# Route to appropriate interface
|
| 48 |
if tool_choice == 'Lexical Sophistication':
|
| 49 |
render_lexical_sophistication_interface()
|
| 50 |
+
elif tool_choice == 'POS Parser':
|
| 51 |
render_pos_parser_interface()
|
| 52 |
+
else: # Frequency Analysis
|
| 53 |
+
render_frequency_analysis_interface()
|
| 54 |
|
| 55 |
|
| 56 |
def render_sidebar():
|
|
|
|
| 120 |
POSHandlers.handle_batch_pos_analysis(parser)
|
| 121 |
|
| 122 |
|
| 123 |
+
def render_frequency_analysis_interface():
|
| 124 |
+
"""Render frequency analysis interface."""
|
| 125 |
+
st.header("📊 Word Frequency Analysis")
|
| 126 |
+
st.markdown("Analyze and visualize word frequency distributions from TSV data files.")
|
| 127 |
+
|
| 128 |
+
# Handle frequency analysis
|
| 129 |
+
FrequencyHandlers.handle_frequency_analysis()
|
| 130 |
+
|
| 131 |
+
|
| 132 |
if __name__ == "__main__":
|
| 133 |
+
main()
|
web_app/components/__pycache__/__init__.cpython-312.pyc
DELETED
|
Binary file (244 Bytes)
|
|
|
web_app/components/__pycache__/comparison_functions.cpython-312.pyc
DELETED
|
Binary file (13.2 kB)
|
|
|
web_app/components/__pycache__/ui_components.cpython-312.pyc
DELETED
|
Binary file (11.9 kB)
|
|
|
web_app/components/comparison_functions.py
CHANGED
|
@@ -260,12 +260,13 @@ def display_token_comparison(results_a, results_b):
|
|
| 260 |
'Token': token.get('token', ''),
|
| 261 |
'Lemma': token.get('lemma', ''),
|
| 262 |
'POS': token.get('pos', ''),
|
|
|
|
| 263 |
'Type': token.get('word_type', '')
|
| 264 |
}
|
| 265 |
|
| 266 |
# Add scores for each measure (skip basic fields)
|
| 267 |
for key, value in token.items():
|
| 268 |
-
if key not in ['id', 'token', 'lemma', 'pos', 'word_type']:
|
| 269 |
row[key] = value if value != 'NA' else 'N/A'
|
| 270 |
|
| 271 |
token_data.append(row)
|
|
|
|
| 260 |
'Token': token.get('token', ''),
|
| 261 |
'Lemma': token.get('lemma', ''),
|
| 262 |
'POS': token.get('pos', ''),
|
| 263 |
+
"TAG": token.get('tag', ''),
|
| 264 |
'Type': token.get('word_type', '')
|
| 265 |
}
|
| 266 |
|
| 267 |
# Add scores for each measure (skip basic fields)
|
| 268 |
for key, value in token.items():
|
| 269 |
+
if key not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
|
| 270 |
row[key] = value if value != 'NA' else 'N/A'
|
| 271 |
|
| 272 |
token_data.append(row)
|
web_app/components/ui_components.py
CHANGED
|
@@ -121,7 +121,7 @@ class UIComponents:
|
|
| 121 |
st.subheader("Analysis Tools")
|
| 122 |
return st.radio(
|
| 123 |
"Select Tool",
|
| 124 |
-
options=['Lexical Sophistication', 'POS Parser'],
|
| 125 |
key='tool_choice'
|
| 126 |
)
|
| 127 |
|
|
@@ -229,4 +229,4 @@ class UIComponents:
|
|
| 229 |
st.write(f"- {error}")
|
| 230 |
|
| 231 |
if success_count == 0:
|
| 232 |
-
st.error("No valid configurations found")
|
|
|
|
| 121 |
st.subheader("Analysis Tools")
|
| 122 |
return st.radio(
|
| 123 |
"Select Tool",
|
| 124 |
+
options=['Lexical Sophistication', 'POS Parser', 'Frequency Analysis'],
|
| 125 |
key='tool_choice'
|
| 126 |
)
|
| 127 |
|
|
|
|
| 229 |
st.write(f"- {error}")
|
| 230 |
|
| 231 |
if success_count == 0:
|
| 232 |
+
st.error("No valid configurations found")
|
web_app/config_manager.py
CHANGED
|
@@ -147,6 +147,9 @@ class ConfigManager:
|
|
| 147 |
"""Load actual data for a reference list based on its configuration."""
|
| 148 |
data = {}
|
| 149 |
|
|
|
|
|
|
|
|
|
|
| 150 |
# Check if this is a bigram or trigram configuration
|
| 151 |
columns = list_config.get('columns', {})
|
| 152 |
is_bigram = 'bigram' in columns
|
|
@@ -173,8 +176,12 @@ class ConfigManager:
|
|
| 173 |
# Get column mapping
|
| 174 |
columns = list_config.get('columns', {})
|
| 175 |
|
| 176 |
-
if file_type in ['token', 'lemma']
|
| 177 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
word_col = columns.get('word', 0)
|
| 179 |
score_col = columns.get('frequency', 1)
|
| 180 |
|
|
@@ -208,9 +215,109 @@ class ConfigManager:
|
|
| 208 |
|
| 209 |
return data
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
@staticmethod
|
| 212 |
def clean_default_reference_lists():
|
| 213 |
"""Clean up default reference lists that are no longer selected."""
|
| 214 |
# This would be called by the UI when managing default reference lists
|
| 215 |
# Implementation depends on how default lists are managed
|
| 216 |
-
pass
|
|
|
|
| 147 |
"""Load actual data for a reference list based on its configuration."""
|
| 148 |
data = {}
|
| 149 |
|
| 150 |
+
# Check if this is a Japanese corpus
|
| 151 |
+
is_japanese_corpus = list_config.get('japanese_corpus', False)
|
| 152 |
+
|
| 153 |
# Check if this is a bigram or trigram configuration
|
| 154 |
columns = list_config.get('columns', {})
|
| 155 |
is_bigram = 'bigram' in columns
|
|
|
|
| 176 |
# Get column mapping
|
| 177 |
columns = list_config.get('columns', {})
|
| 178 |
|
| 179 |
+
if is_japanese_corpus and file_type in ['token', 'lemma']:
|
| 180 |
+
# Handle Japanese corpus format with composite keys
|
| 181 |
+
processed_data = ConfigManager._parse_japanese_corpus_data(df, columns)
|
| 182 |
+
data[file_type] = processed_data
|
| 183 |
+
elif file_type in ['token', 'lemma'] and not is_bigram and not is_trigram:
|
| 184 |
+
# For standard unigrams
|
| 185 |
word_col = columns.get('word', 0)
|
| 186 |
score_col = columns.get('frequency', 1)
|
| 187 |
|
|
|
|
| 215 |
|
| 216 |
return data
|
| 217 |
|
| 218 |
+
@staticmethod
|
| 219 |
+
def _parse_japanese_corpus_data(df: pd.DataFrame, columns: Dict[str, int]) -> Dict[str, Any]:
|
| 220 |
+
"""Parse Japanese corpus data and create multiple lookup dictionaries with hierarchical POS splitting."""
|
| 221 |
+
try:
|
| 222 |
+
# Get column indices
|
| 223 |
+
surface_col_idx = columns.get('surface_form', 1)
|
| 224 |
+
lemma_col_idx = columns.get('lemma', 2)
|
| 225 |
+
pos_col_idx = columns.get('pos', 3)
|
| 226 |
+
freq_col_idx = columns.get('frequency', 6)
|
| 227 |
+
|
| 228 |
+
# Get actual column names
|
| 229 |
+
df_columns = list(df.columns)
|
| 230 |
+
surface_col = df_columns[surface_col_idx] if surface_col_idx < len(df_columns) else None
|
| 231 |
+
lemma_col = df_columns[lemma_col_idx] if lemma_col_idx < len(df_columns) else None
|
| 232 |
+
pos_col = df_columns[pos_col_idx] if pos_col_idx < len(df_columns) else None
|
| 233 |
+
freq_col = df_columns[freq_col_idx] if freq_col_idx < len(df_columns) else None
|
| 234 |
+
|
| 235 |
+
if not all([surface_col, lemma_col, pos_col, freq_col]):
|
| 236 |
+
raise ValueError("Missing required columns for Japanese corpus")
|
| 237 |
+
|
| 238 |
+
# Clean the data
|
| 239 |
+
df_clean = df.copy()
|
| 240 |
+
|
| 241 |
+
# Clean text columns
|
| 242 |
+
for col in [surface_col, lemma_col, pos_col]:
|
| 243 |
+
df_clean[col] = df_clean[col].astype(str).str.strip()
|
| 244 |
+
df_clean = df_clean[df_clean[col] != '']
|
| 245 |
+
df_clean = df_clean[df_clean[col] != 'nan']
|
| 246 |
+
|
| 247 |
+
# Clean and convert frequency column
|
| 248 |
+
df_clean[freq_col] = pd.to_numeric(df_clean[freq_col], errors='coerce')
|
| 249 |
+
df_clean = df_clean.dropna(subset=[freq_col])
|
| 250 |
+
df_clean = df_clean[df_clean[freq_col] > 0] # Only positive frequencies
|
| 251 |
+
|
| 252 |
+
# Split POS column by hyphen to extract pos1, pos2, pos3
|
| 253 |
+
def split_pos(pos_str):
|
| 254 |
+
parts = str(pos_str).split('-')
|
| 255 |
+
return {
|
| 256 |
+
'pos1': parts[0] if len(parts) > 0 else '',
|
| 257 |
+
'pos2': parts[1] if len(parts) > 1 else '',
|
| 258 |
+
'pos3': parts[2] if len(parts) > 2 else ''
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
pos_split = df_clean[pos_col].apply(split_pos)
|
| 262 |
+
df_clean['pos1'] = [p['pos1'] for p in pos_split]
|
| 263 |
+
df_clean['pos2'] = [p['pos2'] for p in pos_split]
|
| 264 |
+
df_clean['pos3'] = [p['pos3'] for p in pos_split]
|
| 265 |
+
|
| 266 |
+
# Create multiple levels of composite keys to match UniDic lookup hierarchy
|
| 267 |
+
# Level 1: lemma_lForm_pos1_pos2_pos3 (when pos3 exists)
|
| 268 |
+
df_clean['level1_key'] = df_clean.apply(
|
| 269 |
+
lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}_{row['pos2']}_{row['pos3']}"
|
| 270 |
+
if row['pos3'] else None, axis=1
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Level 2: lemma_lForm_pos1_pos2
|
| 274 |
+
df_clean['level2_key'] = df_clean.apply(
|
| 275 |
+
lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}_{row['pos2']}"
|
| 276 |
+
if row['pos2'] else None, axis=1
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
# Level 3: lemma_lForm_pos1
|
| 280 |
+
df_clean['level3_key'] = df_clean.apply(
|
| 281 |
+
lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}"
|
| 282 |
+
if row['pos1'] else None, axis=1
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# Legacy composite key for backward compatibility
|
| 286 |
+
df_clean['legacy_key'] = df_clean[lemma_col] + '_' + df_clean[pos_col]
|
| 287 |
+
|
| 288 |
+
# Create lookup dictionaries for each level
|
| 289 |
+
level1_dict = {}
|
| 290 |
+
level2_dict = {}
|
| 291 |
+
level3_dict = {}
|
| 292 |
+
|
| 293 |
+
for _, row in df_clean.iterrows():
|
| 294 |
+
freq = row[freq_col]
|
| 295 |
+
|
| 296 |
+
if row['level1_key']:
|
| 297 |
+
level1_dict[row['level1_key']] = freq
|
| 298 |
+
if row['level2_key']:
|
| 299 |
+
level2_dict[row['level2_key']] = freq
|
| 300 |
+
if row['level3_key']:
|
| 301 |
+
level3_dict[row['level3_key']] = freq
|
| 302 |
+
|
| 303 |
+
# Return enhanced Japanese corpus data structure
|
| 304 |
+
return {
|
| 305 |
+
'level1_dict': level1_dict, # Most specific UniDic-compatible keys
|
| 306 |
+
'level2_dict': level2_dict,
|
| 307 |
+
'level3_dict': level3_dict,
|
| 308 |
+
'composite_dict': dict(zip(df_clean['legacy_key'], df_clean[freq_col])), # Legacy format
|
| 309 |
+
'lemma_dict': dict(zip(df_clean[lemma_col].str.lower(), df_clean[freq_col])),
|
| 310 |
+
'surface_dict': dict(zip(df_clean[surface_col].str.lower(), df_clean[freq_col])),
|
| 311 |
+
'is_japanese_corpus': True
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
except Exception as e:
|
| 315 |
+
st.error(f"Error parsing Japanese corpus data: {e}")
|
| 316 |
+
return {}
|
| 317 |
+
|
| 318 |
@staticmethod
|
| 319 |
def clean_default_reference_lists():
|
| 320 |
"""Clean up default reference lists that are no longer selected."""
|
| 321 |
# This would be called by the UI when managing default reference lists
|
| 322 |
# Implementation depends on how default lists are managed
|
| 323 |
+
pass
|
web_app/handlers/__pycache__/__init__.cpython-312.pyc
DELETED
|
Binary file (245 Bytes)
|
|
|
web_app/handlers/__pycache__/analysis_handlers.cpython-312.pyc
DELETED
|
Binary file (17.9 kB)
|
|
|
web_app/handlers/__pycache__/pos_handlers.cpython-312.pyc
DELETED
|
Binary file (7.52 kB)
|
|
|
web_app/handlers/frequency_handlers.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Frequency Analysis Handlers for Streamlit Interface
|
| 3 |
+
|
| 4 |
+
This module provides Streamlit interface handlers for word frequency visualization,
|
| 5 |
+
including file upload, visualization controls, and results display.
|
| 6 |
+
Supports flexible column mapping for diverse frequency data formats.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import streamlit as st
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import plotly.graph_objects as go
|
| 12 |
+
import plotly.express as px
|
| 13 |
+
import numpy as np
|
| 14 |
+
from typing import Dict, List, Optional
|
| 15 |
+
import sys
|
| 16 |
+
import os
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from io import StringIO
|
| 19 |
+
|
| 20 |
+
# Add parent directory to path for imports
|
| 21 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
| 22 |
+
|
| 23 |
+
from text_analyzer.frequency_analyzer import FrequencyAnalyzer
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class FrequencyHandlers:
|
| 27 |
+
"""
|
| 28 |
+
Streamlit interface handlers for frequency analysis functionality.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def handle_frequency_analysis():
|
| 33 |
+
"""
|
| 34 |
+
Enhanced frequency analysis interface handler with persistent column selection.
|
| 35 |
+
"""
|
| 36 |
+
st.markdown("Upload a frequency data file (TSV/CSV) with flexible column mapping support. "
|
| 37 |
+
"The system will automatically detect columns and let you choose which ones to use for analysis.")
|
| 38 |
+
|
| 39 |
+
# Initialize session state variables
|
| 40 |
+
if 'uploaded_file_name' not in st.session_state:
|
| 41 |
+
st.session_state.uploaded_file_name = None
|
| 42 |
+
if 'column_config' not in st.session_state:
|
| 43 |
+
st.session_state.column_config = None
|
| 44 |
+
if 'analyzer' not in st.session_state:
|
| 45 |
+
st.session_state.analyzer = None
|
| 46 |
+
if 'format_info' not in st.session_state:
|
| 47 |
+
st.session_state.format_info = None
|
| 48 |
+
if 'detected_cols' not in st.session_state:
|
| 49 |
+
st.session_state.detected_cols = None
|
| 50 |
+
if 'uploaded_file_content' not in st.session_state:
|
| 51 |
+
st.session_state.uploaded_file_content = None
|
| 52 |
+
|
| 53 |
+
# File upload section
|
| 54 |
+
uploaded_file = FrequencyHandlers.render_file_upload()
|
| 55 |
+
|
| 56 |
+
# Check if a new file was uploaded
|
| 57 |
+
if uploaded_file is not None:
|
| 58 |
+
current_file_name = uploaded_file.name
|
| 59 |
+
|
| 60 |
+
# Reset state if new file is uploaded
|
| 61 |
+
if st.session_state.uploaded_file_name != current_file_name:
|
| 62 |
+
st.session_state.uploaded_file_name = current_file_name
|
| 63 |
+
st.session_state.column_config = None
|
| 64 |
+
st.session_state.analyzer = None
|
| 65 |
+
st.session_state.format_info = None
|
| 66 |
+
st.session_state.detected_cols = None
|
| 67 |
+
st.session_state.uploaded_file_content = uploaded_file.getvalue()
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
# Initialize analyzer and process file (only if needed)
|
| 71 |
+
if st.session_state.analyzer is None or st.session_state.format_info is None:
|
| 72 |
+
st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
|
| 73 |
+
st.session_state.format_info = st.session_state.analyzer.detect_file_format(uploaded_file.getvalue())
|
| 74 |
+
|
| 75 |
+
# Show format detection results
|
| 76 |
+
st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
|
| 77 |
+
f"{'with' if st.session_state.format_info['has_header'] else 'without'} header, "
|
| 78 |
+
f"~{st.session_state.format_info['estimated_columns']} columns")
|
| 79 |
+
|
| 80 |
+
# Prepare data for column detection
|
| 81 |
+
content = uploaded_file.getvalue()
|
| 82 |
+
if isinstance(content, bytes):
|
| 83 |
+
content = content.decode('utf-8')
|
| 84 |
+
|
| 85 |
+
# Read data for preview and column detection
|
| 86 |
+
df_preview = pd.read_csv(StringIO(content),
|
| 87 |
+
sep=st.session_state.format_info['separator'],
|
| 88 |
+
header=0 if st.session_state.format_info['has_header'] else None,
|
| 89 |
+
nrows=100)
|
| 90 |
+
|
| 91 |
+
# Detect available columns
|
| 92 |
+
st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
|
| 93 |
+
|
| 94 |
+
# Show data preview
|
| 95 |
+
FrequencyHandlers.render_data_preview(df_preview, st.session_state.detected_cols)
|
| 96 |
+
|
| 97 |
+
# ALWAYS show column selection if we have detected columns (persistent interface)
|
| 98 |
+
if st.session_state.detected_cols is not None:
|
| 99 |
+
with st.expander("🎯 Column Selection", expanded=True):
|
| 100 |
+
column_config = FrequencyHandlers.render_persistent_column_selection(
|
| 101 |
+
st.session_state.detected_cols,
|
| 102 |
+
st.session_state.format_info,
|
| 103 |
+
st.session_state.column_config
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Check if column configuration changed
|
| 107 |
+
if column_config != st.session_state.column_config:
|
| 108 |
+
st.session_state.column_config = column_config
|
| 109 |
+
# Reload data with new configuration
|
| 110 |
+
df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
|
| 111 |
+
st.session_state.loaded_data = df
|
| 112 |
+
st.rerun()
|
| 113 |
+
|
| 114 |
+
# ALWAYS show visualization controls if we have a column config
|
| 115 |
+
if st.session_state.column_config is not None:
|
| 116 |
+
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 117 |
+
|
| 118 |
+
if viz_config:
|
| 119 |
+
# Generate analysis
|
| 120 |
+
FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
st.error(f"Error processing file: {str(e)}")
|
| 124 |
+
with st.expander("Error Details"):
|
| 125 |
+
st.code(str(e))
|
| 126 |
+
st.info("Please ensure your file is a valid TSV/CSV with appropriate columns.")
|
| 127 |
+
|
| 128 |
+
elif st.session_state.column_config is not None and st.session_state.uploaded_file_content is not None:
|
| 129 |
+
# Show persistent interface even when no file is currently selected (using cached data)
|
| 130 |
+
with st.expander("🎯 Column Selection", expanded=False):
|
| 131 |
+
column_config = FrequencyHandlers.render_persistent_column_selection(
|
| 132 |
+
st.session_state.detected_cols,
|
| 133 |
+
st.session_state.format_info,
|
| 134 |
+
st.session_state.column_config
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Check if column configuration changed
|
| 138 |
+
if column_config != st.session_state.column_config:
|
| 139 |
+
st.session_state.column_config = column_config
|
| 140 |
+
# Reload data with new configuration
|
| 141 |
+
df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
|
| 142 |
+
st.session_state.loaded_data = df
|
| 143 |
+
st.rerun()
|
| 144 |
+
|
| 145 |
+
viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
|
| 146 |
+
|
| 147 |
+
if viz_config:
|
| 148 |
+
# Generate analysis
|
| 149 |
+
FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
|
| 150 |
+
|
| 151 |
+
@staticmethod
|
| 152 |
+
def render_file_upload():
|
| 153 |
+
"""
|
| 154 |
+
Render enhanced file upload interface with flexible format support.
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
Uploaded file object or None
|
| 158 |
+
"""
|
| 159 |
+
st.subheader("📄 Upload Frequency Data")
|
| 160 |
+
|
| 161 |
+
uploaded_file = st.file_uploader(
|
| 162 |
+
"Choose a frequency data file",
|
| 163 |
+
type=['tsv', 'csv', 'txt'],
|
| 164 |
+
help="Upload a TSV or CSV file with frequency data. Supports flexible column mapping.",
|
| 165 |
+
accept_multiple_files=False
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
if uploaded_file is None:
|
| 169 |
+
# Show example formats
|
| 170 |
+
st.info("**Supported formats:**")
|
| 171 |
+
col1, col2 = st.columns(2)
|
| 172 |
+
|
| 173 |
+
with col1:
|
| 174 |
+
st.write("**Traditional format:**")
|
| 175 |
+
example_traditional = """Type\tFreq\tRank
|
| 176 |
+
the\t69868\t1
|
| 177 |
+
of\t36426\t2
|
| 178 |
+
and\t28891\t3"""
|
| 179 |
+
st.code(example_traditional, language="text")
|
| 180 |
+
|
| 181 |
+
with col2:
|
| 182 |
+
st.write("**Rich corpus format:**")
|
| 183 |
+
example_rich = """rank\tlForm\tlemma\tpos\tfrequency\tpmw
|
| 184 |
+
1\tノ\tの\t助詞\t5061558\t48383.9
|
| 185 |
+
2\tニ\tに\t助詞\t3576558\t34188.7
|
| 186 |
+
3\tテ\tて\t助詞\t3493117\t33391.0"""
|
| 187 |
+
st.code(example_rich, language="text")
|
| 188 |
+
|
| 189 |
+
st.write("**File size limit:** 300MB")
|
| 190 |
+
|
| 191 |
+
return uploaded_file
|
| 192 |
+
|
| 193 |
+
@staticmethod
|
| 194 |
+
def render_data_preview(df: pd.DataFrame, detected_cols: Dict[str, List[str]]):
|
| 195 |
+
"""
|
| 196 |
+
Render enhanced data preview section with column detection results.
|
| 197 |
+
|
| 198 |
+
Args:
|
| 199 |
+
df: Preview DataFrame
|
| 200 |
+
detected_cols: Detected column categorization
|
| 201 |
+
"""
|
| 202 |
+
st.subheader("📊 Data Preview")
|
| 203 |
+
|
| 204 |
+
# Basic metrics
|
| 205 |
+
col1, col2, col3 = st.columns(3)
|
| 206 |
+
with col1:
|
| 207 |
+
st.metric("Total Rows", len(df))
|
| 208 |
+
with col2:
|
| 209 |
+
st.metric("Total Columns", len(df.columns))
|
| 210 |
+
with col3:
|
| 211 |
+
word_cols = len(detected_cols.get('word_columns', []))
|
| 212 |
+
freq_cols = len(detected_cols.get('frequency_columns', []))
|
| 213 |
+
st.metric("Detected", f"{word_cols} word, {freq_cols} freq")
|
| 214 |
+
|
| 215 |
+
# Show sample data
|
| 216 |
+
st.write("**First 5 rows:**")
|
| 217 |
+
st.dataframe(df.head(), use_container_width=True)
|
| 218 |
+
|
| 219 |
+
# Show detected column categories
|
| 220 |
+
with st.expander("🔍 Column Detection Results", expanded=True):
|
| 221 |
+
col1, col2 = st.columns(2)
|
| 222 |
+
|
| 223 |
+
with col1:
|
| 224 |
+
st.write("**Word Columns (text data):**")
|
| 225 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 226 |
+
if word_cols:
|
| 227 |
+
for col in word_cols:
|
| 228 |
+
st.write(f"- `{col}` ({df[col].dtype})")
|
| 229 |
+
else:
|
| 230 |
+
st.write("None detected")
|
| 231 |
+
|
| 232 |
+
st.write("**POS Columns:**")
|
| 233 |
+
pos_cols = detected_cols.get('pos_columns', [])
|
| 234 |
+
if pos_cols:
|
| 235 |
+
for col in pos_cols:
|
| 236 |
+
st.write(f"- `{col}` ({df[col].dtype})")
|
| 237 |
+
else:
|
| 238 |
+
st.write("None detected")
|
| 239 |
+
|
| 240 |
+
with col2:
|
| 241 |
+
st.write("**Frequency Columns (numeric data):**")
|
| 242 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 243 |
+
if freq_cols:
|
| 244 |
+
for col in freq_cols:
|
| 245 |
+
sample_vals = df[col].dropna().head(3).tolist()
|
| 246 |
+
st.write(f"- `{col}` ({df[col].dtype}) - e.g., {sample_vals}")
|
| 247 |
+
else:
|
| 248 |
+
st.write("None detected")
|
| 249 |
+
|
| 250 |
+
st.write("**Other Columns:**")
|
| 251 |
+
other_cols = detected_cols.get('other_columns', [])
|
| 252 |
+
if other_cols:
|
| 253 |
+
for col in other_cols[:5]: # Show max 5
|
| 254 |
+
st.write(f"- `{col}` ({df[col].dtype})")
|
| 255 |
+
if len(other_cols) > 5:
|
| 256 |
+
st.write(f"... and {len(other_cols) - 5} more")
|
| 257 |
+
else:
|
| 258 |
+
st.write("None")
|
| 259 |
+
|
| 260 |
+
@staticmethod
|
| 261 |
+
def render_column_selection_simplified(detected_cols: Dict[str, List[str]], format_info: Dict) -> Optional[Dict[str, str]]:
|
| 262 |
+
"""
|
| 263 |
+
Render simplified column selection interface without multi-frequency complexity.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
detected_cols: Detected column categorization
|
| 267 |
+
format_info: File format information
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
Column configuration dict or None
|
| 271 |
+
"""
|
| 272 |
+
st.subheader("🎯 Column Mapping")
|
| 273 |
+
st.write("Select which columns to use for your frequency analysis:")
|
| 274 |
+
|
| 275 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 276 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 277 |
+
pos_cols = detected_cols.get('pos_columns', [])
|
| 278 |
+
|
| 279 |
+
if not word_cols or not freq_cols:
|
| 280 |
+
st.error("❌ Required columns not detected. Please ensure your file has:")
|
| 281 |
+
st.write("- At least one text column (for words)")
|
| 282 |
+
st.write("- At least one numeric column (for frequencies)")
|
| 283 |
+
return None
|
| 284 |
+
|
| 285 |
+
col1, col2 = st.columns(2)
|
| 286 |
+
|
| 287 |
+
with col1:
|
| 288 |
+
# Word column selection
|
| 289 |
+
word_column = st.selectbox(
|
| 290 |
+
"Word Column",
|
| 291 |
+
options=word_cols,
|
| 292 |
+
index=0,
|
| 293 |
+
help="Column containing word forms or lemmas"
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# POS column selection (optional)
|
| 297 |
+
pos_column = None
|
| 298 |
+
if pos_cols:
|
| 299 |
+
use_pos = st.checkbox("Include POS column", value=False)
|
| 300 |
+
if use_pos:
|
| 301 |
+
pos_column = st.selectbox(
|
| 302 |
+
"POS Column",
|
| 303 |
+
options=pos_cols,
|
| 304 |
+
index=0,
|
| 305 |
+
help="Column containing part-of-speech tags (optional)"
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
with col2:
|
| 309 |
+
# Frequency column selection
|
| 310 |
+
frequency_column = st.selectbox(
|
| 311 |
+
"Frequency Column",
|
| 312 |
+
options=freq_cols,
|
| 313 |
+
index=0,
|
| 314 |
+
help="Column containing frequency values for analysis"
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# Confirm button
|
| 318 |
+
if st.button("🚀 Start Analysis", type="primary"):
|
| 319 |
+
config = {
|
| 320 |
+
'word_column': word_column,
|
| 321 |
+
'frequency_column': frequency_column,
|
| 322 |
+
'separator': format_info['separator'],
|
| 323 |
+
'has_header': format_info['has_header']
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
if pos_column:
|
| 327 |
+
config['pos_column'] = pos_column
|
| 328 |
+
|
| 329 |
+
return config
|
| 330 |
+
|
| 331 |
+
return None
|
| 332 |
+
|
| 333 |
+
@staticmethod
|
| 334 |
+
def render_visualization_controls_simplified(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
|
| 335 |
+
"""
|
| 336 |
+
Legacy method - redirects to enhanced controls for backward compatibility.
|
| 337 |
+
"""
|
| 338 |
+
return FrequencyHandlers.render_enhanced_visualization_controls(analyzer, column_config)
|
| 339 |
+
|
| 340 |
+
@staticmethod
|
| 341 |
+
def render_rank_based_analysis_simplified(analyzer: FrequencyAnalyzer, viz_config: Dict):
|
| 342 |
+
"""
|
| 343 |
+
Legacy method - redirects to enhanced analysis for backward compatibility.
|
| 344 |
+
"""
|
| 345 |
+
return FrequencyHandlers.render_enhanced_rank_based_analysis(analyzer, viz_config)
|
| 346 |
+
|
| 347 |
+
@staticmethod
|
| 348 |
+
def render_persistent_column_selection(detected_cols: Dict[str, List[str]],
|
| 349 |
+
format_info: Dict,
|
| 350 |
+
current_config: Optional[Dict] = None) -> Dict[str, str]:
|
| 351 |
+
"""
|
| 352 |
+
Render persistent column selection interface that doesn't disappear.
|
| 353 |
+
|
| 354 |
+
Args:
|
| 355 |
+
detected_cols: Detected column categorization
|
| 356 |
+
format_info: File format information
|
| 357 |
+
current_config: Current column configuration (for preserving selections)
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
Column configuration dict
|
| 361 |
+
"""
|
| 362 |
+
st.write("Select which columns to use for your frequency analysis:")
|
| 363 |
+
|
| 364 |
+
word_cols = detected_cols.get('word_columns', [])
|
| 365 |
+
freq_cols = detected_cols.get('frequency_columns', [])
|
| 366 |
+
pos_cols = detected_cols.get('pos_columns', [])
|
| 367 |
+
|
| 368 |
+
# Determine default selections
|
| 369 |
+
default_word_idx = 0
|
| 370 |
+
default_freq_idx = 0
|
| 371 |
+
default_use_pos = False
|
| 372 |
+
default_pos_idx = 0
|
| 373 |
+
|
| 374 |
+
if current_config:
|
| 375 |
+
# Preserve current selections
|
| 376 |
+
if current_config['word_column'] in word_cols:
|
| 377 |
+
default_word_idx = word_cols.index(current_config['word_column'])
|
| 378 |
+
if current_config['frequency_column'] in freq_cols:
|
| 379 |
+
default_freq_idx = freq_cols.index(current_config['frequency_column'])
|
| 380 |
+
if 'pos_column' in current_config and current_config['pos_column'] in pos_cols:
|
| 381 |
+
default_use_pos = True
|
| 382 |
+
default_pos_idx = pos_cols.index(current_config['pos_column'])
|
| 383 |
+
|
| 384 |
+
col1, col2 = st.columns(2)
|
| 385 |
+
|
| 386 |
+
with col1:
|
| 387 |
+
word_column = st.selectbox(
|
| 388 |
+
"Word Column",
|
| 389 |
+
options=word_cols,
|
| 390 |
+
index=default_word_idx,
|
| 391 |
+
help="Column containing word forms or lemmas",
|
| 392 |
+
key="persistent_word_col"
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
# POS column selection (optional)
|
| 396 |
+
pos_column = None
|
| 397 |
+
if pos_cols:
|
| 398 |
+
use_pos = st.checkbox("Include POS column", value=default_use_pos, key="persistent_use_pos")
|
| 399 |
+
if use_pos:
|
| 400 |
+
pos_column = st.selectbox(
|
| 401 |
+
"POS Column",
|
| 402 |
+
options=pos_cols,
|
| 403 |
+
index=default_pos_idx,
|
| 404 |
+
help="Column containing part-of-speech tags (optional)",
|
| 405 |
+
key="persistent_pos_col"
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
with col2:
|
| 409 |
+
frequency_column = st.selectbox(
|
| 410 |
+
"Frequency Column",
|
| 411 |
+
options=freq_cols,
|
| 412 |
+
index=default_freq_idx,
|
| 413 |
+
help="Column containing frequency values for analysis",
|
| 414 |
+
key="persistent_freq_col"
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
# Show quick info about selected columns
|
| 418 |
+
st.write("**Selected Configuration:**")
|
| 419 |
+
st.write(f"• Words: `{word_column}`")
|
| 420 |
+
st.write(f"• Frequencies: `{frequency_column}`")
|
| 421 |
+
if pos_column:
|
| 422 |
+
st.write(f"• POS: `{pos_column}`")
|
| 423 |
+
|
| 424 |
+
# Always return configuration (no button needed)
|
| 425 |
+
config = {
|
| 426 |
+
'word_column': word_column,
|
| 427 |
+
'frequency_column': frequency_column,
|
| 428 |
+
'separator': format_info['separator'],
|
| 429 |
+
'has_header': format_info['has_header']
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
if pos_column:
|
| 433 |
+
config['pos_column'] = pos_column
|
| 434 |
+
|
| 435 |
+
return config
|
| 436 |
+
|
| 437 |
+
@staticmethod
|
| 438 |
+
def render_enhanced_visualization_controls(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
|
| 439 |
+
"""
|
| 440 |
+
Render enhanced visualization controls with max words limit.
|
| 441 |
+
|
| 442 |
+
Args:
|
| 443 |
+
analyzer: FrequencyAnalyzer instance with loaded data
|
| 444 |
+
column_config: Column configuration from user selection
|
| 445 |
+
|
| 446 |
+
Returns:
|
| 447 |
+
Dict with visualization configuration or None
|
| 448 |
+
"""
|
| 449 |
+
st.subheader("🎛️ Enhanced Visualization Controls")
|
| 450 |
+
|
| 451 |
+
# Get the frequency column
|
| 452 |
+
frequency_column = column_config['frequency_column']
|
| 453 |
+
|
| 454 |
+
col1, col2, col3 = st.columns(3)
|
| 455 |
+
|
| 456 |
+
with col1:
|
| 457 |
+
# Bin size controls
|
| 458 |
+
bin_size = st.slider(
|
| 459 |
+
"Bin Size (words per group)",
|
| 460 |
+
min_value=100,
|
| 461 |
+
max_value=2000,
|
| 462 |
+
value=500,
|
| 463 |
+
step=100,
|
| 464 |
+
help="Number of words to group together for rank-based analysis"
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
with col2:
|
| 468 |
+
# Log transformation option
|
| 469 |
+
log_transform = st.checkbox(
|
| 470 |
+
"Apply log₁₀ transformation",
|
| 471 |
+
value=False,
|
| 472 |
+
help="Transform frequency values using log₁₀ for better visualization"
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
with col3:
|
| 476 |
+
# Max words control
|
| 477 |
+
max_words = st.number_input(
|
| 478 |
+
"Max words to analyze",
|
| 479 |
+
min_value=1000,
|
| 480 |
+
max_value=200000,
|
| 481 |
+
value=None,
|
| 482 |
+
step=1000,
|
| 483 |
+
help="Limit analysis to top N most frequent words (leave empty for no limit)",
|
| 484 |
+
key="max_words_input"
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
# Quick preset buttons
|
| 488 |
+
st.write("**Quick Presets:**")
|
| 489 |
+
preset_cols = st.columns(4)
|
| 490 |
+
if preset_cols[0].button("10K", key="preset_10k"):
|
| 491 |
+
st.session_state.max_words_preset = 10000
|
| 492 |
+
if preset_cols[1].button("25K", key="preset_25k"):
|
| 493 |
+
st.session_state.max_words_preset = 25000
|
| 494 |
+
if preset_cols[2].button("50K", key="preset_50k"):
|
| 495 |
+
st.session_state.max_words_preset = 50000
|
| 496 |
+
if preset_cols[3].button("All", key="preset_all"):
|
| 497 |
+
st.session_state.max_words_preset = None
|
| 498 |
+
|
| 499 |
+
# Use preset value if set
|
| 500 |
+
if 'max_words_preset' in st.session_state:
|
| 501 |
+
max_words = st.session_state.max_words_preset
|
| 502 |
+
del st.session_state.max_words_preset
|
| 503 |
+
|
| 504 |
+
# Generate visualization button
|
| 505 |
+
if st.button("📊 Generate Enhanced Visualization", type="primary", key="generate_viz"):
|
| 506 |
+
return {
|
| 507 |
+
'frequency_column': frequency_column,
|
| 508 |
+
'bin_size': bin_size,
|
| 509 |
+
'log_transform': log_transform,
|
| 510 |
+
'max_words_to_retain': max_words
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
return None
|
| 514 |
+
|
| 515 |
+
@staticmethod
|
| 516 |
+
def render_enhanced_rank_based_analysis(analyzer: FrequencyAnalyzer, viz_config: Dict):
|
| 517 |
+
"""
|
| 518 |
+
Render enhanced rank-based analysis with improved sample words display.
|
| 519 |
+
|
| 520 |
+
Args:
|
| 521 |
+
analyzer: FrequencyAnalyzer instance with loaded data
|
| 522 |
+
viz_config: Visualization configuration
|
| 523 |
+
"""
|
| 524 |
+
st.subheader("📊 Enhanced Rank-Based Frequency Analysis")
|
| 525 |
+
|
| 526 |
+
frequency_column = viz_config['frequency_column']
|
| 527 |
+
bin_size = viz_config['bin_size']
|
| 528 |
+
log_transform = viz_config['log_transform']
|
| 529 |
+
max_words_to_retain = viz_config.get('max_words_to_retain')
|
| 530 |
+
|
| 531 |
+
try:
|
| 532 |
+
# Calculate statistics
|
| 533 |
+
stats = analyzer.calculate_statistics(frequency_column)
|
| 534 |
+
|
| 535 |
+
# Display basic statistics with word limit info
|
| 536 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 537 |
+
with col1:
|
| 538 |
+
words_analyzed = max_words_to_retain if max_words_to_retain and max_words_to_retain < stats['count'] else stats['count']
|
| 539 |
+
st.metric("Words Analyzed", f"{words_analyzed:,}")
|
| 540 |
+
with col2:
|
| 541 |
+
st.metric("Mean Frequency", f"{stats['mean']:.2f}")
|
| 542 |
+
with col3:
|
| 543 |
+
st.metric("Median Frequency", f"{stats['median']:.2f}")
|
| 544 |
+
with col4:
|
| 545 |
+
st.metric("Std Deviation", f"{stats['std']:.2f}")
|
| 546 |
+
|
| 547 |
+
# Show word limit info if applied
|
| 548 |
+
if max_words_to_retain and max_words_to_retain < stats['count']:
|
| 549 |
+
st.info(f"📊 Analysis limited to top {max_words_to_retain:,} most frequent words (out of {stats['count']:,} total)")
|
| 550 |
+
|
| 551 |
+
# Create rank-based visualization with enhanced parameters
|
| 552 |
+
result = analyzer.create_rank_based_visualization_flexible(
|
| 553 |
+
column=frequency_column,
|
| 554 |
+
bin_size=bin_size,
|
| 555 |
+
log_transform=log_transform,
|
| 556 |
+
max_words_to_retain=max_words_to_retain
|
| 557 |
+
)
|
| 558 |
+
|
| 559 |
+
# Create the main visualization
|
| 560 |
+
fig = go.Figure()
|
| 561 |
+
|
| 562 |
+
fig.add_trace(go.Bar(
|
| 563 |
+
x=result['group_centers'],
|
| 564 |
+
y=result['avg_frequencies'],
|
| 565 |
+
name=f"Avg {frequency_column}",
|
| 566 |
+
marker_color='steelblue',
|
| 567 |
+
hovertemplate=(
|
| 568 |
+
f"<b>Group %{{x}}</b><br>"
|
| 569 |
+
f"Avg {'Log₁₀ ' if log_transform else ''}{frequency_column}: %{{y:.3f}}<br>"
|
| 570 |
+
"<extra></extra>"
|
| 571 |
+
)
|
| 572 |
+
))
|
| 573 |
+
|
| 574 |
+
fig.update_layout(
|
| 575 |
+
title=result.get('title_suffix', f"Enhanced Rank-Based Analysis - {frequency_column}"),
|
| 576 |
+
xaxis_title=result.get('x_label', f"Rank Groups (bin size: {bin_size})"),
|
| 577 |
+
yaxis_title=result.get('y_label', f"{'Log₁₀ ' if log_transform else ''}Average {frequency_column}"),
|
| 578 |
+
showlegend=False,
|
| 579 |
+
height=500
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 583 |
+
|
| 584 |
+
# Enhanced sample words display (up to 20 bins with 5 random samples each)
|
| 585 |
+
st.write("### 🎯 Sample Words by Rank Group (5 Random Samples)")
|
| 586 |
+
|
| 587 |
+
sample_words = result.get('sample_words', {})
|
| 588 |
+
if sample_words:
|
| 589 |
+
# Display up to 20 groups in a more organized layout
|
| 590 |
+
num_groups = min(20, len(sample_words))
|
| 591 |
+
|
| 592 |
+
if num_groups > 0:
|
| 593 |
+
st.write(f"Showing sample words from top {num_groups} rank groups:")
|
| 594 |
+
|
| 595 |
+
# Display in rows of 4 groups each
|
| 596 |
+
for row_start in range(0, num_groups, 4):
|
| 597 |
+
cols = st.columns(4)
|
| 598 |
+
for col_idx in range(4):
|
| 599 |
+
group_idx = row_start + col_idx
|
| 600 |
+
if group_idx < num_groups and group_idx in sample_words:
|
| 601 |
+
with cols[col_idx]:
|
| 602 |
+
group_label = result['group_labels'][group_idx]
|
| 603 |
+
words = sample_words[group_idx]
|
| 604 |
+
|
| 605 |
+
st.write(f"**Group {group_label}:**")
|
| 606 |
+
word_list = [w['word'] for w in words]
|
| 607 |
+
# Display as bullet points for better readability
|
| 608 |
+
for word in word_list:
|
| 609 |
+
st.write(f"• {word}")
|
| 610 |
+
|
| 611 |
+
# Add spacing between groups
|
| 612 |
+
st.write("")
|
| 613 |
+
else:
|
| 614 |
+
st.write("No sample words available")
|
| 615 |
+
|
| 616 |
+
# Show enhanced group statistics
|
| 617 |
+
with st.expander("📈 Detailed Group Statistics"):
|
| 618 |
+
group_stats = result.get('group_stats')
|
| 619 |
+
if group_stats is not None and not group_stats.empty:
|
| 620 |
+
display_stats = group_stats.copy()
|
| 621 |
+
|
| 622 |
+
# Format numeric columns
|
| 623 |
+
numeric_cols = display_stats.select_dtypes(include=[np.number]).columns
|
| 624 |
+
for col in numeric_cols:
|
| 625 |
+
if 'count' not in col.lower():
|
| 626 |
+
display_stats[col] = display_stats[col].round(2)
|
| 627 |
+
|
| 628 |
+
st.dataframe(display_stats, use_container_width=True)
|
| 629 |
+
else:
|
| 630 |
+
st.write("No detailed statistics available")
|
| 631 |
+
|
| 632 |
+
except Exception as e:
|
| 633 |
+
st.error(f"Error in enhanced rank-based analysis: {str(e)}")
|
| 634 |
+
with st.expander("Error Details"):
|
| 635 |
+
st.code(str(e))
|