egumasa commited on
Commit
dbc9105
·
1 Parent(s): fd3d382

Japanese language support

Browse files
Files changed (48) hide show
  1. .gitignore +388 -0
  2. README.md +238 -5
  3. config/reference_lists.yaml +97 -1
  4. japanese-nlp-test.ipynb +819 -0
  5. pyproject.toml +4 -0
  6. resources/reference_lists/ja/BCCWJ_frequencylist_luw2_ver1_1.tsv +3 -0
  7. resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1 copy.tsv +3 -0
  8. resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv +3 -0
  9. resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv +3 -0
  10. test/test_app.py +8 -6
  11. test/test_functionality.py +6 -4
  12. test/test_multi_index.py +2 -3
  13. test/test_yaml_config.py +2 -3
  14. test_frequency_flexible.py +1 -0
  15. test_fugashi_diagnostic.py +134 -0
  16. test_japanese_integration.py +135 -0
  17. test_unidic_diagnostic.py +201 -0
  18. text_analyzer/__pycache__/__init__.cpython-312.pyc +0 -0
  19. text_analyzer/__pycache__/lexical_sophistication.cpython-312.pyc +0 -0
  20. text_analyzer/__pycache__/pos_parser.cpython-312.pyc +0 -0
  21. text_analyzer/app_config.py +183 -0
  22. text_analyzer/base_analyzer.py +308 -0
  23. text_analyzer/frequency_analyzer.py +653 -0
  24. text_analyzer/lexical_sophistication.py +245 -62
  25. text_analyzer/pos_parser.py +11 -36
  26. text_analyzer/text_utility.py +289 -0
  27. text_analyzer/unidic_enricher.py +256 -0
  28. text_analyzer/unidic_extensions.py +25 -0
  29. uv.lock +420 -6
  30. web_app/__pycache__/analysis_handlers.cpython-312.pyc +0 -0
  31. web_app/__pycache__/app.cpython-312.pyc +0 -0
  32. web_app/__pycache__/comparison_functions.cpython-312.pyc +0 -0
  33. web_app/__pycache__/config_manager.cpython-312.pyc +0 -0
  34. web_app/__pycache__/pos_handlers.cpython-312.pyc +0 -0
  35. web_app/__pycache__/reference_manager.cpython-312.pyc +0 -0
  36. web_app/__pycache__/session_manager.cpython-312.pyc +0 -0
  37. web_app/__pycache__/ui_components.cpython-312.pyc +0 -0
  38. web_app/app.py +15 -3
  39. web_app/components/__pycache__/__init__.cpython-312.pyc +0 -0
  40. web_app/components/__pycache__/comparison_functions.cpython-312.pyc +0 -0
  41. web_app/components/__pycache__/ui_components.cpython-312.pyc +0 -0
  42. web_app/components/comparison_functions.py +2 -1
  43. web_app/components/ui_components.py +2 -2
  44. web_app/config_manager.py +110 -3
  45. web_app/handlers/__pycache__/__init__.cpython-312.pyc +0 -0
  46. web_app/handlers/__pycache__/analysis_handlers.cpython-312.pyc +0 -0
  47. web_app/handlers/__pycache__/pos_handlers.cpython-312.pyc +0 -0
  48. web_app/handlers/frequency_handlers.py +635 -0
.gitignore ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ *.py,cover
48
+ .hypothesis/
49
+ .pytest_cache/
50
+ cover/
51
+
52
+ # Translations
53
+ *.mo
54
+ *.pot
55
+
56
+ # Django stuff:
57
+ *.log
58
+ local_settings.py
59
+ db.sqlite3
60
+ db.sqlite3-journal
61
+
62
+ # Flask stuff:
63
+ instance/
64
+ .webassets-cache
65
+
66
+ # Scrapy stuff:
67
+ .scrapy
68
+
69
+ # Sphinx documentation
70
+ docs/_build/
71
+
72
+ # PyBuilder
73
+ .pybuilder/
74
+ target/
75
+
76
+ # Jupyter Notebook
77
+ .ipynb_checkpoints
78
+
79
+ # IPython
80
+ profile_default/
81
+ ipython_config.py
82
+
83
+ # pyenv
84
+ .python-version
85
+
86
+ # pipenv
87
+ Pipfile.lock
88
+
89
+ # poetry
90
+ poetry.lock
91
+
92
+ # pdm
93
+ .pdm.toml
94
+ .pdm-python
95
+
96
+ # PEP 582
97
+ __pypackages__/
98
+
99
+ # Celery stuff
100
+ celerybeat-schedule
101
+ celerybeat.pid
102
+
103
+ # SageMath parsed files
104
+ *.sage.py
105
+
106
+ # Environments
107
+ .env
108
+ .venv
109
+ env/
110
+ venv/
111
+ ENV/
112
+ env.bak/
113
+ venv.bak/
114
+
115
+ # Spyder project settings
116
+ .spyderproject
117
+ .spyproject
118
+
119
+ # Rope project settings
120
+ .ropeproject
121
+
122
+ # mkdocs documentation
123
+ /site
124
+
125
+ # mypy
126
+ .mypy_cache/
127
+ .dmypy.json
128
+ dmypy.json
129
+
130
+ # Pyre type checker
131
+ .pyre/
132
+
133
+ # pytype static type analyzer
134
+ .pytype/
135
+
136
+ # Cython debug symbols
137
+ cython_debug/
138
+
139
+ # PyCharm
140
+ .idea/
141
+
142
+ # VS Code
143
+ .vscode/
144
+ *.code-workspace
145
+
146
+ # Local History for Visual Studio Code
147
+ .history/
148
+
149
+ # macOS
150
+ .DS_Store
151
+ .AppleDouble
152
+ .LSOverride
153
+
154
+ # Thumbnails
155
+ ._*
156
+
157
+ # Files that might appear in the root of a volume
158
+ .DocumentRevisions-V100
159
+ .fseventsd
160
+ .Spotlight-V100
161
+ .TemporaryItems
162
+ .Trashes
163
+ .VolumeIcon.icns
164
+ .com.apple.timemachine.donotpresent
165
+
166
+ # Directories potentially created on remote AFP share
167
+ .AppleDB
168
+ .AppleDesktop
169
+ Network Trash Folder
170
+ Temporary Items
171
+ .apdisk
172
+
173
+ # Windows
174
+ Thumbs.db
175
+ Thumbs.db:encryptable
176
+ ehthumbs.db
177
+ ehthumbs_vista.db
178
+
179
+ # Dump file
180
+ *.stackdump
181
+
182
+ # Folder config file
183
+ [Dd]esktop.ini
184
+
185
+ # Recycle Bin used on file shares
186
+ $RECYCLE.BIN/
187
+
188
+ # Windows Installer files
189
+ *.cab
190
+ *.msi
191
+ *.msix
192
+ *.msm
193
+ *.msp
194
+
195
+ # Windows shortcuts
196
+ *.lnk
197
+
198
+ # Linux
199
+ *~
200
+
201
+ # temporary files which can be created if a process still has a handle open of a deleted file
202
+ .fuse_hidden*
203
+
204
+ # KDE directory preferences
205
+ .directory
206
+
207
+ # Linux trash folder which might appear on any partition or disk
208
+ .Trash-*
209
+
210
+ # .nfs files are created when an open file is removed but is still being accessed
211
+ .nfs*
212
+
213
+ # Editor backups
214
+ *.bak
215
+ *.swp
216
+ *.swo
217
+ *~
218
+
219
+ # Logs
220
+ logs/
221
+ *.log
222
+ npm-debug.log*
223
+ yarn-debug.log*
224
+ yarn-error.log*
225
+ lerna-debug.log*
226
+ .pnpm-debug.log*
227
+
228
+ # Runtime data
229
+ pids
230
+ *.pid
231
+ *.seed
232
+ *.pid.lock
233
+
234
+ # Directory for instrumented libs generated by jscoverage/JSCover
235
+ lib-cov
236
+
237
+ # Coverage directory used by tools like istanbul
238
+ coverage
239
+ *.lcov
240
+
241
+ # Dependency directories
242
+ node_modules/
243
+ jspm_packages/
244
+
245
+ # TypeScript cache
246
+ *.tsbuildinfo
247
+
248
+ # Optional npm cache directory
249
+ .npm
250
+
251
+ # Optional eslint cache
252
+ .eslintcache
253
+
254
+ # Optional stylelint cache
255
+ .stylelintcache
256
+
257
+ # Microbundle cache
258
+ .rpt2_cache/
259
+ .rts2_cache_cjs/
260
+ .rts2_cache_es/
261
+ .rts2_cache_umd/
262
+
263
+ # Optional REPL history
264
+ .node_repl_history
265
+
266
+ # Output of 'npm pack'
267
+ *.tgz
268
+
269
+ # Yarn Integrity file
270
+ .yarn-integrity
271
+
272
+ # dotenv environment variable files
273
+ .env
274
+ .env.development.local
275
+ .env.test.local
276
+ .env.production.local
277
+ .env.local
278
+
279
+ # parcel-bundler cache
280
+ .cache
281
+ .parcel-cache
282
+
283
+ # Next.js build output
284
+ .next
285
+ out
286
+
287
+ # Nuxt.js build / generate output
288
+ .nuxt
289
+ dist
290
+
291
+ # Gatsby files
292
+ .cache/
293
+ public
294
+
295
+ # vuepress build output
296
+ .vuepress/dist
297
+
298
+ # vuepress v2.x temp and cache directory
299
+ .temp
300
+ .cache
301
+
302
+ # Docusaurus cache and generated files
303
+ .docusaurus
304
+
305
+ # Serverless directories
306
+ .serverless/
307
+
308
+ # FuseBox cache
309
+ .fusebox/
310
+
311
+ # DynamoDB Local files
312
+ .dynamodb/
313
+
314
+ # TernJS port file
315
+ .tern-port
316
+
317
+ # Stores VSCode versions used for testing VSCode extensions
318
+ .vscode-test
319
+
320
+ # yarn v2
321
+ .yarn/cache
322
+ .yarn/unplugged
323
+ .yarn/build-state.yml
324
+ .yarn/install-state.gz
325
+ .pnp.*
326
+
327
+ # Temporary files
328
+ tmp/
329
+ temp/
330
+ *.tmp
331
+ *.temp
332
+
333
+ # Database files
334
+ *.db
335
+ *.sqlite
336
+ *.sqlite3
337
+
338
+ # Secret files
339
+ secrets/
340
+ *.key
341
+ *.pem
342
+ *.cert
343
+ *.crt
344
+
345
+ # Config files with sensitive data
346
+ config.local.js
347
+ config.local.json
348
+ settings.local.json
349
+
350
+ # Build artifacts
351
+ bin/
352
+ obj/
353
+
354
+ # Package files
355
+ *.jar
356
+ *.war
357
+ *.nar
358
+ *.ear
359
+ *.zip
360
+ *.tar.gz
361
+ *.rar
362
+
363
+ # Virtual machine crash logs
364
+ hs_err_pid*
365
+
366
+ # Core dumps
367
+ core.*
368
+
369
+ # Compiled source
370
+ *.com
371
+ *.class
372
+ *.dll
373
+ *.exe
374
+ *.o
375
+ *.out
376
+
377
+ # Ignore all dotfiles except .gitignore
378
+ .*
379
+ !.gitignore
380
+ !.gitkeep
381
+ !.github/
382
+ !.gitlab-ci.yml
383
+ !.travis.yml
384
+ !.editorconfig
385
+ !.prettierrc
386
+ !.eslintrc*
387
+ !.stylelintrc*
388
+ !.babelrc*
README.md CHANGED
@@ -7,14 +7,247 @@ sdk: docker
7
  app_port: 8501
8
  tags:
9
  - streamlit
 
 
 
 
10
  pinned: false
11
- short_description: This is a web app for linguistic-data-analysis-I .
12
  license: cc-by-nc-4.0
13
  ---
14
 
15
- # Welcome to Streamlit!
16
 
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
 
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  app_port: 8501
8
  tags:
9
  - streamlit
10
+ - nlp
11
+ - linguistics
12
+ - japanese
13
+ - corpus-linguistics
14
  pinned: false
15
+ short_description: Advanced lexical sophistication analyzer for English and Japanese texts
16
  license: cc-by-nc-4.0
17
  ---
18
 
19
+ # Simple Text Analyzer
20
 
21
+ A comprehensive web-based application for lexical sophistication analysis supporting both English and Japanese languages. This tool provides detailed linguistic analysis using corpus-based frequency data and advanced NLP techniques.
22
 
23
+ ## 🌟 Features
24
+
25
+ ### Multi-Language Support
26
+ - **English**: COCA corpus frequency analysis with unigrams, bigrams, and trigrams
27
+ - **Japanese**: BCCWJ (written) and CSJ (spoken) corpus integration with POS-aware frequency matching
28
+
29
+ ### Analysis Capabilities
30
+ - **Lexical Sophistication**: Frequency-based lexical complexity analysis
31
+ - **Part-of-Speech Analysis**: Detailed POS tagging and classification
32
+ - **N-gram Analysis**: Bigram and trigram frequency analysis
33
+ - **Content vs Function Words**: Automatic classification and separate analysis
34
+ - **Batch Processing**: Multiple file analysis with comparative results
35
+
36
+ ### Japanese Language Features ✨ **NEW**
37
+ - **BCCWJ Integration**: Balanced Corpus of Contemporary Written Japanese
38
+ - Raw frequency counts
39
+ - Normalized frequency (per million words)
40
+ - Frequency rankings
41
+ - **CSJ Integration**: Corpus of Spontaneous Japanese (spoken data)
42
+ - Academic and conversational speech patterns
43
+ - Multiple speech style analysis
44
+ - **POS-Aware Matching**: Composite key lookup using `lemma + POS` for accurate frequency matching
45
+ - **Robust Fallback System**: Three-tier lookup strategy:
46
+ 1. Primary: `lemma_pos` composite key (e.g., "行く_動詞-自立")
47
+ 2. Fallback 1: `lemma` only lookup
48
+ 3. Fallback 2: `surface_form` lookup
49
+
50
+ ## 🚀 Quick Start
51
+
52
+ ### Prerequisites
53
+ - Python 3.8+
54
+ - uv (recommended) or pip for package management
55
+
56
+ ### Installation
57
+
58
+ ```bash
59
+ # Clone the repository
60
+ git clone https://github.com/your-repo/simple-text-analyzer.git
61
+ cd simple-text-analyzer
62
+
63
+ # Install dependencies using uv
64
+ uv sync
65
+
66
+ # Or using pip
67
+ pip install -r requirements.txt
68
+
69
+ # Install required SpaCy models
70
+ python -m spacy download en_core_web_trf
71
+ python -m spacy download ja_core_news_md # For Japanese support
72
+ ```
73
+
74
+ ### Running the Application
75
+
76
+ ```bash
77
+ # Using uv
78
+ uv run streamlit run web_app/app.py
79
+
80
+ # Or directly
81
+ streamlit run web_app/app.py
82
+ ```
83
+
84
+ ## 📊 Supported Corpora
85
+
86
+ ### English
87
+ - **COCA Spoken**: Corpus of Contemporary American English (spoken subcorpus)
88
+ - **COCA Magazine**: Magazine text frequency data
89
+ - **Bigram/Trigram Analysis**: Multi-word expression frequency and association measures
90
+
91
+ ### Japanese
92
+ - **BCCWJ (Balanced Corpus of Contemporary Written Japanese)**
93
+ - 182,604 unique word forms with POS tags
94
+ - Multiple text registers (books, newspapers, magazines, etc.)
95
+ - Comprehensive written language coverage
96
+
97
+ - **CSJ (Corpus of Spontaneous Japanese)**
98
+ - 41,892 unique word forms from spoken data
99
+ - Academic presentations and casual conversations
100
+ - Natural speech pattern analysis
101
+
102
+ ## 🔧 Architecture
103
+
104
+ ### Core Components
105
+ - **LexicalSophisticationAnalyzer**: Main analysis engine with multi-language support
106
+ - **ConfigManager**: Flexible configuration system for corpus integration
107
+ - **ReferenceManager**: Dynamic reference list management
108
+ - **SessionManager**: State management for web interface
109
+
110
+ ### Japanese Integration Features
111
+ - **Composite Key Matching**: Precision matching using lemma and POS combinations
112
+ - **Extensible Design**: Easy addition of new subcorpora via YAML configuration
113
+ - **Fallback Mechanisms**: Robust lookup strategies for maximum coverage
114
+ - **Performance Optimized**: Pre-computed lookup dictionaries for fast analysis
115
+
116
+ ## 📁 File Structure
117
+
118
+ ```
119
+ simple-text-analyzer/
120
+ ├── web_app/ # Streamlit web application
121
+ │ ├── app.py # Main application entry
122
+ │ ├── config_manager.py # Configuration management
123
+ │ ├── reference_manager.py # Reference list handling
124
+ │ └── components/ # UI components
125
+ ├── text_analyzer/ # Core analysis modules
126
+ │ ├── lexical_sophistication.py # Main analyzer
127
+ │ ├── frequency_analyzer.py # Frequency analysis
128
+ │ └── pos_parser.py # POS tagging utilities
129
+ ├── config/ # Configuration files
130
+ │ └── reference_lists.yaml # Corpus configurations
131
+ ├── resources/ # Corpus data files
132
+ │ └── reference_lists/
133
+ │ ├── en/ # English corpus files
134
+ │ └── ja/ # Japanese corpus files
135
+ └── test/ # Test modules
136
+ ```
137
+
138
+ ## 🧪 Testing
139
+
140
+ Test the Japanese integration:
141
+
142
+ ```bash
143
+ uv run python test_japanese_integration.py
144
+ ```
145
+
146
+ Expected output:
147
+ - ✅ SpaCy model loading
148
+ - ✅ Reference data loading (182K+ BCCWJ entries, 41K+ CSJ entries)
149
+ - ✅ Composite key lookup functionality
150
+ - ✅ Fallback mechanism verification
151
+ - ✅ Complete text analysis pipeline
152
+
153
+ ## 📈 Usage Examples
154
+
155
+ ### Japanese Text Analysis
156
+ ```python
157
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
158
+
159
+ # Initialize Japanese analyzer
160
+ analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
161
+
162
+ # Load Japanese corpus references
163
+ selected_indices = ["BCCWJ_frequency", "CSJ_frequency"]
164
+
165
+ # Analyze Japanese text
166
+ results = analyzer.analyze_text(
167
+ "私は毎日学校に行きます。",
168
+ selected_indices
169
+ )
170
+
171
+ # Access frequency scores
172
+ for token in results['token_details']:
173
+ print(f"{token['token']}: BCCWJ={token.get('BCCWJ_frequency_lemma', 'NA')}")
174
+ ```
175
+
176
+ ### English Text Analysis
177
+ ```python
178
+ # Initialize English analyzer
179
+ analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf")
180
+
181
+ # Analyze with COCA frequency data
182
+ results = analyzer.analyze_text(
183
+ "The students studied linguistics carefully.",
184
+ ["COCA_spoken_frequency"]
185
+ )
186
+ ```
187
+
188
+ ## 🔧 Configuration
189
+
190
+ ### Adding New Japanese Subcorpora
191
+
192
+ The system is designed for easy expansion. To add a new subcorpus (e.g., BCCWJ Books):
193
+
194
+ ```yaml
195
+ # config/reference_lists.yaml
196
+ japanese:
197
+ unigrams:
198
+ BCCWJ_books_frequency:
199
+ display_name: "BCCWJ Books - Frequency"
200
+ description: "BCCWJ books subcorpus frequency data"
201
+ files:
202
+ token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
203
+ lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
204
+ format: "tsv"
205
+ has_header: true
206
+ enabled: true
207
+ japanese_corpus: true
208
+ columns:
209
+ surface_form: 1 # lForm column
210
+ lemma: 2 # lemma column
211
+ pos: 3 # pos column
212
+ frequency: 10 # PB_frequency column (books subcorpus)
213
+ ```
214
+
215
+ No code changes required - the system automatically detects and integrates new configurations!
216
+
217
+ ## 📚 Research Applications
218
+
219
+ This tool is ideal for:
220
+ - **Language Learning Research**: Analyzing text complexity for Japanese learners
221
+ - **Corpus Linguistics**: Cross-linguistic frequency analysis
222
+ - **Computational Linguistics**: Lexical sophistication measurement
223
+ - **Educational Assessment**: Text difficulty evaluation
224
+ - **Translation Studies**: Comparative lexical analysis
225
+
226
+ ## 🤝 Contributing
227
+
228
+ 1. Fork the repository
229
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
230
+ 3. Commit your changes (`git commit -m 'Add amazing feature'`)
231
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
232
+ 5. Open a Pull Request
233
+
234
+ ## 📄 License
235
+
236
+ This project is licensed under the CC BY-NC 4.0 License - see the [LICENSE](LICENSE) file for details.
237
+
238
+ ## 🙏 Acknowledgments
239
+
240
+ - **BCCWJ**: National Institute for Japanese Language and Linguistics
241
+ - **CSJ**: National Institute for Japanese Language and Linguistics
242
+ - **COCA**: Mark Davies, Brigham Young University
243
+ - **SpaCy**: Explosion AI for robust NLP models
244
+
245
+ ## 📞 Support
246
+
247
+ For questions, issues, or contributions:
248
+ - Open an issue on GitHub
249
+ - Contact: [Your contact information]
250
+
251
+ ---
252
+
253
+ **Happy analyzing!** 🚀📊
config/reference_lists.yaml CHANGED
@@ -137,6 +137,102 @@ english:
137
 
138
  japanese:
139
  unigrams:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  jp_frequency:
141
  display_name: "Japanese Frequency List"
142
  description: "Frequency data for Japanese words"
@@ -151,4 +247,4 @@ japanese:
151
  enabled: false # Disabled until files exist
152
 
153
  # bigrams: {}
154
- # trigrams: {}
 
137
 
138
  japanese:
139
  unigrams:
140
+ BCCWJ_frequency:
141
+ display_name: "BCCWJ Written - Frequency"
142
+ description: "BCCWJ raw frequency counts for written Japanese"
143
+ files:
144
+ token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
145
+ lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
146
+ format: "tsv"
147
+ has_header: true
148
+ enabled: true
149
+ japanese_corpus: true
150
+ columns:
151
+ surface_form: 1 # lForm
152
+ lemma: 2 # lemma
153
+ pos: 3 # pos
154
+ frequency: 6 # primary measure column
155
+
156
+ BCCWJ_pmw:
157
+ display_name: "BCCWJ Written - Per Million Words"
158
+ description: "BCCWJ normalized frequency for written Japanese"
159
+ files:
160
+ token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
161
+ lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
162
+ format: "tsv"
163
+ has_header: true
164
+ enabled: true
165
+ japanese_corpus: true
166
+ columns:
167
+ surface_form: 1
168
+ lemma: 2
169
+ pos: 3
170
+ frequency: 7 # pmw column
171
+
172
+ BCCWJ_rank:
173
+ display_name: "BCCWJ Written - Frequency Rank"
174
+ description: "BCCWJ frequency ranking for written Japanese"
175
+ files:
176
+ token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
177
+ lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
178
+ format: "tsv"
179
+ has_header: true
180
+ enabled: true
181
+ japanese_corpus: true
182
+ columns:
183
+ surface_form: 1
184
+ lemma: 2
185
+ pos: 3
186
+ frequency: 0 # rank column
187
+
188
+ CSJ_frequency:
189
+ display_name: "CSJ Spoken - Frequency"
190
+ description: "CSJ raw frequency counts for spoken Japanese"
191
+ files:
192
+ token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
193
+ lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
194
+ format: "tsv"
195
+ has_header: true
196
+ enabled: true
197
+ japanese_corpus: true
198
+ columns:
199
+ surface_form: 1
200
+ lemma: 2
201
+ pos: 3
202
+ frequency: 6
203
+
204
+ CSJ_pmw:
205
+ display_name: "CSJ Spoken - Per Million Words"
206
+ description: "CSJ normalized frequency for spoken Japanese"
207
+ files:
208
+ token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
209
+ lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
210
+ format: "tsv"
211
+ has_header: true
212
+ enabled: true
213
+ japanese_corpus: true
214
+ columns:
215
+ surface_form: 1
216
+ lemma: 2
217
+ pos: 3
218
+ frequency: 7
219
+
220
+ CSJ_rank:
221
+ display_name: "CSJ Spoken - Frequency Rank"
222
+ description: "CSJ frequency ranking for spoken Japanese"
223
+ files:
224
+ token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
225
+ lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
226
+ format: "tsv"
227
+ has_header: true
228
+ enabled: true
229
+ japanese_corpus: true
230
+ columns:
231
+ surface_form: 1
232
+ lemma: 2
233
+ pos: 3
234
+ frequency: 0
235
+
236
  jp_frequency:
237
  display_name: "Japanese Frequency List"
238
  description: "Frequency data for Japanese words"
 
247
  enabled: false # Disabled until files exist
248
 
249
  # bigrams: {}
250
+ # trigrams: {}
japanese-nlp-test.ipynb ADDED
@@ -0,0 +1,819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Japanese NLP Analysis: Comparative Study of UniDic-based Approaches\n",
8
+ "\n",
9
+ "This notebook implements and compares two approaches for Japanese morphological analysis with BCCWJ frequency matching:\n",
10
+ "\n",
11
+ "- **Plan A**: MeCab (fugashi) + UniDic direct pipeline\n",
12
+ "- **Plan B**: GiNZA (Sudachi) + UniDic alignment pipeline\n",
13
+ "\n",
14
+ "Each approach is designed for reproducible setup, implementation, validation, and operational use."
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {},
20
+ "source": [
21
+ "## 1. Environment Setup & Verification\n",
22
+ "\n",
23
+ "First, let's verify and set up our environment with all required packages."
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 10,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stdout",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "Python version: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ]\n",
36
+ "Working directory: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/2025/notebooks\n",
37
+ "\n",
38
+ "Checking package availability:\n",
39
+ "✓ fugashi\n",
40
+ "✓ unidic\n",
41
+ "✗ unidic-lite - NOT FOUND\n",
42
+ "✓ spacy\n",
43
+ "✓ ginza\n",
44
+ "✗ ja-ginza - NOT FOUND\n",
45
+ "✓ sudachipy\n",
46
+ "✓ pandas\n",
47
+ "✓ numpy\n",
48
+ "✓ matplotlib\n",
49
+ "✓ collections (built-in)\n"
50
+ ]
51
+ }
52
+ ],
53
+ "source": [
54
+ "# Environment verification and setup\n",
55
+ "import sys\n",
56
+ "import subprocess\n",
57
+ "from pathlib import Path\n",
58
+ "\n",
59
+ "print(f\"Python version: {sys.version}\")\n",
60
+ "print(f\"Working directory: {Path.cwd()}\")\n",
61
+ "\n",
62
+ "# Required packages\n",
63
+ "required_packages = [\n",
64
+ " 'fugashi', 'unidic', 'unidic-lite', 'spacy', 'ginza', \n",
65
+ " 'ja-ginza', 'sudachipy', 'pandas', 'numpy', 'matplotlib', 'collections'\n",
66
+ "]\n",
67
+ "\n",
68
+ "print(\"\\nChecking package availability:\")\n",
69
+ "for package in required_packages:\n",
70
+ " try:\n",
71
+ " if package == 'collections':\n",
72
+ " import collections\n",
73
+ " print(f\"✓ {package} (built-in)\")\n",
74
+ " else:\n",
75
+ " __import__(package)\n",
76
+ " print(f\"✓ {package}\")\n",
77
+ " except ImportError:\n",
78
+ " print(f\"✗ {package} - NOT FOUND\")"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 11,
84
+ "metadata": {},
85
+ "outputs": [
86
+ {
87
+ "name": "stdout",
88
+ "output_type": "stream",
89
+ "text": [
90
+ "scipy not available - will use numpy for correlation\n",
91
+ "All imports successful!\n"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "# Import all necessary libraries\n",
97
+ "import pandas as pd\n",
98
+ "import numpy as np\n",
99
+ "import matplotlib.pyplot as plt\n",
100
+ "from collections import Counter, defaultdict\n",
101
+ "import time\n",
102
+ "import warnings\n",
103
+ "from typing import List, Tuple, Dict, Optional\n",
104
+ "\n",
105
+ "# Japanese NLP libraries\n",
106
+ "import fugashi\n",
107
+ "import unidic\n",
108
+ "import spacy\n",
109
+ "from spacy.tokens import Token, Doc\n",
110
+ "\n",
111
+ "# Statistical analysis\n",
112
+ "try:\n",
113
+ " from scipy.stats import spearmanr\n",
114
+ " scipy_available = True\n",
115
+ "except ImportError:\n",
116
+ " print(\"scipy not available - will use numpy for correlation\")\n",
117
+ " scipy_available = False\n",
118
+ "\n",
119
+ "print(\"All imports successful!\")\n",
120
+ "warnings.filterwarnings('ignore')"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 12,
126
+ "metadata": {},
127
+ "outputs": [
128
+ {
129
+ "name": "stdout",
130
+ "output_type": "stream",
131
+ "text": [
132
+ "UniDic directory: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n",
133
+ "UniDic is properly installed\n",
134
+ "Fugashi + UniDic test successful: テスト\n"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "# Check UniDic installation and download if needed\n",
140
+ "try:\n",
141
+ " print(f\"UniDic directory: {unidic.DICDIR}\")\n",
142
+ " print(\"UniDic is properly installed\")\n",
143
+ "except Exception as e:\n",
144
+ " print(f\"UniDic issue: {e}\")\n",
145
+ " print(\"You may need to run: python -m unidic download\")\n",
146
+ "\n",
147
+ "# Test basic fugashi functionality\n",
148
+ "try:\n",
149
+ " tagger = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
150
+ " test_result = list(tagger(\"テスト\"))\n",
151
+ " print(f\"Fugashi + UniDic test successful: {test_result[0].surface}\")\n",
152
+ "except Exception as e:\n",
153
+ " print(f\"Fugashi test failed: {e}\")"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "markdown",
158
+ "metadata": {},
159
+ "source": [
160
+ "## 2. Sample Data Preparation\n",
161
+ "\n",
162
+ "Let's create realistic Japanese text samples for testing our pipelines."
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 13,
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "Sample texts prepared:\n",
175
+ " 1. 彼は日ごろから本を読むのが好きです。\n",
176
+ " 2. ひごろの勉強が大切だと思います。\n",
177
+ " 3. 日頃の努力が実を結ぶでしょう。\n",
178
+ " 4. 彼女は書きあらわすことが得意です。\n",
179
+ " 5. その問題を書き表すのは難しい。\n",
180
+ " 6. 今日は東京オリンピックについて話しましょう。\n",
181
+ " 7. コーヒーを飲んで、呑み込んで、また飲んでしまった。\n",
182
+ " 8. 国際的な協力が必要不可欠です。\n",
183
+ " 9. 機械学習の技術が進歩している。\n",
184
+ "10. 自然言語処理は興味深い分野だ。\n",
185
+ "\n",
186
+ "Extended corpus: 30 texts\n"
187
+ ]
188
+ }
189
+ ],
190
+ "source": [
191
+ "# Sample Japanese texts for testing\n",
192
+ "sample_texts = [\n",
193
+ " \"彼は日ごろから本を読むのが好きです。\",\n",
194
+ " \"ひごろの勉強が大切だと思います。\",\n",
195
+ " \"日頃の努力が実を結ぶでしょう。\",\n",
196
+ " \"彼女は書きあらわすことが得意です。\",\n",
197
+ " \"その問題を書き表すのは難しい。\",\n",
198
+ " \"今日は東京オリンピックについて話しましょう。\",\n",
199
+ " \"コーヒーを飲んで、呑み込んで、また飲んでしまった。\",\n",
200
+ " \"国際的な協力が必要不可欠です。\",\n",
201
+ " \"機械学習の技術が進歩している。\",\n",
202
+ " \"自然言語処理は興味深い分野だ。\"\n",
203
+ "]\n",
204
+ "\n",
205
+ "print(\"Sample texts prepared:\")\n",
206
+ "for i, text in enumerate(sample_texts, 1):\n",
207
+ " print(f\"{i:2d}. {text}\")\n",
208
+ "\n",
209
+ "# Create a larger corpus by repeating and slightly modifying texts\n",
210
+ "extended_corpus = sample_texts * 3 # Simulate frequency variations\n",
211
+ "print(f\"\\nExtended corpus: {len(extended_corpus)} texts\")"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 14,
217
+ "metadata": {},
218
+ "outputs": [
219
+ {
220
+ "name": "stdout",
221
+ "output_type": "stream",
222
+ "text": [
223
+ "Mock BCCWJ frequency data:\n",
224
+ " lemma reading pos freq_bccwj key\n",
225
+ "0 日頃 ヒゴロ 名詞 1250 (日頃, ヒゴロ, 名詞)\n",
226
+ "1 本 ホン 名詞 8500 (本, ホン, 名詞)\n",
227
+ "2 読む ヨム 動詞 3200 (読む, ヨム, 動詞)\n",
228
+ "3 好き スキ 形容動詞 2100 (好き, スキ, 形容動詞)\n",
229
+ "4 勉強 ベンキョウ 名詞 4200 (勉強, ベンキョウ, 名詞)\n",
230
+ "5 大切 タイセツ 形容動詞 1800 (大切, タイセツ, 形容動詞)\n",
231
+ "6 思う オモウ 動詞 9500 (思う, オモウ, 動詞)\n",
232
+ "7 努力 ドリョク 名詞 2200 (努力, ドリョク, 名詞)\n",
233
+ "8 実 ミ 名詞 1100 (実, ミ, 名詞)\n",
234
+ "9 結ぶ ムスブ 動詞 800 (結ぶ, ムスブ, 動詞)\n",
235
+ "\n",
236
+ "Total entries: 25\n"
237
+ ]
238
+ }
239
+ ],
240
+ "source": [
241
+ "# Create mock BCCWJ frequency data for testing\n",
242
+ "# In real usage, this would be loaded from an actual BCCWJ frequency file\n",
243
+ "\n",
244
+ "mock_bccwj_data = [\n",
245
+ " ('日頃', 'ヒゴロ', '名詞', 1250),\n",
246
+ " ('本', 'ホン', '名詞', 8500),\n",
247
+ " ('読む', 'ヨム', '動詞', 3200),\n",
248
+ " ('好き', 'スキ', '形容動詞', 2100),\n",
249
+ " ('勉強', 'ベンキョウ', '名詞', 4200),\n",
250
+ " ('大切', 'タイセツ', '形容動詞', 1800),\n",
251
+ " ('思う', 'オモウ', '動詞', 9500),\n",
252
+ " ('努力', 'ドリョク', '名詞', 2200),\n",
253
+ " ('実', 'ミ', '名詞', 1100),\n",
254
+ " ('結ぶ', 'ムスブ', '動詞', 800),\n",
255
+ " ('書く', 'カク', '動詞', 4100),\n",
256
+ " ('表す', 'アラワス', '動詞', 1500),\n",
257
+ " ('得意', 'トクイ', '形容動詞', 1300),\n",
258
+ " ('問題', 'モンダイ', '名詞', 6200),\n",
259
+ " ('難しい', 'ムズカシイ', '形容詞', 3800),\n",
260
+ " ('今日', 'キョウ', '名詞', 5500),\n",
261
+ " ('東京', 'トウキョウ', '名詞', 4800),\n",
262
+ " ('話す', 'ハナス', '動詞', 3600),\n",
263
+ " ('飲む', 'ノム', '動詞', 2400),\n",
264
+ " ('呑む', 'ノム', '動詞', 150),\n",
265
+ " ('国際', 'コクサイ', '名詞', 2800),\n",
266
+ " ('協力', 'キョウリョク', '名詞', 1900),\n",
267
+ " ('必要', 'ヒツヨウ', '形容動詞', 4500),\n",
268
+ " ('技術', 'ギジュツ', '名詞', 3900),\n",
269
+ " ('進歩', 'シンポ', '名詞', 1100)\n",
270
+ "]\n",
271
+ "\n",
272
+ "# Create DataFrame\n",
273
+ "df_bccwj = pd.DataFrame(mock_bccwj_data, columns=['lemma', 'reading', 'pos', 'freq_bccwj'])\n",
274
+ "df_bccwj['key'] = list(zip(df_bccwj.lemma, df_bccwj.reading, df_bccwj.pos))\n",
275
+ "\n",
276
+ "print(\"Mock BCCWJ frequency data:\")\n",
277
+ "print(df_bccwj.head(10))\n",
278
+ "print(f\"\\nTotal entries: {len(df_bccwj)}\")"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "markdown",
283
+ "metadata": {},
284
+ "source": [
285
+ "## 3. Plan A: MeCab (fugashi) + UniDic Direct Pipeline\n",
286
+ "\n",
287
+ "### A-1 to A-3: Setup and Configuration\n",
288
+ "\n",
289
+ "UniDic provides the morphological analysis system used in BCCWJ, making it ideal for frequency matching."
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 14,
295
+ "metadata": {},
296
+ "outputs": [
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "Initializing Plan A: fugashi + UniDic pipeline\n",
302
+ "Tagger initialized with UniDic dictionary: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n",
303
+ "\n",
304
+ "Test analysis of '日ごろから勉強している。':\n",
305
+ " 日ごろ -> 日頃 [名,詞,,,普,通,名,詞,,,副,詞,可,能,,,*]\n",
306
+ " から -> から [助,詞,,,格,助,詞,,,*,,,*]\n",
307
+ " 勉強 -> 勉強 [名,詞,,,普,通,名,詞,,,サ,変,可,能,,,*]\n",
308
+ " し -> 為る [動,詞,,,非,自,立,可,能,,,*,,,*]\n",
309
+ " て -> て [助,詞,,,接,続,助,詞,,,*,,,*]\n",
310
+ " いる -> 居る [動,詞,,,非,自,立,可,能,,,*,,,*]\n",
311
+ " 。 -> 。 [補,助,記,号,,,句,点,,,*,,,*]\n"
312
+ ]
313
+ }
314
+ ],
315
+ "source": [
316
+ "# A-3: Initialize fugashi with UniDic\n",
317
+ "print(\"Initializing Plan A: fugashi + UniDic pipeline\")\n",
318
+ "\n",
319
+ "# Initialize tagger with explicit UniDic path\n",
320
+ "tagger_a = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
321
+ "print(f\"Tagger initialized with UniDic dictionary: {unidic.DICDIR}\")\n",
322
+ "\n",
323
+ "# Test the tagger\n",
324
+ "test_text = \"日ごろから勉強している。\"\n",
325
+ "tokens = list(tagger_a(test_text))\n",
326
+ "print(f\"\\nTest analysis of '{test_text}':\")\n",
327
+ "for token in tokens:\n",
328
+ " print(f\" {token.surface} -> {token.feature.lemma} [{','.join(token.pos)}]\")"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": 18,
334
+ "metadata": {},
335
+ "outputs": [
336
+ {
337
+ "name": "stdout",
338
+ "output_type": "stream",
339
+ "text": [
340
+ "Extracted keys from '日ごろから勉強している。':\n",
341
+ " (日ごろ, ヒゴロ, 名)\n",
342
+ " (から, カラ, 助)\n",
343
+ " (勉強, ベンキョー, 名)\n",
344
+ " (する, スル, 動)\n",
345
+ " (て, テ, 助)\n",
346
+ " (いる, イル, 動)\n",
347
+ " (。, *, 補)\n"
348
+ ]
349
+ }
350
+ ],
351
+ "source": [
352
+ "# A-4: Morphological field extraction function\n",
353
+ "def iter_lemma_keys_plan_a(text: str, tagger) -> List[Tuple[str, str, str]]:\n",
354
+ " \"\"\"\n",
355
+ " Extract (lemma, reading, pos_major) tuples from text using UniDic.\n",
356
+ " \n",
357
+ " Args:\n",
358
+ " text: Input Japanese text\n",
359
+ " tagger: fugashi Tagger instance\n",
360
+ " \n",
361
+ " Returns:\n",
362
+ " List of (dictionary_form, reading, pos_major) tuples\n",
363
+ " \"\"\"\n",
364
+ " keys = []\n",
365
+ " for m in tagger(text):\n",
366
+ " if m.surface.strip(): # Skip empty tokens\n",
367
+ " # UniDic POS is hierarchical; use major category (pos[0])\n",
368
+ " pos_major = m.pos[0] if m.pos else 'UNKNOWN'\n",
369
+ " lemma = m.feature[10] if m.feature[10] else m.surface\n",
370
+ " reading = m.feature[11] if m.feature[11] else ''\n",
371
+ " keys.append((lemma, reading, pos_major))\n",
372
+ " return keys\n",
373
+ "\n",
374
+ "# Test the extraction function\n",
375
+ "test_keys = iter_lemma_keys_plan_a(test_text, tagger_a)\n",
376
+ "print(f\"Extracted keys from '{test_text}':\")\n",
377
+ "for lemma, reading, pos in test_keys:\n",
378
+ " print(f\" ({lemma}, {reading}, {pos})\")"
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "code",
383
+ "execution_count": 19,
384
+ "metadata": {},
385
+ "outputs": [
386
+ {
387
+ "name": "stdout",
388
+ "output_type": "stream",
389
+ "text": [
390
+ "Extracted keys from '日ごろから勉強している。' (fixed version):\n",
391
+ " (日ごろ, ヒゴロ, 名)\n",
392
+ " (から, カラ, 助)\n",
393
+ " (勉強, ベンキョー, 名)\n",
394
+ " (する, シ, 動)\n",
395
+ " (て, テ, 助)\n",
396
+ " (いる, イル, 動)\n",
397
+ " (。, *, 補)\n"
398
+ ]
399
+ }
400
+ ],
401
+ "source": [
402
+ "# Fixed version with proper fugashi/UniDic attribute handling\n",
403
+ "def iter_lemma_keys_fixed(text: str, tagger) -> List[Tuple[str, str, str]]:\n",
404
+ " \"\"\"\n",
405
+ " Extract (lemma, reading, pos_major) tuples from text using UniDic.\n",
406
+ " Fixed version that handles fugashi attribute variations.\n",
407
+ " \"\"\"\n",
408
+ " keys = []\n",
409
+ " for m in tagger(text):\n",
410
+ " if m.surface.strip(): # Skip empty tokens\n",
411
+ " # UniDic POS is hierarchical; use major category (pos[0])\n",
412
+ " pos_major = m.pos[0] if m.pos else 'UNKNOWN'\n",
413
+ " \n",
414
+ " # Handle different attribute names for lemma\n",
415
+ " try:\n",
416
+ " lemma = m.lemma if hasattr(m, 'lemma') else m.feature[10]\n",
417
+ " except:\n",
418
+ " lemma = m.surface # fallback\n",
419
+ " \n",
420
+ " # Handle different attribute names for reading\n",
421
+ " try:\n",
422
+ " reading = m.feature[9] if len(m.feature) > 9 else ''\n",
423
+ " except:\n",
424
+ " reading = '' # fallback\n",
425
+ " \n",
426
+ " keys.append((lemma, reading, pos_major))\n",
427
+ " return keys\n",
428
+ "\n",
429
+ "# Use the fixed function\n",
430
+ "iter_lemma_keys_plan_a = iter_lemma_keys_fixed\n",
431
+ "\n",
432
+ "# Test the fixed function\n",
433
+ "test_keys = iter_lemma_keys_plan_a(test_text, tagger_a)\n",
434
+ "print(f\"Extracted keys from '{test_text}' (fixed version):\")\n",
435
+ "for lemma, reading, pos in test_keys:\n",
436
+ " print(f\" ({lemma}, {reading}, {pos})\")"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "execution_count": 20,
442
+ "metadata": {},
443
+ "outputs": [
444
+ {
445
+ "name": "stdout",
446
+ "output_type": "stream",
447
+ "text": [
448
+ "Analyzing 30 texts with Plan A...\n",
449
+ "\n",
450
+ "Plan A Results (top 15):\n",
451
+ " lemma reading pos freq_local freq_bccwj\n",
452
+ "11 。 * 補 30 NaN\n",
453
+ "8 が ガ 助 18 NaN\n",
454
+ "1 は ワ 助 15 NaN\n",
455
+ "7 の ノ 助 15 NaN\n",
456
+ "5 を オ 助 12 NaN\n",
457
+ "10 です デス 助 9 NaN\n",
458
+ "42 で デ 助 9 NaN\n",
459
+ "15 だ ダ 助 6 NaN\n",
460
+ "37 て テ 助 6 NaN\n",
461
+ "41 飲む ノン 動 6 NaN\n",
462
+ "43 、 * 補 6 NaN\n",
463
+ "48 国際 コクサイ 名 3 NaN\n",
464
+ "47 た タ 助 3 NaN\n",
465
+ "46 しまう シマッ 動 3 NaN\n",
466
+ "0 彼 カレ 代 3 NaN\n"
467
+ ]
468
+ }
469
+ ],
470
+ "source": [
471
+ "# A-5: Frequency analysis with BCCWJ matching\n",
472
+ "def analyze_corpus_plan_a(corpus: List[str], tagger, bccwj_df: pd.DataFrame) -> pd.DataFrame:\n",
473
+ " \"\"\"Analyze corpus using Plan A and match with BCCWJ frequencies.\"\"\"\n",
474
+ " freq = Counter()\n",
475
+ " \n",
476
+ " print(f\"Analyzing {len(corpus)} texts with Plan A...\")\n",
477
+ " for text in corpus:\n",
478
+ " for key in iter_lemma_keys_plan_a(text, tagger):\n",
479
+ " freq[key] += 1\n",
480
+ " \n",
481
+ " # Convert to DataFrame\n",
482
+ " rows = []\n",
483
+ " for (lemma, reading, pos), count in freq.items():\n",
484
+ " rows.append((lemma, reading, pos, count))\n",
485
+ " \n",
486
+ " df_local = pd.DataFrame(rows, columns=['lemma', 'reading', 'pos', 'freq_local'])\n",
487
+ " df_local['key'] = list(zip(df_local.lemma, df_local.reading, df_local.pos))\n",
488
+ " \n",
489
+ " # Merge with BCCWJ data\n",
490
+ " merged = df_local.merge(bccwj_df[['key', 'freq_bccwj']], on='key', how='left')\n",
491
+ " \n",
492
+ " return merged.sort_values('freq_local', ascending=False)\n",
493
+ "\n",
494
+ "# Run Plan A analysis\n",
495
+ "results_a = analyze_corpus_plan_a(extended_corpus, tagger_a, df_bccwj)\n",
496
+ "print(f\"\\nPlan A Results (top 15):\")\n",
497
+ "print(results_a.head(15)[['lemma', 'reading', 'pos', 'freq_local', 'freq_bccwj']])"
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "code",
502
+ "execution_count": 21,
503
+ "metadata": {},
504
+ "outputs": [
505
+ {
506
+ "name": "stdout",
507
+ "output_type": "stream",
508
+ "text": [
509
+ "Plan A Evaluation Metrics:\n",
510
+ " type_coverage: 0.000\n",
511
+ " token_coverage: 0.000\n",
512
+ " correlation: None\n",
513
+ " p_value: None\n",
514
+ " total_types: 66\n",
515
+ " matched_types: 0\n",
516
+ " total_tokens: 297\n",
517
+ " matched_tokens: 0\n"
518
+ ]
519
+ }
520
+ ],
521
+ "source": [
522
+ "# A-6: Evaluation metrics for Plan A\n",
523
+ "def calculate_metrics(df: pd.DataFrame) -> Dict[str, float]:\n",
524
+ " \"\"\"Calculate coverage and correlation metrics.\"\"\"\n",
525
+ " # Coverage: percentage of local tokens found in BCCWJ\n",
526
+ " matched = df.dropna(subset=['freq_bccwj'])\n",
527
+ " coverage = len(matched) / len(df) * 100\n",
528
+ " \n",
529
+ " # Token coverage (by frequency)\n",
530
+ " total_tokens = df['freq_local'].sum()\n",
531
+ " matched_tokens = matched['freq_local'].sum()\n",
532
+ " token_coverage = matched_tokens / total_tokens * 100\n",
533
+ " \n",
534
+ " # Spearman correlation for matched items\n",
535
+ " if len(matched) > 1:\n",
536
+ " if scipy_available:\n",
537
+ " correlation, p_value = spearmanr(matched['freq_local'], matched['freq_bccwj'])\n",
538
+ " else:\n",
539
+ " correlation = np.corrcoef(matched['freq_local'].rank(), matched['freq_bccwj'].rank())[0,1]\n",
540
+ " p_value = None\n",
541
+ " else:\n",
542
+ " correlation, p_value = None, None\n",
543
+ " \n",
544
+ " return {\n",
545
+ " 'type_coverage': coverage,\n",
546
+ " 'token_coverage': token_coverage,\n",
547
+ " 'correlation': correlation,\n",
548
+ " 'p_value': p_value,\n",
549
+ " 'total_types': len(df),\n",
550
+ " 'matched_types': len(matched),\n",
551
+ " 'total_tokens': total_tokens,\n",
552
+ " 'matched_tokens': matched_tokens\n",
553
+ " }\n",
554
+ "\n",
555
+ "metrics_a = calculate_metrics(results_a)\n",
556
+ "print(\"Plan A Evaluation Metrics:\")\n",
557
+ "for key, value in metrics_a.items():\n",
558
+ " if isinstance(value, float) and value is not None:\n",
559
+ " print(f\" {key}: {value:.3f}\")\n",
560
+ " else:\n",
561
+ " print(f\" {key}: {value}\")"
562
+ ]
563
+ },
564
+ {
565
+ "cell_type": "markdown",
566
+ "metadata": {},
567
+ "source": [
568
+ "# Using Fugashi"
569
+ ]
570
+ },
571
+ {
572
+ "cell_type": "code",
573
+ "execution_count": 17,
574
+ "metadata": {},
575
+ "outputs": [
576
+ {
577
+ "name": "stdout",
578
+ "output_type": "stream",
579
+ "text": [
580
+ "彼 [('彼', '代名詞', '代')]\n",
581
+ "は [('は', '助詞', '助')]\n",
582
+ "日ごろ [('日頃', '名詞', '名')]\n",
583
+ "本 [('本', '名詞', '名')]\n",
584
+ "を [('を', '助詞', '助')]\n",
585
+ "読む [('読む', '動詞', '動')]\n",
586
+ "。 [('。', '補助記号', '補')]\n"
587
+ ]
588
+ }
589
+ ],
590
+ "source": [
591
+ "import fugashi, unidic\n",
592
+ "from spacy.tokens import Token\n",
593
+ "tagger = fugashi.Tagger()\n",
594
+ "tagger = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
595
+ "\n",
596
+ "if not Token.has_extension(\"unidic_lemmas\"):\n",
597
+ " Token.set_extension(\"unidic_lemmas\", default=None)\n",
598
+ "\n",
599
+ "def enrich_with_unidic(doc):\n",
600
+ " text = doc.text\n",
601
+ " # GiNZA token start index -> token\n",
602
+ " start_map = {tok.idx: tok for tok in doc}\n",
603
+ " cursor = 0\n",
604
+ " for m in tagger(text):\n",
605
+ " surf = m.surface\n",
606
+ " start = text.find(surf, cursor)\n",
607
+ " if start < 0:\n",
608
+ " continue\n",
609
+ " cursor = start + len(surf)\n",
610
+ " tok = start_map.get(start)\n",
611
+ " if tok:\n",
612
+ " if tok._.unidic_lemmas is None:\n",
613
+ " tok._.unidic_lemmas = []\n",
614
+ " tok._.unidic_lemmas.append(\n",
615
+ " (m.feature.lemma, m.feature.pos1, m.pos[0])\n",
616
+ " )\n",
617
+ " return doc\n",
618
+ "\n",
619
+ "doc = enrich_with_unidic(doc)\n",
620
+ "for t in doc:\n",
621
+ " print(t.text, t._.unidic_lemmas)"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": 5,
627
+ "metadata": {},
628
+ "outputs": [],
629
+ "source": [
630
+ "text = \"日頃からの日ごろをてっていする。\""
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": 6,
636
+ "metadata": {},
637
+ "outputs": [],
638
+ "source": [
639
+ "import spacy\n",
640
+ "from fugashi import Tagger\n",
641
+ "import unidic # or unidic_lite\n",
642
+ "\n",
643
+ "nlp = spacy.load(\"ja_ginza\")\n",
644
+ "tagger = Tagger(f'-d \"{unidic.DICDIR}\"') # フル UniDic\n",
645
+ "doc = nlp(text)\n",
646
+ "mecab_tokens = list(tagger(text))\n",
647
+ "# → 文字オフセットでアライメントして doc の token に UniDic 情報を付与"
648
+ ]
649
+ },
650
+ {
651
+ "cell_type": "code",
652
+ "execution_count": 7,
653
+ "metadata": {},
654
+ "outputs": [
655
+ {
656
+ "data": {
657
+ "text/plain": [
658
+ "[日頃, から, の, 日ごろ, を, てってい, する, 。]"
659
+ ]
660
+ },
661
+ "execution_count": 7,
662
+ "metadata": {},
663
+ "output_type": "execute_result"
664
+ }
665
+ ],
666
+ "source": [
667
+ "mecab_tokens"
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": 8,
673
+ "metadata": {},
674
+ "outputs": [
675
+ {
676
+ "name": "stdout",
677
+ "output_type": "stream",
678
+ "text": [
679
+ "<fugashi.fugashi.Tagger object at 0x1183bad80>\n"
680
+ ]
681
+ }
682
+ ],
683
+ "source": [
684
+ "print(tagger)"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "code",
689
+ "execution_count": 9,
690
+ "metadata": {},
691
+ "outputs": [
692
+ {
693
+ "name": "stdout",
694
+ "output_type": "stream",
695
+ "text": [
696
+ "Using unidic at: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n"
697
+ ]
698
+ }
699
+ ],
700
+ "source": [
701
+ "import unidic\n",
702
+ "print(\"Using unidic at:\", unidic.DICDIR)"
703
+ ]
704
+ },
705
+ {
706
+ "cell_type": "code",
707
+ "execution_count": 10,
708
+ "metadata": {},
709
+ "outputs": [
710
+ {
711
+ "name": "stdout",
712
+ "output_type": "stream",
713
+ "text": [
714
+ "feature_len: 29\n"
715
+ ]
716
+ }
717
+ ],
718
+ "source": [
719
+ "sample = next(iter(tagger(\"テスト\")))\n",
720
+ "print(\"feature_len:\", len(sample.feature))\n",
721
+ "# 17 = unidic-lite (2.1.2), 29前後 = フル UniDic 3.x"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": null,
727
+ "metadata": {},
728
+ "outputs": [
729
+ {
730
+ "name": "stdout",
731
+ "output_type": "stream",
732
+ "text": [
733
+ "['dictionary_info']\n"
734
+ ]
735
+ }
736
+ ],
737
+ "source": [
738
+ "print([a for a in dir(tagger) if 'dic' in a.lower()])"
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "execution_count": 12,
744
+ "metadata": {},
745
+ "outputs": [
746
+ {
747
+ "name": "stdout",
748
+ "output_type": "stream",
749
+ "text": [
750
+ "Available attrs: ['char_type', 'feature', 'feature_raw', 'is_unk', 'length', 'pos', 'posid', 'rlength', 'stat', 'surface', 'white_space']\n"
751
+ ]
752
+ }
753
+ ],
754
+ "source": [
755
+ "import fugashi\n",
756
+ "from fugashi import Tagger\n",
757
+ "\n",
758
+ "tagger = Tagger() # まずオプションなし\n",
759
+ "m = next(iter(tagger(\"日ごろ\")))\n",
760
+ "print(\"Available attrs:\", [a for a in dir(m) if not a.startswith('_')][:25])"
761
+ ]
762
+ },
763
+ {
764
+ "cell_type": "code",
765
+ "execution_count": 13,
766
+ "metadata": {},
767
+ "outputs": [
768
+ {
769
+ "name": "stdout",
770
+ "output_type": "stream",
771
+ "text": [
772
+ "Tagger repr: <fugashi.fugashi.Tagger object at 0x13f33b5c0>\n",
773
+ "surface: 日ごろ\n",
774
+ "feature_len: 29\n",
775
+ "raw feature: UnidicFeatures29(pos1='名詞', pos2='普通名詞', pos3='副詞可能', pos4='*', cType='*', cForm='*', lForm='ヒゴロ', lemma='日頃', orth='日ごろ', pron='ヒゴロ', orthBase='日ごろ', pronBase='ヒゴロ', goshu='和', iType='*', iForm='*', fType='*', fForm='*', iConType='*', fConType='*', type='体', kana='ヒゴロ', kanaBase='ヒゴロ', form='ヒゴロ', formBase='ヒゴロ', aType='0', aConType='C2', aModType='*', lid='8605061500510720', lemma_id='31305')\n"
776
+ ]
777
+ }
778
+ ],
779
+ "source": [
780
+ "import fugashi\n",
781
+ "t = fugashi.Tagger()\n",
782
+ "print(\"Tagger repr:\", t) # ここに 'ipa' や 'unidic' などヒントが出ることが多い\n",
783
+ "\n",
784
+ "w = next(iter(t(\"日ごろ\")))\n",
785
+ "print(\"surface:\", w.surface)\n",
786
+ "print(\"feature_len:\", len(w.feature))\n",
787
+ "print(\"raw feature:\", w.feature) # まず 1語分"
788
+ ]
789
+ },
790
+ {
791
+ "cell_type": "code",
792
+ "execution_count": null,
793
+ "metadata": {},
794
+ "outputs": [],
795
+ "source": []
796
+ }
797
+ ],
798
+ "metadata": {
799
+ "kernelspec": {
800
+ "display_name": "Python 3",
801
+ "language": "python",
802
+ "name": "python3"
803
+ },
804
+ "language_info": {
805
+ "codemirror_mode": {
806
+ "name": "ipython",
807
+ "version": 3
808
+ },
809
+ "file_extension": ".py",
810
+ "mimetype": "text/x-python",
811
+ "name": "python",
812
+ "nbconvert_exporter": "python",
813
+ "pygments_lexer": "ipython3",
814
+ "version": "3.12.0"
815
+ }
816
+ },
817
+ "nbformat": 4,
818
+ "nbformat_minor": 4
819
+ }
pyproject.toml CHANGED
@@ -18,4 +18,8 @@ dependencies = [
18
  "ja-core-news-md @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl",
19
  "ja-core-news-trf @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl",
20
  "huggingface-hub[cli]>=0.33.4",
 
 
 
 
21
  ]
 
18
  "ja-core-news-md @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl",
19
  "ja-core-news-trf @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl",
20
  "huggingface-hub[cli]>=0.33.4",
21
+ "chardet>=5.2.0",
22
+ "fugashi>=1.3.0",
23
+ "unidic>=1.1.0",
24
+ "ipykernel>=6.29.5",
25
  ]
resources/reference_lists/ja/BCCWJ_frequencylist_luw2_ver1_1.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abdfe3f5c6383be148809f615834a8f8890d6acab1415428ca350cff08438908
3
+ size 355289031
resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1 copy.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c38228ac27858cf3fa35c71cddd54f2290b86f9ca5e705e360b2f849350179
3
+ size 5123687
resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59cc5d3e0961f130b073a17736e8ff4c5f0f63bd759e27e3c7cd0d96e79f4443
3
+ size 76573321
resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84fa50dd87a9094f85006c81d78d14afab54bfad55e4a7137c1beab89b7200a4
3
+ size 17713132
test/test_app.py CHANGED
@@ -4,13 +4,15 @@ Basic test script to validate the application components.
4
 
5
  import sys
6
  import os
7
- sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
 
 
8
 
9
  def test_imports():
10
  """Test that all required modules can be imported."""
11
  try:
12
- from lexical_sophistication import LexicalSophisticationAnalyzer
13
- from pos_parser import POSParser
14
  print("✓ Backend modules imported successfully")
15
  return True
16
  except ImportError as e:
@@ -20,8 +22,8 @@ def test_imports():
20
  def test_basic_functionality():
21
  """Test basic functionality with SpaCy models."""
22
  try:
23
- from lexical_sophistication import LexicalSophisticationAnalyzer
24
- from pos_parser import POSParser
25
 
26
  print("Testing basic class instantiation...")
27
  print("Note: This will fail without SpaCy models installed")
@@ -64,4 +66,4 @@ def main():
64
  return True
65
 
66
  if __name__ == "__main__":
67
- main()
 
4
 
5
  import sys
6
  import os
7
+
8
+ # Add the parent directory to the Python path for imports
9
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
 
11
  def test_imports():
12
  """Test that all required modules can be imported."""
13
  try:
14
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
15
+ from text_analyzer.pos_parser import POSParser
16
  print("✓ Backend modules imported successfully")
17
  return True
18
  except ImportError as e:
 
22
  def test_basic_functionality():
23
  """Test basic functionality with SpaCy models."""
24
  try:
25
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
26
+ from text_analyzer.pos_parser import POSParser
27
 
28
  print("Testing basic class instantiation...")
29
  print("Note: This will fail without SpaCy models installed")
 
66
  return True
67
 
68
  if __name__ == "__main__":
69
+ main()
test/test_functionality.py CHANGED
@@ -6,10 +6,12 @@ Extended test script to validate application functionality.
6
  import sys
7
  import os
8
  import tempfile
9
- sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
10
 
11
- from lexical_sophistication import LexicalSophisticationAnalyzer
12
- from pos_parser import POSParser
 
 
 
13
  import pandas as pd
14
 
15
  def test_lexical_sophistication():
@@ -122,4 +124,4 @@ def main():
122
 
123
  if __name__ == "__main__":
124
  success = main()
125
- sys.exit(0 if success else 1)
 
6
  import sys
7
  import os
8
  import tempfile
 
9
 
10
+ # Add the parent directory to the Python path for imports
11
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12
+
13
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
14
+ from text_analyzer.pos_parser import POSParser
15
  import pandas as pd
16
 
17
  def test_lexical_sophistication():
 
124
 
125
  if __name__ == "__main__":
126
  success = main()
127
+ sys.exit(0 if success else 1)
test/test_multi_index.py CHANGED
@@ -3,9 +3,8 @@
3
  import sys
4
  import os
5
  import tempfile
6
- sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
7
 
8
- from lexical_sophistication import LexicalSophisticationAnalyzer
9
 
10
  def test_multi_index_functionality():
11
  print("Testing multi-index functionality...")
@@ -130,4 +129,4 @@ that,,,7,12279,500,12063.320,1.000"""
130
  traceback.print_exc()
131
 
132
  if __name__ == "__main__":
133
- test_multi_index_functionality()
 
3
  import sys
4
  import os
5
  import tempfile
 
6
 
7
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
8
 
9
  def test_multi_index_functionality():
10
  print("Testing multi-index functionality...")
 
129
  traceback.print_exc()
130
 
131
  if __name__ == "__main__":
132
+ test_multi_index_functionality()
test/test_yaml_config.py CHANGED
@@ -5,9 +5,8 @@ Test script to validate YAML configuration system.
5
 
6
  import sys
7
  import os
8
- sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
9
 
10
- from lexical_sophistication import LexicalSophisticationAnalyzer
11
  import yaml
12
  from pathlib import Path
13
 
@@ -153,4 +152,4 @@ def main():
153
 
154
  if __name__ == "__main__":
155
  success = main()
156
- sys.exit(0 if success else 1)
 
5
 
6
  import sys
7
  import os
 
8
 
9
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
10
  import yaml
11
  from pathlib import Path
12
 
 
152
 
153
  if __name__ == "__main__":
154
  success = main()
155
+ sys.exit(0 if success else 1)
test_frequency_flexible.py ADDED
@@ -0,0 +1 @@
 
 
1
+ \n#!/usr/bin/env python3\n\"\"\"\nTest script for the enhanced FrequencyAnalyzer with flexible column mapping.\nThis demonstrates the new functionality with sample data.\n\"\"\"\n\nimport pandas as pd\nimport numpy as np\nfrom io import StringIO\nimport sys\nimport os\n\n# Add the text_analyzer to path\nsys.path.append('text_analyzer')\n\nfrom frequency_analyzer import FrequencyAnalyzer\n\ndef create_sample_data():\n \"\"\"Create sample frequency data in the new format.\"\"\"\n sample_data = \"\"\"rank\tlForm\tlemma\tpos\tsubLemma\twType\tfrequency\tpmw\tPB_frequency\tPB_pmw\tPM_frequency\tPM_pmw\tcore_frequency\tcore_pmw\n1\tノ\tの\t助詞-格助詞\t\t和\t5061558\t48383.9\t1473494\t51791.5\t208748\t47179.3\t1398950\t51737.2\n2\tニ\tに\t助詞-格助詞\t\t和\t3576558\t34188.7\t1036653\t36437.1\t140178\t31681.7\t985766\t36456.5\n3\tテ\tて\t助詞-接続助詞\t\t和\t3493117\t33391.0\t948430\t33336.1\t124241\t28079.8\t902379\t33372.6\n4\tハ\tは\t助詞-係助詞\t\t和\t3289932\t31448.8\t945084\t33218.5\t129378\t29240.8\t899776\t33276.3\n5\tガ\tが\t助詞-格助詞\t\t和\t2518164\t24070.6\t743621\t26131.8\t103456\t23390.2\t707331\t26139.9\"\"\"\n return sample_data\n\ndef test_file_format_detection():\n \"\"\"Test file format detection functionality.\"\"\"\n print(\"=== Testing File Format Detection ===\")\n \n analyzer = FrequencyAnalyzer(file_size_limit_mb=300)\n sample_data = create_sample_data()\n \n format_info = analyzer.detect_file_format(sample_data)\n print(f\"Detected separator: '{format_info['separator']}'\")\n print(f\"Has header: {format_info['has_header']}\")\n print(f\"Estimated columns: {format_info['estimated_columns']}\")\n print(f\"Sample lines: {format_info['sample_lines'][:2]}\")\n print()\n\ndef test_column_detection():\n \"\"\"Test column detection and categorization.\"\"\"\n print(\"=== Testing Column Detection ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Read sample data for column detection\n df = pd.read_csv(StringIO(sample_data), sep='\\t')\n detected_cols = analyzer.detect_columns(df)\n \n print(\"Detected columns:\")\n for category, columns in detected_cols.items():\n print(f\" {category}: {columns}\")\n print()\n\ndef test_flexible_loading():\n \"\"\"Test flexible data loading with column configuration.\"\"\"\n print(\"=== Testing Flexible Data Loading ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Test with different column configurations\n configs = [\n {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency',\n 'pos_column': 'pos'\n },\n {\n 'word_column': 'lemma',\n 'frequency_column': 'pmw',\n 'pos_column': 'pos'\n },\n {\n 'word_column': 'lForm',\n 'frequency_column': 'PB_frequency'\n }\n ]\n \n for i, config in enumerate(configs, 1):\n print(f\"Configuration {i}: {config}\")\n try:\n df = analyzer.load_frequency_data(sample_data, config)\n print(f\" ✓ Successfully loaded {len(df)} entries\")\n print(f\" ✓ Available frequency columns: {analyzer.get_available_frequency_columns()}\")\n print(f\" ✓ Available word columns: {analyzer.get_available_word_columns()}\")\n except Exception as e:\n print(f\" ✗ Error: {e}\")\n print()\n\ndef test_multi_frequency_analysis():\n \"\"\"Test multi-frequency analysis functionality.\"\"\"\n print(\"=== Testing Multi-Frequency Analysis ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency',\n 'pos_column': 'pos'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n # Test analysis with multiple frequency columns\n freq_columns = ['frequency', 'pmw', 'PB_frequency']\n \n try:\n results = analyzer.create_multi_frequency_analysis(freq_columns, bin_size=2)\n \n print(f\"Multi-frequency analysis results:\")\n for col, result in results.items():\n print(f\" {col}: {len(result['group_labels'])} groups\")\n print(f\" Sample frequencies: {result['avg_frequencies'][:3]}\")\n \n except Exception as e:\n print(f\"Error in multi-frequency analysis: {e}\")\n print()\n\ndef test_rank_based_visualization():\n \"\"\"Test rank-based visualization with flexible columns.\"\"\"\n print(\"=== Testing Rank-Based Visualization ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n try:\n # Test with different frequency columns\n for col in ['frequency', 'pmw', 'PB_frequency']:\n result = analyzer.create_rank_based_visualization_flexible(\n column=col, \n bin_size=2, \n log_transform=False\n )\n \n print(f\"Analysis for column '{col}':\")\n print(f\" Groups: {len(result['group_labels'])}\")\n print(f\" Sample words: {[w['word'] for w in result['sample_words'].get(0, [])]}\")\n print(f\" Avg frequencies: {result['avg_frequencies']}\")\n \n except Exception as e:\n print(f\"Error in rank-based visualization: {e}\")\n print()\n\ndef test_backward_compatibility():\n \"\"\"Test backward compatibility with legacy interface.\"\"\"\n print(\"=== Testing Backward Compatibility ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Test with flexible loading first\n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n # Then test legacy methods\n try:\n legacy_cols = analyzer.get_available_columns()\n print(f\"Legacy available columns: {legacy_cols}\")\n \n if legacy_cols:\n stats = analyzer.calculate_statistics(legacy_cols[0])\n print(f\"Statistics for {legacy_cols[0]}: mean={stats['mean']:.1f}, count={stats['count']}\")\n \n top_words = analyzer.get_top_words(legacy_cols[0], n=3)\n print(f\"Top 3 words: {[w['word'] for w in top_words]}\")\n \n except Exception as e:\n print(f\"Error in backward compatibility test: {e}\")\n print()\n\nif __name__ == \"__main__\":\n print(\"Testing Enhanced FrequencyAnalyzer with Flexible Column Mapping\")\n print(\"=\" * 60)\n \n test_file_format_detection()\n test_column_detection()\n test_flexible_loading()\n test_multi_frequency_analysis()\n test_rank_based_visualization()\n test_backward_compatibility()\n \n print(\"All tests completed!\")\n
test_fugashi_diagnostic.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Diagnostic test to check if fugashi is working and what matching methods are being used.
4
+ """
5
+
6
+ import sys
7
+ sys.path.append('.')
8
+
9
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
10
+ from web_app.config_manager import ConfigManager
11
+
12
+ def test_fugashi_diagnostic():
13
+ """Test what matching methods are actually being used."""
14
+
15
+ print("=== Fugashi Diagnostic Test ===\n")
16
+
17
+ # Initialize Japanese analyzer
18
+ print("1. Initializing Japanese analyzer...")
19
+ try:
20
+ analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
21
+ print("✓ Japanese SpaCy model loaded successfully")
22
+
23
+ # Check if UniDic enricher is available
24
+ if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
25
+ print("✓ UniDic enricher initialized successfully")
26
+ else:
27
+ print("⚠ UniDic enricher not available - using legacy mode")
28
+ except Exception as e:
29
+ print(f"✗ Failed to load Japanese model: {e}")
30
+ return False
31
+
32
+ # Load reference configuration
33
+ print("\n2. Loading BCCWJ frequency data only...")
34
+ config = ConfigManager.load_reference_config()
35
+ japanese_config = config.get('japanese', {}).get('unigrams', {})
36
+
37
+ # Load just BCCWJ frequency for testing
38
+ bccwj_config = japanese_config.get('BCCWJ_frequency')
39
+ if not bccwj_config:
40
+ print("✗ BCCWJ configuration not found")
41
+ return False
42
+
43
+ print("✓ BCCWJ configuration found")
44
+
45
+ # Load the data
46
+ bccwj_data = ConfigManager.load_reference_list_data(bccwj_config)
47
+ if not bccwj_data:
48
+ print("✗ Failed to load BCCWJ data")
49
+ return False
50
+
51
+ print(f"✓ BCCWJ data loaded successfully")
52
+
53
+ # Load into analyzer
54
+ reference_data = {"unigrams_BCCWJ_frequency": bccwj_data}
55
+ analyzer.load_reference_lists(reference_data)
56
+ print("✓ Reference data loaded into analyzer")
57
+
58
+ # Test with a simple Japanese sentence
59
+ print("\n3. Testing token matching methods...")
60
+ test_text = "私は学校に行きます。"
61
+
62
+ try:
63
+ results = analyzer.analyze_text(test_text, ["unigrams_BCCWJ_frequency"])
64
+
65
+ print(f"\nAnalysis completed for: '{test_text}'")
66
+ print(f"Total tokens analyzed: {len(results['token_details'])}")
67
+
68
+ print("\nDetailed token matching results:")
69
+ for i, token in enumerate(results['token_details']):
70
+ print(f"\nToken {i+1}: '{token['token']}' (lemma: '{token['lemma']}')")
71
+ print(f" POS: {token['pos']}, Tag: {token['tag']}")
72
+
73
+ # Check matching methods
74
+ token_method = token.get('unigrams_BCCWJ_frequency_token_match_method', 'unknown')
75
+ lemma_method = token.get('unigrams_BCCWJ_frequency_lemma_match_method', 'unknown')
76
+
77
+ token_score = token.get('unigrams_BCCWJ_frequency_token')
78
+ lemma_score = token.get('unigrams_BCCWJ_frequency_lemma')
79
+
80
+ print(f" Token matching method: {token_method}")
81
+ print(f" Lemma matching method: {lemma_method}")
82
+ print(f" Token score: {token_score}")
83
+ print(f" Lemma score: {lemma_score}")
84
+
85
+ # Show UniDic features if available
86
+ if 'unidic_features' in token:
87
+ unidic = token['unidic_features']
88
+ print(f" UniDic features available:")
89
+ print(f" lemma: '{unidic.get('lemma', '')}'")
90
+ print(f" lForm: '{unidic.get('lForm', '')}'")
91
+ print(f" pos1: '{unidic.get('pos1', '')}'")
92
+ print(f" pos2: '{unidic.get('pos2', '')}'")
93
+ print(f" alignment_confidence: {unidic.get('alignment_confidence', 0.0)}")
94
+ else:
95
+ print(" No UniDic features available")
96
+
97
+ # Summary
98
+ print("\n4. Summary:")
99
+ methods_used = {}
100
+ for token in results['token_details']:
101
+ token_method = token.get('unigrams_BCCWJ_frequency_token_match_method', 'unknown')
102
+ lemma_method = token.get('unigrams_BCCWJ_frequency_lemma_match_method', 'unknown')
103
+ methods_used[token_method] = methods_used.get(token_method, 0) + 1
104
+ if token_method != lemma_method:
105
+ methods_used[lemma_method] = methods_used.get(lemma_method, 0) + 1
106
+
107
+ print("Matching methods used:")
108
+ for method, count in methods_used.items():
109
+ print(f" {method}: {count} matches")
110
+
111
+ if 'legacy_spacy' in methods_used and len(methods_used) == 1:
112
+ print("\n❌ ALL tokens are using legacy_spacy - fugashi is NOT being used!")
113
+ return False
114
+ elif any('unidic' in method for method in methods_used):
115
+ print("\n✅ Some tokens are using UniDic-based matching - fugashi is working!")
116
+ return True
117
+ else:
118
+ print("\n⚠ Mixed or unexpected matching methods")
119
+ return False
120
+
121
+ except Exception as e:
122
+ print(f"✗ Error during analysis: {e}")
123
+ import traceback
124
+ traceback.print_exc()
125
+ return False
126
+
127
+ if __name__ == "__main__":
128
+ success = test_fugashi_diagnostic()
129
+ if success:
130
+ print("\n🎉 Fugashi diagnostic test indicates fugashi is working!")
131
+ else:
132
+ print("\n❌ Fugashi diagnostic test indicates fugashi is NOT working!")
133
+
134
+ sys.exit(0 if success else 1)
test_japanese_integration.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for Japanese lexical sophistication integration.
4
+ Tests the BCCWJ and CSJ frequency analysis with composite key lookup.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ sys.path.append('.')
10
+
11
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
12
+ from web_app.config_manager import ConfigManager
13
+
14
+ def test_japanese_integration():
15
+ """Test Japanese corpus integration with sample text."""
16
+
17
+ print("=== Japanese Lexical Sophistication Integration Test ===\n")
18
+
19
+ # Initialize Japanese analyzer
20
+ print("1. Initializing Japanese analyzer...")
21
+ try:
22
+ analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
23
+ print("✓ Japanese SpaCy model loaded successfully")
24
+
25
+ # Check if UniDic enricher is available
26
+ if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
27
+ print("✓ UniDic enricher initialized successfully")
28
+ else:
29
+ print("⚠ UniDic enricher not available - using legacy mode")
30
+ except Exception as e:
31
+ print(f"✗ Failed to load Japanese model: {e}")
32
+ print("Please install: python -m spacy download ja_core_news_md")
33
+ return False
34
+
35
+ # Load reference configuration
36
+ print("\n2. Loading reference configuration...")
37
+ config = ConfigManager.load_reference_config()
38
+ japanese_config = config.get('japanese', {}).get('unigrams', {})
39
+
40
+ if not japanese_config:
41
+ print("✗ No Japanese configuration found")
42
+ return False
43
+
44
+ print(f"✓ Found {len(japanese_config)} Japanese reference lists")
45
+
46
+ # Test data loading for available files
47
+ print("\n3. Testing data loading...")
48
+ reference_data = {}
49
+
50
+ for list_name, list_config in japanese_config.items():
51
+ if not list_config.get('enabled', False):
52
+ continue
53
+
54
+ file_path = list_config.get('files', {}).get('token', '')
55
+ if not os.path.exists(file_path):
56
+ print(f"⚠ File not found: {file_path}")
57
+ continue
58
+
59
+ print(f" Loading {list_name}...")
60
+ try:
61
+ data = ConfigManager.load_reference_list_data(list_config)
62
+ if data:
63
+ reference_data[f"unigrams_{list_name}"] = data
64
+
65
+ # Check if Japanese corpus data was created correctly
66
+ for file_type, file_data in data.items():
67
+ if isinstance(file_data, dict) and file_data.get('is_japanese_corpus'):
68
+ composite_count = len(file_data.get('composite_dict', {}))
69
+ lemma_count = len(file_data.get('lemma_dict', {}))
70
+ surface_count = len(file_data.get('surface_dict', {}))
71
+ print(f" ✓ {list_name}: {composite_count} composite keys, {lemma_count} lemmas, {surface_count} surface forms")
72
+
73
+ except Exception as e:
74
+ print(f" ✗ Error loading {list_name}: {e}")
75
+
76
+ if not reference_data:
77
+ print("✗ No reference data loaded successfully")
78
+ return False
79
+
80
+ # Load reference data into analyzer
81
+ print("\n4. Loading reference data into analyzer...")
82
+ analyzer.load_reference_lists(reference_data)
83
+ print(f"✓ Loaded {len(reference_data)} reference lists")
84
+
85
+ # Test with Japanese text
86
+ print("\n5. Testing Japanese text analysis...")
87
+ japanese_text = """
88
+ 私は毎日学校に行きます。
89
+ 友達と一緒に勉強して、とても楽しいです。
90
+ 日本語の文法は少し難しいですが、頑張って覚えています。
91
+ """
92
+
93
+ selected_indices = list(reference_data.keys())
94
+ print(f" Using indices: {', '.join(selected_indices)}")
95
+
96
+ try:
97
+ results = analyzer.analyze_text(japanese_text, selected_indices)
98
+
99
+ # Display results
100
+ print(f"\n6. Analysis Results:")
101
+ print(f" Total tokens: {results['text_stats']['total_tokens']}")
102
+ print(f" Content words: {results['text_stats']['content_words']}")
103
+ print(f" Function words: {results['text_stats']['function_words']}")
104
+
105
+ # Show some token details
106
+ print(f"\n Sample token analysis:")
107
+ for i, token in enumerate(results['token_details'][:5]): # First 5 tokens
108
+ print(f" {i+1}. {token['token']} (lemma: {token['lemma']}, pos: {token['pos']})")
109
+ for key, value in token.items():
110
+ if key.endswith('_token') or key.endswith('_lemma'):
111
+ if value != 'NA':
112
+ print(f" {key}: {value}")
113
+
114
+ # Show summary statistics
115
+ print(f"\n Summary statistics:")
116
+ for key, stats in results['summary'].items():
117
+ print(f" {key}: mean={stats['mean']:.2f}, count={stats['count']}")
118
+
119
+ print(f"\n✓ Japanese text analysis completed successfully!")
120
+ return True
121
+
122
+ except Exception as e:
123
+ print(f"✗ Error during analysis: {e}")
124
+ import traceback
125
+ traceback.print_exc()
126
+ return False
127
+
128
+ if __name__ == "__main__":
129
+ success = test_japanese_integration()
130
+ if success:
131
+ print("\n🎉 Japanese integration test PASSED!")
132
+ else:
133
+ print("\n❌ Japanese integration test FAILED!")
134
+
135
+ sys.exit(0 if success else 1)
test_unidic_diagnostic.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Diagnostic test for UniDic integration functionality.
4
+ Tests both the fallback mechanism and enhanced features.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ sys.path.append('.')
10
+
11
+ from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
12
+ from web_app.config_manager import ConfigManager
13
+
14
+ def test_unidic_diagnostic():
15
+ """Test UniDic integration with diagnostic information."""
16
+
17
+ print("=== UniDic Integration Diagnostic Test ===\n")
18
+
19
+ # Initialize Japanese analyzer
20
+ print("1. Initializing Japanese analyzer...")
21
+ try:
22
+ analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
23
+ print("✓ Japanese SpaCy model loaded successfully")
24
+
25
+ # Check UniDic availability
26
+ unidic_available = hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher
27
+ if unidic_available:
28
+ print("✓ UniDic enricher initialized - enhanced mode available")
29
+ else:
30
+ print("⚠ UniDic enricher not available - using legacy fallback mode")
31
+ except Exception as e:
32
+ print(f"✗ Failed to initialize analyzer: {e}")
33
+ return False
34
+
35
+ # Load reference data
36
+ print("\n2. Loading reference configuration...")
37
+ config = ConfigManager.load_reference_config()
38
+ japanese_config = config.get('japanese', {}).get('unigrams', {})
39
+
40
+ # Get first available Japanese corpus
41
+ reference_data = {}
42
+ for list_name, list_config in japanese_config.items():
43
+ if list_config.get('enabled', False):
44
+ file_path = list_config.get('files', {}).get('token', '')
45
+ if os.path.exists(file_path):
46
+ data = ConfigManager.load_reference_list_data(list_config)
47
+ if data:
48
+ reference_data[f"unigrams_{list_name}"] = data
49
+ print(f"✓ Loaded {list_name} for testing")
50
+ break
51
+
52
+ if not reference_data:
53
+ print("✗ No reference data available")
54
+ return False
55
+
56
+ # Load into analyzer
57
+ analyzer.load_reference_lists(reference_data)
58
+
59
+ # Test with sample Japanese text
60
+ print("\n3. Testing Japanese text analysis...")
61
+ test_text = "私は学校に行く。"
62
+ selected_indices = list(reference_data.keys())
63
+
64
+ try:
65
+ results = analyzer.analyze_text(test_text, selected_indices)
66
+
67
+ print(f"\n4. Analysis Results:")
68
+ print(f" Total tokens: {results['text_stats']['total_tokens']}")
69
+
70
+ # Show detailed token analysis with diagnostic information
71
+ print(f"\n Token Details with Diagnostics:")
72
+ for i, token_detail in enumerate(results['token_details'][:4]): # First 4 tokens
73
+ print(f"\n Token {i+1}: '{token_detail['token']}'")
74
+ print(f" SpaCy: lemma='{token_detail['lemma']}', pos='{token_detail['pos']}', tag='{token_detail['tag']}'")
75
+
76
+ # Look for UniDic features
77
+ if 'unidic_features' in token_detail:
78
+ unidic_feat = token_detail['unidic_features']
79
+ print(f" UniDic: lemma='{unidic_feat.get('lemma', '')}', lForm='{unidic_feat.get('lForm', '')}', pos1='{unidic_feat.get('pos1', '')}', goshu='{unidic_feat.get('goshu', '')}'")
80
+ print(f" Alignment confidence: {unidic_feat.get('alignment_confidence', 0.0):.2f}")
81
+
82
+ # Show matching methods for each index
83
+ for idx_name in selected_indices:
84
+ token_method = token_detail.get(f"{idx_name}_token_match_method", "N/A")
85
+ lemma_method = token_detail.get(f"{idx_name}_lemma_match_method", "N/A")
86
+ token_score = token_detail.get(f"{idx_name}_token", "N/A")
87
+ lemma_score = token_detail.get(f"{idx_name}_lemma", "N/A")
88
+
89
+ print(f" {idx_name}:")
90
+ print(f" Token: score={token_score}, method={token_method}")
91
+ print(f" Lemma: score={lemma_score}, method={lemma_method}")
92
+
93
+ # Show summary
94
+ print(f"\n Summary Statistics:")
95
+ matching_methods = {}
96
+ for token_detail in results['token_details']:
97
+ for key, value in token_detail.items():
98
+ if key.endswith('_match_method'):
99
+ method = value
100
+ matching_methods[method] = matching_methods.get(method, 0) + 1
101
+
102
+ print(f" Matching method distribution:")
103
+ for method, count in matching_methods.items():
104
+ print(f" {method}: {count} matches")
105
+
106
+ return True
107
+
108
+ except Exception as e:
109
+ print(f"✗ Error during analysis: {e}")
110
+ import traceback
111
+ traceback.print_exc()
112
+ return False
113
+
114
+ def test_unidic_fallback_levels():
115
+ """Test the 3-level UniDic fallback strategy simulation."""
116
+ print("\n=== UniDic Fallback Strategy Test ===\n")
117
+
118
+ # Simulate UniDic features for different fallback levels
119
+ test_cases = [
120
+ {
121
+ 'name': 'Complete UniDic features (Level 1)',
122
+ 'features': {
123
+ 'lemma': '行く', 'lForm': 'イク', 'pos1': '動詞', 'pos2': '一般', 'goshu': '和'
124
+ },
125
+ 'expected_keys': [
126
+ '行く_イク_動詞_一般_和', # Level 1
127
+ '行く_イク_動詞_一般', # Level 2
128
+ '行く_イク_動詞' # Level 3
129
+ ]
130
+ },
131
+ {
132
+ 'name': 'Partial features (Level 2)',
133
+ 'features': {
134
+ 'lemma': '学校', 'lForm': 'ガッコウ', 'pos1': '名詞', 'pos2': '一般', 'goshu': ''
135
+ },
136
+ 'expected_keys': [
137
+ '学校_ガッコウ_名詞_一般', # Level 2
138
+ '学校_ガッコウ_名詞' # Level 3
139
+ ]
140
+ },
141
+ {
142
+ 'name': 'Minimal features (Level 3)',
143
+ 'features': {
144
+ 'lemma': '私', 'lForm': 'ワタシ', 'pos1': '代名詞', 'pos2': '', 'goshu': ''
145
+ },
146
+ 'expected_keys': [
147
+ '私_ワタシ_代名詞' # Level 3 only
148
+ ]
149
+ }
150
+ ]
151
+
152
+ for case in test_cases:
153
+ print(f"Testing: {case['name']}")
154
+ features = case['features']
155
+ expected = case['expected_keys']
156
+
157
+ # Generate actual keys that would be attempted
158
+ actual_keys = []
159
+
160
+ # Level 1: {lemma}_{lForm}_{pos1}_{pos2}_{goshu}
161
+ if all([features['lemma'], features['lForm'], features['pos1'], features['pos2'], features['goshu']]):
162
+ level1_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}_{features['pos2']}_{features['goshu']}"
163
+ actual_keys.append(level1_key)
164
+
165
+ # Level 2: {lemma}_{lForm}_{pos1}_{pos2}
166
+ if all([features['lemma'], features['lForm'], features['pos1'], features['pos2']]):
167
+ level2_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}_{features['pos2']}"
168
+ actual_keys.append(level2_key)
169
+
170
+ # Level 3: {lemma}_{lForm}_{pos1}
171
+ if all([features['lemma'], features['lForm'], features['pos1']]):
172
+ level3_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}"
173
+ actual_keys.append(level3_key)
174
+
175
+ # Check if matches expected
176
+ match = actual_keys == expected
177
+ status = "✓" if match else "✗"
178
+ print(f" {status} Generated keys: {actual_keys}")
179
+ if not match:
180
+ print(f" Expected: {expected}")
181
+ print()
182
+
183
+ return True
184
+
185
+ if __name__ == "__main__":
186
+ print("Running UniDic integration diagnostics...\n")
187
+
188
+ success1 = test_unidic_diagnostic()
189
+ success2 = test_unidic_fallback_levels()
190
+
191
+ if success1 and success2:
192
+ print("\n🎉 All UniDic diagnostic tests PASSED!")
193
+ print("\nSystem Status:")
194
+ print("- Legacy Japanese analysis: ✓ Working")
195
+ print("- Fallback strategy: ✓ Implemented")
196
+ print("- Diagnostic tracking: ✓ Available")
197
+ print("- UniDic integration: ⚠ Ready (requires MeCab setup)")
198
+ else:
199
+ print("\n❌ Some diagnostic tests FAILED!")
200
+
201
+ sys.exit(0 if success1 and success2 else 1)
text_analyzer/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (297 Bytes)
 
text_analyzer/__pycache__/lexical_sophistication.cpython-312.pyc DELETED
Binary file (24.3 kB)
 
text_analyzer/__pycache__/pos_parser.cpython-312.pyc DELETED
Binary file (9.72 kB)
 
text_analyzer/app_config.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized configuration module for the text analysis application.
3
+ Contains all constants, settings, and configuration loading utilities.
4
+ """
5
+
6
+ import yaml
7
+ from pathlib import Path
8
+ from typing import Dict, Any, Optional
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class AppConfig:
15
+ """Centralized configuration management for the text analysis application."""
16
+
17
+ # SpaCy Model Mappings
18
+ SPACY_MODELS = {
19
+ ("en", "md"): "en_core_web_md",
20
+ ("en", "trf"): "en_core_web_trf",
21
+ ("ja", "md"): "ja_core_news_md",
22
+ ("ja", "trf"): "ja_core_news_trf"
23
+ }
24
+
25
+ # Default Settings
26
+ DEFAULT_LANGUAGE = "en"
27
+ DEFAULT_MODEL_SIZE = "md" # Changed from "trf" to be more accessible
28
+
29
+ # Analysis Limits (shared constants)
30
+ MAX_TOKENS_FOR_VISUALIZATION = 30
31
+ DEFAULT_HISTOGRAM_BINS = 25
32
+ DEFAULT_RANK_BIN_SIZE = 500
33
+ MAX_NGRAM_SENTENCE_LENGTH = 100
34
+
35
+ # File Processing (generic utilities)
36
+ SUPPORTED_ENCODINGS = ['utf-8', 'utf-16', 'latin-1']
37
+ SUPPORTED_DELIMITERS = [',', '\t', ';']
38
+
39
+ # Configuration Paths
40
+ REFERENCE_LISTS_CONFIG = "config/reference_lists.yaml"
41
+
42
+ @classmethod
43
+ def get_spacy_model_name(cls, language: str, model_size: str) -> Optional[str]:
44
+ """
45
+ Get the SpaCy model name for given language and size.
46
+
47
+ Args:
48
+ language: Language code ('en' or 'ja')
49
+ model_size: Model size ('md' or 'trf')
50
+
51
+ Returns:
52
+ SpaCy model name or None if not found
53
+ """
54
+ return cls.SPACY_MODELS.get((language, model_size))
55
+
56
+ @classmethod
57
+ def get_supported_languages(cls) -> list[str]:
58
+ """Get list of supported languages."""
59
+ return list(set(lang for lang, _ in cls.SPACY_MODELS.keys()))
60
+
61
+ @classmethod
62
+ def get_supported_model_sizes(cls) -> list[str]:
63
+ """Get list of supported model sizes."""
64
+ return list(set(size for _, size in cls.SPACY_MODELS.keys()))
65
+
66
+ @classmethod
67
+ def load_reference_config(cls) -> Dict[str, Any]:
68
+ """
69
+ Load reference lists configuration from YAML file.
70
+
71
+ Returns:
72
+ Configuration dictionary loaded from YAML
73
+ """
74
+ config_path = Path(cls.REFERENCE_LISTS_CONFIG)
75
+
76
+ if not config_path.exists():
77
+ logger.warning(f"Reference config file not found: {config_path}")
78
+ return cls._get_default_config()
79
+
80
+ try:
81
+ with open(config_path, 'r', encoding='utf-8') as f:
82
+ config = yaml.safe_load(f)
83
+ if config is None:
84
+ logger.warning("Empty YAML configuration, using defaults")
85
+ return cls._get_default_config()
86
+ return config
87
+ except Exception as e:
88
+ logger.error(f"Error loading reference configuration: {e}")
89
+ return cls._get_default_config()
90
+
91
+ @classmethod
92
+ def get_corpus_configuration(cls, corpus_name: str) -> Dict[str, Any]:
93
+ """
94
+ Get configuration for a specific corpus from YAML.
95
+
96
+ Args:
97
+ corpus_name: Name of the corpus to find
98
+
99
+ Returns:
100
+ Corpus configuration dictionary
101
+ """
102
+ config = cls.load_reference_config()
103
+
104
+ # Search through all languages and ngram types
105
+ for lang_config in config.values():
106
+ if not isinstance(lang_config, dict):
107
+ continue
108
+ for ngram_type_config in lang_config.values():
109
+ if not isinstance(ngram_type_config, dict):
110
+ continue
111
+ if corpus_name in ngram_type_config:
112
+ return ngram_type_config[corpus_name]
113
+
114
+ logger.warning(f"Corpus configuration not found: {corpus_name}")
115
+ return {}
116
+
117
+ @classmethod
118
+ def get_corpus_columns(cls, corpus_name: str) -> Dict[str, int]:
119
+ """
120
+ Get column mappings for a specific corpus.
121
+
122
+ Args:
123
+ corpus_name: Name of the corpus
124
+
125
+ Returns:
126
+ Dictionary mapping column names to indices
127
+ """
128
+ corpus_config = cls.get_corpus_configuration(corpus_name)
129
+ return corpus_config.get('columns', {})
130
+
131
+ @classmethod
132
+ def is_japanese_corpus(cls, corpus_name: str) -> bool:
133
+ """
134
+ Check if a corpus is marked as Japanese corpus.
135
+
136
+ Args:
137
+ corpus_name: Name of the corpus
138
+
139
+ Returns:
140
+ True if it's a Japanese corpus
141
+ """
142
+ corpus_config = cls.get_corpus_configuration(corpus_name)
143
+ return corpus_config.get('japanese_corpus', False)
144
+
145
+ @classmethod
146
+ def _get_default_config(cls) -> Dict[str, Any]:
147
+ """Get default configuration structure if YAML fails to load."""
148
+ return {
149
+ "english": {
150
+ "unigrams": {},
151
+ "bigrams": {},
152
+ "trigrams": {}
153
+ },
154
+ "japanese": {
155
+ "unigrams": {},
156
+ "bigrams": {},
157
+ "trigrams": {}
158
+ }
159
+ }
160
+
161
+ @classmethod
162
+ def validate_language_model_combination(cls, language: str, model_size: str) -> bool:
163
+ """
164
+ Validate that a language/model combination is supported.
165
+
166
+ Args:
167
+ language: Language code
168
+ model_size: Model size
169
+
170
+ Returns:
171
+ True if combination is supported
172
+ """
173
+ return (language, model_size) in cls.SPACY_MODELS
174
+
175
+ @classmethod
176
+ def get_processing_limits(cls) -> Dict[str, int]:
177
+ """Get all processing limits as a dictionary."""
178
+ return {
179
+ 'max_tokens_visualization': cls.MAX_TOKENS_FOR_VISUALIZATION,
180
+ 'default_histogram_bins': cls.DEFAULT_HISTOGRAM_BINS,
181
+ 'default_rank_bin_size': cls.DEFAULT_RANK_BIN_SIZE,
182
+ 'max_ngram_sentence_length': cls.MAX_NGRAM_SENTENCE_LENGTH
183
+ }
text_analyzer/base_analyzer.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base analyzer module providing shared SpaCy infrastructure.
3
+ Eliminates code duplication and provides common functionality for all SpaCy-based analyzers.
4
+ """
5
+
6
+ import spacy
7
+ from typing import Dict, List, Any, Optional, Iterator, Tuple, TYPE_CHECKING
8
+ import logging
9
+ import tempfile
10
+ from pathlib import Path
11
+ from .app_config import AppConfig
12
+ from .text_utility import TextUtility
13
+
14
+ # Import UniDic extensions and enricher
15
+ try:
16
+ from . import unidic_extensions # This registers the token extensions
17
+ from .unidic_enricher import UniDicEnricher
18
+ UNIDIC_AVAILABLE = True
19
+ except ImportError as e:
20
+ logger.warning(f"UniDic integration not available: {e}")
21
+ UNIDIC_AVAILABLE = False
22
+ UniDicEnricher = None
23
+
24
+ if TYPE_CHECKING:
25
+ import spacy
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class BaseAnalyzer:
31
+ """
32
+ Base class for all SpaCy-based text analyzers.
33
+ Provides shared model loading, document processing, and utility functions.
34
+ """
35
+
36
+ def __init__(self, language: str = None, model_size: str = None):
37
+ """
38
+ Initialize the base analyzer.
39
+
40
+ Args:
41
+ language: Language code ('en' or 'ja')
42
+ model_size: Model size ('md' or 'trf')
43
+ """
44
+ self.language = language or AppConfig.DEFAULT_LANGUAGE
45
+ self.model_size = model_size or AppConfig.DEFAULT_MODEL_SIZE
46
+ self.nlp = None
47
+ self._model_info = {}
48
+ self.unidic_enricher = None
49
+
50
+ self._load_spacy_model()
51
+
52
+ # Initialize UniDic enricher for Japanese
53
+ if self.language == 'ja' and UNIDIC_AVAILABLE:
54
+ try:
55
+ self.unidic_enricher = UniDicEnricher()
56
+ logger.info("UniDic enricher initialized for Japanese analysis")
57
+ except Exception as e:
58
+ logger.warning(f"Failed to initialize UniDic enricher: {e}")
59
+ self.unidic_enricher = None
60
+
61
+ def _load_spacy_model(self) -> None:
62
+ """Load appropriate SpaCy model based on language and size."""
63
+ # Validate combination
64
+ if not AppConfig.validate_language_model_combination(self.language, self.model_size):
65
+ raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
66
+
67
+ model_name = AppConfig.get_spacy_model_name(self.language, self.model_size)
68
+ if not model_name:
69
+ raise ValueError(f"No model found for language '{self.language}' and size '{self.model_size}'")
70
+
71
+ try:
72
+ self.nlp = spacy.load(model_name)
73
+ self._model_info = {
74
+ 'name': model_name,
75
+ 'language': self.language,
76
+ 'model_size': self.model_size,
77
+ 'version': spacy.__version__
78
+ }
79
+ logger.info(f"Loaded SpaCy model: {model_name}")
80
+ except OSError as e:
81
+ error_msg = f"SpaCy model {model_name} not found. Please install it first."
82
+ logger.error(error_msg)
83
+ raise OSError(error_msg) from e
84
+
85
+ def get_model_info(self) -> Dict[str, str]:
86
+ """
87
+ Get information about the loaded model.
88
+
89
+ Returns:
90
+ Dictionary with model information
91
+ """
92
+ return self._model_info.copy()
93
+
94
+ def process_document(self, text: str) -> "spacy.Doc":
95
+ """
96
+ Process text into a SpaCy document.
97
+
98
+ Args:
99
+ text: Input text to process
100
+
101
+ Returns:
102
+ Processed SpaCy document
103
+
104
+ Raises:
105
+ ValueError: If model not loaded or text processing fails
106
+ """
107
+ if not self.nlp:
108
+ raise ValueError("SpaCy model not loaded")
109
+
110
+ if not text or not text.strip():
111
+ raise ValueError("Empty text provided")
112
+
113
+ try:
114
+ # Clean text before processing
115
+ cleaned_text = TextUtility.clean_text_input(text)
116
+
117
+ # Process with SpaCy
118
+ doc = self.nlp(cleaned_text)
119
+
120
+ # Add UniDic enrichment for Japanese
121
+ if self.unidic_enricher and self.language == 'ja':
122
+ try:
123
+ self.unidic_enricher.enrich_spacy_doc(doc, cleaned_text)
124
+ logger.debug("UniDic enrichment completed")
125
+ except Exception as e:
126
+ logger.warning(f"UniDic enrichment failed: {e}")
127
+
128
+ return doc
129
+
130
+ except Exception as e:
131
+ self.handle_processing_error(e, f"processing text of length {len(text)}")
132
+ raise
133
+
134
+ def handle_processing_error(self, error: Exception, context: str) -> None:
135
+ """
136
+ Handle processing errors with appropriate logging.
137
+
138
+ Args:
139
+ error: The exception that occurred
140
+ context: Context description for the error
141
+ """
142
+ error_msg = f"Error {context}: {error}"
143
+ logger.error(error_msg)
144
+
145
+ def filter_tokens(self,
146
+ doc: "spacy.Doc",
147
+ exclude_punct: bool = True,
148
+ exclude_space: bool = True,
149
+ word_type_filter: Optional[str] = None) -> List["spacy.Token"]:
150
+ """
151
+ Filter tokens based on various criteria.
152
+
153
+ Args:
154
+ doc: SpaCy document
155
+ exclude_punct: Whether to exclude punctuation
156
+ exclude_space: Whether to exclude spaces
157
+ word_type_filter: Filter by word type ('CW', 'FW', or None)
158
+
159
+ Returns:
160
+ List of filtered tokens
161
+ """
162
+ filtered_tokens = []
163
+
164
+ for token in doc:
165
+ # Basic filtering
166
+ if exclude_space and token.is_space:
167
+ continue
168
+ if exclude_punct and token.is_punct:
169
+ continue
170
+
171
+ # Word type filtering
172
+ if word_type_filter:
173
+ word_type = self._classify_pos(token)
174
+ if word_type != word_type_filter:
175
+ continue
176
+
177
+ filtered_tokens.append(token)
178
+
179
+ return filtered_tokens
180
+
181
+ def _classify_pos(self, token: "spacy.Token") -> str:
182
+ """
183
+ Classify token as content word (CW) or function word (FW).
184
+
185
+ Args:
186
+ token: SpaCy token object
187
+
188
+ Returns:
189
+ 'CW' for content words, 'FW' for function words
190
+ """
191
+ content_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
192
+ function_pos = {'DET', 'PRON', 'ADP', 'CONJ', 'CCONJ', 'SCONJ'}
193
+
194
+ if token.pos_ in content_pos:
195
+ return 'CW'
196
+ elif token.pos_ in function_pos:
197
+ return 'FW'
198
+ else:
199
+ # Default classification for ambiguous cases
200
+ return 'CW' if token.pos_ not in {'PUNCT', 'SPACE', 'X'} else 'FW'
201
+
202
+ def format_token_for_display(self, token: "spacy.Token", include_syntax: bool = True) -> Dict[str, Any]:
203
+ """
204
+ Format token for UI display - only call when needed for output.
205
+
206
+ Args:
207
+ token: SpaCy token
208
+ include_syntax: Whether to include syntactic information (dep_, head, etc.)
209
+
210
+ Returns:
211
+ Formatted token data dictionary for display
212
+ """
213
+ result = {
214
+ 'token': token.text,
215
+ 'lemma': token.lemma_,
216
+ 'pos': token.pos_,
217
+ 'tag': token.tag_,
218
+ 'word_type': self._classify_pos(token)
219
+ }
220
+
221
+ if include_syntax:
222
+ result.update({
223
+ 'dep_': token.dep_,
224
+ 'head_text': token.head.text,
225
+ 'head_pos': token.head.pos_,
226
+ })
227
+
228
+ return result
229
+
230
+ def get_syntactic_context(self, token: "spacy.Token") -> Dict[str, Any]:
231
+ """
232
+ Get comprehensive syntactic relationships for a token.
233
+
234
+ Args:
235
+ token: SpaCy token
236
+
237
+ Returns:
238
+ Dictionary with syntactic context information
239
+ """
240
+ return {
241
+ 'dep_': token.dep_,
242
+ 'head': token.head,
243
+ 'children': list(token.children),
244
+ 'ancestors': list(token.ancestors),
245
+ 'subtree_span': token.subtree,
246
+ 'left_edge': token.left_edge,
247
+ 'right_edge': token.right_edge
248
+ }
249
+
250
+ def process_sentences(self,
251
+ doc: "spacy.Doc",
252
+ max_tokens: Optional[int] = None) -> List["spacy.Span"]:
253
+ """
254
+ Process sentences with optional token limits.
255
+
256
+ Args:
257
+ doc: SpaCy document
258
+ max_tokens: Maximum tokens per sentence (uses config default if None)
259
+
260
+ Returns:
261
+ List of sentence spans
262
+ """
263
+ max_tokens = max_tokens or AppConfig.MAX_TOKENS_FOR_VISUALIZATION
264
+
265
+ processed_sentences = []
266
+ for sent in doc.sents:
267
+ # Filter tokens (exclude spaces for counting)
268
+ sent_tokens = [token for token in sent if not token.is_space]
269
+
270
+ if len(sent_tokens) > max_tokens:
271
+ # Truncate sentence
272
+ truncated_tokens = sent_tokens[:max_tokens]
273
+ # Create new span with truncated tokens
274
+ start_idx = truncated_tokens[0].i
275
+ end_idx = truncated_tokens[-1].i + 1
276
+ truncated_span = doc[start_idx:end_idx]
277
+ processed_sentences.append(truncated_span)
278
+ else:
279
+ processed_sentences.append(sent)
280
+
281
+ return processed_sentences
282
+
283
+ def setup_batch_processing(self, file_paths: List[str]) -> Iterator[Tuple[str, str]]:
284
+ """
285
+ Set up batch processing for multiple files.
286
+
287
+ Args:
288
+ file_paths: List of file paths to process
289
+
290
+ Yields:
291
+ Tuples of (file_path, text_content)
292
+ """
293
+ for file_path in file_paths:
294
+ try:
295
+ text_content = TextUtility.extract_text_from_file(file_path)
296
+ yield file_path, text_content
297
+ except Exception as e:
298
+ logger.error(f"Error processing file {file_path}: {e}")
299
+ yield file_path, f"ERROR: {e}"
300
+
301
+ def cleanup_batch_processing(self, temp_files: List[str]) -> None:
302
+ """
303
+ Clean up temporary files from batch processing.
304
+
305
+ Args:
306
+ temp_files: List of temporary file paths
307
+ """
308
+ TextUtility.cleanup_temp_files(temp_files)
text_analyzer/frequency_analyzer.py ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Frequency Analysis Module for Word Frequency Visualization
3
+
4
+ This module provides functionality to analyze word frequency data from various file formats,
5
+ create histogram data, and sample representative words for each frequency bin.
6
+ Supports flexible column mapping for diverse frequency data formats.
7
+ """
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ from typing import Dict, List, Tuple, Optional, Union
12
+ import logging
13
+ import random
14
+ from io import StringIO
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class FrequencyAnalyzer:
20
+ """
21
+ Analyzes word frequency data and provides visualization-ready outputs.
22
+
23
+ Supports flexible column mapping for various frequency data formats.
24
+ Can handle both traditional 'Type'/'Freq' format and modern multi-column formats.
25
+ """
26
+
27
+ # Default column names to try for auto-detection
28
+ DEFAULT_WORD_COLUMNS = ['lForm', 'lemma', 'word', 'Type', 'surface_form']
29
+ DEFAULT_FREQUENCY_COLUMNS = ['frequency', 'freq', 'Freq', 'pmw', 'NormFreq']
30
+ DEFAULT_POS_COLUMNS = ['pos', 'POS', 'tag']
31
+
32
+ def __init__(self, file_size_limit_mb: int = 300):
33
+ """
34
+ Initialize the frequency analyzer.
35
+
36
+ Args:
37
+ file_size_limit_mb: Maximum file size limit in MB for uploads
38
+ """
39
+ self.data = None
40
+ self.original_data = None
41
+ self.column_config = None
42
+ self.file_size_limit = file_size_limit_mb * 1024 * 1024
43
+ self.detected_columns = None
44
+
45
+ def detect_file_format(self, content: Union[str, bytes]) -> Dict[str, any]:
46
+ """
47
+ Detect file format and separator.
48
+
49
+ Args:
50
+ content: File content as string or bytes
51
+
52
+ Returns:
53
+ Dict with format information
54
+ """
55
+ if isinstance(content, bytes):
56
+ content = content.decode('utf-8')
57
+
58
+ # Check file size
59
+ if len(content.encode('utf-8')) > self.file_size_limit:
60
+ raise ValueError(f"File too large. Maximum size is {self.file_size_limit // (1024*1024)}MB")
61
+
62
+ # Detect separator by checking first few lines
63
+ lines = content.strip().split('\n')[:5]
64
+ separators = ['\t', ',', ';', '|']
65
+ best_sep = '\t'
66
+ max_columns = 0
67
+
68
+ for sep in separators:
69
+ avg_cols = np.mean([len(line.split(sep)) for line in lines])
70
+ if avg_cols > max_columns:
71
+ max_columns = avg_cols
72
+ best_sep = sep
73
+
74
+ # Detect if first row is header
75
+ first_line = lines[0].split(best_sep)
76
+ second_line = lines[1].split(best_sep) if len(lines) > 1 else []
77
+
78
+ # Simple heuristic: if first row contains mostly strings and second row has numbers
79
+ has_header = True
80
+ if len(second_line) > 0:
81
+ try:
82
+ # Try to convert second row elements to numbers
83
+ numeric_count = sum(1 for x in second_line if self._is_numeric(x.strip()))
84
+ if numeric_count > len(second_line) * 0.3: # If >30% are numeric
85
+ has_header = True
86
+ except:
87
+ has_header = False
88
+
89
+ return {
90
+ 'separator': best_sep,
91
+ 'has_header': has_header,
92
+ 'estimated_columns': int(max_columns),
93
+ 'sample_lines': lines[:3]
94
+ }
95
+
96
+ def _is_numeric(self, value: str) -> bool:
97
+ """Check if a string value is numeric."""
98
+ try:
99
+ float(value)
100
+ return True
101
+ except (ValueError, TypeError):
102
+ return False
103
+
104
+ def detect_columns(self, df: pd.DataFrame) -> Dict[str, List[str]]:
105
+ """
106
+ Detect and categorize columns by data type and content.
107
+
108
+ Args:
109
+ df: DataFrame to analyze
110
+
111
+ Returns:
112
+ Dict with categorized column lists
113
+ """
114
+ word_candidates = []
115
+ frequency_candidates = []
116
+ pos_candidates = []
117
+ other_columns = []
118
+
119
+ for col in df.columns:
120
+ col_str = str(col).lower()
121
+
122
+ # Check if column contains string data (potential word column)
123
+ if df[col].dtype == 'object':
124
+ # Check if it looks like words (not mostly numbers)
125
+ sample_values = df[col].dropna().head(100)
126
+ if len(sample_values) > 0:
127
+ non_numeric_ratio = sum(1 for x in sample_values if not self._is_numeric(str(x))) / len(sample_values)
128
+ if non_numeric_ratio > 0.8: # >80% non-numeric
129
+ if any(word in col_str for word in ['form', 'lemma', 'word', 'type']):
130
+ word_candidates.append(col)
131
+ elif any(pos in col_str for pos in ['pos', 'tag', 'part']):
132
+ pos_candidates.append(col)
133
+ else:
134
+ word_candidates.append(col) # Default string columns to word candidates
135
+
136
+ # Check if column contains numeric data (potential frequency column)
137
+ elif pd.api.types.is_numeric_dtype(df[col]):
138
+ # Skip rank columns (usually sequential integers starting from 1)
139
+ if col_str in ['rank', 'index'] or (df[col].equals(pd.Series(range(1, len(df) + 1)))):
140
+ other_columns.append(col)
141
+ else:
142
+ frequency_candidates.append(col)
143
+
144
+ else:
145
+ other_columns.append(col)
146
+
147
+ # Sort candidates by preference based on common naming patterns
148
+ word_candidates = self._sort_by_preference(word_candidates, self.DEFAULT_WORD_COLUMNS)
149
+ frequency_candidates = self._sort_by_preference(frequency_candidates, self.DEFAULT_FREQUENCY_COLUMNS)
150
+ pos_candidates = self._sort_by_preference(pos_candidates, self.DEFAULT_POS_COLUMNS)
151
+
152
+ return {
153
+ 'word_columns': word_candidates,
154
+ 'frequency_columns': frequency_candidates,
155
+ 'pos_columns': pos_candidates,
156
+ 'other_columns': other_columns
157
+ }
158
+
159
+ def _sort_by_preference(self, columns: List[str], preferred_order: List[str]) -> List[str]:
160
+ """Sort columns by preference order."""
161
+ sorted_cols = []
162
+ remaining_cols = columns.copy()
163
+
164
+ # Add preferred columns first
165
+ for pref in preferred_order:
166
+ for col in columns:
167
+ if pref.lower() in str(col).lower() and col in remaining_cols:
168
+ sorted_cols.append(col)
169
+ remaining_cols.remove(col)
170
+ break
171
+
172
+ # Add remaining columns
173
+ sorted_cols.extend(remaining_cols)
174
+ return sorted_cols
175
+
176
+ def load_frequency_data(self, content: Union[str, bytes], column_config: Dict[str, str]) -> pd.DataFrame:
177
+ """
178
+ Load and validate frequency data with flexible column mapping.
179
+
180
+ Args:
181
+ content: File content as string or bytes
182
+ column_config: Column mapping configuration
183
+ {
184
+ 'word_column': 'lForm',
185
+ 'frequency_column': 'frequency',
186
+ 'pos_column': 'pos', # optional
187
+ 'separator': '\t' # optional, will auto-detect if not provided
188
+ }
189
+
190
+ Returns:
191
+ pd.DataFrame: Loaded and validated frequency data
192
+
193
+ Raises:
194
+ ValueError: If data format is invalid or columns not found
195
+ """
196
+ try:
197
+ # Handle both string and bytes input
198
+ if isinstance(content, bytes):
199
+ content = content.decode('utf-8')
200
+
201
+ # Auto-detect format if separator not provided
202
+ if 'separator' not in column_config:
203
+ format_info = self.detect_file_format(content)
204
+ separator = format_info['separator']
205
+ has_header = format_info['has_header']
206
+ else:
207
+ separator = column_config['separator']
208
+ has_header = column_config.get('has_header', True)
209
+
210
+ # Read data
211
+ df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None)
212
+
213
+ # Store column configuration
214
+ self.column_config = column_config.copy()
215
+ self.column_config['separator'] = separator
216
+ self.column_config['has_header'] = has_header
217
+
218
+ # Detect available columns
219
+ self.detected_columns = self.detect_columns(df)
220
+
221
+ # Validate column configuration
222
+ if not self.validate_column_config(df, column_config):
223
+ raise ValueError("Invalid column configuration")
224
+
225
+ # Clean and prepare data with flexible column mapping
226
+ df = self._clean_data_flexible(df, column_config)
227
+
228
+ # Store data
229
+ self.original_data = df.copy()
230
+ self.data = df
231
+
232
+ logger.info(f"Loaded {len(df)} frequency entries with columns: {list(df.columns)}")
233
+ return df
234
+
235
+ except Exception as e:
236
+ logger.error(f"Error loading frequency data: {str(e)}")
237
+ raise ValueError(f"Failed to load frequency data: {str(e)}")
238
+
239
+ def validate_column_config(self, df: pd.DataFrame, column_config: Dict[str, str]) -> bool:
240
+ """
241
+ Validate that the specified columns exist and contain appropriate data.
242
+
243
+ Args:
244
+ df: DataFrame to validate
245
+ column_config: Column configuration
246
+
247
+ Returns:
248
+ bool: True if configuration is valid
249
+ """
250
+ # Check required columns exist
251
+ word_col = column_config.get('word_column')
252
+ freq_col = column_config.get('frequency_column')
253
+
254
+ if not word_col or word_col not in df.columns:
255
+ logger.error(f"Word column '{word_col}' not found in data")
256
+ return False
257
+
258
+ if not freq_col or freq_col not in df.columns:
259
+ logger.error(f"Frequency column '{freq_col}' not found in data")
260
+ return False
261
+
262
+ # Check that word column contains string data
263
+ if df[word_col].dtype != 'object':
264
+ logger.error(f"Word column '{word_col}' must contain text data")
265
+ return False
266
+
267
+ # Check that frequency column contains numeric data
268
+ if not pd.api.types.is_numeric_dtype(df[freq_col]):
269
+ logger.error(f"Frequency column '{freq_col}' must contain numeric data")
270
+ return False
271
+
272
+ # Check optional POS column if specified
273
+ pos_col = column_config.get('pos_column')
274
+ if pos_col and pos_col not in df.columns:
275
+ logger.warning(f"POS column '{pos_col}' not found in data, skipping")
276
+
277
+ return True
278
+
279
+ def _clean_data_flexible(self, df: pd.DataFrame, column_config: Dict[str, str]) -> pd.DataFrame:
280
+ """
281
+ Clean and prepare the frequency data with flexible column mapping.
282
+
283
+ Args:
284
+ df: Raw DataFrame
285
+ column_config: Column configuration
286
+
287
+ Returns:
288
+ pd.DataFrame: Cleaned DataFrame with standardized column names
289
+ """
290
+ word_col = column_config['word_column']
291
+ freq_col = column_config['frequency_column']
292
+ pos_col = column_config.get('pos_column')
293
+
294
+ # Create a copy and rename columns to standard names for compatibility
295
+ df_clean = df.copy()
296
+
297
+ # Remove rows with missing word or frequency data
298
+ df_clean = df_clean.dropna(subset=[word_col, freq_col])
299
+
300
+ # Ensure frequency is numeric
301
+ df_clean[freq_col] = pd.to_numeric(df_clean[freq_col], errors='coerce')
302
+ df_clean = df_clean.dropna(subset=[freq_col])
303
+
304
+ # Remove zero or negative frequencies
305
+ df_clean = df_clean[df_clean[freq_col] > 0]
306
+
307
+ # Clean word column (remove extra whitespace)
308
+ df_clean[word_col] = df_clean[word_col].astype(str).str.strip()
309
+
310
+ # Add standardized column names for backward compatibility
311
+ df_clean['Type'] = df_clean[word_col]
312
+ df_clean['Freq'] = df_clean[freq_col]
313
+
314
+ # Add POS column if available
315
+ if pos_col and pos_col in df_clean.columns:
316
+ df_clean['POS'] = df_clean[pos_col]
317
+
318
+ # Sort by frequency (descending) for better analysis
319
+ df_clean = df_clean.sort_values(freq_col, ascending=False).reset_index(drop=True)
320
+
321
+ return df_clean
322
+
323
+ def get_available_frequency_columns(self) -> List[str]:
324
+ """
325
+ Get list of available frequency columns for analysis.
326
+
327
+ Returns:
328
+ List[str]: Available frequency columns from the detected columns
329
+ """
330
+ if self.detected_columns is None:
331
+ return []
332
+
333
+ return self.detected_columns.get('frequency_columns', [])
334
+
335
+ def get_available_word_columns(self) -> List[str]:
336
+ """
337
+ Get list of available word columns.
338
+
339
+ Returns:
340
+ List[str]: Available word columns from the detected columns
341
+ """
342
+ if self.detected_columns is None:
343
+ return []
344
+
345
+ return self.detected_columns.get('word_columns', [])
346
+
347
+ def create_multi_frequency_analysis(self, frequency_columns: List[str], bin_size: int = 500, log_transform: bool = False) -> Dict[str, Dict]:
348
+ """
349
+ Create rank-based analysis for multiple frequency columns.
350
+
351
+ Args:
352
+ frequency_columns: List of frequency column names to analyze
353
+ bin_size: Number of words per rank group
354
+ log_transform: Whether to apply log10 transformation
355
+
356
+ Returns:
357
+ Dict mapping column names to their analysis results
358
+ """
359
+ if self.original_data is None:
360
+ raise ValueError("No data loaded")
361
+
362
+ results = {}
363
+
364
+ for freq_col in frequency_columns:
365
+ if freq_col not in self.original_data.columns:
366
+ logger.warning(f"Frequency column '{freq_col}' not found, skipping")
367
+ continue
368
+
369
+ try:
370
+ # Create analysis for this frequency column
371
+ analysis = self.create_rank_based_visualization_flexible(
372
+ column=freq_col,
373
+ bin_size=bin_size,
374
+ log_transform=log_transform
375
+ )
376
+ results[freq_col] = analysis
377
+ except Exception as e:
378
+ logger.error(f"Error analyzing column '{freq_col}': {e}")
379
+ continue
380
+
381
+ return results
382
+
383
+ def create_rank_based_visualization_flexible(self, column: str, bin_size: int = 500, log_transform: bool = False, max_words_to_retain: Optional[int] = None) -> Dict:
384
+ """
385
+ Create rank-based visualization with flexible column support.
386
+
387
+ Args:
388
+ column: Column name to analyze (can be any numeric column)
389
+ bin_size: Number of words per rank group
390
+ log_transform: Whether to apply log10 transformation
391
+ max_words_to_retain: Maximum number of top frequent words to retain for analysis
392
+
393
+ Returns:
394
+ Dict: Rank-based visualization data
395
+ """
396
+ if self.original_data is None:
397
+ raise ValueError("No data loaded")
398
+
399
+ if column not in self.original_data.columns:
400
+ raise ValueError(f"Column '{column}' not found in data")
401
+
402
+ # Get word column from config or use default
403
+ word_col = self.column_config.get('word_column', 'Type') if self.column_config else 'Type'
404
+ if word_col not in self.original_data.columns:
405
+ word_col = 'Type' # Fallback to standardized column
406
+
407
+ # Sort by the specified frequency column (descending)
408
+ sorted_data = self.original_data.sort_values(column, ascending=False).reset_index(drop=True)
409
+
410
+ # Apply word limit if specified
411
+ if max_words_to_retain and max_words_to_retain < len(sorted_data):
412
+ sorted_data = sorted_data.head(max_words_to_retain)
413
+ logger.info(f"Limited analysis to top {max_words_to_retain} words")
414
+
415
+ # Create bins by slicing exactly bin_size words
416
+ group_labels = []
417
+ group_centers = []
418
+ avg_frequencies = []
419
+ sample_words = {}
420
+ group_stats_list = []
421
+
422
+ # Limit to top 20 bins for better UI performance
423
+ max_display_bins = 20
424
+
425
+ for i in range(0, len(sorted_data), bin_size):
426
+ if len(group_labels) >= max_display_bins:
427
+ break
428
+
429
+ end_idx = min(i + bin_size, len(sorted_data))
430
+ bin_data = sorted_data[i:end_idx]
431
+
432
+ # Calculate group boundaries
433
+ start_rank = i + 1
434
+ end_rank = end_idx
435
+ group_label = f"{start_rank}-{end_rank}"
436
+ group_labels.append(group_label)
437
+ group_centers.append((start_rank + end_rank) / 2)
438
+
439
+ # Calculate average frequency
440
+ avg_freq = bin_data[column].mean()
441
+ if log_transform:
442
+ avg_freq = np.log10(avg_freq + 1e-10)
443
+ avg_frequencies.append(avg_freq)
444
+
445
+ # Get sample words (5 randomly sampled from this bin)
446
+ n_samples = min(5, len(bin_data))
447
+ if n_samples > 0:
448
+ if n_samples == len(bin_data):
449
+ # If fewer than 5 words, take all
450
+ sample_word_list = bin_data[word_col].tolist()
451
+ else:
452
+ # Randomly sample 5 words
453
+ sample_indices = random.sample(range(len(bin_data)), n_samples)
454
+ sample_word_list = [bin_data.iloc[idx][word_col] for idx in sample_indices]
455
+ else:
456
+ sample_word_list = []
457
+
458
+ group_idx = len(group_labels) - 1
459
+ sample_words[group_idx] = [{'word': word, 'group': group_label} for word in sample_word_list]
460
+
461
+ # Store group statistics
462
+ group_stats_list.append({
463
+ 'group_idx': group_idx,
464
+ f'{column}_mean': bin_data[column].mean(),
465
+ f'{column}_count': len(bin_data),
466
+ f'{column}_min': bin_data[column].min(),
467
+ f'{column}_max': bin_data[column].max(),
468
+ 'start_rank': start_rank,
469
+ 'end_rank': end_rank
470
+ })
471
+
472
+ # Create a DataFrame for group stats
473
+ group_stats = pd.DataFrame(group_stats_list)
474
+
475
+ # Create title suffix with word limit info
476
+ title_parts = [f"Bin Size: {bin_size}"]
477
+ if max_words_to_retain:
478
+ title_parts.append(f"Top {max_words_to_retain:,} words")
479
+ title_parts.append(f"{'Log₁₀ ' if log_transform else ''}{column}")
480
+ title_suffix = " (" + ", ".join(title_parts) + ")"
481
+
482
+ return {
483
+ 'group_labels': group_labels,
484
+ 'group_centers': group_centers,
485
+ 'avg_frequencies': avg_frequencies,
486
+ 'group_stats': group_stats,
487
+ 'sample_words': sample_words,
488
+ 'bin_size': bin_size,
489
+ 'column': column,
490
+ 'log_transform': log_transform,
491
+ 'max_words_to_retain': max_words_to_retain,
492
+ 'total_groups': len(group_labels),
493
+ 'title_suffix': title_suffix,
494
+ 'x_label': f"Rank Groups (bin size: {bin_size})",
495
+ 'y_label': f"{'Log₁₀ ' if log_transform else ''}Average {column}"
496
+ }
497
+
498
+ # Legacy methods for backward compatibility
499
+ def validate_format(self, df: pd.DataFrame) -> bool:
500
+ """Legacy method for backward compatibility."""
501
+ return 'Type' in df.columns and 'Freq' in df.columns
502
+
503
+ def get_available_columns(self) -> List[str]:
504
+ """Legacy method for backward compatibility."""
505
+ if self.data is None:
506
+ return []
507
+
508
+ freq_columns = []
509
+ if 'Freq' in self.data.columns:
510
+ freq_columns.append('Freq')
511
+ if 'NormFreq' in self.data.columns:
512
+ freq_columns.append('NormFreq')
513
+
514
+ return freq_columns
515
+
516
+ def create_histogram_data(self, column: str = 'Freq', bins: int = 25, log_transform: bool = False) -> Dict:
517
+ """Legacy histogram method for backward compatibility."""
518
+ if self.data is None:
519
+ raise ValueError("No data loaded")
520
+
521
+ if column not in self.data.columns:
522
+ raise ValueError(f"Column '{column}' not found in data")
523
+
524
+ # Get frequency values
525
+ freq_values = self.data[column].copy()
526
+
527
+ # Apply log transformation if requested
528
+ if log_transform:
529
+ freq_values = np.log10(freq_values + 1e-10)
530
+ title_suffix = f" (Log₁₀ {column})"
531
+ x_label = f"Log₁₀ {column}"
532
+ else:
533
+ title_suffix = f" ({column})"
534
+ x_label = column
535
+
536
+ # Create histogram
537
+ counts, bin_edges = np.histogram(freq_values, bins=bins)
538
+ bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
539
+ bin_widths = bin_edges[1:] - bin_edges[:-1]
540
+
541
+ return {
542
+ 'counts': counts,
543
+ 'bin_edges': bin_edges,
544
+ 'bin_centers': bin_centers,
545
+ 'bin_widths': bin_widths,
546
+ 'freq_values': freq_values,
547
+ 'original_column': column,
548
+ 'log_transform': log_transform,
549
+ 'title_suffix': title_suffix,
550
+ 'x_label': x_label,
551
+ 'total_words': len(self.data)
552
+ }
553
+
554
+ def sample_words_per_bin(self, histogram_data: Dict, samples_per_bin: int = 5) -> Dict[int, List[Dict]]:
555
+ """Legacy word sampling method for backward compatibility."""
556
+ if self.data is None:
557
+ raise ValueError("No data loaded")
558
+
559
+ bin_edges = histogram_data['bin_edges']
560
+ freq_values = histogram_data['freq_values']
561
+ original_column = histogram_data['original_column']
562
+
563
+ sampled_words = {}
564
+
565
+ for i in range(len(bin_edges) - 1):
566
+ bin_start = bin_edges[i]
567
+ bin_end = bin_edges[i + 1]
568
+
569
+ # Find words in this bin
570
+ if i == len(bin_edges) - 2: # Last bin, include right edge
571
+ mask = (freq_values >= bin_start) & (freq_values <= bin_end)
572
+ else:
573
+ mask = (freq_values >= bin_start) & (freq_values < bin_end)
574
+
575
+ bin_words = self.data[mask]
576
+
577
+ if len(bin_words) > 0:
578
+ # Sample words (up to samples_per_bin)
579
+ n_samples = min(samples_per_bin, len(bin_words))
580
+ sampled = bin_words.sample(n=n_samples, random_state=42)
581
+
582
+ # Create word info list
583
+ word_list = []
584
+ for _, word_row in sampled.iterrows():
585
+ word_info = {
586
+ 'word': word_row['Type'],
587
+ 'freq': word_row[original_column],
588
+ 'rank': word_row.get('Rank', 'N/A'),
589
+ 'original_freq': word_row['Freq'] if original_column != 'Freq' else word_row['Freq']
590
+ }
591
+ word_list.append(word_info)
592
+
593
+ sampled_words[i] = word_list
594
+ else:
595
+ sampled_words[i] = []
596
+
597
+ return sampled_words
598
+
599
+ def create_rank_based_visualization(self, column: str = 'Freq', bin_size: int = 500, log_transform: bool = False) -> Dict:
600
+ """Legacy rank-based visualization method for backward compatibility."""
601
+ return self.create_rank_based_visualization_flexible(column, bin_size, log_transform)
602
+
603
+ def calculate_statistics(self, column: str = 'Freq') -> Dict:
604
+ """Calculate descriptive statistics for the frequency data."""
605
+ if self.data is None:
606
+ raise ValueError("No data loaded")
607
+
608
+ if column not in self.data.columns:
609
+ raise ValueError(f"Column '{column}' not found in data")
610
+
611
+ freq_values = self.data[column]
612
+
613
+ stats = {
614
+ 'count': len(freq_values),
615
+ 'mean': float(freq_values.mean()),
616
+ 'median': float(freq_values.median()),
617
+ 'std': float(freq_values.std()),
618
+ 'min': float(freq_values.min()),
619
+ 'max': float(freq_values.max()),
620
+ 'q25': float(freq_values.quantile(0.25)),
621
+ 'q75': float(freq_values.quantile(0.75)),
622
+ 'skewness': float(freq_values.skew()),
623
+ 'column_name': column
624
+ }
625
+
626
+ # Add some additional insights
627
+ stats['range'] = stats['max'] - stats['min']
628
+ stats['iqr'] = stats['q75'] - stats['q25']
629
+ stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] != 0 else 0
630
+
631
+ return stats
632
+
633
+ def get_top_words(self, column: str = 'Freq', n: int = 10) -> List[Dict]:
634
+ """Get the top N words by frequency."""
635
+ if self.data is None:
636
+ raise ValueError("No data loaded")
637
+
638
+ if column not in self.data.columns:
639
+ raise ValueError(f"Column '{column}' not found in data")
640
+
641
+ top_words = self.data.nlargest(n, column)
642
+
643
+ result = []
644
+ for _, row in top_words.iterrows():
645
+ word_info = {
646
+ 'word': row['Type'],
647
+ 'freq': row[column],
648
+ 'rank': row.get('Rank', 'N/A'),
649
+ 'original_freq': row['Freq']
650
+ }
651
+ result.append(word_info)
652
+
653
+ return result
text_analyzer/lexical_sophistication.py CHANGED
@@ -13,50 +13,30 @@ import logging
13
  from collections import defaultdict
14
  import re
15
 
 
 
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
 
21
- class LexicalSophisticationAnalyzer:
22
  """
23
  Main class for lexical sophistication analysis.
24
  Handles tokenization, n-gram generation, and score calculation.
25
  """
26
 
27
- def __init__(self, language: str = "en", model_size: str = "trf"):
28
  """
29
  Initialize analyzer with specified language and model.
30
 
31
  Args:
32
  language (str): Language code ('en' for English, 'ja' for Japanese)
33
- model_size (str): SpaCy model size ('trf' or 'lg')
34
  """
35
- self.language = language
36
- self.model_size = model_size
37
- self.nlp = None
38
  self.reference_lists = {}
39
- self._load_spacy_model()
40
-
41
- def _load_spacy_model(self):
42
- """Load appropriate SpaCy model based on language and size."""
43
- model_map = {
44
- ("en", "md"): "en_core_web_md",
45
- ("en", "trf"): "en_core_web_trf",
46
- ("ja", "md"): "ja_core_news_md",
47
- ("ja", "trf"): "ja_core_news_trf"
48
- }
49
-
50
- model_name = model_map.get((self.language, self.model_size))
51
- if not model_name:
52
- raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
53
-
54
- try:
55
- self.nlp = spacy.load(model_name)
56
- logger.info(f"Loaded SpaCy model: {model_name}")
57
- except OSError:
58
- logger.error(f"SpaCy model {model_name} not found. Please install it first.")
59
- raise
60
 
61
  def load_reference_lists(self, reference_files: Dict[str, Dict[str, Union[str, dict]]]):
62
  """
@@ -235,26 +215,6 @@ class LexicalSophisticationAnalyzer:
235
  logger.error(f"Error parsing custom config: {e}")
236
  return {}
237
 
238
- def _classify_pos(self, token) -> str:
239
- """
240
- Classify token as content word (CW) or function word (FW).
241
-
242
- Args:
243
- token: SpaCy token object
244
-
245
- Returns:
246
- str: 'CW' for content words, 'FW' for function words
247
- """
248
- content_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
249
- function_pos = {'DET', 'PRON', 'ADP', 'CONJ', 'CCONJ', 'SCONJ'}
250
-
251
- if token.pos_ in content_pos:
252
- return 'CW'
253
- elif token.pos_ in function_pos:
254
- return 'FW'
255
- else:
256
- # Default classification for ambiguous cases
257
- return 'CW' if token.pos_ not in {'PUNCT', 'SPACE', 'X'} else 'FW'
258
 
259
  def _generate_ngrams(self, tokens: List, n: int, sep: str = " ") -> List[str]:
260
  """
@@ -296,7 +256,7 @@ class LexicalSophisticationAnalyzer:
296
  measure_col: Optional[str] = None) -> Optional[float]:
297
  """
298
  Look up score for a word in reference lists.
299
-
300
  Args:
301
  word: Word to look up
302
  index_name: Name of the reference index
@@ -314,6 +274,12 @@ class LexicalSophisticationAnalyzer:
314
  return None
315
 
316
  if file_type in ['token', 'lemma']:
 
 
 
 
 
 
317
  # Simple dictionary lookup for unigrams
318
  return ref_data.get(word.lower())
319
  else:
@@ -344,6 +310,169 @@ class LexicalSophisticationAnalyzer:
344
  except (ValueError, TypeError):
345
  return None
346
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  def analyze_text(self, text: str, selected_indices: List[str],
349
  apply_log: bool = False, word_type_filter: Optional[str] = None) -> Dict:
@@ -359,12 +488,9 @@ class LexicalSophisticationAnalyzer:
359
  Returns:
360
  Dictionary containing analysis results
361
  """
362
- if not self.nlp:
363
- raise ValueError("SpaCy model not loaded")
364
-
365
- # Process text
366
- doc = self.nlp(text)
367
- tokens = [token for token in doc if not token.is_punct and not token.is_space]
368
 
369
  # Generate n-grams
370
  bigrams = self._generate_ngrams(tokens, 2)
@@ -382,7 +508,9 @@ class LexicalSophisticationAnalyzer:
382
  'content_words': len([t for t in tokens if self._classify_pos(t) == 'CW']),
383
  'function_words': len([t for t in tokens if self._classify_pos(t) == 'FW'])
384
  },
385
- 'raw_scores': {} # Add raw_scores for plotting
 
 
386
  }
387
 
388
  # Initialize score collections
@@ -396,23 +524,78 @@ class LexicalSophisticationAnalyzer:
396
  if word_type_filter and word_type != word_type_filter:
397
  continue
398
 
 
399
  token_detail = {
400
  'id': i + 1,
401
  'token': token.text,
402
  'lemma': token.lemma_,
403
  'pos': token.pos_,
 
 
 
 
404
  'word_type': word_type
405
  }
406
 
407
  # Look up scores for each selected index
408
  for index_name in selected_indices:
409
- # Token-based lookup
410
- token_score = self._lookup_score(token.text, index_name, 'token')
411
- lemma_score = self._lookup_score(token.lemma_, index_name, 'lemma')
 
 
 
 
 
412
 
413
- # Store scores
414
- token_detail[f"{index_name}_token"] = token_score if token_score is not None else "NA"
415
- token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else "NA"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
  # Collect for summary statistics
418
  if token_score is not None:
@@ -477,7 +660,7 @@ class LexicalSophisticationAnalyzer:
477
  score_val = np.log10(score) if apply_log and score > 0 else score
478
  ngram_detail[f"{index_name}_{measure}"] = score_val
479
  else:
480
- ngram_detail[f"{index_name}_{measure}"] = "NA"
481
 
482
  results[ngram_details_key].append(ngram_detail)
483
 
 
13
  from collections import defaultdict
14
  import re
15
 
16
+ from .base_analyzer import BaseAnalyzer
17
+ from .app_config import AppConfig
18
+
19
  # Configure logging
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
 
24
+ class LexicalSophisticationAnalyzer(BaseAnalyzer):
25
  """
26
  Main class for lexical sophistication analysis.
27
  Handles tokenization, n-gram generation, and score calculation.
28
  """
29
 
30
+ def __init__(self, language: str = None, model_size: str = None):
31
  """
32
  Initialize analyzer with specified language and model.
33
 
34
  Args:
35
  language (str): Language code ('en' for English, 'ja' for Japanese)
36
+ model_size (str): SpaCy model size ('md' or 'trf')
37
  """
38
+ super().__init__(language, model_size)
 
 
39
  self.reference_lists = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def load_reference_lists(self, reference_files: Dict[str, Dict[str, Union[str, dict]]]):
42
  """
 
215
  logger.error(f"Error parsing custom config: {e}")
216
  return {}
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  def _generate_ngrams(self, tokens: List, n: int, sep: str = " ") -> List[str]:
220
  """
 
256
  measure_col: Optional[str] = None) -> Optional[float]:
257
  """
258
  Look up score for a word in reference lists.
259
+
260
  Args:
261
  word: Word to look up
262
  index_name: Name of the reference index
 
274
  return None
275
 
276
  if file_type in ['token', 'lemma']:
277
+ # Check if this is Japanese corpus data
278
+ if isinstance(ref_data, dict) and ref_data.get('is_japanese_corpus', False):
279
+ # This should not be called directly for Japanese data
280
+ # Use _lookup_japanese_score instead
281
+ return None
282
+
283
  # Simple dictionary lookup for unigrams
284
  return ref_data.get(word.lower())
285
  else:
 
310
  except (ValueError, TypeError):
311
  return None
312
  return None
313
+
314
+ def _lookup_with_unidic_fallback(self, token, index_name: str, file_type: str) -> Dict:
315
+ """
316
+ Enhanced Japanese lookup with UniDic 3-level fallback using corpus-compatible keys.
317
+
318
+ Args:
319
+ token: SpaCy token object with UniDic extensions
320
+ index_name: Name of the reference index
321
+ file_type: Type of reference file ('token', 'lemma')
322
+
323
+ Returns:
324
+ Dictionary with score, method, key, and diagnostic information
325
+ """
326
+ # Initialize diagnostic tracking
327
+ attempted_keys = []
328
+ diagnostic_info = {
329
+ 'attempted_keys': attempted_keys,
330
+ 'unidic_features': {},
331
+ 'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0),
332
+ 'spacy_fallback_used': False,
333
+ 'no_match': False
334
+ }
335
+
336
+ # Get UniDic features from token extensions
337
+ unidic_features = {
338
+ 'lemma': getattr(token._, 'unidic_lemma', '') or '',
339
+ 'lForm': getattr(token._, 'unidic_lform', '') or '',
340
+ 'pos1': getattr(token._, 'unidic_pos1', '') or '',
341
+ 'pos2': getattr(token._, 'unidic_pos2', '') or '',
342
+ 'pos3': getattr(token._, 'unidic_pos3', '') or '',
343
+ 'goshu': getattr(token._, 'unidic_goshu', '') or ''
344
+ }
345
+ diagnostic_info['unidic_features'] = unidic_features
346
+
347
+ # Only proceed with UniDic matching if we have good alignment and features
348
+ if diagnostic_info['alignment_confidence'] > 0.5 and any(unidic_features.values()):
349
+
350
+ # Try corpus-compatible keys using the hierarchical lookup dictionaries
351
+ # Level 1: {lemma}_{lForm}_{pos1}_{pos2}_{pos3} (when pos3 exists)
352
+ if all([unidic_features['lemma'], unidic_features['lForm'],
353
+ unidic_features['pos1'], unidic_features['pos2'], unidic_features['pos3']]):
354
+ level1_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}_{unidic_features['pos2']}_{unidic_features['pos3']}"
355
+ attempted_keys.append(level1_key)
356
+ score = self._lookup_japanese_corpus_level(level1_key, index_name, file_type, 'level1_dict')
357
+ if score is not None:
358
+ return {
359
+ 'score': score,
360
+ 'match_method': 'unidic_corpus_level_1',
361
+ 'match_key': level1_key,
362
+ 'diagnostic_info': diagnostic_info
363
+ }
364
+
365
+ # Level 2: {lemma}_{lForm}_{pos1}_{pos2}
366
+ if all([unidic_features['lemma'], unidic_features['lForm'],
367
+ unidic_features['pos1'], unidic_features['pos2']]):
368
+ level2_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}_{unidic_features['pos2']}"
369
+ attempted_keys.append(level2_key)
370
+ score = self._lookup_japanese_corpus_level(level2_key, index_name, file_type, 'level2_dict')
371
+ if score is not None:
372
+ return {
373
+ 'score': score,
374
+ 'match_method': 'unidic_corpus_level_2',
375
+ 'match_key': level2_key,
376
+ 'diagnostic_info': diagnostic_info
377
+ }
378
+
379
+ # Level 3: {lemma}_{lForm}_{pos1}
380
+ if all([unidic_features['lemma'], unidic_features['lForm'], unidic_features['pos1']]):
381
+ level3_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}"
382
+ attempted_keys.append(level3_key)
383
+ score = self._lookup_japanese_corpus_level(level3_key, index_name, file_type, 'level3_dict')
384
+ if score is not None:
385
+ return {
386
+ 'score': score,
387
+ 'match_method': 'unidic_corpus_level_3',
388
+ 'match_key': level3_key,
389
+ 'diagnostic_info': diagnostic_info
390
+ }
391
+
392
+ # Fallback to legacy spaCy-based matching
393
+ diagnostic_info['spacy_fallback_used'] = True
394
+ legacy_score = self._lookup_japanese_score(token, index_name, file_type, fallback=True)
395
+ if legacy_score is not None:
396
+ legacy_key = f"{token.lemma_}_{token.tag_}"
397
+ attempted_keys.append(f"legacy: {legacy_key}")
398
+ return {
399
+ 'score': legacy_score,
400
+ 'match_method': 'legacy_spacy',
401
+ 'match_key': legacy_key,
402
+ 'diagnostic_info': diagnostic_info
403
+ }
404
+
405
+ # No match found
406
+ diagnostic_info['no_match'] = True
407
+ return {
408
+ 'score': None,
409
+ 'match_method': 'none',
410
+ 'match_key': None,
411
+ 'diagnostic_info': diagnostic_info
412
+ }
413
+
414
+ def _lookup_japanese_corpus_level(self, key: str, index_name: str, file_type: str, level_dict_name: str) -> Optional[float]:
415
+ """
416
+ Look up score in a specific level dictionary of Japanese corpus data.
417
+
418
+ Args:
419
+ key: Composite key to look up
420
+ index_name: Name of the reference index
421
+ file_type: Type of reference file ('token', 'lemma')
422
+ level_dict_name: Name of the level dictionary ('level1_dict', 'level2_dict', 'level3_dict')
423
+
424
+ Returns:
425
+ Score if found, None otherwise
426
+ """
427
+ if index_name not in self.reference_lists:
428
+ return None
429
+
430
+ ref_data = self.reference_lists[index_name].get(file_type)
431
+ if ref_data is None or not isinstance(ref_data, dict):
432
+ return None
433
+
434
+ if not ref_data.get('is_japanese_corpus', False):
435
+ return None
436
+
437
+ level_dict = ref_data.get(level_dict_name, {})
438
+ return level_dict.get(key)
439
+
440
+ def _lookup_japanese_score(self, token, index_name: str, file_type: str, fallback: bool = False) -> Optional[float]:
441
+ """
442
+ Look up score for a Japanese word using composite key approach.
443
+
444
+ Args:
445
+ token: SpaCy token object
446
+ index_name: Name of the reference index
447
+ file_type: Type of reference file ('token', 'lemma')
448
+ fallback: Whether to use fallback search strategies
449
+
450
+ Returns:
451
+ Score if found, None otherwise
452
+ """
453
+ if index_name not in self.reference_lists:
454
+ return None
455
+
456
+ ref_data = self.reference_lists[index_name].get(file_type)
457
+ if ref_data is None or not isinstance(ref_data, dict):
458
+ return None
459
+
460
+ if not ref_data.get('is_japanese_corpus', False):
461
+ return None
462
+
463
+ # Try composite key first (lemma_pos)
464
+ composite_key = f"{token.lemma_}_{token.tag_}"
465
+ score = ref_data.get('composite_dict', {}).get(composite_key)
466
+
467
+ if score is None and fallback:
468
+ # Fallback to lemma only
469
+ score = ref_data.get('lemma_dict', {}).get(token.lemma_.lower())
470
+
471
+ if score is None and fallback:
472
+ # Final fallback to surface form
473
+ score = ref_data.get('surface_dict', {}).get(token.text.lower())
474
+
475
+ return score
476
 
477
  def analyze_text(self, text: str, selected_indices: List[str],
478
  apply_log: bool = False, word_type_filter: Optional[str] = None) -> Dict:
 
488
  Returns:
489
  Dictionary containing analysis results
490
  """
491
+ # Process text using base class
492
+ doc = self.process_document(text)
493
+ tokens = self.filter_tokens(doc, exclude_punct=True, exclude_space=True)
 
 
 
494
 
495
  # Generate n-grams
496
  bigrams = self._generate_ngrams(tokens, 2)
 
508
  'content_words': len([t for t in tokens if self._classify_pos(t) == 'CW']),
509
  'function_words': len([t for t in tokens if self._classify_pos(t) == 'FW'])
510
  },
511
+ 'raw_scores': {}, # Raw scores for plotting
512
+ 'tokens': tokens, # Raw spaCy tokens for advanced analysis
513
+ 'doc': doc # Full spaCy doc for complex operations
514
  }
515
 
516
  # Initialize score collections
 
524
  if word_type_filter and word_type != word_type_filter:
525
  continue
526
 
527
+ # Work directly with spaCy token - include syntactic information
528
  token_detail = {
529
  'id': i + 1,
530
  'token': token.text,
531
  'lemma': token.lemma_,
532
  'pos': token.pos_,
533
+ 'tag': token.tag_,
534
+ 'dep_': token.dep_, # Add dependency relation
535
+ 'head_text': token.head.text, # Add head word
536
+ 'head_pos': token.head.pos_, # Add head POS
537
  'word_type': word_type
538
  }
539
 
540
  # Look up scores for each selected index
541
  for index_name in selected_indices:
542
+ # Check if this is a Japanese corpus reference list
543
+ ref_data = self.reference_lists.get(index_name, {})
544
+ is_japanese_corpus = False
545
+ for file_type in ['token', 'lemma']:
546
+ data = ref_data.get(file_type, {})
547
+ if isinstance(data, dict) and data.get('is_japanese_corpus', False):
548
+ is_japanese_corpus = True
549
+ break
550
 
551
+ if is_japanese_corpus and self.language == 'ja':
552
+ # Use enhanced UniDic lookup with 3-level fallback and diagnostics
553
+ token_result = self._lookup_with_unidic_fallback(token, index_name, 'token')
554
+ lemma_result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
555
+
556
+ # Extract scores and diagnostic information
557
+ token_score = token_result['score']
558
+ lemma_score = lemma_result['score']
559
+
560
+ # Store enhanced details with diagnostic information
561
+ token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
562
+ token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
563
+
564
+ # Add diagnostic information for debugging
565
+ token_detail[f"{index_name}_token_match_method"] = token_result['match_method']
566
+ token_detail[f"{index_name}_lemma_match_method"] = lemma_result['match_method']
567
+ token_detail[f"{index_name}_token_match_key"] = token_result['match_key'] or None
568
+ token_detail[f"{index_name}_lemma_match_key"] = lemma_result['match_key'] or None
569
+
570
+ # Store UniDic features for display
571
+ if hasattr(token, '_') and hasattr(token._, 'unidic_lemma'):
572
+ token_detail['unidic_features'] = {
573
+ 'lemma': getattr(token._, 'unidic_lemma', ''),
574
+ 'lForm': getattr(token._, 'unidic_lform', ''),
575
+ 'pos1': getattr(token._, 'unidic_pos1', ''),
576
+ 'pos2': getattr(token._, 'unidic_pos2', ''),
577
+ 'goshu': getattr(token._, 'unidic_goshu', ''),
578
+ 'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
579
+ }
580
+
581
+ elif is_japanese_corpus:
582
+ # Fallback to legacy Japanese lookup if UniDic not available
583
+ token_score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
584
+ lemma_score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
585
+
586
+ # Store scores
587
+ token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
588
+ token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
589
+ token_detail[f"{index_name}_token_match_method"] = "legacy_spacy"
590
+ token_detail[f"{index_name}_lemma_match_method"] = "legacy_spacy"
591
+ else:
592
+ # Standard lookup for non-Japanese data
593
+ token_score = self._lookup_score(token.text, index_name, 'token')
594
+ lemma_score = self._lookup_score(token.lemma_, index_name, 'lemma')
595
+
596
+ # Store scores
597
+ token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
598
+ token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
599
 
600
  # Collect for summary statistics
601
  if token_score is not None:
 
660
  score_val = np.log10(score) if apply_log and score > 0 else score
661
  ngram_detail[f"{index_name}_{measure}"] = score_val
662
  else:
663
+ ngram_detail[f"{index_name}_{measure}"] = None
664
 
665
  results[ngram_details_key].append(ngram_detail)
666
 
text_analyzer/pos_parser.py CHANGED
@@ -13,15 +13,18 @@ import base64
13
  from io import BytesIO
14
  import zipfile
15
 
 
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
 
21
- class POSParser:
22
  """
23
  Main class for POS tagging and dependency parsing.
24
  Handles multilingual analysis and visualization.
 
25
  """
26
 
27
  def __init__(self, language: str = "en", model_size: str = "trf"):
@@ -30,32 +33,9 @@ class POSParser:
30
 
31
  Args:
32
  language (str): Language code ('en' for English, 'ja' for Japanese)
33
- model_size (str): SpaCy model size ('trf' or 'lg')
34
  """
35
- self.language = language
36
- self.model_size = model_size
37
- self.nlp = None
38
- self._load_spacy_model()
39
-
40
- def _load_spacy_model(self):
41
- """Load appropriate SpaCy model based on language and size."""
42
- model_map = {
43
- ("en", "md"): "en_core_web_md",
44
- ("en", "trf"): "en_core_web_trf",
45
- ("ja", "md"): "ja_core_news_md",
46
- ("ja", "trf"): "ja_core_news_trf"
47
- }
48
-
49
- model_name = model_map.get((self.language, self.model_size))
50
- if not model_name:
51
- raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
52
-
53
- try:
54
- self.nlp = spacy.load(model_name)
55
- logger.info(f"Loaded SpaCy model: {model_name}")
56
- except OSError:
57
- logger.error(f"SpaCy model {model_name} not found. Please install it first.")
58
- raise
59
 
60
  def analyze_text(self, text: str) -> Dict:
61
  """
@@ -67,11 +47,8 @@ class POSParser:
67
  Returns:
68
  Dictionary containing analysis results
69
  """
70
- if not self.nlp:
71
- raise ValueError("SpaCy model not loaded")
72
-
73
- # Process text
74
- doc = self.nlp(text)
75
 
76
  # Extract token information
77
  token_data = []
@@ -131,10 +108,8 @@ class POSParser:
131
  Returns:
132
  List of HTML strings, one per sentence
133
  """
134
- if not self.nlp:
135
- raise ValueError("SpaCy model not loaded")
136
-
137
- doc = self.nlp(text)
138
  html_outputs = []
139
 
140
  for sent in doc.sents:
@@ -235,4 +210,4 @@ class POSParser:
235
  zip_file.write(file_path, file_path.name)
236
 
237
  zip_buffer.seek(0)
238
- return zip_buffer.getvalue()
 
13
  from io import BytesIO
14
  import zipfile
15
 
16
+ from .base_analyzer import BaseAnalyzer
17
+
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
22
 
23
+ class POSParser(BaseAnalyzer):
24
  """
25
  Main class for POS tagging and dependency parsing.
26
  Handles multilingual analysis and visualization.
27
+ Inherits from BaseAnalyzer for consistent SpaCy model management.
28
  """
29
 
30
  def __init__(self, language: str = "en", model_size: str = "trf"):
 
33
 
34
  Args:
35
  language (str): Language code ('en' for English, 'ja' for Japanese)
36
+ model_size (str): SpaCy model size ('trf' or 'md')
37
  """
38
+ super().__init__(language, model_size)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def analyze_text(self, text: str) -> Dict:
41
  """
 
47
  Returns:
48
  Dictionary containing analysis results
49
  """
50
+ # Process text using base class method
51
+ doc = self.process_document(text)
 
 
 
52
 
53
  # Extract token information
54
  token_data = []
 
108
  Returns:
109
  List of HTML strings, one per sentence
110
  """
111
+ # Process text using base class method
112
+ doc = self.process_document(text)
 
 
113
  html_outputs = []
114
 
115
  for sent in doc.sents:
 
210
  zip_file.write(file_path, file_path.name)
211
 
212
  zip_buffer.seek(0)
213
+ return zip_buffer.getvalue()
text_analyzer/text_utility.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text processing utilities module.
3
+ Contains reusable functions for file handling, encoding detection, and text cleaning.
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ import chardet
9
+ from pathlib import Path
10
+ from typing import Union, Tuple, List, Dict, Any, Optional
11
+ import logging
12
+ import re
13
+ from .app_config import AppConfig
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class TextUtility:
19
+ """Collection of text processing and file handling utilities."""
20
+
21
+ @staticmethod
22
+ def detect_encoding(content: bytes) -> str:
23
+ """
24
+ Detect encoding of byte content.
25
+
26
+ Args:
27
+ content: Byte content to analyze
28
+
29
+ Returns:
30
+ Detected encoding string
31
+ """
32
+ try:
33
+ # Try chardet for automatic detection
34
+ result = chardet.detect(content)
35
+ encoding = result.get('encoding', 'utf-8')
36
+
37
+ # Validate detected encoding against supported list
38
+ if encoding and encoding.lower() in [enc.lower() for enc in AppConfig.SUPPORTED_ENCODINGS]:
39
+ return encoding
40
+
41
+ # Fall back to trying supported encodings
42
+ for enc in AppConfig.SUPPORTED_ENCODINGS:
43
+ try:
44
+ content.decode(enc)
45
+ return enc
46
+ except UnicodeDecodeError:
47
+ continue
48
+
49
+ # Final fallback
50
+ return 'utf-8'
51
+
52
+ except Exception as e:
53
+ logger.warning(f"Error detecting encoding: {e}, defaulting to utf-8")
54
+ return 'utf-8'
55
+
56
+ @staticmethod
57
+ def detect_delimiter(text: str) -> str:
58
+ """
59
+ Detect delimiter in text content.
60
+
61
+ Args:
62
+ text: Text content to analyze
63
+
64
+ Returns:
65
+ Detected delimiter
66
+ """
67
+ # Count occurrences of each supported delimiter
68
+ delimiter_counts = {}
69
+ for delimiter in AppConfig.SUPPORTED_DELIMITERS:
70
+ delimiter_counts[delimiter] = text.count(delimiter)
71
+
72
+ # Return the most frequent delimiter, or tab as default
73
+ if delimiter_counts:
74
+ return max(delimiter_counts, key=delimiter_counts.get)
75
+ return '\t'
76
+
77
+ @staticmethod
78
+ def clean_text_input(text: str) -> str:
79
+ """
80
+ Clean text input by normalizing whitespace and removing problematic characters.
81
+
82
+ Args:
83
+ text: Raw text input
84
+
85
+ Returns:
86
+ Cleaned text
87
+ """
88
+ if not text:
89
+ return ""
90
+
91
+ # Normalize whitespace
92
+ text = TextUtility.normalize_whitespace(text)
93
+
94
+ # Remove or replace problematic characters
95
+ # Remove null bytes
96
+ text = text.replace('\x00', '')
97
+
98
+ # Normalize unicode
99
+ text = text.encode('utf-8', errors='ignore').decode('utf-8')
100
+
101
+ return text.strip()
102
+
103
+ @staticmethod
104
+ def normalize_whitespace(text: str) -> str:
105
+ """
106
+ Normalize whitespace in text.
107
+
108
+ Args:
109
+ text: Text to normalize
110
+
111
+ Returns:
112
+ Text with normalized whitespace
113
+ """
114
+ if not text:
115
+ return ""
116
+
117
+ # Replace multiple whitespace with single space
118
+ text = re.sub(r'\s+', ' ', text)
119
+
120
+ # Remove leading/trailing whitespace from each line
121
+ lines = text.split('\n')
122
+ lines = [line.strip() for line in lines]
123
+
124
+ # Remove empty lines at beginning and end
125
+ while lines and not lines[0]:
126
+ lines.pop(0)
127
+ while lines and not lines[-1]:
128
+ lines.pop()
129
+
130
+ return '\n'.join(lines)
131
+
132
+ @staticmethod
133
+ def validate_text_length(text: str, max_length: int = None) -> bool:
134
+ """
135
+ Validate text length against limits.
136
+
137
+ Args:
138
+ text: Text to validate
139
+ max_length: Maximum allowed length (optional)
140
+
141
+ Returns:
142
+ True if text length is valid
143
+ """
144
+ if not text:
145
+ return False
146
+
147
+ if max_length and len(text) > max_length:
148
+ return False
149
+
150
+ return True
151
+
152
+ @staticmethod
153
+ def extract_text_from_file(file_path: str) -> str:
154
+ """
155
+ Extract text content from a file with encoding detection.
156
+
157
+ Args:
158
+ file_path: Path to the file
159
+
160
+ Returns:
161
+ Extracted text content
162
+ """
163
+ try:
164
+ # Read as bytes first for encoding detection
165
+ with open(file_path, 'rb') as f:
166
+ content = f.read()
167
+
168
+ # Detect encoding
169
+ encoding = TextUtility.detect_encoding(content)
170
+
171
+ # Decode with detected encoding
172
+ text = content.decode(encoding)
173
+
174
+ # Clean the text
175
+ return TextUtility.clean_text_input(text)
176
+
177
+ except Exception as e:
178
+ logger.error(f"Error extracting text from {file_path}: {e}")
179
+ raise ValueError(f"Failed to extract text from file: {e}")
180
+
181
+ @staticmethod
182
+ def prepare_batch_files(file_paths: List[str]) -> List[Tuple[str, str]]:
183
+ """
184
+ Prepare batch files for processing by extracting text content.
185
+
186
+ Args:
187
+ file_paths: List of file paths
188
+
189
+ Returns:
190
+ List of tuples (file_path, text_content)
191
+ """
192
+ prepared_files = []
193
+
194
+ for file_path in file_paths:
195
+ try:
196
+ text_content = TextUtility.extract_text_from_file(file_path)
197
+ prepared_files.append((file_path, text_content))
198
+ except Exception as e:
199
+ logger.error(f"Error preparing file {file_path}: {e}")
200
+ # Add error entry
201
+ prepared_files.append((file_path, f"ERROR: {e}"))
202
+
203
+ return prepared_files
204
+
205
+ @staticmethod
206
+ def sanitize_filename(filename: str) -> str:
207
+ """
208
+ Sanitize filename by removing problematic characters.
209
+
210
+ Args:
211
+ filename: Original filename
212
+
213
+ Returns:
214
+ Sanitized filename
215
+ """
216
+ # Remove or replace problematic characters
217
+ filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
218
+
219
+ # Remove control characters
220
+ filename = ''.join(char for char in filename if ord(char) >= 32)
221
+
222
+ # Limit length
223
+ if len(filename) > 255:
224
+ name, ext = os.path.splitext(filename)
225
+ filename = name[:255-len(ext)] + ext
226
+
227
+ return filename or "unnamed_file"
228
+
229
+ @staticmethod
230
+ def create_safe_temp_file(content: str, suffix: str = '.txt') -> str:
231
+ """
232
+ Create a temporary file with given content safely.
233
+
234
+ Args:
235
+ content: Content to write to file
236
+ suffix: File suffix
237
+
238
+ Returns:
239
+ Path to created temporary file
240
+ """
241
+ try:
242
+ with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False, encoding='utf-8') as f:
243
+ f.write(content)
244
+ return f.name
245
+ except Exception as e:
246
+ logger.error(f"Error creating temporary file: {e}")
247
+ raise ValueError(f"Failed to create temporary file: {e}")
248
+
249
+ @staticmethod
250
+ def load_corpus_config(corpus_name: str) -> Dict[str, Any]:
251
+ """
252
+ Load specific corpus configuration from reference_lists.yaml
253
+
254
+ Args:
255
+ corpus_name: Name of the corpus
256
+
257
+ Returns:
258
+ Corpus configuration dictionary
259
+ """
260
+ return AppConfig.get_corpus_configuration(corpus_name)
261
+
262
+ @staticmethod
263
+ def get_column_mapping(config: Dict, corpus_type: str = 'columns') -> Dict[str, int]:
264
+ """
265
+ Extract column mappings from corpus configuration
266
+
267
+ Args:
268
+ config: Corpus configuration dictionary
269
+ corpus_type: Type of mapping to extract
270
+
271
+ Returns:
272
+ Dictionary mapping column names to indices
273
+ """
274
+ return config.get(corpus_type, {})
275
+
276
+ @staticmethod
277
+ def cleanup_temp_files(file_paths: List[str]) -> None:
278
+ """
279
+ Clean up temporary files safely.
280
+
281
+ Args:
282
+ file_paths: List of temporary file paths to clean up
283
+ """
284
+ for file_path in file_paths:
285
+ try:
286
+ if os.path.exists(file_path):
287
+ os.unlink(file_path)
288
+ except Exception as e:
289
+ logger.warning(f"Error cleaning up temporary file {file_path}: {e}")
text_analyzer/unidic_enricher.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UniDic morphological enricher for Japanese text analysis.
3
+ Provides fugashi/UniDic integration with character offset alignment.
4
+ """
5
+
6
+ import fugashi
7
+ from unidic import DICDIR
8
+ from typing import List, Dict, Optional, Tuple
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class UniDicEnricher:
15
+ """
16
+ Enriches spaCy tokens with UniDic morphological features using fugashi.
17
+ Handles character offset alignment and provides comprehensive feature extraction.
18
+ """
19
+
20
+ def __init__(self, unidic_path: Optional[str] = None):
21
+ """
22
+ Initialize with full UniDic dictionary.
23
+
24
+ Args:
25
+ unidic_path: Path to UniDic dictionary. Uses default if None.
26
+ """
27
+ if unidic_path is None:
28
+ unidic_path = DICDIR
29
+
30
+ try:
31
+ # Initialize tagger with full UniDic
32
+ self.tagger = fugashi.Tagger(f'-Owakati -d {unidic_path}')
33
+ logger.info(f"UniDicEnricher initialized with dictionary: {unidic_path}")
34
+ except Exception as e:
35
+ logger.error(f"Failed to initialize UniDic tagger: {e}")
36
+ raise
37
+
38
+ def extract_full_features(self, word_node) -> Dict[str, str]:
39
+ """
40
+ Extract complete UniDic features using proper fugashi API.
41
+
42
+ Args:
43
+ word_node: Fugashi word node object
44
+
45
+ Returns:
46
+ Dictionary of UniDic morphological features
47
+ """
48
+ try:
49
+ features = {
50
+ 'surface': word_node.surface,
51
+ 'pos1': word_node.feature.pos1, # 品詞大分類
52
+ 'pos2': word_node.feature.pos2, # 品詞中分類
53
+ 'pos3': word_node.feature.pos3, # 品詞小分類
54
+ 'pos4': word_node.feature.pos4, # 品詞細分類
55
+ 'cType': word_node.feature.cType, # 活用型
56
+ 'cForm': word_node.feature.cForm, # 活用形
57
+ 'lemma': word_node.feature.lemma, # 基本形
58
+ 'lForm': word_node.feature.lForm, # 読み
59
+ 'orth': word_node.feature.orth, # 表記
60
+ 'orthBase': word_node.feature.orthBase, # 表記基本形
61
+ 'goshu': word_node.feature.goshu, # 語種 (和/漢/外/混)
62
+ }
63
+
64
+ # Handle None values by converting to empty strings
65
+ for key, value in features.items():
66
+ if value is None:
67
+ features[key] = ""
68
+
69
+ return features
70
+
71
+ except Exception as e:
72
+ logger.warning(f"Error extracting features from word node: {e}")
73
+ return self._get_empty_features(word_node.surface if hasattr(word_node, 'surface') else "")
74
+
75
+ def _get_empty_features(self, surface: str) -> Dict[str, str]:
76
+ """Return empty feature dictionary with surface form."""
77
+ return {
78
+ 'surface': surface,
79
+ 'pos1': '', 'pos2': '', 'pos3': '', 'pos4': '',
80
+ 'cType': '', 'cForm': '', 'lemma': surface, 'lForm': '',
81
+ 'orth': '', 'orthBase': '', 'goshu': ''
82
+ }
83
+
84
+ def parse_text(self, text: str) -> List[Dict[str, any]]:
85
+ """
86
+ Parse text with fugashi and extract character positions.
87
+
88
+ Args:
89
+ text: Input text to parse
90
+
91
+ Returns:
92
+ List of dictionaries containing features and character positions
93
+ """
94
+ result = []
95
+ char_pos = 0
96
+
97
+ try:
98
+ for word_node in self.tagger(text):
99
+ surface = word_node.surface
100
+ features = self.extract_full_features(word_node)
101
+
102
+ # Find character position in original text
103
+ start_pos = text.find(surface, char_pos)
104
+ if start_pos == -1:
105
+ # Fallback: assume consecutive positioning
106
+ start_pos = char_pos
107
+
108
+ end_pos = start_pos + len(surface)
109
+
110
+ result.append({
111
+ 'surface': surface,
112
+ 'start': start_pos,
113
+ 'end': end_pos,
114
+ 'features': features
115
+ })
116
+
117
+ char_pos = end_pos
118
+
119
+ except Exception as e:
120
+ logger.error(f"Error parsing text with fugashi: {e}")
121
+
122
+ return result
123
+
124
+ def align_with_spacy_tokens(self, text: str, spacy_tokens) -> List[Dict]:
125
+ """
126
+ Align fugashi tokens with spaCy tokens using character offsets.
127
+
128
+ Args:
129
+ text: Original input text
130
+ spacy_tokens: List of spaCy token objects
131
+
132
+ Returns:
133
+ List of alignment results with confidence scores
134
+ """
135
+ fugashi_tokens = self.parse_text(text)
136
+ alignments = []
137
+
138
+ for spacy_token in spacy_tokens:
139
+ spacy_start = spacy_token.idx
140
+ spacy_end = spacy_token.idx + len(spacy_token.text)
141
+
142
+ best_match = None
143
+ best_confidence = 0.0
144
+
145
+ # Find best overlapping fugashi token
146
+ for fugashi_token in fugashi_tokens:
147
+ overlap = self._calculate_overlap(
148
+ spacy_start, spacy_end,
149
+ fugashi_token['start'], fugashi_token['end']
150
+ )
151
+
152
+ if overlap > best_confidence:
153
+ best_confidence = overlap
154
+ best_match = fugashi_token
155
+
156
+ alignment = {
157
+ 'spacy_token': spacy_token,
158
+ 'fugashi_token': best_match,
159
+ 'confidence': best_confidence,
160
+ 'aligned': best_confidence > 0.5 # Threshold for successful alignment
161
+ }
162
+
163
+ alignments.append(alignment)
164
+
165
+ return alignments
166
+
167
+ def _calculate_overlap(self, start1: int, end1: int, start2: int, end2: int) -> float:
168
+ """
169
+ Calculate overlap ratio between two character ranges.
170
+
171
+ Args:
172
+ start1, end1: First range
173
+ start2, end2: Second range
174
+
175
+ Returns:
176
+ Overlap ratio (0.0 to 1.0)
177
+ """
178
+ if end1 <= start2 or end2 <= start1:
179
+ return 0.0 # No overlap
180
+
181
+ overlap_start = max(start1, start2)
182
+ overlap_end = min(end1, end2)
183
+ overlap_length = overlap_end - overlap_start
184
+
185
+ total_length = max(end1 - start1, end2 - start2)
186
+
187
+ return overlap_length / total_length if total_length > 0 else 0.0
188
+
189
+ def enrich_spacy_doc(self, doc, text: str):
190
+ """
191
+ Add UniDic features to spaCy tokens via extensions.
192
+
193
+ Args:
194
+ doc: spaCy document object
195
+ text: Original input text
196
+ """
197
+ try:
198
+ # Get token alignments
199
+ alignments = self.align_with_spacy_tokens(text, doc)
200
+
201
+ # Apply UniDic features to spaCy tokens
202
+ for alignment in alignments:
203
+ token = alignment['spacy_token']
204
+ fugashi_token = alignment['fugashi_token']
205
+ confidence = alignment['confidence']
206
+
207
+ # Set alignment confidence
208
+ token._.alignment_confidence = confidence
209
+
210
+ if fugashi_token and alignment['aligned']:
211
+ features = fugashi_token['features']
212
+
213
+ # Set UniDic features on token extensions
214
+ token._.unidic_surface = features.get('surface', '')
215
+ token._.unidic_lemma = features.get('lemma', '')
216
+ token._.unidic_lform = features.get('lForm', '')
217
+ token._.unidic_pos1 = features.get('pos1', '')
218
+ token._.unidic_pos2 = features.get('pos2', '')
219
+ token._.unidic_pos3 = features.get('pos3', '')
220
+ token._.unidic_pos4 = features.get('pos4', '')
221
+ token._.unidic_goshu = features.get('goshu', '')
222
+ token._.unidic_orth = features.get('orth', '')
223
+ token._.unidic_ctype = features.get('cType', '')
224
+ token._.unidic_cform = features.get('cForm', '')
225
+ token._.unidic_orthbase = features.get('orthBase', '')
226
+
227
+ # Store full entry for debugging
228
+ token._.unidic_entries = [features]
229
+ else:
230
+ # No alignment found - set empty values
231
+ self._set_empty_unidic_features(token)
232
+
233
+ logger.debug(f"Enriched {len(alignments)} tokens with UniDic features")
234
+
235
+ except Exception as e:
236
+ logger.error(f"Error enriching spaCy doc: {e}")
237
+ # Set empty features for all tokens on error
238
+ for token in doc:
239
+ self._set_empty_unidic_features(token)
240
+
241
+ def _set_empty_unidic_features(self, token):
242
+ """Set empty UniDic features on a token."""
243
+ token._.unidic_surface = ""
244
+ token._.unidic_lemma = ""
245
+ token._.unidic_lform = ""
246
+ token._.unidic_pos1 = ""
247
+ token._.unidic_pos2 = ""
248
+ token._.unidic_pos3 = ""
249
+ token._.unidic_pos4 = ""
250
+ token._.unidic_goshu = ""
251
+ token._.unidic_orth = ""
252
+ token._.unidic_ctype = ""
253
+ token._.unidic_cform = ""
254
+ token._.unidic_orthbase = ""
255
+ token._.unidic_entries = []
256
+ token._.alignment_confidence = 0.0
text_analyzer/unidic_extensions.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ spaCy token extensions for UniDic morphological features.
3
+ This module defines custom token extensions to store UniDic analysis results.
4
+ """
5
+
6
+ from spacy.tokens import Token
7
+
8
+ # Comprehensive UniDic feature extensions
9
+ Token.set_extension("unidic_entries", default=[], force=True)
10
+ Token.set_extension("unidic_lemma", default=None, force=True)
11
+ Token.set_extension("unidic_lform", default=None, force=True)
12
+ Token.set_extension("unidic_pos1", default=None, force=True)
13
+ Token.set_extension("unidic_pos2", default=None, force=True)
14
+ Token.set_extension("unidic_pos3", default=None, force=True)
15
+ Token.set_extension("unidic_sublemma", default=None, force=True)
16
+ Token.set_extension("unidic_goshu", default=None, force=True)
17
+ Token.set_extension("unidic_orth", default=None, force=True)
18
+ Token.set_extension("alignment_confidence", default=1.0, force=True)
19
+
20
+ # Additional extensions for diagnostic tracking
21
+ Token.set_extension("unidic_surface", default=None, force=True)
22
+ Token.set_extension("unidic_pos4", default=None, force=True)
23
+ Token.set_extension("unidic_ctype", default=None, force=True)
24
+ Token.set_extension("unidic_cform", default=None, force=True)
25
+ Token.set_extension("unidic_orthbase", default=None, force=True)
uv.lock CHANGED
@@ -27,6 +27,24 @@ wheels = [
27
  { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
28
  ]
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  [[package]]
31
  name = "attrs"
32
  version = "25.3.0"
@@ -88,6 +106,48 @@ wheels = [
88
  { url = "https://files.pythonhosted.org/packages/4f/52/34c6cf5bb9285074dc3531c437b3919e825d976fde097a7a73f79e726d03/certifi-2025.7.14-py3-none-any.whl", hash = "sha256:6b31f564a415d79ee77df69d757bb49a5bb53bd9f756cbbe24394ffd6fc1f4b2", size = 162722, upload-time = "2025-07-14T03:29:26.863Z" },
89
  ]
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  [[package]]
92
  name = "charset-normalizer"
93
  version = "3.4.2"
@@ -153,6 +213,18 @@ wheels = [
153
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
154
  ]
155
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  [[package]]
157
  name = "confection"
158
  version = "0.1.5"
@@ -216,6 +288,32 @@ wheels = [
216
  { url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload-time = "2025-01-16T21:50:24.239Z" },
217
  ]
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  [[package]]
220
  name = "en-core-web-md"
221
  version = "3.7.0"
@@ -248,6 +346,15 @@ requires-dist = [
248
  { name = "spacy-curated-transformers", specifier = ">=0.2.0,<0.3.0" },
249
  ]
250
 
 
 
 
 
 
 
 
 
 
251
  [[package]]
252
  name = "filelock"
253
  version = "3.18.0"
@@ -266,6 +373,26 @@ wheels = [
266
  { url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
267
  ]
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  [[package]]
270
  name = "gitdb"
271
  version = "4.0.12"
@@ -351,6 +478,63 @@ wheels = [
351
  { url = "https://files.pythonhosted.org/packages/ce/ff/3b59672c47c6284e8005b42e84ceba13864aa0f39f067c973d1af02f5d91/InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4", size = 67677, upload-time = "2022-06-27T23:11:17.723Z" },
352
  ]
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  [[package]]
355
  name = "ja-core-news-md"
356
  version = "3.7.0"
@@ -393,6 +577,18 @@ requires-dist = [
393
  { name = "sudachipy", specifier = ">=0.5.2,!=0.6.1" },
394
  ]
395
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  [[package]]
397
  name = "jinja2"
398
  version = "3.1.6"
@@ -432,6 +628,36 @@ wheels = [
432
  { url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" },
433
  ]
434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  [[package]]
436
  name = "langcodes"
437
  version = "3.5.0"
@@ -539,6 +765,18 @@ wheels = [
539
  { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" },
540
  ]
541
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  [[package]]
543
  name = "mdurl"
544
  version = "0.1.2"
@@ -588,6 +826,15 @@ wheels = [
588
  { url = "https://files.pythonhosted.org/packages/c0/15/278693412221859a0159719878e51a79812a189edceef2fe325160a8e661/narwhals-1.47.1-py3-none-any.whl", hash = "sha256:b9f2b2557aba054231361a00f6fcabc5017e338575e810e82155eb34e38ace93", size = 375506, upload-time = "2025-07-17T18:23:02.492Z" },
589
  ]
590
 
 
 
 
 
 
 
 
 
 
591
  [[package]]
592
  name = "networkx"
593
  version = "3.5"
@@ -789,6 +1036,27 @@ wheels = [
789
  { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
790
  ]
791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
  [[package]]
793
  name = "pfzy"
794
  version = "0.3.4"
@@ -864,6 +1132,24 @@ wheels = [
864
  { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
865
  ]
866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
867
  [[package]]
868
  name = "plotly"
869
  version = "6.2.0"
@@ -929,6 +1215,39 @@ wheels = [
929
  { url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload-time = "2025-05-28T19:25:53.926Z" },
930
  ]
931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
932
  [[package]]
933
  name = "pyarrow"
934
  version = "21.0.0"
@@ -958,6 +1277,15 @@ wheels = [
958
  { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
959
  ]
960
 
 
 
 
 
 
 
 
 
 
961
  [[package]]
962
  name = "pydantic"
963
  version = "2.11.7"
@@ -1058,6 +1386,22 @@ wheels = [
1058
  { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
1059
  ]
1060
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
  [[package]]
1062
  name = "pyyaml"
1063
  version = "6.0.2"
@@ -1084,6 +1428,36 @@ wheels = [
1084
  { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
1085
  ]
1086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1087
  [[package]]
1088
  name = "referencing"
1089
  version = "0.36.2"
@@ -1301,9 +1675,12 @@ name = "simple-text-analyzer"
1301
  version = "0.1.0"
1302
  source = { virtual = "." }
1303
  dependencies = [
 
1304
  { name = "en-core-web-md" },
1305
  { name = "en-core-web-trf" },
 
1306
  { name = "huggingface-hub", extra = ["cli"] },
 
1307
  { name = "ja-core-news-md" },
1308
  { name = "ja-core-news-trf" },
1309
  { name = "numpy" },
@@ -1314,13 +1691,17 @@ dependencies = [
1314
  { name = "spacy" },
1315
  { name = "spacy-curated-transformers" },
1316
  { name = "streamlit" },
 
1317
  ]
1318
 
1319
  [package.metadata]
1320
  requires-dist = [
 
1321
  { name = "en-core-web-md", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl" },
1322
  { name = "en-core-web-trf", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl" },
 
1323
  { name = "huggingface-hub", extras = ["cli"], specifier = ">=0.33.4" },
 
1324
  { name = "ja-core-news-md", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl" },
1325
  { name = "ja-core-news-trf", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl" },
1326
  { name = "numpy", specifier = ">=1.24.0,<2.0" },
@@ -1331,6 +1712,7 @@ requires-dist = [
1331
  { name = "spacy", specifier = ">=3.7.0" },
1332
  { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
1333
  { name = "streamlit", specifier = ">=1.28.0" },
 
1334
  ]
1335
 
1336
  [[package]]
@@ -1455,6 +1837,20 @@ wheels = [
1455
  { url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload-time = "2025-01-17T09:26:10.018Z" },
1456
  ]
1457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1458
  [[package]]
1459
  name = "streamlit"
1460
  version = "1.47.0"
@@ -1643,6 +2039,15 @@ wheels = [
1643
  { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
1644
  ]
1645
 
 
 
 
 
 
 
 
 
 
1646
  [[package]]
1647
  name = "triton"
1648
  version = "3.3.1"
@@ -1701,6 +2106,18 @@ wheels = [
1701
  { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
1702
  ]
1703
 
 
 
 
 
 
 
 
 
 
 
 
 
1704
  [[package]]
1705
  name = "urllib3"
1706
  version = "2.5.0"
@@ -1712,14 +2129,11 @@ wheels = [
1712
 
1713
  [[package]]
1714
  name = "wasabi"
1715
- version = "1.1.3"
1716
  source = { registry = "https://pypi.org/simple" }
1717
- dependencies = [
1718
- { name = "colorama", marker = "sys_platform == 'win32'" },
1719
- ]
1720
- sdist = { url = "https://files.pythonhosted.org/packages/ac/f9/054e6e2f1071e963b5e746b48d1e3727470b2a490834d18ad92364929db3/wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878", size = 30391, upload-time = "2024-05-31T16:56:18.99Z" }
1721
  wheels = [
1722
- { url = "https://files.pythonhosted.org/packages/06/7c/34330a89da55610daa5f245ddce5aab81244321101614751e7537f125133/wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c", size = 27880, upload-time = "2024-05-31T16:56:16.699Z" },
1723
  ]
1724
 
1725
  [[package]]
 
27
  { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
28
  ]
29
 
30
+ [[package]]
31
+ name = "appnope"
32
+ version = "0.1.4"
33
+ source = { registry = "https://pypi.org/simple" }
34
+ sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" }
35
+ wheels = [
36
+ { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" },
37
+ ]
38
+
39
+ [[package]]
40
+ name = "asttokens"
41
+ version = "3.0.0"
42
+ source = { registry = "https://pypi.org/simple" }
43
+ sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" }
44
+ wheels = [
45
+ { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" },
46
+ ]
47
+
48
  [[package]]
49
  name = "attrs"
50
  version = "25.3.0"
 
106
  { url = "https://files.pythonhosted.org/packages/4f/52/34c6cf5bb9285074dc3531c437b3919e825d976fde097a7a73f79e726d03/certifi-2025.7.14-py3-none-any.whl", hash = "sha256:6b31f564a415d79ee77df69d757bb49a5bb53bd9f756cbbe24394ffd6fc1f4b2", size = 162722, upload-time = "2025-07-14T03:29:26.863Z" },
107
  ]
108
 
109
+ [[package]]
110
+ name = "cffi"
111
+ version = "1.17.1"
112
+ source = { registry = "https://pypi.org/simple" }
113
+ dependencies = [
114
+ { name = "pycparser" },
115
+ ]
116
+ sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" }
117
+ wheels = [
118
+ { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178, upload-time = "2024-09-04T20:44:12.232Z" },
119
+ { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840, upload-time = "2024-09-04T20:44:13.739Z" },
120
+ { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803, upload-time = "2024-09-04T20:44:15.231Z" },
121
+ { url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850, upload-time = "2024-09-04T20:44:17.188Z" },
122
+ { url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729, upload-time = "2024-09-04T20:44:18.688Z" },
123
+ { url = "https://files.pythonhosted.org/packages/91/2b/9a1ddfa5c7f13cab007a2c9cc295b70fbbda7cb10a286aa6810338e60ea1/cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99", size = 471256, upload-time = "2024-09-04T20:44:20.248Z" },
124
+ { url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424, upload-time = "2024-09-04T20:44:21.673Z" },
125
+ { url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568, upload-time = "2024-09-04T20:44:23.245Z" },
126
+ { url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736, upload-time = "2024-09-04T20:44:24.757Z" },
127
+ { url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448, upload-time = "2024-09-04T20:44:26.208Z" },
128
+ { url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976, upload-time = "2024-09-04T20:44:27.578Z" },
129
+ { url = "https://files.pythonhosted.org/packages/8d/f8/dd6c246b148639254dad4d6803eb6a54e8c85c6e11ec9df2cffa87571dbe/cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e", size = 182989, upload-time = "2024-09-04T20:44:28.956Z" },
130
+ { url = "https://files.pythonhosted.org/packages/8b/f1/672d303ddf17c24fc83afd712316fda78dc6fce1cd53011b839483e1ecc8/cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2", size = 178802, upload-time = "2024-09-04T20:44:30.289Z" },
131
+ { url = "https://files.pythonhosted.org/packages/0e/2d/eab2e858a91fdff70533cab61dcff4a1f55ec60425832ddfdc9cd36bc8af/cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3", size = 454792, upload-time = "2024-09-04T20:44:32.01Z" },
132
+ { url = "https://files.pythonhosted.org/packages/75/b2/fbaec7c4455c604e29388d55599b99ebcc250a60050610fadde58932b7ee/cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683", size = 478893, upload-time = "2024-09-04T20:44:33.606Z" },
133
+ { url = "https://files.pythonhosted.org/packages/4f/b7/6e4a2162178bf1935c336d4da8a9352cccab4d3a5d7914065490f08c0690/cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5", size = 485810, upload-time = "2024-09-04T20:44:35.191Z" },
134
+ { url = "https://files.pythonhosted.org/packages/c7/8a/1d0e4a9c26e54746dc08c2c6c037889124d4f59dffd853a659fa545f1b40/cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4", size = 471200, upload-time = "2024-09-04T20:44:36.743Z" },
135
+ { url = "https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd", size = 479447, upload-time = "2024-09-04T20:44:38.492Z" },
136
+ { url = "https://files.pythonhosted.org/packages/5f/e4/fb8b3dd8dc0e98edf1135ff067ae070bb32ef9d509d6cb0f538cd6f7483f/cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed", size = 484358, upload-time = "2024-09-04T20:44:40.046Z" },
137
+ { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469, upload-time = "2024-09-04T20:44:41.616Z" },
138
+ { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475, upload-time = "2024-09-04T20:44:43.733Z" },
139
+ { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" },
140
+ ]
141
+
142
+ [[package]]
143
+ name = "chardet"
144
+ version = "5.2.0"
145
+ source = { registry = "https://pypi.org/simple" }
146
+ sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" }
147
+ wheels = [
148
+ { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" },
149
+ ]
150
+
151
  [[package]]
152
  name = "charset-normalizer"
153
  version = "3.4.2"
 
213
  { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
214
  ]
215
 
216
+ [[package]]
217
+ name = "comm"
218
+ version = "0.2.2"
219
+ source = { registry = "https://pypi.org/simple" }
220
+ dependencies = [
221
+ { name = "traitlets" },
222
+ ]
223
+ sdist = { url = "https://files.pythonhosted.org/packages/e9/a8/fb783cb0abe2b5fded9f55e5703015cdf1c9c85b3669087c538dd15a6a86/comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e", size = 6210, upload-time = "2024-03-12T16:53:41.133Z" }
224
+ wheels = [
225
+ { url = "https://files.pythonhosted.org/packages/e6/75/49e5bfe642f71f272236b5b2d2691cf915a7283cc0ceda56357b61daa538/comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3", size = 7180, upload-time = "2024-03-12T16:53:39.226Z" },
226
+ ]
227
+
228
  [[package]]
229
  name = "confection"
230
  version = "0.1.5"
 
288
  { url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload-time = "2025-01-16T21:50:24.239Z" },
289
  ]
290
 
291
+ [[package]]
292
+ name = "debugpy"
293
+ version = "1.8.15"
294
+ source = { registry = "https://pypi.org/simple" }
295
+ sdist = { url = "https://files.pythonhosted.org/packages/8c/8b/3a9a28ddb750a76eaec445c7f4d3147ea2c579a97dbd9e25d39001b92b21/debugpy-1.8.15.tar.gz", hash = "sha256:58d7a20b7773ab5ee6bdfb2e6cf622fdf1e40c9d5aef2857d85391526719ac00", size = 1643279, upload-time = "2025-07-15T16:43:29.135Z" }
296
+ wheels = [
297
+ { url = "https://files.pythonhosted.org/packages/ab/4a/4508d256e52897f5cdfee6a6d7580974811e911c6d01321df3264508a5ac/debugpy-1.8.15-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:3dcc7225cb317469721ab5136cda9ff9c8b6e6fb43e87c9e15d5b108b99d01ba", size = 2511197, upload-time = "2025-07-15T16:43:42.343Z" },
298
+ { url = "https://files.pythonhosted.org/packages/99/8d/7f6ef1097e7fecf26b4ef72338d08e41644a41b7ee958a19f494ffcffc29/debugpy-1.8.15-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:047a493ca93c85ccede1dbbaf4e66816794bdc214213dde41a9a61e42d27f8fc", size = 4229517, upload-time = "2025-07-15T16:43:44.14Z" },
299
+ { url = "https://files.pythonhosted.org/packages/3f/e8/e8c6a9aa33a9c9c6dacbf31747384f6ed2adde4de2e9693c766bdf323aa3/debugpy-1.8.15-cp312-cp312-win32.whl", hash = "sha256:b08e9b0bc260cf324c890626961dad4ffd973f7568fbf57feb3c3a65ab6b6327", size = 5276132, upload-time = "2025-07-15T16:43:45.529Z" },
300
+ { url = "https://files.pythonhosted.org/packages/e9/ad/231050c6177b3476b85fcea01e565dac83607b5233d003ff067e2ee44d8f/debugpy-1.8.15-cp312-cp312-win_amd64.whl", hash = "sha256:e2a4fe357c92334272eb2845fcfcdbec3ef9f22c16cf613c388ac0887aed15fa", size = 5317645, upload-time = "2025-07-15T16:43:46.968Z" },
301
+ { url = "https://files.pythonhosted.org/packages/28/70/2928aad2310726d5920b18ed9f54b9f06df5aa4c10cf9b45fa18ff0ab7e8/debugpy-1.8.15-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:f5e01291ad7d6649aed5773256c5bba7a1a556196300232de1474c3c372592bf", size = 2495538, upload-time = "2025-07-15T16:43:48.927Z" },
302
+ { url = "https://files.pythonhosted.org/packages/9e/c6/9b8ffb4ca91fac8b2877eef63c9cc0e87dd2570b1120054c272815ec4cd0/debugpy-1.8.15-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dc0f0d00e528d915e0ce1c78e771475b2335b376c49afcc7382ee0b146bab6", size = 4221874, upload-time = "2025-07-15T16:43:50.282Z" },
303
+ { url = "https://files.pythonhosted.org/packages/55/8a/9b8d59674b4bf489318c7c46a1aab58e606e583651438084b7e029bf3c43/debugpy-1.8.15-cp313-cp313-win32.whl", hash = "sha256:fcf0748d4f6e25f89dc5e013d1129ca6f26ad4da405e0723a4f704583896a709", size = 5275949, upload-time = "2025-07-15T16:43:52.079Z" },
304
+ { url = "https://files.pythonhosted.org/packages/72/83/9e58e6fdfa8710a5e6ec06c2401241b9ad48b71c0a7eb99570a1f1edb1d3/debugpy-1.8.15-cp313-cp313-win_amd64.whl", hash = "sha256:73c943776cb83e36baf95e8f7f8da765896fd94b05991e7bc162456d25500683", size = 5317720, upload-time = "2025-07-15T16:43:53.703Z" },
305
+ { url = "https://files.pythonhosted.org/packages/07/d5/98748d9860e767a1248b5e31ffa7ce8cb7006e97bf8abbf3d891d0a8ba4e/debugpy-1.8.15-py2.py3-none-any.whl", hash = "sha256:bce2e6c5ff4f2e00b98d45e7e01a49c7b489ff6df5f12d881c67d2f1ac635f3d", size = 5282697, upload-time = "2025-07-15T16:44:07.996Z" },
306
+ ]
307
+
308
+ [[package]]
309
+ name = "decorator"
310
+ version = "5.2.1"
311
+ source = { registry = "https://pypi.org/simple" }
312
+ sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
313
+ wheels = [
314
+ { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
315
+ ]
316
+
317
  [[package]]
318
  name = "en-core-web-md"
319
  version = "3.7.0"
 
346
  { name = "spacy-curated-transformers", specifier = ">=0.2.0,<0.3.0" },
347
  ]
348
 
349
+ [[package]]
350
+ name = "executing"
351
+ version = "2.2.0"
352
+ source = { registry = "https://pypi.org/simple" }
353
+ sdist = { url = "https://files.pythonhosted.org/packages/91/50/a9d80c47ff289c611ff12e63f7c5d13942c65d68125160cefd768c73e6e4/executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755", size = 978693, upload-time = "2025-01-22T15:41:29.403Z" }
354
+ wheels = [
355
+ { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702, upload-time = "2025-01-22T15:41:25.929Z" },
356
+ ]
357
+
358
  [[package]]
359
  name = "filelock"
360
  version = "3.18.0"
 
373
  { url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
374
  ]
375
 
376
+ [[package]]
377
+ name = "fugashi"
378
+ version = "1.5.1"
379
+ source = { registry = "https://pypi.org/simple" }
380
+ sdist = { url = "https://files.pythonhosted.org/packages/5e/09/e41bb13152e591f3dd5984be112a97927f6a1ae73ab0301f3cbd1c38db20/fugashi-1.5.1.tar.gz", hash = "sha256:3ff9b4d0e40e04d56d7ced906ae8fba6c6fa41aac46f5210de1b56d6626e7a1f", size = 339745, upload-time = "2025-06-05T10:29:49.158Z" }
381
+ wheels = [
382
+ { url = "https://files.pythonhosted.org/packages/0f/03/cb79fcc4ec503e39e4aec9878aa4ee2038f56794f418de7e5dccc127b6c3/fugashi-1.5.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9d6e6398a1dd8d704dbd26790195455166f6f93d0fdbebf5d1913a69d15adb22", size = 562515, upload-time = "2025-06-05T10:35:16.458Z" },
383
+ { url = "https://files.pythonhosted.org/packages/17/6d/cf637e80350e2127d682593ba51916c19dbea9eb7abc5f69b58c5cbbd0d6/fugashi-1.5.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a02a8e4ab7758c29d8b217c8d7b019079220846fdeb04b7e1ddd4dfdb2570b7e", size = 507454, upload-time = "2025-06-05T10:35:17.982Z" },
384
+ { url = "https://files.pythonhosted.org/packages/51/a1/41eeea4f5e71615b60f0ad39037dbbd787b9376e383219a2cc48e94b3733/fugashi-1.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a01c97af19a378545d7600bcb10552bebb4fe70b54a66032cc78cee1be328d66", size = 503416, upload-time = "2025-06-05T10:35:19.041Z" },
385
+ { url = "https://files.pythonhosted.org/packages/a6/c1/02fa1c2bcdbb661cc618d11ef23aef5ed243a8f2e680cbf7398ae913961e/fugashi-1.5.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:97906d1c7c56907b87c3fcf587a4990504784d7beecb67673c78c8dd608644c1", size = 675822, upload-time = "2025-06-05T10:54:33.357Z" },
386
+ { url = "https://files.pythonhosted.org/packages/ee/be/e5723a9c3a6866c14207e7dbb6d06bc49d55ea97e1784bf1096c86f0d954/fugashi-1.5.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:823e6db97d57079da4c3fcc26f04943b894974af5a22f4762e6f6ba2ed63f212", size = 697875, upload-time = "2025-06-05T10:30:50.634Z" },
387
+ { url = "https://files.pythonhosted.org/packages/f4/bc/a65acd05eca1e5583f34f215df866635a232e6345a80d965ed23d1af0718/fugashi-1.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5afa5a2bf11d039a8e45eac0ba5c2bff54ed9ef9379cb9ad7f67c987a7f6dfc0", size = 513282, upload-time = "2025-06-05T10:29:38.667Z" },
388
+ { url = "https://files.pythonhosted.org/packages/0d/2c/684cd6bb8d0a988f1d4b7e41c8eebe0385417113b2a18006c3d032df7139/fugashi-1.5.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e6f69766af17299635fa5c2ee9fe99476482003126ee1769f565a661ebd4cfb1", size = 560845, upload-time = "2025-06-05T10:35:20.042Z" },
389
+ { url = "https://files.pythonhosted.org/packages/96/c8/e8ce5efa5a7a80a5ad75770f1944c4b22694408e956b7d8a5780cda879dd/fugashi-1.5.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2055a0e0993261906f3010522ccc94b8bb9278b35a726ed58b314a5b539b9511", size = 506664, upload-time = "2025-06-05T10:35:21.015Z" },
390
+ { url = "https://files.pythonhosted.org/packages/1a/5d/46a06d2ed06cccf8a553ba0c6d723bb9863b0a02ba81463a425e30eab082/fugashi-1.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f0f3e269bfd9ba92c64086d9e6963a0bd81a3dffb9b6eeb981f33902738b7956", size = 502687, upload-time = "2025-06-05T10:35:22.298Z" },
391
+ { url = "https://files.pythonhosted.org/packages/14/89/7f90847fd65ea1ef50a070b0cb63a8fad12b18f54d95627cf4ac57af3a41/fugashi-1.5.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:53ff43069ed46bd0d5dec4140115f7883bf4a590d70f3c90a422c61260be342b", size = 672332, upload-time = "2025-06-05T10:54:34.757Z" },
392
+ { url = "https://files.pythonhosted.org/packages/72/6e/b92fec651f430e258c9fd0a82b924be2fcc23d0defd74e76ad6a5bbd97f6/fugashi-1.5.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53ce31df44b4e95904793280eda0e9895646828859801d457314efc1d535cb4f", size = 693962, upload-time = "2025-06-05T10:30:52.246Z" },
393
+ { url = "https://files.pythonhosted.org/packages/84/a9/72a7c8261ddceb0fbaee8fe075d4acd9023504c8fa8cbea2cf6140892040/fugashi-1.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:adf1646103151af5c0b78f11fd01e145c506774609243935c0978606e4a96ad3", size = 513083, upload-time = "2025-06-05T10:29:30.189Z" },
394
+ ]
395
+
396
  [[package]]
397
  name = "gitdb"
398
  version = "4.0.12"
 
478
  { url = "https://files.pythonhosted.org/packages/ce/ff/3b59672c47c6284e8005b42e84ceba13864aa0f39f067c973d1af02f5d91/InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4", size = 67677, upload-time = "2022-06-27T23:11:17.723Z" },
479
  ]
480
 
481
+ [[package]]
482
+ name = "ipykernel"
483
+ version = "6.29.5"
484
+ source = { registry = "https://pypi.org/simple" }
485
+ dependencies = [
486
+ { name = "appnope", marker = "sys_platform == 'darwin'" },
487
+ { name = "comm" },
488
+ { name = "debugpy" },
489
+ { name = "ipython" },
490
+ { name = "jupyter-client" },
491
+ { name = "jupyter-core" },
492
+ { name = "matplotlib-inline" },
493
+ { name = "nest-asyncio" },
494
+ { name = "packaging" },
495
+ { name = "psutil" },
496
+ { name = "pyzmq" },
497
+ { name = "tornado" },
498
+ { name = "traitlets" },
499
+ ]
500
+ sdist = { url = "https://files.pythonhosted.org/packages/e9/5c/67594cb0c7055dc50814b21731c22a601101ea3b1b50a9a1b090e11f5d0f/ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215", size = 163367, upload-time = "2024-07-01T14:07:22.543Z" }
501
+ wheels = [
502
+ { url = "https://files.pythonhosted.org/packages/94/5c/368ae6c01c7628438358e6d337c19b05425727fbb221d2a3c4303c372f42/ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5", size = 117173, upload-time = "2024-07-01T14:07:19.603Z" },
503
+ ]
504
+
505
+ [[package]]
506
+ name = "ipython"
507
+ version = "9.4.0"
508
+ source = { registry = "https://pypi.org/simple" }
509
+ dependencies = [
510
+ { name = "colorama", marker = "sys_platform == 'win32'" },
511
+ { name = "decorator" },
512
+ { name = "ipython-pygments-lexers" },
513
+ { name = "jedi" },
514
+ { name = "matplotlib-inline" },
515
+ { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
516
+ { name = "prompt-toolkit" },
517
+ { name = "pygments" },
518
+ { name = "stack-data" },
519
+ { name = "traitlets" },
520
+ ]
521
+ sdist = { url = "https://files.pythonhosted.org/packages/54/80/406f9e3bde1c1fd9bf5a0be9d090f8ae623e401b7670d8f6fdf2ab679891/ipython-9.4.0.tar.gz", hash = "sha256:c033c6d4e7914c3d9768aabe76bbe87ba1dc66a92a05db6bfa1125d81f2ee270", size = 4385338, upload-time = "2025-07-01T11:11:30.606Z" }
522
+ wheels = [
523
+ { url = "https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl", hash = "sha256:25850f025a446d9b359e8d296ba175a36aedd32e83ca9b5060430fe16801f066", size = 611021, upload-time = "2025-07-01T11:11:27.85Z" },
524
+ ]
525
+
526
+ [[package]]
527
+ name = "ipython-pygments-lexers"
528
+ version = "1.1.1"
529
+ source = { registry = "https://pypi.org/simple" }
530
+ dependencies = [
531
+ { name = "pygments" },
532
+ ]
533
+ sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" }
534
+ wheels = [
535
+ { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
536
+ ]
537
+
538
  [[package]]
539
  name = "ja-core-news-md"
540
  version = "3.7.0"
 
577
  { name = "sudachipy", specifier = ">=0.5.2,!=0.6.1" },
578
  ]
579
 
580
+ [[package]]
581
+ name = "jedi"
582
+ version = "0.19.2"
583
+ source = { registry = "https://pypi.org/simple" }
584
+ dependencies = [
585
+ { name = "parso" },
586
+ ]
587
+ sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
588
+ wheels = [
589
+ { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
590
+ ]
591
+
592
  [[package]]
593
  name = "jinja2"
594
  version = "3.1.6"
 
628
  { url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" },
629
  ]
630
 
631
+ [[package]]
632
+ name = "jupyter-client"
633
+ version = "8.6.3"
634
+ source = { registry = "https://pypi.org/simple" }
635
+ dependencies = [
636
+ { name = "jupyter-core" },
637
+ { name = "python-dateutil" },
638
+ { name = "pyzmq" },
639
+ { name = "tornado" },
640
+ { name = "traitlets" },
641
+ ]
642
+ sdist = { url = "https://files.pythonhosted.org/packages/71/22/bf9f12fdaeae18019a468b68952a60fe6dbab5d67cd2a103cac7659b41ca/jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419", size = 342019, upload-time = "2024-09-17T10:44:17.613Z" }
643
+ wheels = [
644
+ { url = "https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f", size = 106105, upload-time = "2024-09-17T10:44:15.218Z" },
645
+ ]
646
+
647
+ [[package]]
648
+ name = "jupyter-core"
649
+ version = "5.8.1"
650
+ source = { registry = "https://pypi.org/simple" }
651
+ dependencies = [
652
+ { name = "platformdirs" },
653
+ { name = "pywin32", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'win32'" },
654
+ { name = "traitlets" },
655
+ ]
656
+ sdist = { url = "https://files.pythonhosted.org/packages/99/1b/72906d554acfeb588332eaaa6f61577705e9ec752ddb486f302dafa292d9/jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941", size = 88923, upload-time = "2025-05-27T07:38:16.655Z" }
657
+ wheels = [
658
+ { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" },
659
+ ]
660
+
661
  [[package]]
662
  name = "langcodes"
663
  version = "3.5.0"
 
765
  { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" },
766
  ]
767
 
768
+ [[package]]
769
+ name = "matplotlib-inline"
770
+ version = "0.1.7"
771
+ source = { registry = "https://pypi.org/simple" }
772
+ dependencies = [
773
+ { name = "traitlets" },
774
+ ]
775
+ sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159, upload-time = "2024-04-15T13:44:44.803Z" }
776
+ wheels = [
777
+ { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" },
778
+ ]
779
+
780
  [[package]]
781
  name = "mdurl"
782
  version = "0.1.2"
 
826
  { url = "https://files.pythonhosted.org/packages/c0/15/278693412221859a0159719878e51a79812a189edceef2fe325160a8e661/narwhals-1.47.1-py3-none-any.whl", hash = "sha256:b9f2b2557aba054231361a00f6fcabc5017e338575e810e82155eb34e38ace93", size = 375506, upload-time = "2025-07-17T18:23:02.492Z" },
827
  ]
828
 
829
+ [[package]]
830
+ name = "nest-asyncio"
831
+ version = "1.6.0"
832
+ source = { registry = "https://pypi.org/simple" }
833
+ sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" }
834
+ wheels = [
835
+ { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" },
836
+ ]
837
+
838
  [[package]]
839
  name = "networkx"
840
  version = "3.5"
 
1036
  { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
1037
  ]
1038
 
1039
+ [[package]]
1040
+ name = "parso"
1041
+ version = "0.8.4"
1042
+ source = { registry = "https://pypi.org/simple" }
1043
+ sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609, upload-time = "2024-04-05T09:43:55.897Z" }
1044
+ wheels = [
1045
+ { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" },
1046
+ ]
1047
+
1048
+ [[package]]
1049
+ name = "pexpect"
1050
+ version = "4.9.0"
1051
+ source = { registry = "https://pypi.org/simple" }
1052
+ dependencies = [
1053
+ { name = "ptyprocess" },
1054
+ ]
1055
+ sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
1056
+ wheels = [
1057
+ { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
1058
+ ]
1059
+
1060
  [[package]]
1061
  name = "pfzy"
1062
  version = "0.3.4"
 
1132
  { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
1133
  ]
1134
 
1135
+ [[package]]
1136
+ name = "plac"
1137
+ version = "1.4.5"
1138
+ source = { registry = "https://pypi.org/simple" }
1139
+ sdist = { url = "https://files.pythonhosted.org/packages/23/09/26ef2d614cabdcc52a7f383d0dc7967bf46be3c9700898c594e37b710c3d/plac-1.4.5.tar.gz", hash = "sha256:5f05bf85235c017fcd76c73c8101d4ff8e96beb3dc58b9a37de49cac7de82d14", size = 38988, upload-time = "2025-04-04T14:03:25.651Z" }
1140
+ wheels = [
1141
+ { url = "https://files.pythonhosted.org/packages/15/36/38676114a0dbee137ec366daa86603d667a07e9a52667d5ebf5c580100ba/plac-1.4.5-py2.py3-none-any.whl", hash = "sha256:87187786b4e446688b1cf5112e18fed8a23ab3b316c25fe91266a10bd1736b16", size = 22468, upload-time = "2025-04-04T14:03:24.761Z" },
1142
+ ]
1143
+
1144
+ [[package]]
1145
+ name = "platformdirs"
1146
+ version = "4.3.8"
1147
+ source = { registry = "https://pypi.org/simple" }
1148
+ sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" }
1149
+ wheels = [
1150
+ { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" },
1151
+ ]
1152
+
1153
  [[package]]
1154
  name = "plotly"
1155
  version = "6.2.0"
 
1215
  { url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload-time = "2025-05-28T19:25:53.926Z" },
1216
  ]
1217
 
1218
+ [[package]]
1219
+ name = "psutil"
1220
+ version = "7.0.0"
1221
+ source = { registry = "https://pypi.org/simple" }
1222
+ sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
1223
+ wheels = [
1224
+ { url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
1225
+ { url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
1226
+ { url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
1227
+ { url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
1228
+ { url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
1229
+ { url = "https://files.pythonhosted.org/packages/50/e6/eecf58810b9d12e6427369784efe814a1eec0f492084ce8eb8f4d89d6d61/psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99", size = 241053, upload-time = "2025-02-13T21:54:34.31Z" },
1230
+ { url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885, upload-time = "2025-02-13T21:54:37.486Z" },
1231
+ ]
1232
+
1233
+ [[package]]
1234
+ name = "ptyprocess"
1235
+ version = "0.7.0"
1236
+ source = { registry = "https://pypi.org/simple" }
1237
+ sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
1238
+ wheels = [
1239
+ { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
1240
+ ]
1241
+
1242
+ [[package]]
1243
+ name = "pure-eval"
1244
+ version = "0.2.3"
1245
+ source = { registry = "https://pypi.org/simple" }
1246
+ sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
1247
+ wheels = [
1248
+ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
1249
+ ]
1250
+
1251
  [[package]]
1252
  name = "pyarrow"
1253
  version = "21.0.0"
 
1277
  { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
1278
  ]
1279
 
1280
+ [[package]]
1281
+ name = "pycparser"
1282
+ version = "2.22"
1283
+ source = { registry = "https://pypi.org/simple" }
1284
+ sdist = { url = "https://files.pythonhosted.org/packages/1d/b2/31537cf4b1ca988837256c910a668b553fceb8f069bedc4b1c826024b52c/pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6", size = 172736, upload-time = "2024-03-30T13:22:22.564Z" }
1285
+ wheels = [
1286
+ { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552, upload-time = "2024-03-30T13:22:20.476Z" },
1287
+ ]
1288
+
1289
  [[package]]
1290
  name = "pydantic"
1291
  version = "2.11.7"
 
1386
  { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
1387
  ]
1388
 
1389
+ [[package]]
1390
+ name = "pywin32"
1391
+ version = "311"
1392
+ source = { registry = "https://pypi.org/simple" }
1393
+ wheels = [
1394
+ { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" },
1395
+ { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" },
1396
+ { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" },
1397
+ { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" },
1398
+ { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" },
1399
+ { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" },
1400
+ { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" },
1401
+ { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" },
1402
+ { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" },
1403
+ ]
1404
+
1405
  [[package]]
1406
  name = "pyyaml"
1407
  version = "6.0.2"
 
1428
  { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
1429
  ]
1430
 
1431
+ [[package]]
1432
+ name = "pyzmq"
1433
+ version = "27.0.0"
1434
+ source = { registry = "https://pypi.org/simple" }
1435
+ dependencies = [
1436
+ { name = "cffi", marker = "implementation_name == 'pypy'" },
1437
+ ]
1438
+ sdist = { url = "https://files.pythonhosted.org/packages/f1/06/50a4e9648b3e8b992bef8eb632e457307553a89d294103213cfd47b3da69/pyzmq-27.0.0.tar.gz", hash = "sha256:b1f08eeb9ce1510e6939b6e5dcd46a17765e2333daae78ecf4606808442e52cf", size = 280478, upload-time = "2025-06-13T14:09:07.087Z" }
1439
+ wheels = [
1440
+ { url = "https://files.pythonhosted.org/packages/93/a7/9ad68f55b8834ede477842214feba6a4c786d936c022a67625497aacf61d/pyzmq-27.0.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:cbabc59dcfaac66655c040dfcb8118f133fb5dde185e5fc152628354c1598e52", size = 1305438, upload-time = "2025-06-13T14:07:31.676Z" },
1441
+ { url = "https://files.pythonhosted.org/packages/ba/ee/26aa0f98665a22bc90ebe12dced1de5f3eaca05363b717f6fb229b3421b3/pyzmq-27.0.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:cb0ac5179cba4b2f94f1aa208fbb77b62c4c9bf24dd446278b8b602cf85fcda3", size = 895095, upload-time = "2025-06-13T14:07:33.104Z" },
1442
+ { url = "https://files.pythonhosted.org/packages/cf/85/c57e7ab216ecd8aa4cc7e3b83b06cc4e9cf45c87b0afc095f10cd5ce87c1/pyzmq-27.0.0-cp312-abi3-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53a48f0228eab6cbf69fde3aa3c03cbe04e50e623ef92ae395fce47ef8a76152", size = 651826, upload-time = "2025-06-13T14:07:34.831Z" },
1443
+ { url = "https://files.pythonhosted.org/packages/69/9a/9ea7e230feda9400fb0ae0d61d7d6ddda635e718d941c44eeab22a179d34/pyzmq-27.0.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:111db5f395e09f7e775f759d598f43cb815fc58e0147623c4816486e1a39dc22", size = 839750, upload-time = "2025-06-13T14:07:36.553Z" },
1444
+ { url = "https://files.pythonhosted.org/packages/08/66/4cebfbe71f3dfbd417011daca267539f62ed0fbc68105357b68bbb1a25b7/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c8878011653dcdc27cc2c57e04ff96f0471e797f5c19ac3d7813a245bcb24371", size = 1641357, upload-time = "2025-06-13T14:07:38.21Z" },
1445
+ { url = "https://files.pythonhosted.org/packages/ac/f6/b0f62578c08d2471c791287149cb8c2aaea414ae98c6e995c7dbe008adfb/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:c0ed2c1f335ba55b5fdc964622254917d6b782311c50e138863eda409fbb3b6d", size = 2020281, upload-time = "2025-06-13T14:07:39.599Z" },
1446
+ { url = "https://files.pythonhosted.org/packages/37/b9/4f670b15c7498495da9159edc374ec09c88a86d9cd5a47d892f69df23450/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e918d70862d4cfd4b1c187310015646a14e1f5917922ab45b29f28f345eeb6be", size = 1877110, upload-time = "2025-06-13T14:07:41.027Z" },
1447
+ { url = "https://files.pythonhosted.org/packages/66/31/9dee25c226295b740609f0d46db2fe972b23b6f5cf786360980524a3ba92/pyzmq-27.0.0-cp312-abi3-win32.whl", hash = "sha256:88b4e43cab04c3c0f0d55df3b1eef62df2b629a1a369b5289a58f6fa8b07c4f4", size = 559297, upload-time = "2025-06-13T14:07:42.533Z" },
1448
+ { url = "https://files.pythonhosted.org/packages/9b/12/52da5509800f7ff2d287b2f2b4e636e7ea0f001181cba6964ff6c1537778/pyzmq-27.0.0-cp312-abi3-win_amd64.whl", hash = "sha256:dce4199bf5f648a902ce37e7b3afa286f305cd2ef7a8b6ec907470ccb6c8b371", size = 619203, upload-time = "2025-06-13T14:07:43.843Z" },
1449
+ { url = "https://files.pythonhosted.org/packages/93/6d/7f2e53b19d1edb1eb4f09ec7c3a1f945ca0aac272099eab757d15699202b/pyzmq-27.0.0-cp312-abi3-win_arm64.whl", hash = "sha256:56e46bbb85d52c1072b3f809cc1ce77251d560bc036d3a312b96db1afe76db2e", size = 551927, upload-time = "2025-06-13T14:07:45.51Z" },
1450
+ { url = "https://files.pythonhosted.org/packages/19/62/876b27c4ff777db4ceba1c69ea90d3c825bb4f8d5e7cd987ce5802e33c55/pyzmq-27.0.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:c36ad534c0c29b4afa088dc53543c525b23c0797e01b69fef59b1a9c0e38b688", size = 1340826, upload-time = "2025-06-13T14:07:46.881Z" },
1451
+ { url = "https://files.pythonhosted.org/packages/43/69/58ef8f4f59d3bcd505260c73bee87b008850f45edca40ddaba54273c35f4/pyzmq-27.0.0-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:67855c14173aec36395d7777aaba3cc527b393821f30143fd20b98e1ff31fd38", size = 897283, upload-time = "2025-06-13T14:07:49.562Z" },
1452
+ { url = "https://files.pythonhosted.org/packages/43/15/93a0d0396700a60475ad3c5d42c5f1c308d3570bc94626b86c71ef9953e0/pyzmq-27.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8617c7d43cd8ccdb62aebe984bfed77ca8f036e6c3e46dd3dddda64b10f0ab7a", size = 660567, upload-time = "2025-06-13T14:07:51.364Z" },
1453
+ { url = "https://files.pythonhosted.org/packages/0e/b3/fe055513e498ca32f64509abae19b9c9eb4d7c829e02bd8997dd51b029eb/pyzmq-27.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:67bfbcbd0a04c575e8103a6061d03e393d9f80ffdb9beb3189261e9e9bc5d5e9", size = 847681, upload-time = "2025-06-13T14:07:52.77Z" },
1454
+ { url = "https://files.pythonhosted.org/packages/b6/4f/ff15300b00b5b602191f3df06bbc8dd4164e805fdd65bb77ffbb9c5facdc/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5cd11d46d7b7e5958121b3eaf4cd8638eff3a720ec527692132f05a57f14341d", size = 1650148, upload-time = "2025-06-13T14:07:54.178Z" },
1455
+ { url = "https://files.pythonhosted.org/packages/c4/6f/84bdfff2a224a6f26a24249a342e5906993c50b0761e311e81b39aef52a7/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:b801c2e40c5aa6072c2f4876de8dccd100af6d9918d4d0d7aa54a1d982fd4f44", size = 2023768, upload-time = "2025-06-13T14:07:55.714Z" },
1456
+ { url = "https://files.pythonhosted.org/packages/64/39/dc2db178c26a42228c5ac94a9cc595030458aa64c8d796a7727947afbf55/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:20d5cb29e8c5f76a127c75b6e7a77e846bc4b655c373baa098c26a61b7ecd0ef", size = 1885199, upload-time = "2025-06-13T14:07:57.166Z" },
1457
+ { url = "https://files.pythonhosted.org/packages/c7/21/dae7b06a1f8cdee5d8e7a63d99c5d129c401acc40410bef2cbf42025e26f/pyzmq-27.0.0-cp313-cp313t-win32.whl", hash = "sha256:a20528da85c7ac7a19b7384e8c3f8fa707841fd85afc4ed56eda59d93e3d98ad", size = 575439, upload-time = "2025-06-13T14:07:58.959Z" },
1458
+ { url = "https://files.pythonhosted.org/packages/eb/bc/1709dc55f0970cf4cb8259e435e6773f9946f41a045c2cb90e870b7072da/pyzmq-27.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d8229f2efece6a660ee211d74d91dbc2a76b95544d46c74c615e491900dc107f", size = 639933, upload-time = "2025-06-13T14:08:00.777Z" },
1459
+ ]
1460
+
1461
  [[package]]
1462
  name = "referencing"
1463
  version = "0.36.2"
 
1675
  version = "0.1.0"
1676
  source = { virtual = "." }
1677
  dependencies = [
1678
+ { name = "chardet" },
1679
  { name = "en-core-web-md" },
1680
  { name = "en-core-web-trf" },
1681
+ { name = "fugashi" },
1682
  { name = "huggingface-hub", extra = ["cli"] },
1683
+ { name = "ipykernel" },
1684
  { name = "ja-core-news-md" },
1685
  { name = "ja-core-news-trf" },
1686
  { name = "numpy" },
 
1691
  { name = "spacy" },
1692
  { name = "spacy-curated-transformers" },
1693
  { name = "streamlit" },
1694
+ { name = "unidic" },
1695
  ]
1696
 
1697
  [package.metadata]
1698
  requires-dist = [
1699
+ { name = "chardet", specifier = ">=5.2.0" },
1700
  { name = "en-core-web-md", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl" },
1701
  { name = "en-core-web-trf", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl" },
1702
+ { name = "fugashi", specifier = ">=1.3.0" },
1703
  { name = "huggingface-hub", extras = ["cli"], specifier = ">=0.33.4" },
1704
+ { name = "ipykernel", specifier = ">=6.29.5" },
1705
  { name = "ja-core-news-md", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl" },
1706
  { name = "ja-core-news-trf", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl" },
1707
  { name = "numpy", specifier = ">=1.24.0,<2.0" },
 
1712
  { name = "spacy", specifier = ">=3.7.0" },
1713
  { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
1714
  { name = "streamlit", specifier = ">=1.28.0" },
1715
+ { name = "unidic", specifier = ">=1.1.0" },
1716
  ]
1717
 
1718
  [[package]]
 
1837
  { url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload-time = "2025-01-17T09:26:10.018Z" },
1838
  ]
1839
 
1840
+ [[package]]
1841
+ name = "stack-data"
1842
+ version = "0.6.3"
1843
+ source = { registry = "https://pypi.org/simple" }
1844
+ dependencies = [
1845
+ { name = "asttokens" },
1846
+ { name = "executing" },
1847
+ { name = "pure-eval" },
1848
+ ]
1849
+ sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
1850
+ wheels = [
1851
+ { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
1852
+ ]
1853
+
1854
  [[package]]
1855
  name = "streamlit"
1856
  version = "1.47.0"
 
2039
  { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
2040
  ]
2041
 
2042
+ [[package]]
2043
+ name = "traitlets"
2044
+ version = "5.14.3"
2045
+ source = { registry = "https://pypi.org/simple" }
2046
+ sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
2047
+ wheels = [
2048
+ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
2049
+ ]
2050
+
2051
  [[package]]
2052
  name = "triton"
2053
  version = "3.3.1"
 
2106
  { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
2107
  ]
2108
 
2109
+ [[package]]
2110
+ name = "unidic"
2111
+ version = "1.1.0"
2112
+ source = { registry = "https://pypi.org/simple" }
2113
+ dependencies = [
2114
+ { name = "plac" },
2115
+ { name = "requests" },
2116
+ { name = "tqdm" },
2117
+ { name = "wasabi" },
2118
+ ]
2119
+ sdist = { url = "https://files.pythonhosted.org/packages/5a/09/271dfbf8d5b56adddc70e30fa94249f5d3ab35f615bf278d65258045564a/unidic-1.1.0.tar.gz", hash = "sha256:0ab91c05de342c84d2a6314901fd3afb9061ecd7534dd4a0431dccbb87d921b7", size = 7688, upload-time = "2021-10-10T08:56:44.301Z" }
2120
+
2121
  [[package]]
2122
  name = "urllib3"
2123
  version = "2.5.0"
 
2129
 
2130
  [[package]]
2131
  name = "wasabi"
2132
+ version = "0.10.1"
2133
  source = { registry = "https://pypi.org/simple" }
2134
+ sdist = { url = "https://files.pythonhosted.org/packages/69/41/0c31737ee1a29c8b829690ebb4ab988b1f489aa2c3efa115a732a9dd7997/wasabi-0.10.1.tar.gz", hash = "sha256:c8e372781be19272942382b14d99314d175518d7822057cb7a97010c4259d249", size = 28380, upload-time = "2022-07-28T08:17:54.968Z" }
 
 
 
2135
  wheels = [
2136
+ { url = "https://files.pythonhosted.org/packages/34/74/bd566f876c2de097e75d525c2696fb9829009987a0d93a4fb3576778a0a8/wasabi-0.10.1-py3-none-any.whl", hash = "sha256:fe862cc24034fbc9f04717cd312ab884f71f51a8ecabebc3449b751c2a649d83", size = 26075, upload-time = "2022-07-28T08:17:53.504Z" },
2137
  ]
2138
 
2139
  [[package]]
web_app/__pycache__/analysis_handlers.cpython-312.pyc DELETED
Binary file (17.9 kB)
 
web_app/__pycache__/app.cpython-312.pyc DELETED
Binary file (4.4 kB)
 
web_app/__pycache__/comparison_functions.cpython-312.pyc DELETED
Binary file (13.2 kB)
 
web_app/__pycache__/config_manager.cpython-312.pyc DELETED
Binary file (9.89 kB)
 
web_app/__pycache__/pos_handlers.cpython-312.pyc DELETED
Binary file (7.49 kB)
 
web_app/__pycache__/reference_manager.cpython-312.pyc DELETED
Binary file (10.6 kB)
 
web_app/__pycache__/session_manager.cpython-312.pyc DELETED
Binary file (6.81 kB)
 
web_app/__pycache__/ui_components.cpython-312.pyc DELETED
Binary file (11.8 kB)
 
web_app/app.py CHANGED
@@ -19,6 +19,7 @@ from web_app.components.ui_components import UIComponents
19
  from web_app.handlers.analysis_handlers import AnalysisHandlers
20
  from web_app.reference_manager import ReferenceManager
21
  from web_app.handlers.pos_handlers import POSHandlers
 
22
 
23
  # Configure Streamlit page
24
  st.set_page_config(
@@ -32,7 +33,7 @@ st.set_page_config(
32
  def main():
33
  """Main application entry point."""
34
  st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
35
- st.markdown("*Educational tools for lexical sophistication analysis and POS/dependency parsing*")
36
 
37
  # Initialize session state
38
  SessionManager.initialize_session_state()
@@ -46,8 +47,10 @@ def main():
46
  # Route to appropriate interface
47
  if tool_choice == 'Lexical Sophistication':
48
  render_lexical_sophistication_interface()
49
- else:
50
  render_pos_parser_interface()
 
 
51
 
52
 
53
  def render_sidebar():
@@ -117,5 +120,14 @@ def render_pos_parser_interface():
117
  POSHandlers.handle_batch_pos_analysis(parser)
118
 
119
 
 
 
 
 
 
 
 
 
 
120
  if __name__ == "__main__":
121
- main()
 
19
  from web_app.handlers.analysis_handlers import AnalysisHandlers
20
  from web_app.reference_manager import ReferenceManager
21
  from web_app.handlers.pos_handlers import POSHandlers
22
+ from web_app.handlers.frequency_handlers import FrequencyHandlers
23
 
24
  # Configure Streamlit page
25
  st.set_page_config(
 
33
  def main():
34
  """Main application entry point."""
35
  st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
36
+ st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
37
 
38
  # Initialize session state
39
  SessionManager.initialize_session_state()
 
47
  # Route to appropriate interface
48
  if tool_choice == 'Lexical Sophistication':
49
  render_lexical_sophistication_interface()
50
+ elif tool_choice == 'POS Parser':
51
  render_pos_parser_interface()
52
+ else: # Frequency Analysis
53
+ render_frequency_analysis_interface()
54
 
55
 
56
  def render_sidebar():
 
120
  POSHandlers.handle_batch_pos_analysis(parser)
121
 
122
 
123
+ def render_frequency_analysis_interface():
124
+ """Render frequency analysis interface."""
125
+ st.header("📊 Word Frequency Analysis")
126
+ st.markdown("Analyze and visualize word frequency distributions from TSV data files.")
127
+
128
+ # Handle frequency analysis
129
+ FrequencyHandlers.handle_frequency_analysis()
130
+
131
+
132
  if __name__ == "__main__":
133
+ main()
web_app/components/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (244 Bytes)
 
web_app/components/__pycache__/comparison_functions.cpython-312.pyc DELETED
Binary file (13.2 kB)
 
web_app/components/__pycache__/ui_components.cpython-312.pyc DELETED
Binary file (11.9 kB)
 
web_app/components/comparison_functions.py CHANGED
@@ -260,12 +260,13 @@ def display_token_comparison(results_a, results_b):
260
  'Token': token.get('token', ''),
261
  'Lemma': token.get('lemma', ''),
262
  'POS': token.get('pos', ''),
 
263
  'Type': token.get('word_type', '')
264
  }
265
 
266
  # Add scores for each measure (skip basic fields)
267
  for key, value in token.items():
268
- if key not in ['id', 'token', 'lemma', 'pos', 'word_type']:
269
  row[key] = value if value != 'NA' else 'N/A'
270
 
271
  token_data.append(row)
 
260
  'Token': token.get('token', ''),
261
  'Lemma': token.get('lemma', ''),
262
  'POS': token.get('pos', ''),
263
+ "TAG": token.get('tag', ''),
264
  'Type': token.get('word_type', '')
265
  }
266
 
267
  # Add scores for each measure (skip basic fields)
268
  for key, value in token.items():
269
+ if key not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
270
  row[key] = value if value != 'NA' else 'N/A'
271
 
272
  token_data.append(row)
web_app/components/ui_components.py CHANGED
@@ -121,7 +121,7 @@ class UIComponents:
121
  st.subheader("Analysis Tools")
122
  return st.radio(
123
  "Select Tool",
124
- options=['Lexical Sophistication', 'POS Parser'],
125
  key='tool_choice'
126
  )
127
 
@@ -229,4 +229,4 @@ class UIComponents:
229
  st.write(f"- {error}")
230
 
231
  if success_count == 0:
232
- st.error("No valid configurations found")
 
121
  st.subheader("Analysis Tools")
122
  return st.radio(
123
  "Select Tool",
124
+ options=['Lexical Sophistication', 'POS Parser', 'Frequency Analysis'],
125
  key='tool_choice'
126
  )
127
 
 
229
  st.write(f"- {error}")
230
 
231
  if success_count == 0:
232
+ st.error("No valid configurations found")
web_app/config_manager.py CHANGED
@@ -147,6 +147,9 @@ class ConfigManager:
147
  """Load actual data for a reference list based on its configuration."""
148
  data = {}
149
 
 
 
 
150
  # Check if this is a bigram or trigram configuration
151
  columns = list_config.get('columns', {})
152
  is_bigram = 'bigram' in columns
@@ -173,8 +176,12 @@ class ConfigManager:
173
  # Get column mapping
174
  columns = list_config.get('columns', {})
175
 
176
- if file_type in ['token', 'lemma'] and not is_bigram and not is_trigram:
177
- # For unigrams only
 
 
 
 
178
  word_col = columns.get('word', 0)
179
  score_col = columns.get('frequency', 1)
180
 
@@ -208,9 +215,109 @@ class ConfigManager:
208
 
209
  return data
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  @staticmethod
212
  def clean_default_reference_lists():
213
  """Clean up default reference lists that are no longer selected."""
214
  # This would be called by the UI when managing default reference lists
215
  # Implementation depends on how default lists are managed
216
- pass
 
147
  """Load actual data for a reference list based on its configuration."""
148
  data = {}
149
 
150
+ # Check if this is a Japanese corpus
151
+ is_japanese_corpus = list_config.get('japanese_corpus', False)
152
+
153
  # Check if this is a bigram or trigram configuration
154
  columns = list_config.get('columns', {})
155
  is_bigram = 'bigram' in columns
 
176
  # Get column mapping
177
  columns = list_config.get('columns', {})
178
 
179
+ if is_japanese_corpus and file_type in ['token', 'lemma']:
180
+ # Handle Japanese corpus format with composite keys
181
+ processed_data = ConfigManager._parse_japanese_corpus_data(df, columns)
182
+ data[file_type] = processed_data
183
+ elif file_type in ['token', 'lemma'] and not is_bigram and not is_trigram:
184
+ # For standard unigrams
185
  word_col = columns.get('word', 0)
186
  score_col = columns.get('frequency', 1)
187
 
 
215
 
216
  return data
217
 
218
+ @staticmethod
219
+ def _parse_japanese_corpus_data(df: pd.DataFrame, columns: Dict[str, int]) -> Dict[str, Any]:
220
+ """Parse Japanese corpus data and create multiple lookup dictionaries with hierarchical POS splitting."""
221
+ try:
222
+ # Get column indices
223
+ surface_col_idx = columns.get('surface_form', 1)
224
+ lemma_col_idx = columns.get('lemma', 2)
225
+ pos_col_idx = columns.get('pos', 3)
226
+ freq_col_idx = columns.get('frequency', 6)
227
+
228
+ # Get actual column names
229
+ df_columns = list(df.columns)
230
+ surface_col = df_columns[surface_col_idx] if surface_col_idx < len(df_columns) else None
231
+ lemma_col = df_columns[lemma_col_idx] if lemma_col_idx < len(df_columns) else None
232
+ pos_col = df_columns[pos_col_idx] if pos_col_idx < len(df_columns) else None
233
+ freq_col = df_columns[freq_col_idx] if freq_col_idx < len(df_columns) else None
234
+
235
+ if not all([surface_col, lemma_col, pos_col, freq_col]):
236
+ raise ValueError("Missing required columns for Japanese corpus")
237
+
238
+ # Clean the data
239
+ df_clean = df.copy()
240
+
241
+ # Clean text columns
242
+ for col in [surface_col, lemma_col, pos_col]:
243
+ df_clean[col] = df_clean[col].astype(str).str.strip()
244
+ df_clean = df_clean[df_clean[col] != '']
245
+ df_clean = df_clean[df_clean[col] != 'nan']
246
+
247
+ # Clean and convert frequency column
248
+ df_clean[freq_col] = pd.to_numeric(df_clean[freq_col], errors='coerce')
249
+ df_clean = df_clean.dropna(subset=[freq_col])
250
+ df_clean = df_clean[df_clean[freq_col] > 0] # Only positive frequencies
251
+
252
+ # Split POS column by hyphen to extract pos1, pos2, pos3
253
+ def split_pos(pos_str):
254
+ parts = str(pos_str).split('-')
255
+ return {
256
+ 'pos1': parts[0] if len(parts) > 0 else '',
257
+ 'pos2': parts[1] if len(parts) > 1 else '',
258
+ 'pos3': parts[2] if len(parts) > 2 else ''
259
+ }
260
+
261
+ pos_split = df_clean[pos_col].apply(split_pos)
262
+ df_clean['pos1'] = [p['pos1'] for p in pos_split]
263
+ df_clean['pos2'] = [p['pos2'] for p in pos_split]
264
+ df_clean['pos3'] = [p['pos3'] for p in pos_split]
265
+
266
+ # Create multiple levels of composite keys to match UniDic lookup hierarchy
267
+ # Level 1: lemma_lForm_pos1_pos2_pos3 (when pos3 exists)
268
+ df_clean['level1_key'] = df_clean.apply(
269
+ lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}_{row['pos2']}_{row['pos3']}"
270
+ if row['pos3'] else None, axis=1
271
+ )
272
+
273
+ # Level 2: lemma_lForm_pos1_pos2
274
+ df_clean['level2_key'] = df_clean.apply(
275
+ lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}_{row['pos2']}"
276
+ if row['pos2'] else None, axis=1
277
+ )
278
+
279
+ # Level 3: lemma_lForm_pos1
280
+ df_clean['level3_key'] = df_clean.apply(
281
+ lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}"
282
+ if row['pos1'] else None, axis=1
283
+ )
284
+
285
+ # Legacy composite key for backward compatibility
286
+ df_clean['legacy_key'] = df_clean[lemma_col] + '_' + df_clean[pos_col]
287
+
288
+ # Create lookup dictionaries for each level
289
+ level1_dict = {}
290
+ level2_dict = {}
291
+ level3_dict = {}
292
+
293
+ for _, row in df_clean.iterrows():
294
+ freq = row[freq_col]
295
+
296
+ if row['level1_key']:
297
+ level1_dict[row['level1_key']] = freq
298
+ if row['level2_key']:
299
+ level2_dict[row['level2_key']] = freq
300
+ if row['level3_key']:
301
+ level3_dict[row['level3_key']] = freq
302
+
303
+ # Return enhanced Japanese corpus data structure
304
+ return {
305
+ 'level1_dict': level1_dict, # Most specific UniDic-compatible keys
306
+ 'level2_dict': level2_dict,
307
+ 'level3_dict': level3_dict,
308
+ 'composite_dict': dict(zip(df_clean['legacy_key'], df_clean[freq_col])), # Legacy format
309
+ 'lemma_dict': dict(zip(df_clean[lemma_col].str.lower(), df_clean[freq_col])),
310
+ 'surface_dict': dict(zip(df_clean[surface_col].str.lower(), df_clean[freq_col])),
311
+ 'is_japanese_corpus': True
312
+ }
313
+
314
+ except Exception as e:
315
+ st.error(f"Error parsing Japanese corpus data: {e}")
316
+ return {}
317
+
318
  @staticmethod
319
  def clean_default_reference_lists():
320
  """Clean up default reference lists that are no longer selected."""
321
  # This would be called by the UI when managing default reference lists
322
  # Implementation depends on how default lists are managed
323
+ pass
web_app/handlers/__pycache__/__init__.cpython-312.pyc DELETED
Binary file (245 Bytes)
 
web_app/handlers/__pycache__/analysis_handlers.cpython-312.pyc DELETED
Binary file (17.9 kB)
 
web_app/handlers/__pycache__/pos_handlers.cpython-312.pyc DELETED
Binary file (7.52 kB)
 
web_app/handlers/frequency_handlers.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Frequency Analysis Handlers for Streamlit Interface
3
+
4
+ This module provides Streamlit interface handlers for word frequency visualization,
5
+ including file upload, visualization controls, and results display.
6
+ Supports flexible column mapping for diverse frequency data formats.
7
+ """
8
+
9
+ import streamlit as st
10
+ import pandas as pd
11
+ import plotly.graph_objects as go
12
+ import plotly.express as px
13
+ import numpy as np
14
+ from typing import Dict, List, Optional
15
+ import sys
16
+ import os
17
+ from pathlib import Path
18
+ from io import StringIO
19
+
20
+ # Add parent directory to path for imports
21
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
22
+
23
+ from text_analyzer.frequency_analyzer import FrequencyAnalyzer
24
+
25
+
26
+ class FrequencyHandlers:
27
+ """
28
+ Streamlit interface handlers for frequency analysis functionality.
29
+ """
30
+
31
+ @staticmethod
32
+ def handle_frequency_analysis():
33
+ """
34
+ Enhanced frequency analysis interface handler with persistent column selection.
35
+ """
36
+ st.markdown("Upload a frequency data file (TSV/CSV) with flexible column mapping support. "
37
+ "The system will automatically detect columns and let you choose which ones to use for analysis.")
38
+
39
+ # Initialize session state variables
40
+ if 'uploaded_file_name' not in st.session_state:
41
+ st.session_state.uploaded_file_name = None
42
+ if 'column_config' not in st.session_state:
43
+ st.session_state.column_config = None
44
+ if 'analyzer' not in st.session_state:
45
+ st.session_state.analyzer = None
46
+ if 'format_info' not in st.session_state:
47
+ st.session_state.format_info = None
48
+ if 'detected_cols' not in st.session_state:
49
+ st.session_state.detected_cols = None
50
+ if 'uploaded_file_content' not in st.session_state:
51
+ st.session_state.uploaded_file_content = None
52
+
53
+ # File upload section
54
+ uploaded_file = FrequencyHandlers.render_file_upload()
55
+
56
+ # Check if a new file was uploaded
57
+ if uploaded_file is not None:
58
+ current_file_name = uploaded_file.name
59
+
60
+ # Reset state if new file is uploaded
61
+ if st.session_state.uploaded_file_name != current_file_name:
62
+ st.session_state.uploaded_file_name = current_file_name
63
+ st.session_state.column_config = None
64
+ st.session_state.analyzer = None
65
+ st.session_state.format_info = None
66
+ st.session_state.detected_cols = None
67
+ st.session_state.uploaded_file_content = uploaded_file.getvalue()
68
+
69
+ try:
70
+ # Initialize analyzer and process file (only if needed)
71
+ if st.session_state.analyzer is None or st.session_state.format_info is None:
72
+ st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
73
+ st.session_state.format_info = st.session_state.analyzer.detect_file_format(uploaded_file.getvalue())
74
+
75
+ # Show format detection results
76
+ st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
77
+ f"{'with' if st.session_state.format_info['has_header'] else 'without'} header, "
78
+ f"~{st.session_state.format_info['estimated_columns']} columns")
79
+
80
+ # Prepare data for column detection
81
+ content = uploaded_file.getvalue()
82
+ if isinstance(content, bytes):
83
+ content = content.decode('utf-8')
84
+
85
+ # Read data for preview and column detection
86
+ df_preview = pd.read_csv(StringIO(content),
87
+ sep=st.session_state.format_info['separator'],
88
+ header=0 if st.session_state.format_info['has_header'] else None,
89
+ nrows=100)
90
+
91
+ # Detect available columns
92
+ st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
93
+
94
+ # Show data preview
95
+ FrequencyHandlers.render_data_preview(df_preview, st.session_state.detected_cols)
96
+
97
+ # ALWAYS show column selection if we have detected columns (persistent interface)
98
+ if st.session_state.detected_cols is not None:
99
+ with st.expander("🎯 Column Selection", expanded=True):
100
+ column_config = FrequencyHandlers.render_persistent_column_selection(
101
+ st.session_state.detected_cols,
102
+ st.session_state.format_info,
103
+ st.session_state.column_config
104
+ )
105
+
106
+ # Check if column configuration changed
107
+ if column_config != st.session_state.column_config:
108
+ st.session_state.column_config = column_config
109
+ # Reload data with new configuration
110
+ df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
111
+ st.session_state.loaded_data = df
112
+ st.rerun()
113
+
114
+ # ALWAYS show visualization controls if we have a column config
115
+ if st.session_state.column_config is not None:
116
+ viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
117
+
118
+ if viz_config:
119
+ # Generate analysis
120
+ FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
121
+
122
+ except Exception as e:
123
+ st.error(f"Error processing file: {str(e)}")
124
+ with st.expander("Error Details"):
125
+ st.code(str(e))
126
+ st.info("Please ensure your file is a valid TSV/CSV with appropriate columns.")
127
+
128
+ elif st.session_state.column_config is not None and st.session_state.uploaded_file_content is not None:
129
+ # Show persistent interface even when no file is currently selected (using cached data)
130
+ with st.expander("🎯 Column Selection", expanded=False):
131
+ column_config = FrequencyHandlers.render_persistent_column_selection(
132
+ st.session_state.detected_cols,
133
+ st.session_state.format_info,
134
+ st.session_state.column_config
135
+ )
136
+
137
+ # Check if column configuration changed
138
+ if column_config != st.session_state.column_config:
139
+ st.session_state.column_config = column_config
140
+ # Reload data with new configuration
141
+ df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
142
+ st.session_state.loaded_data = df
143
+ st.rerun()
144
+
145
+ viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
146
+
147
+ if viz_config:
148
+ # Generate analysis
149
+ FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
150
+
151
+ @staticmethod
152
+ def render_file_upload():
153
+ """
154
+ Render enhanced file upload interface with flexible format support.
155
+
156
+ Returns:
157
+ Uploaded file object or None
158
+ """
159
+ st.subheader("📄 Upload Frequency Data")
160
+
161
+ uploaded_file = st.file_uploader(
162
+ "Choose a frequency data file",
163
+ type=['tsv', 'csv', 'txt'],
164
+ help="Upload a TSV or CSV file with frequency data. Supports flexible column mapping.",
165
+ accept_multiple_files=False
166
+ )
167
+
168
+ if uploaded_file is None:
169
+ # Show example formats
170
+ st.info("**Supported formats:**")
171
+ col1, col2 = st.columns(2)
172
+
173
+ with col1:
174
+ st.write("**Traditional format:**")
175
+ example_traditional = """Type\tFreq\tRank
176
+ the\t69868\t1
177
+ of\t36426\t2
178
+ and\t28891\t3"""
179
+ st.code(example_traditional, language="text")
180
+
181
+ with col2:
182
+ st.write("**Rich corpus format:**")
183
+ example_rich = """rank\tlForm\tlemma\tpos\tfrequency\tpmw
184
+ 1\tノ\tの\t助詞\t5061558\t48383.9
185
+ 2\tニ\tに\t助詞\t3576558\t34188.7
186
+ 3\tテ\tて\t助詞\t3493117\t33391.0"""
187
+ st.code(example_rich, language="text")
188
+
189
+ st.write("**File size limit:** 300MB")
190
+
191
+ return uploaded_file
192
+
193
+ @staticmethod
194
+ def render_data_preview(df: pd.DataFrame, detected_cols: Dict[str, List[str]]):
195
+ """
196
+ Render enhanced data preview section with column detection results.
197
+
198
+ Args:
199
+ df: Preview DataFrame
200
+ detected_cols: Detected column categorization
201
+ """
202
+ st.subheader("📊 Data Preview")
203
+
204
+ # Basic metrics
205
+ col1, col2, col3 = st.columns(3)
206
+ with col1:
207
+ st.metric("Total Rows", len(df))
208
+ with col2:
209
+ st.metric("Total Columns", len(df.columns))
210
+ with col3:
211
+ word_cols = len(detected_cols.get('word_columns', []))
212
+ freq_cols = len(detected_cols.get('frequency_columns', []))
213
+ st.metric("Detected", f"{word_cols} word, {freq_cols} freq")
214
+
215
+ # Show sample data
216
+ st.write("**First 5 rows:**")
217
+ st.dataframe(df.head(), use_container_width=True)
218
+
219
+ # Show detected column categories
220
+ with st.expander("🔍 Column Detection Results", expanded=True):
221
+ col1, col2 = st.columns(2)
222
+
223
+ with col1:
224
+ st.write("**Word Columns (text data):**")
225
+ word_cols = detected_cols.get('word_columns', [])
226
+ if word_cols:
227
+ for col in word_cols:
228
+ st.write(f"- `{col}` ({df[col].dtype})")
229
+ else:
230
+ st.write("None detected")
231
+
232
+ st.write("**POS Columns:**")
233
+ pos_cols = detected_cols.get('pos_columns', [])
234
+ if pos_cols:
235
+ for col in pos_cols:
236
+ st.write(f"- `{col}` ({df[col].dtype})")
237
+ else:
238
+ st.write("None detected")
239
+
240
+ with col2:
241
+ st.write("**Frequency Columns (numeric data):**")
242
+ freq_cols = detected_cols.get('frequency_columns', [])
243
+ if freq_cols:
244
+ for col in freq_cols:
245
+ sample_vals = df[col].dropna().head(3).tolist()
246
+ st.write(f"- `{col}` ({df[col].dtype}) - e.g., {sample_vals}")
247
+ else:
248
+ st.write("None detected")
249
+
250
+ st.write("**Other Columns:**")
251
+ other_cols = detected_cols.get('other_columns', [])
252
+ if other_cols:
253
+ for col in other_cols[:5]: # Show max 5
254
+ st.write(f"- `{col}` ({df[col].dtype})")
255
+ if len(other_cols) > 5:
256
+ st.write(f"... and {len(other_cols) - 5} more")
257
+ else:
258
+ st.write("None")
259
+
260
+ @staticmethod
261
+ def render_column_selection_simplified(detected_cols: Dict[str, List[str]], format_info: Dict) -> Optional[Dict[str, str]]:
262
+ """
263
+ Render simplified column selection interface without multi-frequency complexity.
264
+
265
+ Args:
266
+ detected_cols: Detected column categorization
267
+ format_info: File format information
268
+
269
+ Returns:
270
+ Column configuration dict or None
271
+ """
272
+ st.subheader("🎯 Column Mapping")
273
+ st.write("Select which columns to use for your frequency analysis:")
274
+
275
+ word_cols = detected_cols.get('word_columns', [])
276
+ freq_cols = detected_cols.get('frequency_columns', [])
277
+ pos_cols = detected_cols.get('pos_columns', [])
278
+
279
+ if not word_cols or not freq_cols:
280
+ st.error("❌ Required columns not detected. Please ensure your file has:")
281
+ st.write("- At least one text column (for words)")
282
+ st.write("- At least one numeric column (for frequencies)")
283
+ return None
284
+
285
+ col1, col2 = st.columns(2)
286
+
287
+ with col1:
288
+ # Word column selection
289
+ word_column = st.selectbox(
290
+ "Word Column",
291
+ options=word_cols,
292
+ index=0,
293
+ help="Column containing word forms or lemmas"
294
+ )
295
+
296
+ # POS column selection (optional)
297
+ pos_column = None
298
+ if pos_cols:
299
+ use_pos = st.checkbox("Include POS column", value=False)
300
+ if use_pos:
301
+ pos_column = st.selectbox(
302
+ "POS Column",
303
+ options=pos_cols,
304
+ index=0,
305
+ help="Column containing part-of-speech tags (optional)"
306
+ )
307
+
308
+ with col2:
309
+ # Frequency column selection
310
+ frequency_column = st.selectbox(
311
+ "Frequency Column",
312
+ options=freq_cols,
313
+ index=0,
314
+ help="Column containing frequency values for analysis"
315
+ )
316
+
317
+ # Confirm button
318
+ if st.button("🚀 Start Analysis", type="primary"):
319
+ config = {
320
+ 'word_column': word_column,
321
+ 'frequency_column': frequency_column,
322
+ 'separator': format_info['separator'],
323
+ 'has_header': format_info['has_header']
324
+ }
325
+
326
+ if pos_column:
327
+ config['pos_column'] = pos_column
328
+
329
+ return config
330
+
331
+ return None
332
+
333
+ @staticmethod
334
+ def render_visualization_controls_simplified(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
335
+ """
336
+ Legacy method - redirects to enhanced controls for backward compatibility.
337
+ """
338
+ return FrequencyHandlers.render_enhanced_visualization_controls(analyzer, column_config)
339
+
340
+ @staticmethod
341
+ def render_rank_based_analysis_simplified(analyzer: FrequencyAnalyzer, viz_config: Dict):
342
+ """
343
+ Legacy method - redirects to enhanced analysis for backward compatibility.
344
+ """
345
+ return FrequencyHandlers.render_enhanced_rank_based_analysis(analyzer, viz_config)
346
+
347
+ @staticmethod
348
+ def render_persistent_column_selection(detected_cols: Dict[str, List[str]],
349
+ format_info: Dict,
350
+ current_config: Optional[Dict] = None) -> Dict[str, str]:
351
+ """
352
+ Render persistent column selection interface that doesn't disappear.
353
+
354
+ Args:
355
+ detected_cols: Detected column categorization
356
+ format_info: File format information
357
+ current_config: Current column configuration (for preserving selections)
358
+
359
+ Returns:
360
+ Column configuration dict
361
+ """
362
+ st.write("Select which columns to use for your frequency analysis:")
363
+
364
+ word_cols = detected_cols.get('word_columns', [])
365
+ freq_cols = detected_cols.get('frequency_columns', [])
366
+ pos_cols = detected_cols.get('pos_columns', [])
367
+
368
+ # Determine default selections
369
+ default_word_idx = 0
370
+ default_freq_idx = 0
371
+ default_use_pos = False
372
+ default_pos_idx = 0
373
+
374
+ if current_config:
375
+ # Preserve current selections
376
+ if current_config['word_column'] in word_cols:
377
+ default_word_idx = word_cols.index(current_config['word_column'])
378
+ if current_config['frequency_column'] in freq_cols:
379
+ default_freq_idx = freq_cols.index(current_config['frequency_column'])
380
+ if 'pos_column' in current_config and current_config['pos_column'] in pos_cols:
381
+ default_use_pos = True
382
+ default_pos_idx = pos_cols.index(current_config['pos_column'])
383
+
384
+ col1, col2 = st.columns(2)
385
+
386
+ with col1:
387
+ word_column = st.selectbox(
388
+ "Word Column",
389
+ options=word_cols,
390
+ index=default_word_idx,
391
+ help="Column containing word forms or lemmas",
392
+ key="persistent_word_col"
393
+ )
394
+
395
+ # POS column selection (optional)
396
+ pos_column = None
397
+ if pos_cols:
398
+ use_pos = st.checkbox("Include POS column", value=default_use_pos, key="persistent_use_pos")
399
+ if use_pos:
400
+ pos_column = st.selectbox(
401
+ "POS Column",
402
+ options=pos_cols,
403
+ index=default_pos_idx,
404
+ help="Column containing part-of-speech tags (optional)",
405
+ key="persistent_pos_col"
406
+ )
407
+
408
+ with col2:
409
+ frequency_column = st.selectbox(
410
+ "Frequency Column",
411
+ options=freq_cols,
412
+ index=default_freq_idx,
413
+ help="Column containing frequency values for analysis",
414
+ key="persistent_freq_col"
415
+ )
416
+
417
+ # Show quick info about selected columns
418
+ st.write("**Selected Configuration:**")
419
+ st.write(f"• Words: `{word_column}`")
420
+ st.write(f"• Frequencies: `{frequency_column}`")
421
+ if pos_column:
422
+ st.write(f"• POS: `{pos_column}`")
423
+
424
+ # Always return configuration (no button needed)
425
+ config = {
426
+ 'word_column': word_column,
427
+ 'frequency_column': frequency_column,
428
+ 'separator': format_info['separator'],
429
+ 'has_header': format_info['has_header']
430
+ }
431
+
432
+ if pos_column:
433
+ config['pos_column'] = pos_column
434
+
435
+ return config
436
+
437
+ @staticmethod
438
+ def render_enhanced_visualization_controls(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
439
+ """
440
+ Render enhanced visualization controls with max words limit.
441
+
442
+ Args:
443
+ analyzer: FrequencyAnalyzer instance with loaded data
444
+ column_config: Column configuration from user selection
445
+
446
+ Returns:
447
+ Dict with visualization configuration or None
448
+ """
449
+ st.subheader("🎛️ Enhanced Visualization Controls")
450
+
451
+ # Get the frequency column
452
+ frequency_column = column_config['frequency_column']
453
+
454
+ col1, col2, col3 = st.columns(3)
455
+
456
+ with col1:
457
+ # Bin size controls
458
+ bin_size = st.slider(
459
+ "Bin Size (words per group)",
460
+ min_value=100,
461
+ max_value=2000,
462
+ value=500,
463
+ step=100,
464
+ help="Number of words to group together for rank-based analysis"
465
+ )
466
+
467
+ with col2:
468
+ # Log transformation option
469
+ log_transform = st.checkbox(
470
+ "Apply log₁₀ transformation",
471
+ value=False,
472
+ help="Transform frequency values using log₁₀ for better visualization"
473
+ )
474
+
475
+ with col3:
476
+ # Max words control
477
+ max_words = st.number_input(
478
+ "Max words to analyze",
479
+ min_value=1000,
480
+ max_value=200000,
481
+ value=None,
482
+ step=1000,
483
+ help="Limit analysis to top N most frequent words (leave empty for no limit)",
484
+ key="max_words_input"
485
+ )
486
+
487
+ # Quick preset buttons
488
+ st.write("**Quick Presets:**")
489
+ preset_cols = st.columns(4)
490
+ if preset_cols[0].button("10K", key="preset_10k"):
491
+ st.session_state.max_words_preset = 10000
492
+ if preset_cols[1].button("25K", key="preset_25k"):
493
+ st.session_state.max_words_preset = 25000
494
+ if preset_cols[2].button("50K", key="preset_50k"):
495
+ st.session_state.max_words_preset = 50000
496
+ if preset_cols[3].button("All", key="preset_all"):
497
+ st.session_state.max_words_preset = None
498
+
499
+ # Use preset value if set
500
+ if 'max_words_preset' in st.session_state:
501
+ max_words = st.session_state.max_words_preset
502
+ del st.session_state.max_words_preset
503
+
504
+ # Generate visualization button
505
+ if st.button("📊 Generate Enhanced Visualization", type="primary", key="generate_viz"):
506
+ return {
507
+ 'frequency_column': frequency_column,
508
+ 'bin_size': bin_size,
509
+ 'log_transform': log_transform,
510
+ 'max_words_to_retain': max_words
511
+ }
512
+
513
+ return None
514
+
515
+ @staticmethod
516
+ def render_enhanced_rank_based_analysis(analyzer: FrequencyAnalyzer, viz_config: Dict):
517
+ """
518
+ Render enhanced rank-based analysis with improved sample words display.
519
+
520
+ Args:
521
+ analyzer: FrequencyAnalyzer instance with loaded data
522
+ viz_config: Visualization configuration
523
+ """
524
+ st.subheader("📊 Enhanced Rank-Based Frequency Analysis")
525
+
526
+ frequency_column = viz_config['frequency_column']
527
+ bin_size = viz_config['bin_size']
528
+ log_transform = viz_config['log_transform']
529
+ max_words_to_retain = viz_config.get('max_words_to_retain')
530
+
531
+ try:
532
+ # Calculate statistics
533
+ stats = analyzer.calculate_statistics(frequency_column)
534
+
535
+ # Display basic statistics with word limit info
536
+ col1, col2, col3, col4 = st.columns(4)
537
+ with col1:
538
+ words_analyzed = max_words_to_retain if max_words_to_retain and max_words_to_retain < stats['count'] else stats['count']
539
+ st.metric("Words Analyzed", f"{words_analyzed:,}")
540
+ with col2:
541
+ st.metric("Mean Frequency", f"{stats['mean']:.2f}")
542
+ with col3:
543
+ st.metric("Median Frequency", f"{stats['median']:.2f}")
544
+ with col4:
545
+ st.metric("Std Deviation", f"{stats['std']:.2f}")
546
+
547
+ # Show word limit info if applied
548
+ if max_words_to_retain and max_words_to_retain < stats['count']:
549
+ st.info(f"📊 Analysis limited to top {max_words_to_retain:,} most frequent words (out of {stats['count']:,} total)")
550
+
551
+ # Create rank-based visualization with enhanced parameters
552
+ result = analyzer.create_rank_based_visualization_flexible(
553
+ column=frequency_column,
554
+ bin_size=bin_size,
555
+ log_transform=log_transform,
556
+ max_words_to_retain=max_words_to_retain
557
+ )
558
+
559
+ # Create the main visualization
560
+ fig = go.Figure()
561
+
562
+ fig.add_trace(go.Bar(
563
+ x=result['group_centers'],
564
+ y=result['avg_frequencies'],
565
+ name=f"Avg {frequency_column}",
566
+ marker_color='steelblue',
567
+ hovertemplate=(
568
+ f"<b>Group %{{x}}</b><br>"
569
+ f"Avg {'Log₁₀ ' if log_transform else ''}{frequency_column}: %{{y:.3f}}<br>"
570
+ "<extra></extra>"
571
+ )
572
+ ))
573
+
574
+ fig.update_layout(
575
+ title=result.get('title_suffix', f"Enhanced Rank-Based Analysis - {frequency_column}"),
576
+ xaxis_title=result.get('x_label', f"Rank Groups (bin size: {bin_size})"),
577
+ yaxis_title=result.get('y_label', f"{'Log₁₀ ' if log_transform else ''}Average {frequency_column}"),
578
+ showlegend=False,
579
+ height=500
580
+ )
581
+
582
+ st.plotly_chart(fig, use_container_width=True)
583
+
584
+ # Enhanced sample words display (up to 20 bins with 5 random samples each)
585
+ st.write("### 🎯 Sample Words by Rank Group (5 Random Samples)")
586
+
587
+ sample_words = result.get('sample_words', {})
588
+ if sample_words:
589
+ # Display up to 20 groups in a more organized layout
590
+ num_groups = min(20, len(sample_words))
591
+
592
+ if num_groups > 0:
593
+ st.write(f"Showing sample words from top {num_groups} rank groups:")
594
+
595
+ # Display in rows of 4 groups each
596
+ for row_start in range(0, num_groups, 4):
597
+ cols = st.columns(4)
598
+ for col_idx in range(4):
599
+ group_idx = row_start + col_idx
600
+ if group_idx < num_groups and group_idx in sample_words:
601
+ with cols[col_idx]:
602
+ group_label = result['group_labels'][group_idx]
603
+ words = sample_words[group_idx]
604
+
605
+ st.write(f"**Group {group_label}:**")
606
+ word_list = [w['word'] for w in words]
607
+ # Display as bullet points for better readability
608
+ for word in word_list:
609
+ st.write(f"• {word}")
610
+
611
+ # Add spacing between groups
612
+ st.write("")
613
+ else:
614
+ st.write("No sample words available")
615
+
616
+ # Show enhanced group statistics
617
+ with st.expander("📈 Detailed Group Statistics"):
618
+ group_stats = result.get('group_stats')
619
+ if group_stats is not None and not group_stats.empty:
620
+ display_stats = group_stats.copy()
621
+
622
+ # Format numeric columns
623
+ numeric_cols = display_stats.select_dtypes(include=[np.number]).columns
624
+ for col in numeric_cols:
625
+ if 'count' not in col.lower():
626
+ display_stats[col] = display_stats[col].round(2)
627
+
628
+ st.dataframe(display_stats, use_container_width=True)
629
+ else:
630
+ st.write("No detailed statistics available")
631
+
632
+ except Exception as e:
633
+ st.error(f"Error in enhanced rank-based analysis: {str(e)}")
634
+ with st.expander("Error Details"):
635
+ st.code(str(e))