Spaces:

egumasa
/

simple-text-analyzer

Building

App Files Files Community

egumasa commited on Jul 20, 2025

Commit

dbc9105

1 Parent(s): fd3d382

Japanese language support

Browse files

Files changed (48) hide show

.gitignore +388 -0
README.md +238 -5
config/reference_lists.yaml +97 -1
japanese-nlp-test.ipynb +819 -0
pyproject.toml +4 -0
resources/reference_lists/ja/BCCWJ_frequencylist_luw2_ver1_1.tsv +3 -0
resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1 copy.tsv +3 -0
resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv +3 -0
resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv +3 -0
test/test_app.py +8 -6
test/test_functionality.py +6 -4
test/test_multi_index.py +2 -3
test/test_yaml_config.py +2 -3
test_frequency_flexible.py +1 -0
test_fugashi_diagnostic.py +134 -0
test_japanese_integration.py +135 -0
test_unidic_diagnostic.py +201 -0
text_analyzer/__pycache__/__init__.cpython-312.pyc +0 -0
text_analyzer/__pycache__/lexical_sophistication.cpython-312.pyc +0 -0
text_analyzer/__pycache__/pos_parser.cpython-312.pyc +0 -0
text_analyzer/app_config.py +183 -0
text_analyzer/base_analyzer.py +308 -0
text_analyzer/frequency_analyzer.py +653 -0
text_analyzer/lexical_sophistication.py +245 -62
text_analyzer/pos_parser.py +11 -36
text_analyzer/text_utility.py +289 -0
text_analyzer/unidic_enricher.py +256 -0
text_analyzer/unidic_extensions.py +25 -0
uv.lock +420 -6
web_app/__pycache__/analysis_handlers.cpython-312.pyc +0 -0
web_app/__pycache__/app.cpython-312.pyc +0 -0
web_app/__pycache__/comparison_functions.cpython-312.pyc +0 -0
web_app/__pycache__/config_manager.cpython-312.pyc +0 -0
web_app/__pycache__/pos_handlers.cpython-312.pyc +0 -0
web_app/__pycache__/reference_manager.cpython-312.pyc +0 -0
web_app/__pycache__/session_manager.cpython-312.pyc +0 -0
web_app/__pycache__/ui_components.cpython-312.pyc +0 -0
web_app/app.py +15 -3
web_app/components/__pycache__/__init__.cpython-312.pyc +0 -0
web_app/components/__pycache__/comparison_functions.cpython-312.pyc +0 -0
web_app/components/__pycache__/ui_components.cpython-312.pyc +0 -0
web_app/components/comparison_functions.py +2 -1
web_app/components/ui_components.py +2 -2
web_app/config_manager.py +110 -3
web_app/handlers/__pycache__/__init__.cpython-312.pyc +0 -0
web_app/handlers/__pycache__/analysis_handlers.cpython-312.pyc +0 -0
web_app/handlers/__pycache__/pos_handlers.cpython-312.pyc +0 -0
web_app/handlers/frequency_handlers.py +635 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,388 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# poetry
+poetry.lock
+# pdm
+.pdm.toml
+.pdm-python
+# PEP 582
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+.idea/
+# VS Code
+.vscode/
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+# Windows
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+# Linux
+*~
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+# Editor backups
+*.bak
+*.swp
+*.swo
+*~
+# Logs
+logs/
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+# Dependency directories
+node_modules/
+jspm_packages/
+# TypeScript cache
+*.tsbuildinfo
+# Optional npm cache directory
+.npm
+# Optional eslint cache
+.eslintcache
+# Optional stylelint cache
+.stylelintcache
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+# Optional REPL history
+.node_repl_history
+# Output of 'npm pack'
+*.tgz
+# Yarn Integrity file
+.yarn-integrity
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+# parcel-bundler cache
+.cache
+.parcel-cache
+# Next.js build output
+.next
+out
+# Nuxt.js build / generate output
+.nuxt
+dist
+# Gatsby files
+.cache/
+public
+# vuepress build output
+.vuepress/dist
+# vuepress v2.x temp and cache directory
+.temp
+.cache
+# Docusaurus cache and generated files
+.docusaurus
+# Serverless directories
+.serverless/
+# FuseBox cache
+.fusebox/
+# DynamoDB Local files
+.dynamodb/
+# TernJS port file
+.tern-port
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+# Temporary files
+tmp/
+temp/
+*.tmp
+*.temp
+# Database files
+*.db
+*.sqlite
+*.sqlite3
+# Secret files
+secrets/
+*.key
+*.pem
+*.cert
+*.crt
+# Config files with sensitive data
+config.local.js
+config.local.json
+settings.local.json
+# Build artifacts
+bin/
+obj/
+# Package files
+*.jar
+*.war
+*.nar
+*.ear
+*.zip
+*.tar.gz
+*.rar
+# Virtual machine crash logs
+hs_err_pid*
+# Core dumps
+core.*
+# Compiled source
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.out
+# Ignore all dotfiles except .gitignore
+.*
+!.gitignore
+!.gitkeep
+!.github/
+!.gitlab-ci.yml
+!.travis.yml
+!.editorconfig
+!.prettierrc
+!.eslintrc*
+!.stylelintrc*
+!.babelrc*

README.md CHANGED Viewed

@@ -7,14 +7,247 @@ sdk: docker
 app_port: 8501
 tags:
 - streamlit
 pinned: false
-short_description: This is a web app for linguistic-data-analysis-I .
 license: cc-by-nc-4.0
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 app_port: 8501
 tags:
 - streamlit
+- nlp
+- linguistics
+- japanese
+- corpus-linguistics
 pinned: false
+short_description: Advanced lexical sophistication analyzer for English and Japanese texts
 license: cc-by-nc-4.0
 ---
+# Simple Text Analyzer
+A comprehensive web-based application for lexical sophistication analysis supporting both English and Japanese languages. This tool provides detailed linguistic analysis using corpus-based frequency data and advanced NLP techniques.
+## 🌟 Features
+### Multi-Language Support
+- **English**: COCA corpus frequency analysis with unigrams, bigrams, and trigrams
+- **Japanese**: BCCWJ (written) and CSJ (spoken) corpus integration with POS-aware frequency matching
+### Analysis Capabilities
+- **Lexical Sophistication**: Frequency-based lexical complexity analysis
+- **Part-of-Speech Analysis**: Detailed POS tagging and classification
+- **N-gram Analysis**: Bigram and trigram frequency analysis
+- **Content vs Function Words**: Automatic classification and separate analysis
+- **Batch Processing**: Multiple file analysis with comparative results
+### Japanese Language Features ✨ **NEW**
+- **BCCWJ Integration**: Balanced Corpus of Contemporary Written Japanese
+  - Raw frequency counts
+  - Normalized frequency (per million words)
+  - Frequency rankings
+- **CSJ Integration**: Corpus of Spontaneous Japanese (spoken data)
+  - Academic and conversational speech patterns
+  - Multiple speech style analysis
+- **POS-Aware Matching**: Composite key lookup using `lemma + POS` for accurate frequency matching
+- **Robust Fallback System**: Three-tier lookup strategy:
+  1. Primary: `lemma_pos` composite key (e.g., "行く_動詞-自立")
+  2. Fallback 1: `lemma` only lookup
+  3. Fallback 2: `surface_form` lookup
+## 🚀 Quick Start
+### Prerequisites
+- Python 3.8+
+- uv (recommended) or pip for package management
+### Installation
+```bash
+# Clone the repository
+git clone https://github.com/your-repo/simple-text-analyzer.git
+cd simple-text-analyzer
+# Install dependencies using uv
+uv sync
+# Or using pip
+pip install -r requirements.txt
+# Install required SpaCy models
+python -m spacy download en_core_web_trf
+python -m spacy download ja_core_news_md  # For Japanese support
+```
+### Running the Application
+```bash
+# Using uv
+uv run streamlit run web_app/app.py
+# Or directly
+streamlit run web_app/app.py
+```
+## 📊 Supported Corpora
+### English
+- **COCA Spoken**: Corpus of Contemporary American English (spoken subcorpus)
+- **COCA Magazine**: Magazine text frequency data
+- **Bigram/Trigram Analysis**: Multi-word expression frequency and association measures
+### Japanese
+- **BCCWJ (Balanced Corpus of Contemporary Written Japanese)**
+  - 182,604 unique word forms with POS tags
+  - Multiple text registers (books, newspapers, magazines, etc.)
+  - Comprehensive written language coverage
+- **CSJ (Corpus of Spontaneous Japanese)**
+  - 41,892 unique word forms from spoken data
+  - Academic presentations and casual conversations
+  - Natural speech pattern analysis
+## 🔧 Architecture
+### Core Components
+- **LexicalSophisticationAnalyzer**: Main analysis engine with multi-language support
+- **ConfigManager**: Flexible configuration system for corpus integration
+- **ReferenceManager**: Dynamic reference list management
+- **SessionManager**: State management for web interface
+### Japanese Integration Features
+- **Composite Key Matching**: Precision matching using lemma and POS combinations
+- **Extensible Design**: Easy addition of new subcorpora via YAML configuration
+- **Fallback Mechanisms**: Robust lookup strategies for maximum coverage
+- **Performance Optimized**: Pre-computed lookup dictionaries for fast analysis
+## 📁 File Structure
+```
+simple-text-analyzer/
+├── web_app/                 # Streamlit web application
+│   ├── app.py              # Main application entry
+│   ├── config_manager.py   # Configuration management
+│   ├── reference_manager.py # Reference list handling
+│   └── components/         # UI components
+├── text_analyzer/          # Core analysis modules
+│   ├── lexical_sophistication.py  # Main analyzer
+│   ├── frequency_analyzer.py      # Frequency analysis
+│   └── pos_parser.py       # POS tagging utilities
+├── config/                 # Configuration files
+│   └── reference_lists.yaml       # Corpus configurations
+├── resources/              # Corpus data files
+│   └── reference_lists/
+│       ├── en/            # English corpus files
+│       └── ja/            # Japanese corpus files
+└── test/                  # Test modules
+```
+## 🧪 Testing
+Test the Japanese integration:
+```bash
+uv run python test_japanese_integration.py
+```
+Expected output:
+- ✅ SpaCy model loading
+- ✅ Reference data loading (182K+ BCCWJ entries, 41K+ CSJ entries)
+- ✅ Composite key lookup functionality
+- ✅ Fallback mechanism verification
+- ✅ Complete text analysis pipeline
+## 📈 Usage Examples
+### Japanese Text Analysis
+```python
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+# Initialize Japanese analyzer
+analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
+# Load Japanese corpus references
+selected_indices = ["BCCWJ_frequency", "CSJ_frequency"]
+# Analyze Japanese text
+results = analyzer.analyze_text(
+    "私は毎日学校に行きます。",
+    selected_indices
+)
+# Access frequency scores
+for token in results['token_details']:
+    print(f"{token['token']}: BCCWJ={token.get('BCCWJ_frequency_lemma', 'NA')}")
+```
+### English Text Analysis
+```python
+# Initialize English analyzer
+analyzer = LexicalSophisticationAnalyzer(language="en", model_size="trf")
+# Analyze with COCA frequency data
+results = analyzer.analyze_text(
+    "The students studied linguistics carefully.",
+    ["COCA_spoken_frequency"]
+)
+```
+## 🔧 Configuration
+### Adding New Japanese Subcorpora
+The system is designed for easy expansion. To add a new subcorpus (e.g., BCCWJ Books):
+```yaml
+# config/reference_lists.yaml
+japanese:
+  unigrams:
+    BCCWJ_books_frequency:
+      display_name: "BCCWJ Books - Frequency"
+      description: "BCCWJ books subcorpus frequency data"
+      files:
+        token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+        lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+      format: "tsv"
+      has_header: true
+      enabled: true
+      japanese_corpus: true
+      columns:
+        surface_form: 1  # lForm column
+        lemma: 2         # lemma column
+        pos: 3           # pos column
+        frequency: 10    # PB_frequency column (books subcorpus)
+```
+No code changes required - the system automatically detects and integrates new configurations!
+## 📚 Research Applications
+This tool is ideal for:
+- **Language Learning Research**: Analyzing text complexity for Japanese learners
+- **Corpus Linguistics**: Cross-linguistic frequency analysis
+- **Computational Linguistics**: Lexical sophistication measurement
+- **Educational Assessment**: Text difficulty evaluation
+- **Translation Studies**: Comparative lexical analysis
+## 🤝 Contributing
+1. Fork the repository
+2. Create a feature branch (`git checkout -b feature/amazing-feature`)
+3. Commit your changes (`git commit -m 'Add amazing feature'`)
+4. Push to the branch (`git push origin feature/amazing-feature`)
+5. Open a Pull Request
+## 📄 License
+This project is licensed under the CC BY-NC 4.0 License - see the [LICENSE](LICENSE) file for details.
+## 🙏 Acknowledgments
+- **BCCWJ**: National Institute for Japanese Language and Linguistics
+- **CSJ**: National Institute for Japanese Language and Linguistics
+- **COCA**: Mark Davies, Brigham Young University
+- **SpaCy**: Explosion AI for robust NLP models
+## 📞 Support
+For questions, issues, or contributions:
+- Open an issue on GitHub
+- Contact: [Your contact information]
+---
+**Happy analyzing!** 🚀📊

config/reference_lists.yaml CHANGED Viewed

@@ -137,6 +137,102 @@ english:
 japanese:
   unigrams:
     jp_frequency:
       display_name: "Japanese Frequency List"
       description: "Frequency data for Japanese words"
@@ -151,4 +247,4 @@ japanese:
       enabled: false  # Disabled until files exist
   # bigrams: {}
-  # trigrams: {}

 japanese:
   unigrams:
+    BCCWJ_frequency:
+      display_name: "BCCWJ Written - Frequency"
+      description: "BCCWJ raw frequency counts for written Japanese"
+      files:
+        token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+        lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+      format: "tsv"
+      has_header: true
+      enabled: true
+      japanese_corpus: true
+      columns:
+        surface_form: 1    # lForm
+        lemma: 2          # lemma
+        pos: 3            # pos
+        frequency: 6      # primary measure column
+    BCCWJ_pmw:
+      display_name: "BCCWJ Written - Per Million Words"
+      description: "BCCWJ normalized frequency for written Japanese"
+      files:
+        token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+        lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+      format: "tsv"
+      has_header: true
+      enabled: true
+      japanese_corpus: true
+      columns:
+        surface_form: 1
+        lemma: 2
+        pos: 3
+        frequency: 7      # pmw column
+    BCCWJ_rank:
+      display_name: "BCCWJ Written - Frequency Rank"
+      description: "BCCWJ frequency ranking for written Japanese"
+      files:
+        token: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+        lemma: "resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv"
+      format: "tsv"
+      has_header: true
+      enabled: true
+      japanese_corpus: true
+      columns:
+        surface_form: 1
+        lemma: 2
+        pos: 3
+        frequency: 0      # rank column
+    CSJ_frequency:
+      display_name: "CSJ Spoken - Frequency"
+      description: "CSJ raw frequency counts for spoken Japanese"
+      files:
+        token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
+        lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
+      format: "tsv"
+      has_header: true
+      enabled: true
+      japanese_corpus: true
+      columns:
+        surface_form: 1
+        lemma: 2
+        pos: 3
+        frequency: 6
+    CSJ_pmw:
+      display_name: "CSJ Spoken - Per Million Words"
+      description: "CSJ normalized frequency for spoken Japanese"
+      files:
+        token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
+        lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
+      format: "tsv"
+      has_header: true
+      enabled: true
+      japanese_corpus: true
+      columns:
+        surface_form: 1
+        lemma: 2
+        pos: 3
+        frequency: 7
+    CSJ_rank:
+      display_name: "CSJ Spoken - Frequency Rank"
+      description: "CSJ frequency ranking for spoken Japanese"
+      files:
+        token: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
+        lemma: "resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv"
+      format: "tsv"
+      has_header: true
+      enabled: true
+      japanese_corpus: true
+      columns:
+        surface_form: 1
+        lemma: 2
+        pos: 3
+        frequency: 0
     jp_frequency:
       display_name: "Japanese Frequency List"
       description: "Frequency data for Japanese words"
       enabled: false  # Disabled until files exist
   # bigrams: {}
+  # trigrams: {}

japanese-nlp-test.ipynb ADDED Viewed

	@@ -0,0 +1,819 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Japanese NLP Analysis: Comparative Study of UniDic-based Approaches\n",
+    "\n",
+    "This notebook implements and compares two approaches for Japanese morphological analysis with BCCWJ frequency matching:\n",
+    "\n",
+    "- **Plan A**: MeCab (fugashi) + UniDic direct pipeline\n",
+    "- **Plan B**: GiNZA (Sudachi) + UniDic alignment pipeline\n",
+    "\n",
+    "Each approach is designed for reproducible setup, implementation, validation, and operational use."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Environment Setup & Verification\n",
+    "\n",
+    "First, let's verify and set up our environment with all required packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Python version: 3.12.2 (main, Feb 25 2024, 03:55:42) [Clang 17.0.6 ]\n",
+      "Working directory: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/2025/notebooks\n",
+      "\n",
+      "Checking package availability:\n",
+      "✓ fugashi\n",
+      "✓ unidic\n",
+      "✗ unidic-lite - NOT FOUND\n",
+      "✓ spacy\n",
+      "✓ ginza\n",
+      "✗ ja-ginza - NOT FOUND\n",
+      "✓ sudachipy\n",
+      "✓ pandas\n",
+      "✓ numpy\n",
+      "✓ matplotlib\n",
+      "✓ collections (built-in)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Environment verification and setup\n",
+    "import sys\n",
+    "import subprocess\n",
+    "from pathlib import Path\n",
+    "\n",
+    "print(f\"Python version: {sys.version}\")\n",
+    "print(f\"Working directory: {Path.cwd()}\")\n",
+    "\n",
+    "# Required packages\n",
+    "required_packages = [\n",
+    "    'fugashi', 'unidic', 'unidic-lite', 'spacy', 'ginza', \n",
+    "    'ja-ginza', 'sudachipy', 'pandas', 'numpy', 'matplotlib', 'collections'\n",
+    "]\n",
+    "\n",
+    "print(\"\\nChecking package availability:\")\n",
+    "for package in required_packages:\n",
+    "    try:\n",
+    "        if package == 'collections':\n",
+    "            import collections\n",
+    "            print(f\"✓ {package} (built-in)\")\n",
+    "        else:\n",
+    "            __import__(package)\n",
+    "            print(f\"✓ {package}\")\n",
+    "    except ImportError:\n",
+    "        print(f\"✗ {package} - NOT FOUND\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "scipy not available - will use numpy for correlation\n",
+      "All imports successful!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Import all necessary libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from collections import Counter, defaultdict\n",
+    "import time\n",
+    "import warnings\n",
+    "from typing import List, Tuple, Dict, Optional\n",
+    "\n",
+    "# Japanese NLP libraries\n",
+    "import fugashi\n",
+    "import unidic\n",
+    "import spacy\n",
+    "from spacy.tokens import Token, Doc\n",
+    "\n",
+    "# Statistical analysis\n",
+    "try:\n",
+    "    from scipy.stats import spearmanr\n",
+    "    scipy_available = True\n",
+    "except ImportError:\n",
+    "    print(\"scipy not available - will use numpy for correlation\")\n",
+    "    scipy_available = False\n",
+    "\n",
+    "print(\"All imports successful!\")\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "UniDic directory: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n",
+      "UniDic is properly installed\n",
+      "Fugashi + UniDic test successful: テスト\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check UniDic installation and download if needed\n",
+    "try:\n",
+    "    print(f\"UniDic directory: {unidic.DICDIR}\")\n",
+    "    print(\"UniDic is properly installed\")\n",
+    "except Exception as e:\n",
+    "    print(f\"UniDic issue: {e}\")\n",
+    "    print(\"You may need to run: python -m unidic download\")\n",
+    "\n",
+    "# Test basic fugashi functionality\n",
+    "try:\n",
+    "    tagger = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
+    "    test_result = list(tagger(\"テスト\"))\n",
+    "    print(f\"Fugashi + UniDic test successful: {test_result[0].surface}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Fugashi test failed: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Sample Data Preparation\n",
+    "\n",
+    "Let's create realistic Japanese text samples for testing our pipelines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sample texts prepared:\n",
+      " 1. 彼は日ごろから本を読むのが好きです。\n",
+      " 2. ひごろの勉強が大切だと思います。\n",
+      " 3. 日頃の努力が実を結ぶでしょう。\n",
+      " 4. 彼女は書きあらわすことが得意です。\n",
+      " 5. その問題を書き表すのは難しい。\n",
+      " 6. 今日は東京オリンピックについて話しましょう。\n",
+      " 7. コーヒーを飲んで、呑み込んで、また飲んでしまった。\n",
+      " 8. 国際的な協力が必要不可欠です。\n",
+      " 9. 機械学習の技術が進歩している。\n",
+      "10. 自然言語処理は興味深い分野だ。\n",
+      "\n",
+      "Extended corpus: 30 texts\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Sample Japanese texts for testing\n",
+    "sample_texts = [\n",
+    "    \"彼は日ごろから本を読むのが好きです。\",\n",
+    "    \"ひごろの勉強が大切だと思います。\",\n",
+    "    \"日頃の努力が実を結ぶでしょう。\",\n",
+    "    \"彼女は書きあらわすことが得意です。\",\n",
+    "    \"その問題を書き表すのは難しい。\",\n",
+    "    \"今日は東京オリンピックについて話しましょう。\",\n",
+    "    \"コーヒーを飲んで、呑み込んで、また飲んでしまった。\",\n",
+    "    \"国際的な協力が必要不可欠です。\",\n",
+    "    \"機械学習の技術が進歩している。\",\n",
+    "    \"自然言語処理は興味深い分野だ。\"\n",
+    "]\n",
+    "\n",
+    "print(\"Sample texts prepared:\")\n",
+    "for i, text in enumerate(sample_texts, 1):\n",
+    "    print(f\"{i:2d}. {text}\")\n",
+    "\n",
+    "# Create a larger corpus by repeating and slightly modifying texts\n",
+    "extended_corpus = sample_texts * 3  # Simulate frequency variations\n",
+    "print(f\"\\nExtended corpus: {len(extended_corpus)} texts\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mock BCCWJ frequency data:\n",
+      "  lemma reading   pos  freq_bccwj               key\n",
+      "0    日頃     ヒゴロ    名詞        1250     (日頃, ヒゴロ, 名詞)\n",
+      "1     本      ホン    名詞        8500       (本, ホン, 名詞)\n",
+      "2    読む      ヨム    動詞        3200      (読む, ヨム, 動詞)\n",
+      "3    好き      スキ  形容動詞        2100    (好き, スキ, 形容動詞)\n",
+      "4    勉強   ベンキョウ    名詞        4200   (勉強, ベンキョウ, 名詞)\n",
+      "5    大切    タイセツ  形容動詞        1800  (大切, タイセツ, 形容動詞)\n",
+      "6    思う     オモウ    動詞        9500     (思う, オモウ, 動詞)\n",
+      "7    努力    ドリョク    名詞        2200    (努力, ドリョク, 名詞)\n",
+      "8     実       ミ    名詞        1100        (実, ミ, 名詞)\n",
+      "9    結ぶ     ムスブ    動詞         800     (結ぶ, ムスブ, 動詞)\n",
+      "\n",
+      "Total entries: 25\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create mock BCCWJ frequency data for testing\n",
+    "# In real usage, this would be loaded from an actual BCCWJ frequency file\n",
+    "\n",
+    "mock_bccwj_data = [\n",
+    "    ('日頃', 'ヒゴロ', '名詞', 1250),\n",
+    "    ('本', 'ホン', '名詞', 8500),\n",
+    "    ('読む', 'ヨム', '動詞', 3200),\n",
+    "    ('好き', 'スキ', '形容動詞', 2100),\n",
+    "    ('勉強', 'ベンキョウ', '名詞', 4200),\n",
+    "    ('大切', 'タイセツ', '形容動詞', 1800),\n",
+    "    ('思う', 'オモウ', '動詞', 9500),\n",
+    "    ('努力', 'ドリョク', '名詞', 2200),\n",
+    "    ('実', 'ミ', '名詞', 1100),\n",
+    "    ('結ぶ', 'ムスブ', '動詞', 800),\n",
+    "    ('書く', 'カク', '動詞', 4100),\n",
+    "    ('表す', 'アラワス', '動詞', 1500),\n",
+    "    ('得意', 'トクイ', '形容動詞', 1300),\n",
+    "    ('問題', 'モンダイ', '名詞', 6200),\n",
+    "    ('難しい', 'ムズカシイ', '形容詞', 3800),\n",
+    "    ('今日', 'キョウ', '名詞', 5500),\n",
+    "    ('東京', 'トウキョウ', '名詞', 4800),\n",
+    "    ('話す', 'ハナス', '動詞', 3600),\n",
+    "    ('飲む', 'ノム', '動詞', 2400),\n",
+    "    ('呑む', 'ノム', '動詞', 150),\n",
+    "    ('国際', 'コクサイ', '名詞', 2800),\n",
+    "    ('協力', 'キョウリョク', '名詞', 1900),\n",
+    "    ('必要', 'ヒツヨウ', '形容動詞', 4500),\n",
+    "    ('技術', 'ギジュツ', '名詞', 3900),\n",
+    "    ('進歩', 'シンポ', '名詞', 1100)\n",
+    "]\n",
+    "\n",
+    "# Create DataFrame\n",
+    "df_bccwj = pd.DataFrame(mock_bccwj_data, columns=['lemma', 'reading', 'pos', 'freq_bccwj'])\n",
+    "df_bccwj['key'] = list(zip(df_bccwj.lemma, df_bccwj.reading, df_bccwj.pos))\n",
+    "\n",
+    "print(\"Mock BCCWJ frequency data:\")\n",
+    "print(df_bccwj.head(10))\n",
+    "print(f\"\\nTotal entries: {len(df_bccwj)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Plan A: MeCab (fugashi) + UniDic Direct Pipeline\n",
+    "\n",
+    "### A-1 to A-3: Setup and Configuration\n",
+    "\n",
+    "UniDic provides the morphological analysis system used in BCCWJ, making it ideal for frequency matching."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initializing Plan A: fugashi + UniDic pipeline\n",
+      "Tagger initialized with UniDic dictionary: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n",
+      "\n",
+      "Test analysis of '日ごろから勉強している。':\n",
+      "  日ごろ -> 日頃 [名,詞,,,普,通,名,詞,,,副,詞,可,能,,,*]\n",
+      "  から -> から [助,詞,,,格,助,詞,,,*,,,*]\n",
+      "  勉強 -> 勉強 [名,詞,,,普,通,名,詞,,,サ,変,可,能,,,*]\n",
+      "  し -> 為る [動,詞,,,非,自,立,可,能,,,*,,,*]\n",
+      "  て -> て [助,詞,,,接,続,助,詞,,,*,,,*]\n",
+      "  いる -> 居る [動,詞,,,非,自,立,可,能,,,*,,,*]\n",
+      "  。 -> 。 [補,助,記,号,,,句,点,,,*,,,*]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# A-3: Initialize fugashi with UniDic\n",
+    "print(\"Initializing Plan A: fugashi + UniDic pipeline\")\n",
+    "\n",
+    "# Initialize tagger with explicit UniDic path\n",
+    "tagger_a = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
+    "print(f\"Tagger initialized with UniDic dictionary: {unidic.DICDIR}\")\n",
+    "\n",
+    "# Test the tagger\n",
+    "test_text = \"日ごろから勉強している。\"\n",
+    "tokens = list(tagger_a(test_text))\n",
+    "print(f\"\\nTest analysis of '{test_text}':\")\n",
+    "for token in tokens:\n",
+    "    print(f\"  {token.surface} -> {token.feature.lemma} [{','.join(token.pos)}]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracted keys from '日ごろから勉強している。':\n",
+      "  (日ごろ, ヒゴロ, 名)\n",
+      "  (から, カラ, 助)\n",
+      "  (勉強, ベンキョー, 名)\n",
+      "  (する, スル, 動)\n",
+      "  (て, テ, 助)\n",
+      "  (いる, イル, 動)\n",
+      "  (。, *, 補)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# A-4: Morphological field extraction function\n",
+    "def iter_lemma_keys_plan_a(text: str, tagger) -> List[Tuple[str, str, str]]:\n",
+    "    \"\"\"\n",
+    "    Extract (lemma, reading, pos_major) tuples from text using UniDic.\n",
+    "    \n",
+    "    Args:\n",
+    "        text: Input Japanese text\n",
+    "        tagger: fugashi Tagger instance\n",
+    "    \n",
+    "    Returns:\n",
+    "        List of (dictionary_form, reading, pos_major) tuples\n",
+    "    \"\"\"\n",
+    "    keys = []\n",
+    "    for m in tagger(text):\n",
+    "        if m.surface.strip():  # Skip empty tokens\n",
+    "            # UniDic POS is hierarchical; use major category (pos[0])\n",
+    "            pos_major = m.pos[0] if m.pos else 'UNKNOWN'\n",
+    "            lemma = m.feature[10] if m.feature[10] else m.surface\n",
+    "            reading = m.feature[11] if m.feature[11] else ''\n",
+    "            keys.append((lemma, reading, pos_major))\n",
+    "    return keys\n",
+    "\n",
+    "# Test the extraction function\n",
+    "test_keys = iter_lemma_keys_plan_a(test_text, tagger_a)\n",
+    "print(f\"Extracted keys from '{test_text}':\")\n",
+    "for lemma, reading, pos in test_keys:\n",
+    "    print(f\"  ({lemma}, {reading}, {pos})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracted keys from '日ごろから勉強している。' (fixed version):\n",
+      "  (日ごろ, ヒゴロ, 名)\n",
+      "  (から, カラ, 助)\n",
+      "  (勉強, ベンキョー, 名)\n",
+      "  (する, シ, 動)\n",
+      "  (て, テ, 助)\n",
+      "  (いる, イル, 動)\n",
+      "  (。, *, 補)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Fixed version with proper fugashi/UniDic attribute handling\n",
+    "def iter_lemma_keys_fixed(text: str, tagger) -> List[Tuple[str, str, str]]:\n",
+    "    \"\"\"\n",
+    "    Extract (lemma, reading, pos_major) tuples from text using UniDic.\n",
+    "    Fixed version that handles fugashi attribute variations.\n",
+    "    \"\"\"\n",
+    "    keys = []\n",
+    "    for m in tagger(text):\n",
+    "        if m.surface.strip():  # Skip empty tokens\n",
+    "            # UniDic POS is hierarchical; use major category (pos[0])\n",
+    "            pos_major = m.pos[0] if m.pos else 'UNKNOWN'\n",
+    "            \n",
+    "            # Handle different attribute names for lemma\n",
+    "            try:\n",
+    "                lemma = m.lemma if hasattr(m, 'lemma') else m.feature[10]\n",
+    "            except:\n",
+    "                lemma = m.surface  # fallback\n",
+    "            \n",
+    "            # Handle different attribute names for reading\n",
+    "            try:\n",
+    "                reading = m.feature[9] if len(m.feature) > 9 else ''\n",
+    "            except:\n",
+    "                reading = ''  # fallback\n",
+    "            \n",
+    "            keys.append((lemma, reading, pos_major))\n",
+    "    return keys\n",
+    "\n",
+    "# Use the fixed function\n",
+    "iter_lemma_keys_plan_a = iter_lemma_keys_fixed\n",
+    "\n",
+    "# Test the fixed function\n",
+    "test_keys = iter_lemma_keys_plan_a(test_text, tagger_a)\n",
+    "print(f\"Extracted keys from '{test_text}' (fixed version):\")\n",
+    "for lemma, reading, pos in test_keys:\n",
+    "    print(f\"  ({lemma}, {reading}, {pos})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Analyzing 30 texts with Plan A...\n",
+      "\n",
+      "Plan A Results (top 15):\n",
+      "   lemma reading pos  freq_local  freq_bccwj\n",
+      "11     。       *   補          30         NaN\n",
+      "8      が       ガ   助          18         NaN\n",
+      "1      は       ワ   助          15         NaN\n",
+      "7      の       ノ   助          15         NaN\n",
+      "5      を       オ   助          12         NaN\n",
+      "10    です      デス   助           9         NaN\n",
+      "42     で       デ   助           9         NaN\n",
+      "15     だ       ダ   助           6         NaN\n",
+      "37     て       テ   助           6         NaN\n",
+      "41    飲む      ノン   動           6         NaN\n",
+      "43     、       *   補           6         NaN\n",
+      "48    国際    コクサイ   名           3         NaN\n",
+      "47     た       タ   助           3         NaN\n",
+      "46   しまう     シマッ   動           3         NaN\n",
+      "0      彼      カレ   代           3         NaN\n"
+     ]
+    }
+   ],
+   "source": [
+    "# A-5: Frequency analysis with BCCWJ matching\n",
+    "def analyze_corpus_plan_a(corpus: List[str], tagger, bccwj_df: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"Analyze corpus using Plan A and match with BCCWJ frequencies.\"\"\"\n",
+    "    freq = Counter()\n",
+    "    \n",
+    "    print(f\"Analyzing {len(corpus)} texts with Plan A...\")\n",
+    "    for text in corpus:\n",
+    "        for key in iter_lemma_keys_plan_a(text, tagger):\n",
+    "            freq[key] += 1\n",
+    "    \n",
+    "    # Convert to DataFrame\n",
+    "    rows = []\n",
+    "    for (lemma, reading, pos), count in freq.items():\n",
+    "        rows.append((lemma, reading, pos, count))\n",
+    "    \n",
+    "    df_local = pd.DataFrame(rows, columns=['lemma', 'reading', 'pos', 'freq_local'])\n",
+    "    df_local['key'] = list(zip(df_local.lemma, df_local.reading, df_local.pos))\n",
+    "    \n",
+    "    # Merge with BCCWJ data\n",
+    "    merged = df_local.merge(bccwj_df[['key', 'freq_bccwj']], on='key', how='left')\n",
+    "    \n",
+    "    return merged.sort_values('freq_local', ascending=False)\n",
+    "\n",
+    "# Run Plan A analysis\n",
+    "results_a = analyze_corpus_plan_a(extended_corpus, tagger_a, df_bccwj)\n",
+    "print(f\"\\nPlan A Results (top 15):\")\n",
+    "print(results_a.head(15)[['lemma', 'reading', 'pos', 'freq_local', 'freq_bccwj']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Plan A Evaluation Metrics:\n",
+      "  type_coverage: 0.000\n",
+      "  token_coverage: 0.000\n",
+      "  correlation: None\n",
+      "  p_value: None\n",
+      "  total_types: 66\n",
+      "  matched_types: 0\n",
+      "  total_tokens: 297\n",
+      "  matched_tokens: 0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# A-6: Evaluation metrics for Plan A\n",
+    "def calculate_metrics(df: pd.DataFrame) -> Dict[str, float]:\n",
+    "    \"\"\"Calculate coverage and correlation metrics.\"\"\"\n",
+    "    # Coverage: percentage of local tokens found in BCCWJ\n",
+    "    matched = df.dropna(subset=['freq_bccwj'])\n",
+    "    coverage = len(matched) / len(df) * 100\n",
+    "    \n",
+    "    # Token coverage (by frequency)\n",
+    "    total_tokens = df['freq_local'].sum()\n",
+    "    matched_tokens = matched['freq_local'].sum()\n",
+    "    token_coverage = matched_tokens / total_tokens * 100\n",
+    "    \n",
+    "    # Spearman correlation for matched items\n",
+    "    if len(matched) > 1:\n",
+    "        if scipy_available:\n",
+    "            correlation, p_value = spearmanr(matched['freq_local'], matched['freq_bccwj'])\n",
+    "        else:\n",
+    "            correlation = np.corrcoef(matched['freq_local'].rank(), matched['freq_bccwj'].rank())[0,1]\n",
+    "            p_value = None\n",
+    "    else:\n",
+    "        correlation, p_value = None, None\n",
+    "    \n",
+    "    return {\n",
+    "        'type_coverage': coverage,\n",
+    "        'token_coverage': token_coverage,\n",
+    "        'correlation': correlation,\n",
+    "        'p_value': p_value,\n",
+    "        'total_types': len(df),\n",
+    "        'matched_types': len(matched),\n",
+    "        'total_tokens': total_tokens,\n",
+    "        'matched_tokens': matched_tokens\n",
+    "    }\n",
+    "\n",
+    "metrics_a = calculate_metrics(results_a)\n",
+    "print(\"Plan A Evaluation Metrics:\")\n",
+    "for key, value in metrics_a.items():\n",
+    "    if isinstance(value, float) and value is not None:\n",
+    "        print(f\"  {key}: {value:.3f}\")\n",
+    "    else:\n",
+    "        print(f\"  {key}: {value}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using Fugashi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "彼 [('彼', '代名詞', '代')]\n",
+      "は [('は', '助詞', '助')]\n",
+      "日ごろ [('日頃', '名詞', '名')]\n",
+      "本 [('本', '名詞', '名')]\n",
+      "を [('を', '助詞', '助')]\n",
+      "読む [('読む', '動詞', '動')]\n",
+      "。 [('。', '補助記号', '補')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import fugashi, unidic\n",
+    "from spacy.tokens import Token\n",
+    "tagger = fugashi.Tagger()\n",
+    "tagger = fugashi.Tagger(f'-d \"{unidic.DICDIR}\"')\n",
+    "\n",
+    "if not Token.has_extension(\"unidic_lemmas\"):\n",
+    "    Token.set_extension(\"unidic_lemmas\", default=None)\n",
+    "\n",
+    "def enrich_with_unidic(doc):\n",
+    "    text = doc.text\n",
+    "    # GiNZA token start index -> token\n",
+    "    start_map = {tok.idx: tok for tok in doc}\n",
+    "    cursor = 0\n",
+    "    for m in tagger(text):\n",
+    "        surf = m.surface\n",
+    "        start = text.find(surf, cursor)\n",
+    "        if start < 0:\n",
+    "            continue\n",
+    "        cursor = start + len(surf)\n",
+    "        tok = start_map.get(start)\n",
+    "        if tok:\n",
+    "            if tok._.unidic_lemmas is None:\n",
+    "                tok._.unidic_lemmas = []\n",
+    "            tok._.unidic_lemmas.append(\n",
+    "                (m.feature.lemma, m.feature.pos1, m.pos[0])\n",
+    "            )\n",
+    "    return doc\n",
+    "\n",
+    "doc = enrich_with_unidic(doc)\n",
+    "for t in doc:\n",
+    "    print(t.text, t._.unidic_lemmas)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"日頃からの日ごろをてっていする。\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy\n",
+    "from fugashi import Tagger\n",
+    "import unidic   # or unidic_lite\n",
+    "\n",
+    "nlp = spacy.load(\"ja_ginza\")\n",
+    "tagger = Tagger(f'-d \"{unidic.DICDIR}\"')  # フル UniDic\n",
+    "doc = nlp(text)\n",
+    "mecab_tokens = list(tagger(text))\n",
+    "# → 文字オフセットでアライメントして doc の token に UniDic 情報を付与"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[日頃, から, の, 日ごろ, を, てってい, する, 。]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mecab_tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<fugashi.fugashi.Tagger object at 0x1183bad80>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tagger)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using unidic at: /Users/eguchi/Dropbox/teaching/Tohoku-2025/linguistic-data-analysis-I/.venv/lib/python3.12/site-packages/unidic/dicdir\n"
+     ]
+    }
+   ],
+   "source": [
+    "import unidic\n",
+    "print(\"Using unidic at:\", unidic.DICDIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "feature_len: 29\n"
+     ]
+    }
+   ],
+   "source": [
+    "sample = next(iter(tagger(\"テスト\")))\n",
+    "print(\"feature_len:\", len(sample.feature))\n",
+    "# 17 = unidic-lite (2.1.2), 29前後 = フル UniDic 3.x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['dictionary_info']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print([a for a in dir(tagger) if 'dic' in a.lower()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Available attrs: ['char_type', 'feature', 'feature_raw', 'is_unk', 'length', 'pos', 'posid', 'rlength', 'stat', 'surface', 'white_space']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import fugashi\n",
+    "from fugashi import Tagger\n",
+    "\n",
+    "tagger = Tagger()  # まずオプションなし\n",
+    "m = next(iter(tagger(\"日ごろ\")))\n",
+    "print(\"Available attrs:\", [a for a in dir(m) if not a.startswith('_')][:25])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tagger repr: <fugashi.fugashi.Tagger object at 0x13f33b5c0>\n",
+      "surface: 日ごろ\n",
+      "feature_len: 29\n",
+      "raw feature: UnidicFeatures29(pos1='名詞', pos2='普通名詞', pos3='副詞可能', pos4='*', cType='*', cForm='*', lForm='ヒゴロ', lemma='日頃', orth='日ごろ', pron='ヒゴロ', orthBase='日ごろ', pronBase='ヒゴロ', goshu='和', iType='*', iForm='*', fType='*', fForm='*', iConType='*', fConType='*', type='体', kana='ヒゴロ', kanaBase='ヒゴロ', form='ヒゴロ', formBase='ヒゴロ', aType='0', aConType='C2', aModType='*', lid='8605061500510720', lemma_id='31305')\n"
+     ]
+    }
+   ],
+   "source": [
+    "import fugashi\n",
+    "t = fugashi.Tagger()\n",
+    "print(\"Tagger repr:\", t)   # ここに 'ipa' や 'unidic' などヒントが出ることが多い\n",
+    "\n",
+    "w = next(iter(t(\"日ごろ\")))\n",
+    "print(\"surface:\", w.surface)\n",
+    "print(\"feature_len:\", len(w.feature))\n",
+    "print(\"raw feature:\", w.feature)          # まず 1語分"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

pyproject.toml CHANGED Viewed

@@ -18,4 +18,8 @@ dependencies = [
     "ja-core-news-md @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl",
     "ja-core-news-trf @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl",
     "huggingface-hub[cli]>=0.33.4",
 ]

     "ja-core-news-md @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl",
     "ja-core-news-trf @ https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl",
     "huggingface-hub[cli]>=0.33.4",
+    "chardet>=5.2.0",
+    "fugashi>=1.3.0",
+    "unidic>=1.1.0",
+    "ipykernel>=6.29.5",
 ]

resources/reference_lists/ja/BCCWJ_frequencylist_luw2_ver1_1.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abdfe3f5c6383be148809f615834a8f8890d6acab1415428ca350cff08438908
+size 355289031

resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1 copy.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51c38228ac27858cf3fa35c71cddd54f2290b86f9ca5e705e360b2f849350179
+size 5123687

resources/reference_lists/ja/BCCWJ_frequencylist_suw_ver1_1.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59cc5d3e0961f130b073a17736e8ff4c5f0f63bd759e27e3c7cd0d96e79f4443
+size 76573321

resources/reference_lists/ja/CSJ_frequencylist_suw_ver201803.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84fa50dd87a9094f85006c81d78d14afab54bfad55e4a7137c1beab89b7200a4
+size 17713132

test/test_app.py CHANGED Viewed

@@ -4,13 +4,15 @@ Basic test script to validate the application components.
 import sys
 import os
-sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
 def test_imports():
     """Test that all required modules can be imported."""
     try:
-        from lexical_sophistication import LexicalSophisticationAnalyzer
-        from pos_parser import POSParser
         print("✓ Backend modules imported successfully")
         return True
     except ImportError as e:
@@ -20,8 +22,8 @@ def test_imports():
 def test_basic_functionality():
     """Test basic functionality with SpaCy models."""
     try:
-        from lexical_sophistication import LexicalSophisticationAnalyzer
-        from pos_parser import POSParser
         print("Testing basic class instantiation...")
         print("Note: This will fail without SpaCy models installed")
@@ -64,4 +66,4 @@ def main():
     return True
 if __name__ == "__main__":
-    main()

 import sys
 import os
+# Add the parent directory to the Python path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 def test_imports():
     """Test that all required modules can be imported."""
     try:
+        from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+        from text_analyzer.pos_parser import POSParser
         print("✓ Backend modules imported successfully")
         return True
     except ImportError as e:
 def test_basic_functionality():
     """Test basic functionality with SpaCy models."""
     try:
+        from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+        from text_analyzer.pos_parser import POSParser
         print("Testing basic class instantiation...")
         print("Note: This will fail without SpaCy models installed")
     return True
 if __name__ == "__main__":
+    main()

test/test_functionality.py CHANGED Viewed

@@ -6,10 +6,12 @@ Extended test script to validate application functionality.
 import sys
 import os
 import tempfile
-sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
-from lexical_sophistication import LexicalSophisticationAnalyzer
-from pos_parser import POSParser
 import pandas as pd
 def test_lexical_sophistication():
@@ -122,4 +124,4 @@ def main():
 if __name__ == "__main__":
     success = main()
-    sys.exit(0 if success else 1)

 import sys
 import os
 import tempfile
+# Add the parent directory to the Python path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+from text_analyzer.pos_parser import POSParser
 import pandas as pd
 def test_lexical_sophistication():
 if __name__ == "__main__":
     success = main()
+    sys.exit(0 if success else 1)

test/test_multi_index.py CHANGED Viewed

@@ -3,9 +3,8 @@
 import sys
 import os
 import tempfile
-sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
-from lexical_sophistication import LexicalSophisticationAnalyzer
 def test_multi_index_functionality():
     print("Testing multi-index functionality...")
@@ -130,4 +129,4 @@ that,,,7,12279,500,12063.320,1.000"""
         traceback.print_exc()
 if __name__ == "__main__":
-    test_multi_index_functionality()

 import sys
 import os
 import tempfile
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
 def test_multi_index_functionality():
     print("Testing multi-index functionality...")
         traceback.print_exc()
 if __name__ == "__main__":
+    test_multi_index_functionality()

test/test_yaml_config.py CHANGED Viewed

@@ -5,9 +5,8 @@ Test script to validate YAML configuration system.
 import sys
 import os
-sys.path.append(os.path.join(os.path.dirname(__file__), 'backend'))
-from lexical_sophistication import LexicalSophisticationAnalyzer
 import yaml
 from pathlib import Path
@@ -153,4 +152,4 @@ def main():
 if __name__ == "__main__":
     success = main()
-    sys.exit(0 if success else 1)

 import sys
 import os
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
 import yaml
 from pathlib import Path
 if __name__ == "__main__":
     success = main()
+    sys.exit(0 if success else 1)

test_frequency_flexible.py ADDED Viewed

	@@ -0,0 +1 @@

+ \n#!/usr/bin/env python3\n\"\"\"\nTest script for the enhanced FrequencyAnalyzer with flexible column mapping.\nThis demonstrates the new functionality with sample data.\n\"\"\"\n\nimport pandas as pd\nimport numpy as np\nfrom io import StringIO\nimport sys\nimport os\n\n# Add the text_analyzer to path\nsys.path.append('text_analyzer')\n\nfrom frequency_analyzer import FrequencyAnalyzer\n\ndef create_sample_data():\n \"\"\"Create sample frequency data in the new format.\"\"\"\n sample_data = \"\"\"rank\tlForm\tlemma\tpos\tsubLemma\twType\tfrequency\tpmw\tPB_frequency\tPB_pmw\tPM_frequency\tPM_pmw\tcore_frequency\tcore_pmw\n1\tノ\tの\t助詞-格助詞\t\t和\t5061558\t48383.9\t1473494\t51791.5\t208748\t47179.3\t1398950\t51737.2\n2\tニ\tに\t助詞-格助詞\t\t和\t3576558\t34188.7\t1036653\t36437.1\t140178\t31681.7\t985766\t36456.5\n3\tテ\tて\t助詞-接続助詞\t\t和\t3493117\t33391.0\t948430\t33336.1\t124241\t28079.8\t902379\t33372.6\n4\tハ\tは\t助詞-係助詞\t\t和\t3289932\t31448.8\t945084\t33218.5\t129378\t29240.8\t899776\t33276.3\n5\tガ\tが\t助詞-格助詞\t\t和\t2518164\t24070.6\t743621\t26131.8\t103456\t23390.2\t707331\t26139.9\"\"\"\n return sample_data\n\ndef test_file_format_detection():\n \"\"\"Test file format detection functionality.\"\"\"\n print(\"=== Testing File Format Detection ===\")\n \n analyzer = FrequencyAnalyzer(file_size_limit_mb=300)\n sample_data = create_sample_data()\n \n format_info = analyzer.detect_file_format(sample_data)\n print(f\"Detected separator: '{format_info['separator']}'\")\n print(f\"Has header: {format_info['has_header']}\")\n print(f\"Estimated columns: {format_info['estimated_columns']}\")\n print(f\"Sample lines: {format_info['sample_lines'][:2]}\")\n print()\n\ndef test_column_detection():\n \"\"\"Test column detection and categorization.\"\"\"\n print(\"=== Testing Column Detection ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Read sample data for column detection\n df = pd.read_csv(StringIO(sample_data), sep='\\t')\n detected_cols = analyzer.detect_columns(df)\n \n print(\"Detected columns:\")\n for category, columns in detected_cols.items():\n print(f\" {category}: {columns}\")\n print()\n\ndef test_flexible_loading():\n \"\"\"Test flexible data loading with column configuration.\"\"\"\n print(\"=== Testing Flexible Data Loading ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Test with different column configurations\n configs = [\n {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency',\n 'pos_column': 'pos'\n },\n {\n 'word_column': 'lemma',\n 'frequency_column': 'pmw',\n 'pos_column': 'pos'\n },\n {\n 'word_column': 'lForm',\n 'frequency_column': 'PB_frequency'\n }\n ]\n \n for i, config in enumerate(configs, 1):\n print(f\"Configuration {i}: {config}\")\n try:\n df = analyzer.load_frequency_data(sample_data, config)\n print(f\" ✓ Successfully loaded {len(df)} entries\")\n print(f\" ✓ Available frequency columns: {analyzer.get_available_frequency_columns()}\")\n print(f\" ✓ Available word columns: {analyzer.get_available_word_columns()}\")\n except Exception as e:\n print(f\" ✗ Error: {e}\")\n print()\n\ndef test_multi_frequency_analysis():\n \"\"\"Test multi-frequency analysis functionality.\"\"\"\n print(\"=== Testing Multi-Frequency Analysis ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency',\n 'pos_column': 'pos'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n # Test analysis with multiple frequency columns\n freq_columns = ['frequency', 'pmw', 'PB_frequency']\n \n try:\n results = analyzer.create_multi_frequency_analysis(freq_columns, bin_size=2)\n \n print(f\"Multi-frequency analysis results:\")\n for col, result in results.items():\n print(f\" {col}: {len(result['group_labels'])} groups\")\n print(f\" Sample frequencies: {result['avg_frequencies'][:3]}\")\n \n except Exception as e:\n print(f\"Error in multi-frequency analysis: {e}\")\n print()\n\ndef test_rank_based_visualization():\n \"\"\"Test rank-based visualization with flexible columns.\"\"\"\n print(\"=== Testing Rank-Based Visualization ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n try:\n # Test with different frequency columns\n for col in ['frequency', 'pmw', 'PB_frequency']:\n result = analyzer.create_rank_based_visualization_flexible(\n column=col, \n bin_size=2, \n log_transform=False\n )\n \n print(f\"Analysis for column '{col}':\")\n print(f\" Groups: {len(result['group_labels'])}\")\n print(f\" Sample words: {[w['word'] for w in result['sample_words'].get(0, [])]}\")\n print(f\" Avg frequencies: {result['avg_frequencies']}\")\n \n except Exception as e:\n print(f\"Error in rank-based visualization: {e}\")\n print()\n\ndef test_backward_compatibility():\n \"\"\"Test backward compatibility with legacy interface.\"\"\"\n print(\"=== Testing Backward Compatibility ===\")\n \n analyzer = FrequencyAnalyzer()\n sample_data = create_sample_data()\n \n # Test with flexible loading first\n config = {\n 'word_column': 'lForm',\n 'frequency_column': 'frequency'\n }\n \n df = analyzer.load_frequency_data(sample_data, config)\n \n # Then test legacy methods\n try:\n legacy_cols = analyzer.get_available_columns()\n print(f\"Legacy available columns: {legacy_cols}\")\n \n if legacy_cols:\n stats = analyzer.calculate_statistics(legacy_cols[0])\n print(f\"Statistics for {legacy_cols[0]}: mean={stats['mean']:.1f}, count={stats['count']}\")\n \n top_words = analyzer.get_top_words(legacy_cols[0], n=3)\n print(f\"Top 3 words: {[w['word'] for w in top_words]}\")\n \n except Exception as e:\n print(f\"Error in backward compatibility test: {e}\")\n print()\n\nif __name__ == \"__main__\":\n print(\"Testing Enhanced FrequencyAnalyzer with Flexible Column Mapping\")\n print(\"=\" * 60)\n \n test_file_format_detection()\n test_column_detection()\n test_flexible_loading()\n test_multi_frequency_analysis()\n test_rank_based_visualization()\n test_backward_compatibility()\n \n print(\"All tests completed!\")\n

test_fugashi_diagnostic.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+"""
+Diagnostic test to check if fugashi is working and what matching methods are being used.
+"""
+import sys
+sys.path.append('.')
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+from web_app.config_manager import ConfigManager
+def test_fugashi_diagnostic():
+    """Test what matching methods are actually being used."""
+    print("=== Fugashi Diagnostic Test ===\n")
+    # Initialize Japanese analyzer
+    print("1. Initializing Japanese analyzer...")
+    try:
+        analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
+        print("✓ Japanese SpaCy model loaded successfully")
+        # Check if UniDic enricher is available
+        if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
+            print("✓ UniDic enricher initialized successfully")
+        else:
+            print("⚠ UniDic enricher not available - using legacy mode")
+    except Exception as e:
+        print(f"✗ Failed to load Japanese model: {e}")
+        return False
+    # Load reference configuration
+    print("\n2. Loading BCCWJ frequency data only...")
+    config = ConfigManager.load_reference_config()
+    japanese_config = config.get('japanese', {}).get('unigrams', {})
+    # Load just BCCWJ frequency for testing
+    bccwj_config = japanese_config.get('BCCWJ_frequency')
+    if not bccwj_config:
+        print("✗ BCCWJ configuration not found")
+        return False
+    print("✓ BCCWJ configuration found")
+    # Load the data
+    bccwj_data = ConfigManager.load_reference_list_data(bccwj_config)
+    if not bccwj_data:
+        print("✗ Failed to load BCCWJ data")
+        return False
+    print(f"✓ BCCWJ data loaded successfully")
+    # Load into analyzer
+    reference_data = {"unigrams_BCCWJ_frequency": bccwj_data}
+    analyzer.load_reference_lists(reference_data)
+    print("✓ Reference data loaded into analyzer")
+    # Test with a simple Japanese sentence
+    print("\n3. Testing token matching methods...")
+    test_text = "私は学校に行きます。"
+    try:
+        results = analyzer.analyze_text(test_text, ["unigrams_BCCWJ_frequency"])
+        print(f"\nAnalysis completed for: '{test_text}'")
+        print(f"Total tokens analyzed: {len(results['token_details'])}")
+        print("\nDetailed token matching results:")
+        for i, token in enumerate(results['token_details']):
+            print(f"\nToken {i+1}: '{token['token']}' (lemma: '{token['lemma']}')")
+            print(f"   POS: {token['pos']}, Tag: {token['tag']}")
+            # Check matching methods
+            token_method = token.get('unigrams_BCCWJ_frequency_token_match_method', 'unknown')
+            lemma_method = token.get('unigrams_BCCWJ_frequency_lemma_match_method', 'unknown')
+            token_score = token.get('unigrams_BCCWJ_frequency_token')
+            lemma_score = token.get('unigrams_BCCWJ_frequency_lemma')
+            print(f"   Token matching method: {token_method}")
+            print(f"   Lemma matching method: {lemma_method}")
+            print(f"   Token score: {token_score}")
+            print(f"   Lemma score: {lemma_score}")
+            # Show UniDic features if available
+            if 'unidic_features' in token:
+                unidic = token['unidic_features']
+                print(f"   UniDic features available:")
+                print(f"     lemma: '{unidic.get('lemma', '')}'")
+                print(f"     lForm: '{unidic.get('lForm', '')}'")
+                print(f"     pos1: '{unidic.get('pos1', '')}'")
+                print(f"     pos2: '{unidic.get('pos2', '')}'")
+                print(f"     alignment_confidence: {unidic.get('alignment_confidence', 0.0)}")
+            else:
+                print("   No UniDic features available")
+        # Summary
+        print("\n4. Summary:")
+        methods_used = {}
+        for token in results['token_details']:
+            token_method = token.get('unigrams_BCCWJ_frequency_token_match_method', 'unknown')
+            lemma_method = token.get('unigrams_BCCWJ_frequency_lemma_match_method', 'unknown')
+            methods_used[token_method] = methods_used.get(token_method, 0) + 1
+            if token_method != lemma_method:
+                methods_used[lemma_method] = methods_used.get(lemma_method, 0) + 1
+        print("Matching methods used:")
+        for method, count in methods_used.items():
+            print(f"   {method}: {count} matches")
+        if 'legacy_spacy' in methods_used and len(methods_used) == 1:
+            print("\n❌ ALL tokens are using legacy_spacy - fugashi is NOT being used!")
+            return False
+        elif any('unidic' in method for method in methods_used):
+            print("\n✅ Some tokens are using UniDic-based matching - fugashi is working!")
+            return True
+        else:
+            print("\n⚠ Mixed or unexpected matching methods")
+            return False
+    except Exception as e:
+        print(f"✗ Error during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    success = test_fugashi_diagnostic()
+    if success:
+        print("\n🎉 Fugashi diagnostic test indicates fugashi is working!")
+    else:
+        print("\n❌ Fugashi diagnostic test indicates fugashi is NOT working!")
+    sys.exit(0 if success else 1)

test_japanese_integration.py ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/usr/bin/env python3
+"""
+Test script for Japanese lexical sophistication integration.
+Tests the BCCWJ and CSJ frequency analysis with composite key lookup.
+"""
+import os
+import sys
+sys.path.append('.')
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+from web_app.config_manager import ConfigManager
+def test_japanese_integration():
+    """Test Japanese corpus integration with sample text."""
+    print("=== Japanese Lexical Sophistication Integration Test ===\n")
+    # Initialize Japanese analyzer
+    print("1. Initializing Japanese analyzer...")
+    try:
+        analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
+        print("✓ Japanese SpaCy model loaded successfully")
+        # Check if UniDic enricher is available
+        if hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher:
+            print("✓ UniDic enricher initialized successfully")
+        else:
+            print("⚠ UniDic enricher not available - using legacy mode")
+    except Exception as e:
+        print(f"✗ Failed to load Japanese model: {e}")
+        print("Please install: python -m spacy download ja_core_news_md")
+        return False
+    # Load reference configuration
+    print("\n2. Loading reference configuration...")
+    config = ConfigManager.load_reference_config()
+    japanese_config = config.get('japanese', {}).get('unigrams', {})
+    if not japanese_config:
+        print("✗ No Japanese configuration found")
+        return False
+    print(f"✓ Found {len(japanese_config)} Japanese reference lists")
+    # Test data loading for available files
+    print("\n3. Testing data loading...")
+    reference_data = {}
+    for list_name, list_config in japanese_config.items():
+        if not list_config.get('enabled', False):
+            continue
+        file_path = list_config.get('files', {}).get('token', '')
+        if not os.path.exists(file_path):
+            print(f"⚠ File not found: {file_path}")
+            continue
+        print(f"   Loading {list_name}...")
+        try:
+            data = ConfigManager.load_reference_list_data(list_config)
+            if data:
+                reference_data[f"unigrams_{list_name}"] = data
+                # Check if Japanese corpus data was created correctly
+                for file_type, file_data in data.items():
+                    if isinstance(file_data, dict) and file_data.get('is_japanese_corpus'):
+                        composite_count = len(file_data.get('composite_dict', {}))
+                        lemma_count = len(file_data.get('lemma_dict', {}))
+                        surface_count = len(file_data.get('surface_dict', {}))
+                        print(f"   ✓ {list_name}: {composite_count} composite keys, {lemma_count} lemmas, {surface_count} surface forms")
+        except Exception as e:
+            print(f"   ✗ Error loading {list_name}: {e}")
+    if not reference_data:
+        print("✗ No reference data loaded successfully")
+        return False
+    # Load reference data into analyzer
+    print("\n4. Loading reference data into analyzer...")
+    analyzer.load_reference_lists(reference_data)
+    print(f"✓ Loaded {len(reference_data)} reference lists")
+    # Test with Japanese text
+    print("\n5. Testing Japanese text analysis...")
+    japanese_text = """
+    私は毎日学校に行きます。
+    友達と一緒に勉強して、とても楽しいです。
+    日本語の文法は少し難しいですが、頑張って覚えています。
+    """
+    selected_indices = list(reference_data.keys())
+    print(f"   Using indices: {', '.join(selected_indices)}")
+    try:
+        results = analyzer.analyze_text(japanese_text, selected_indices)
+        # Display results
+        print(f"\n6. Analysis Results:")
+        print(f"   Total tokens: {results['text_stats']['total_tokens']}")
+        print(f"   Content words: {results['text_stats']['content_words']}")
+        print(f"   Function words: {results['text_stats']['function_words']}")
+        # Show some token details
+        print(f"\n   Sample token analysis:")
+        for i, token in enumerate(results['token_details'][:5]):  # First 5 tokens
+            print(f"   {i+1}. {token['token']} (lemma: {token['lemma']}, pos: {token['pos']})")
+            for key, value in token.items():
+                if key.endswith('_token') or key.endswith('_lemma'):
+                    if value != 'NA':
+                        print(f"      {key}: {value}")
+        # Show summary statistics
+        print(f"\n   Summary statistics:")
+        for key, stats in results['summary'].items():
+            print(f"   {key}: mean={stats['mean']:.2f}, count={stats['count']}")
+        print(f"\n✓ Japanese text analysis completed successfully!")
+        return True
+    except Exception as e:
+        print(f"✗ Error during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    success = test_japanese_integration()
+    if success:
+        print("\n🎉 Japanese integration test PASSED!")
+    else:
+        print("\n❌ Japanese integration test FAILED!")
+    sys.exit(0 if success else 1)

test_unidic_diagnostic.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python3
+"""
+Diagnostic test for UniDic integration functionality.
+Tests both the fallback mechanism and enhanced features.
+"""
+import os
+import sys
+sys.path.append('.')
+from text_analyzer.lexical_sophistication import LexicalSophisticationAnalyzer
+from web_app.config_manager import ConfigManager
+def test_unidic_diagnostic():
+    """Test UniDic integration with diagnostic information."""
+    print("=== UniDic Integration Diagnostic Test ===\n")
+    # Initialize Japanese analyzer
+    print("1. Initializing Japanese analyzer...")
+    try:
+        analyzer = LexicalSophisticationAnalyzer(language="ja", model_size="md")
+        print("✓ Japanese SpaCy model loaded successfully")
+        # Check UniDic availability
+        unidic_available = hasattr(analyzer, 'unidic_enricher') and analyzer.unidic_enricher
+        if unidic_available:
+            print("✓ UniDic enricher initialized - enhanced mode available")
+        else:
+            print("⚠ UniDic enricher not available - using legacy fallback mode")
+    except Exception as e:
+        print(f"✗ Failed to initialize analyzer: {e}")
+        return False
+    # Load reference data
+    print("\n2. Loading reference configuration...")
+    config = ConfigManager.load_reference_config()
+    japanese_config = config.get('japanese', {}).get('unigrams', {})
+    # Get first available Japanese corpus
+    reference_data = {}
+    for list_name, list_config in japanese_config.items():
+        if list_config.get('enabled', False):
+            file_path = list_config.get('files', {}).get('token', '')
+            if os.path.exists(file_path):
+                data = ConfigManager.load_reference_list_data(list_config)
+                if data:
+                    reference_data[f"unigrams_{list_name}"] = data
+                    print(f"✓ Loaded {list_name} for testing")
+                    break
+    if not reference_data:
+        print("✗ No reference data available")
+        return False
+    # Load into analyzer
+    analyzer.load_reference_lists(reference_data)
+    # Test with sample Japanese text
+    print("\n3. Testing Japanese text analysis...")
+    test_text = "私は学校に行く。"
+    selected_indices = list(reference_data.keys())
+    try:
+        results = analyzer.analyze_text(test_text, selected_indices)
+        print(f"\n4. Analysis Results:")
+        print(f"   Total tokens: {results['text_stats']['total_tokens']}")
+        # Show detailed token analysis with diagnostic information
+        print(f"\n   Token Details with Diagnostics:")
+        for i, token_detail in enumerate(results['token_details'][:4]):  # First 4 tokens
+            print(f"\n   Token {i+1}: '{token_detail['token']}'")
+            print(f"      SpaCy: lemma='{token_detail['lemma']}', pos='{token_detail['pos']}', tag='{token_detail['tag']}'")
+            # Look for UniDic features
+            if 'unidic_features' in token_detail:
+                unidic_feat = token_detail['unidic_features']
+                print(f"      UniDic: lemma='{unidic_feat.get('lemma', '')}', lForm='{unidic_feat.get('lForm', '')}', pos1='{unidic_feat.get('pos1', '')}', goshu='{unidic_feat.get('goshu', '')}'")
+                print(f"      Alignment confidence: {unidic_feat.get('alignment_confidence', 0.0):.2f}")
+            # Show matching methods for each index
+            for idx_name in selected_indices:
+                token_method = token_detail.get(f"{idx_name}_token_match_method", "N/A")
+                lemma_method = token_detail.get(f"{idx_name}_lemma_match_method", "N/A")
+                token_score = token_detail.get(f"{idx_name}_token", "N/A")
+                lemma_score = token_detail.get(f"{idx_name}_lemma", "N/A")
+                print(f"      {idx_name}:")
+                print(f"         Token: score={token_score}, method={token_method}")
+                print(f"         Lemma: score={lemma_score}, method={lemma_method}")
+        # Show summary
+        print(f"\n   Summary Statistics:")
+        matching_methods = {}
+        for token_detail in results['token_details']:
+            for key, value in token_detail.items():
+                if key.endswith('_match_method'):
+                    method = value
+                    matching_methods[method] = matching_methods.get(method, 0) + 1
+        print(f"   Matching method distribution:")
+        for method, count in matching_methods.items():
+            print(f"      {method}: {count} matches")
+        return True
+    except Exception as e:
+        print(f"✗ Error during analysis: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def test_unidic_fallback_levels():
+    """Test the 3-level UniDic fallback strategy simulation."""
+    print("\n=== UniDic Fallback Strategy Test ===\n")
+    # Simulate UniDic features for different fallback levels
+    test_cases = [
+        {
+            'name': 'Complete UniDic features (Level 1)',
+            'features': {
+                'lemma': '行く', 'lForm': 'イク', 'pos1': '動詞', 'pos2': '一般', 'goshu': '和'
+            },
+            'expected_keys': [
+                '行く_イク_動詞_一般_和',  # Level 1
+                '行く_イク_動詞_一般',    # Level 2
+                '行く_イク_動詞'          # Level 3
+            ]
+        },
+        {
+            'name': 'Partial features (Level 2)',
+            'features': {
+                'lemma': '学校', 'lForm': 'ガッコウ', 'pos1': '名詞', 'pos2': '一般', 'goshu': ''
+            },
+            'expected_keys': [
+                '学校_ガッコウ_名詞_一般',  # Level 2
+                '学校_ガッコウ_名詞'        # Level 3
+            ]
+        },
+        {
+            'name': 'Minimal features (Level 3)',
+            'features': {
+                'lemma': '私', 'lForm': 'ワタシ', 'pos1': '代名詞', 'pos2': '', 'goshu': ''
+            },
+            'expected_keys': [
+                '私_ワタシ_代名詞'          # Level 3 only
+            ]
+        }
+    ]
+    for case in test_cases:
+        print(f"Testing: {case['name']}")
+        features = case['features']
+        expected = case['expected_keys']
+        # Generate actual keys that would be attempted
+        actual_keys = []
+        # Level 1: {lemma}_{lForm}_{pos1}_{pos2}_{goshu}
+        if all([features['lemma'], features['lForm'], features['pos1'], features['pos2'], features['goshu']]):
+            level1_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}_{features['pos2']}_{features['goshu']}"
+            actual_keys.append(level1_key)
+        # Level 2: {lemma}_{lForm}_{pos1}_{pos2}
+        if all([features['lemma'], features['lForm'], features['pos1'], features['pos2']]):
+            level2_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}_{features['pos2']}"
+            actual_keys.append(level2_key)
+        # Level 3: {lemma}_{lForm}_{pos1}
+        if all([features['lemma'], features['lForm'], features['pos1']]):
+            level3_key = f"{features['lemma']}_{features['lForm']}_{features['pos1']}"
+            actual_keys.append(level3_key)
+        # Check if matches expected
+        match = actual_keys == expected
+        status = "✓" if match else "✗"
+        print(f"   {status} Generated keys: {actual_keys}")
+        if not match:
+            print(f"     Expected: {expected}")
+        print()
+    return True
+if __name__ == "__main__":
+    print("Running UniDic integration diagnostics...\n")
+    success1 = test_unidic_diagnostic()
+    success2 = test_unidic_fallback_levels()
+    if success1 and success2:
+        print("\n🎉 All UniDic diagnostic tests PASSED!")
+        print("\nSystem Status:")
+        print("- Legacy Japanese analysis: ✓ Working")
+        print("- Fallback strategy: ✓ Implemented")
+        print("- Diagnostic tracking: ✓ Available")
+        print("- UniDic integration: ⚠ Ready (requires MeCab setup)")
+    else:
+        print("\n❌ Some diagnostic tests FAILED!")
+    sys.exit(0 if success1 and success2 else 1)

text_analyzer/__pycache__/__init__.cpython-312.pyc DELETED Viewed

Binary file (297 Bytes)

text_analyzer/__pycache__/lexical_sophistication.cpython-312.pyc DELETED Viewed

Binary file (24.3 kB)

text_analyzer/__pycache__/pos_parser.cpython-312.pyc DELETED Viewed

Binary file (9.72 kB)

text_analyzer/app_config.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+Centralized configuration module for the text analysis application.
+Contains all constants, settings, and configuration loading utilities.
+"""
+import yaml
+from pathlib import Path
+from typing import Dict, Any, Optional
+import logging
+logger = logging.getLogger(__name__)
+class AppConfig:
+    """Centralized configuration management for the text analysis application."""
+    # SpaCy Model Mappings
+    SPACY_MODELS = {
+        ("en", "md"): "en_core_web_md",
+        ("en", "trf"): "en_core_web_trf",
+        ("ja", "md"): "ja_core_news_md",
+        ("ja", "trf"): "ja_core_news_trf"
+    }
+    # Default Settings
+    DEFAULT_LANGUAGE = "en"
+    DEFAULT_MODEL_SIZE = "md"  # Changed from "trf" to be more accessible
+    # Analysis Limits (shared constants)
+    MAX_TOKENS_FOR_VISUALIZATION = 30
+    DEFAULT_HISTOGRAM_BINS = 25
+    DEFAULT_RANK_BIN_SIZE = 500
+    MAX_NGRAM_SENTENCE_LENGTH = 100
+    # File Processing (generic utilities)
+    SUPPORTED_ENCODINGS = ['utf-8', 'utf-16', 'latin-1']
+    SUPPORTED_DELIMITERS = [',', '\t', ';']
+    # Configuration Paths
+    REFERENCE_LISTS_CONFIG = "config/reference_lists.yaml"
+    @classmethod
+    def get_spacy_model_name(cls, language: str, model_size: str) -> Optional[str]:
+        """
+        Get the SpaCy model name for given language and size.
+        Args:
+            language: Language code ('en' or 'ja')
+            model_size: Model size ('md' or 'trf')
+        Returns:
+            SpaCy model name or None if not found
+        """
+        return cls.SPACY_MODELS.get((language, model_size))
+    @classmethod
+    def get_supported_languages(cls) -> list[str]:
+        """Get list of supported languages."""
+        return list(set(lang for lang, _ in cls.SPACY_MODELS.keys()))
+    @classmethod
+    def get_supported_model_sizes(cls) -> list[str]:
+        """Get list of supported model sizes."""
+        return list(set(size for _, size in cls.SPACY_MODELS.keys()))
+    @classmethod
+    def load_reference_config(cls) -> Dict[str, Any]:
+        """
+        Load reference lists configuration from YAML file.
+        Returns:
+            Configuration dictionary loaded from YAML
+        """
+        config_path = Path(cls.REFERENCE_LISTS_CONFIG)
+        if not config_path.exists():
+            logger.warning(f"Reference config file not found: {config_path}")
+            return cls._get_default_config()
+        try:
+            with open(config_path, 'r', encoding='utf-8') as f:
+                config = yaml.safe_load(f)
+                if config is None:
+                    logger.warning("Empty YAML configuration, using defaults")
+                    return cls._get_default_config()
+                return config
+        except Exception as e:
+            logger.error(f"Error loading reference configuration: {e}")
+            return cls._get_default_config()
+    @classmethod
+    def get_corpus_configuration(cls, corpus_name: str) -> Dict[str, Any]:
+        """
+        Get configuration for a specific corpus from YAML.
+        Args:
+            corpus_name: Name of the corpus to find
+        Returns:
+            Corpus configuration dictionary
+        """
+        config = cls.load_reference_config()
+        # Search through all languages and ngram types
+        for lang_config in config.values():
+            if not isinstance(lang_config, dict):
+                continue
+            for ngram_type_config in lang_config.values():
+                if not isinstance(ngram_type_config, dict):
+                    continue
+                if corpus_name in ngram_type_config:
+                    return ngram_type_config[corpus_name]
+        logger.warning(f"Corpus configuration not found: {corpus_name}")
+        return {}
+    @classmethod
+    def get_corpus_columns(cls, corpus_name: str) -> Dict[str, int]:
+        """
+        Get column mappings for a specific corpus.
+        Args:
+            corpus_name: Name of the corpus
+        Returns:
+            Dictionary mapping column names to indices
+        """
+        corpus_config = cls.get_corpus_configuration(corpus_name)
+        return corpus_config.get('columns', {})
+    @classmethod
+    def is_japanese_corpus(cls, corpus_name: str) -> bool:
+        """
+        Check if a corpus is marked as Japanese corpus.
+        Args:
+            corpus_name: Name of the corpus
+        Returns:
+            True if it's a Japanese corpus
+        """
+        corpus_config = cls.get_corpus_configuration(corpus_name)
+        return corpus_config.get('japanese_corpus', False)
+    @classmethod
+    def _get_default_config(cls) -> Dict[str, Any]:
+        """Get default configuration structure if YAML fails to load."""
+        return {
+            "english": {
+                "unigrams": {},
+                "bigrams": {},
+                "trigrams": {}
+            },
+            "japanese": {
+                "unigrams": {},
+                "bigrams": {},
+                "trigrams": {}
+            }
+        }
+    @classmethod
+    def validate_language_model_combination(cls, language: str, model_size: str) -> bool:
+        """
+        Validate that a language/model combination is supported.
+        Args:
+            language: Language code
+            model_size: Model size
+        Returns:
+            True if combination is supported
+        """
+        return (language, model_size) in cls.SPACY_MODELS
+    @classmethod
+    def get_processing_limits(cls) -> Dict[str, int]:
+        """Get all processing limits as a dictionary."""
+        return {
+            'max_tokens_visualization': cls.MAX_TOKENS_FOR_VISUALIZATION,
+            'default_histogram_bins': cls.DEFAULT_HISTOGRAM_BINS,
+            'default_rank_bin_size': cls.DEFAULT_RANK_BIN_SIZE,
+            'max_ngram_sentence_length': cls.MAX_NGRAM_SENTENCE_LENGTH
+        }

text_analyzer/base_analyzer.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""
+Base analyzer module providing shared SpaCy infrastructure.
+Eliminates code duplication and provides common functionality for all SpaCy-based analyzers.
+"""
+import spacy
+from typing import Dict, List, Any, Optional, Iterator, Tuple, TYPE_CHECKING
+import logging
+import tempfile
+from pathlib import Path
+from .app_config import AppConfig
+from .text_utility import TextUtility
+# Import UniDic extensions and enricher
+try:
+    from . import unidic_extensions  # This registers the token extensions
+    from .unidic_enricher import UniDicEnricher
+    UNIDIC_AVAILABLE = True
+except ImportError as e:
+    logger.warning(f"UniDic integration not available: {e}")
+    UNIDIC_AVAILABLE = False
+    UniDicEnricher = None
+if TYPE_CHECKING:
+    import spacy
+logger = logging.getLogger(__name__)
+class BaseAnalyzer:
+    """
+    Base class for all SpaCy-based text analyzers.
+    Provides shared model loading, document processing, and utility functions.
+    """
+    def __init__(self, language: str = None, model_size: str = None):
+        """
+        Initialize the base analyzer.
+        Args:
+            language: Language code ('en' or 'ja')
+            model_size: Model size ('md' or 'trf')
+        """
+        self.language = language or AppConfig.DEFAULT_LANGUAGE
+        self.model_size = model_size or AppConfig.DEFAULT_MODEL_SIZE
+        self.nlp = None
+        self._model_info = {}
+        self.unidic_enricher = None
+        self._load_spacy_model()
+        # Initialize UniDic enricher for Japanese
+        if self.language == 'ja' and UNIDIC_AVAILABLE:
+            try:
+                self.unidic_enricher = UniDicEnricher()
+                logger.info("UniDic enricher initialized for Japanese analysis")
+            except Exception as e:
+                logger.warning(f"Failed to initialize UniDic enricher: {e}")
+                self.unidic_enricher = None
+    def _load_spacy_model(self) -> None:
+        """Load appropriate SpaCy model based on language and size."""
+        # Validate combination
+        if not AppConfig.validate_language_model_combination(self.language, self.model_size):
+            raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
+        model_name = AppConfig.get_spacy_model_name(self.language, self.model_size)
+        if not model_name:
+            raise ValueError(f"No model found for language '{self.language}' and size '{self.model_size}'")
+        try:
+            self.nlp = spacy.load(model_name)
+            self._model_info = {
+                'name': model_name,
+                'language': self.language,
+                'model_size': self.model_size,
+                'version': spacy.__version__
+            }
+            logger.info(f"Loaded SpaCy model: {model_name}")
+        except OSError as e:
+            error_msg = f"SpaCy model {model_name} not found. Please install it first."
+            logger.error(error_msg)
+            raise OSError(error_msg) from e
+    def get_model_info(self) -> Dict[str, str]:
+        """
+        Get information about the loaded model.
+        Returns:
+            Dictionary with model information
+        """
+        return self._model_info.copy()
+    def process_document(self, text: str) -> "spacy.Doc":
+        """
+        Process text into a SpaCy document.
+        Args:
+            text: Input text to process
+        Returns:
+            Processed SpaCy document
+        Raises:
+            ValueError: If model not loaded or text processing fails
+        """
+        if not self.nlp:
+            raise ValueError("SpaCy model not loaded")
+        if not text or not text.strip():
+            raise ValueError("Empty text provided")
+        try:
+            # Clean text before processing
+            cleaned_text = TextUtility.clean_text_input(text)
+            # Process with SpaCy
+            doc = self.nlp(cleaned_text)
+            # Add UniDic enrichment for Japanese
+            if self.unidic_enricher and self.language == 'ja':
+                try:
+                    self.unidic_enricher.enrich_spacy_doc(doc, cleaned_text)
+                    logger.debug("UniDic enrichment completed")
+                except Exception as e:
+                    logger.warning(f"UniDic enrichment failed: {e}")
+            return doc
+        except Exception as e:
+            self.handle_processing_error(e, f"processing text of length {len(text)}")
+            raise
+    def handle_processing_error(self, error: Exception, context: str) -> None:
+        """
+        Handle processing errors with appropriate logging.
+        Args:
+            error: The exception that occurred
+            context: Context description for the error
+        """
+        error_msg = f"Error {context}: {error}"
+        logger.error(error_msg)
+    def filter_tokens(self,
+                     doc: "spacy.Doc",
+                     exclude_punct: bool = True,
+                     exclude_space: bool = True,
+                     word_type_filter: Optional[str] = None) -> List["spacy.Token"]:
+        """
+        Filter tokens based on various criteria.
+        Args:
+            doc: SpaCy document
+            exclude_punct: Whether to exclude punctuation
+            exclude_space: Whether to exclude spaces
+            word_type_filter: Filter by word type ('CW', 'FW', or None)
+        Returns:
+            List of filtered tokens
+        """
+        filtered_tokens = []
+        for token in doc:
+            # Basic filtering
+            if exclude_space and token.is_space:
+                continue
+            if exclude_punct and token.is_punct:
+                continue
+            # Word type filtering
+            if word_type_filter:
+                word_type = self._classify_pos(token)
+                if word_type != word_type_filter:
+                    continue
+            filtered_tokens.append(token)
+        return filtered_tokens
+    def _classify_pos(self, token: "spacy.Token") -> str:
+        """
+        Classify token as content word (CW) or function word (FW).
+        Args:
+            token: SpaCy token object
+        Returns:
+            'CW' for content words, 'FW' for function words
+        """
+        content_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
+        function_pos = {'DET', 'PRON', 'ADP', 'CONJ', 'CCONJ', 'SCONJ'}
+        if token.pos_ in content_pos:
+            return 'CW'
+        elif token.pos_ in function_pos:
+            return 'FW'
+        else:
+            # Default classification for ambiguous cases
+            return 'CW' if token.pos_ not in {'PUNCT', 'SPACE', 'X'} else 'FW'
+    def format_token_for_display(self, token: "spacy.Token", include_syntax: bool = True) -> Dict[str, Any]:
+        """
+        Format token for UI display - only call when needed for output.
+        Args:
+            token: SpaCy token
+            include_syntax: Whether to include syntactic information (dep_, head, etc.)
+        Returns:
+            Formatted token data dictionary for display
+        """
+        result = {
+            'token': token.text,
+            'lemma': token.lemma_,
+            'pos': token.pos_,
+            'tag': token.tag_,
+            'word_type': self._classify_pos(token)
+        }
+        if include_syntax:
+            result.update({
+                'dep_': token.dep_,
+                'head_text': token.head.text,
+                'head_pos': token.head.pos_,
+            })
+        return result
+    def get_syntactic_context(self, token: "spacy.Token") -> Dict[str, Any]:
+        """
+        Get comprehensive syntactic relationships for a token.
+        Args:
+            token: SpaCy token
+        Returns:
+            Dictionary with syntactic context information
+        """
+        return {
+            'dep_': token.dep_,
+            'head': token.head,
+            'children': list(token.children),
+            'ancestors': list(token.ancestors),
+            'subtree_span': token.subtree,
+            'left_edge': token.left_edge,
+            'right_edge': token.right_edge
+        }
+    def process_sentences(self,
+                         doc: "spacy.Doc",
+                         max_tokens: Optional[int] = None) -> List["spacy.Span"]:
+        """
+        Process sentences with optional token limits.
+        Args:
+            doc: SpaCy document
+            max_tokens: Maximum tokens per sentence (uses config default if None)
+        Returns:
+            List of sentence spans
+        """
+        max_tokens = max_tokens or AppConfig.MAX_TOKENS_FOR_VISUALIZATION
+        processed_sentences = []
+        for sent in doc.sents:
+            # Filter tokens (exclude spaces for counting)
+            sent_tokens = [token for token in sent if not token.is_space]
+            if len(sent_tokens) > max_tokens:
+                # Truncate sentence
+                truncated_tokens = sent_tokens[:max_tokens]
+                # Create new span with truncated tokens
+                start_idx = truncated_tokens[0].i
+                end_idx = truncated_tokens[-1].i + 1
+                truncated_span = doc[start_idx:end_idx]
+                processed_sentences.append(truncated_span)
+            else:
+                processed_sentences.append(sent)
+        return processed_sentences
+    def setup_batch_processing(self, file_paths: List[str]) -> Iterator[Tuple[str, str]]:
+        """
+        Set up batch processing for multiple files.
+        Args:
+            file_paths: List of file paths to process
+        Yields:
+            Tuples of (file_path, text_content)
+        """
+        for file_path in file_paths:
+            try:
+                text_content = TextUtility.extract_text_from_file(file_path)
+                yield file_path, text_content
+            except Exception as e:
+                logger.error(f"Error processing file {file_path}: {e}")
+                yield file_path, f"ERROR: {e}"
+    def cleanup_batch_processing(self, temp_files: List[str]) -> None:
+        """
+        Clean up temporary files from batch processing.
+        Args:
+            temp_files: List of temporary file paths
+        """
+        TextUtility.cleanup_temp_files(temp_files)

text_analyzer/frequency_analyzer.py ADDED Viewed

	@@ -0,0 +1,653 @@

+"""
+Frequency Analysis Module for Word Frequency Visualization
+This module provides functionality to analyze word frequency data from various file formats,
+create histogram data, and sample representative words for each frequency bin.
+Supports flexible column mapping for diverse frequency data formats.
+"""
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Tuple, Optional, Union
+import logging
+import random
+from io import StringIO
+logger = logging.getLogger(__name__)
+class FrequencyAnalyzer:
+    """
+    Analyzes word frequency data and provides visualization-ready outputs.
+    Supports flexible column mapping for various frequency data formats.
+    Can handle both traditional 'Type'/'Freq' format and modern multi-column formats.
+    """
+    # Default column names to try for auto-detection
+    DEFAULT_WORD_COLUMNS = ['lForm', 'lemma', 'word', 'Type', 'surface_form']
+    DEFAULT_FREQUENCY_COLUMNS = ['frequency', 'freq', 'Freq', 'pmw', 'NormFreq']
+    DEFAULT_POS_COLUMNS = ['pos', 'POS', 'tag']
+    def __init__(self, file_size_limit_mb: int = 300):
+        """
+        Initialize the frequency analyzer.
+        Args:
+            file_size_limit_mb: Maximum file size limit in MB for uploads
+        """
+        self.data = None
+        self.original_data = None
+        self.column_config = None
+        self.file_size_limit = file_size_limit_mb * 1024 * 1024
+        self.detected_columns = None
+    def detect_file_format(self, content: Union[str, bytes]) -> Dict[str, any]:
+        """
+        Detect file format and separator.
+        Args:
+            content: File content as string or bytes
+        Returns:
+            Dict with format information
+        """
+        if isinstance(content, bytes):
+            content = content.decode('utf-8')
+        # Check file size
+        if len(content.encode('utf-8')) > self.file_size_limit:
+            raise ValueError(f"File too large. Maximum size is {self.file_size_limit // (1024*1024)}MB")
+        # Detect separator by checking first few lines
+        lines = content.strip().split('\n')[:5]
+        separators = ['\t', ',', ';', '|']
+        best_sep = '\t'
+        max_columns = 0
+        for sep in separators:
+            avg_cols = np.mean([len(line.split(sep)) for line in lines])
+            if avg_cols > max_columns:
+                max_columns = avg_cols
+                best_sep = sep
+        # Detect if first row is header
+        first_line = lines[0].split(best_sep)
+        second_line = lines[1].split(best_sep) if len(lines) > 1 else []
+        # Simple heuristic: if first row contains mostly strings and second row has numbers
+        has_header = True
+        if len(second_line) > 0:
+            try:
+                # Try to convert second row elements to numbers
+                numeric_count = sum(1 for x in second_line if self._is_numeric(x.strip()))
+                if numeric_count > len(second_line) * 0.3:  # If >30% are numeric
+                    has_header = True
+            except:
+                has_header = False
+        return {
+            'separator': best_sep,
+            'has_header': has_header,
+            'estimated_columns': int(max_columns),
+            'sample_lines': lines[:3]
+        }
+    def _is_numeric(self, value: str) -> bool:
+        """Check if a string value is numeric."""
+        try:
+            float(value)
+            return True
+        except (ValueError, TypeError):
+            return False
+    def detect_columns(self, df: pd.DataFrame) -> Dict[str, List[str]]:
+        """
+        Detect and categorize columns by data type and content.
+        Args:
+            df: DataFrame to analyze
+        Returns:
+            Dict with categorized column lists
+        """
+        word_candidates = []
+        frequency_candidates = []
+        pos_candidates = []
+        other_columns = []
+        for col in df.columns:
+            col_str = str(col).lower()
+            # Check if column contains string data (potential word column)
+            if df[col].dtype == 'object':
+                # Check if it looks like words (not mostly numbers)
+                sample_values = df[col].dropna().head(100)
+                if len(sample_values) > 0:
+                    non_numeric_ratio = sum(1 for x in sample_values if not self._is_numeric(str(x))) / len(sample_values)
+                    if non_numeric_ratio > 0.8:  # >80% non-numeric
+                        if any(word in col_str for word in ['form', 'lemma', 'word', 'type']):
+                            word_candidates.append(col)
+                        elif any(pos in col_str for pos in ['pos', 'tag', 'part']):
+                            pos_candidates.append(col)
+                        else:
+                            word_candidates.append(col)  # Default string columns to word candidates
+            # Check if column contains numeric data (potential frequency column)
+            elif pd.api.types.is_numeric_dtype(df[col]):
+                # Skip rank columns (usually sequential integers starting from 1)
+                if col_str in ['rank', 'index'] or (df[col].equals(pd.Series(range(1, len(df) + 1)))):
+                    other_columns.append(col)
+                else:
+                    frequency_candidates.append(col)
+            else:
+                other_columns.append(col)
+        # Sort candidates by preference based on common naming patterns
+        word_candidates = self._sort_by_preference(word_candidates, self.DEFAULT_WORD_COLUMNS)
+        frequency_candidates = self._sort_by_preference(frequency_candidates, self.DEFAULT_FREQUENCY_COLUMNS)
+        pos_candidates = self._sort_by_preference(pos_candidates, self.DEFAULT_POS_COLUMNS)
+        return {
+            'word_columns': word_candidates,
+            'frequency_columns': frequency_candidates,
+            'pos_columns': pos_candidates,
+            'other_columns': other_columns
+        }
+    def _sort_by_preference(self, columns: List[str], preferred_order: List[str]) -> List[str]:
+        """Sort columns by preference order."""
+        sorted_cols = []
+        remaining_cols = columns.copy()
+        # Add preferred columns first
+        for pref in preferred_order:
+            for col in columns:
+                if pref.lower() in str(col).lower() and col in remaining_cols:
+                    sorted_cols.append(col)
+                    remaining_cols.remove(col)
+                    break
+        # Add remaining columns
+        sorted_cols.extend(remaining_cols)
+        return sorted_cols
+    def load_frequency_data(self, content: Union[str, bytes], column_config: Dict[str, str]) -> pd.DataFrame:
+        """
+        Load and validate frequency data with flexible column mapping.
+        Args:
+            content: File content as string or bytes
+            column_config: Column mapping configuration
+                {
+                    'word_column': 'lForm',
+                    'frequency_column': 'frequency',
+                    'pos_column': 'pos',  # optional
+                    'separator': '\t'     # optional, will auto-detect if not provided
+                }
+        Returns:
+            pd.DataFrame: Loaded and validated frequency data
+        Raises:
+            ValueError: If data format is invalid or columns not found
+        """
+        try:
+            # Handle both string and bytes input
+            if isinstance(content, bytes):
+                content = content.decode('utf-8')
+            # Auto-detect format if separator not provided
+            if 'separator' not in column_config:
+                format_info = self.detect_file_format(content)
+                separator = format_info['separator']
+                has_header = format_info['has_header']
+            else:
+                separator = column_config['separator']
+                has_header = column_config.get('has_header', True)
+            # Read data
+            df = pd.read_csv(StringIO(content), sep=separator, header=0 if has_header else None)
+            # Store column configuration
+            self.column_config = column_config.copy()
+            self.column_config['separator'] = separator
+            self.column_config['has_header'] = has_header
+            # Detect available columns
+            self.detected_columns = self.detect_columns(df)
+            # Validate column configuration
+            if not self.validate_column_config(df, column_config):
+                raise ValueError("Invalid column configuration")
+            # Clean and prepare data with flexible column mapping
+            df = self._clean_data_flexible(df, column_config)
+            # Store data
+            self.original_data = df.copy()
+            self.data = df
+            logger.info(f"Loaded {len(df)} frequency entries with columns: {list(df.columns)}")
+            return df
+        except Exception as e:
+            logger.error(f"Error loading frequency data: {str(e)}")
+            raise ValueError(f"Failed to load frequency data: {str(e)}")
+    def validate_column_config(self, df: pd.DataFrame, column_config: Dict[str, str]) -> bool:
+        """
+        Validate that the specified columns exist and contain appropriate data.
+        Args:
+            df: DataFrame to validate
+            column_config: Column configuration
+        Returns:
+            bool: True if configuration is valid
+        """
+        # Check required columns exist
+        word_col = column_config.get('word_column')
+        freq_col = column_config.get('frequency_column')
+        if not word_col or word_col not in df.columns:
+            logger.error(f"Word column '{word_col}' not found in data")
+            return False
+        if not freq_col or freq_col not in df.columns:
+            logger.error(f"Frequency column '{freq_col}' not found in data")
+            return False
+        # Check that word column contains string data
+        if df[word_col].dtype != 'object':
+            logger.error(f"Word column '{word_col}' must contain text data")
+            return False
+        # Check that frequency column contains numeric data
+        if not pd.api.types.is_numeric_dtype(df[freq_col]):
+            logger.error(f"Frequency column '{freq_col}' must contain numeric data")
+            return False
+        # Check optional POS column if specified
+        pos_col = column_config.get('pos_column')
+        if pos_col and pos_col not in df.columns:
+            logger.warning(f"POS column '{pos_col}' not found in data, skipping")
+        return True
+    def _clean_data_flexible(self, df: pd.DataFrame, column_config: Dict[str, str]) -> pd.DataFrame:
+        """
+        Clean and prepare the frequency data with flexible column mapping.
+        Args:
+            df: Raw DataFrame
+            column_config: Column configuration
+        Returns:
+            pd.DataFrame: Cleaned DataFrame with standardized column names
+        """
+        word_col = column_config['word_column']
+        freq_col = column_config['frequency_column']
+        pos_col = column_config.get('pos_column')
+        # Create a copy and rename columns to standard names for compatibility
+        df_clean = df.copy()
+        # Remove rows with missing word or frequency data
+        df_clean = df_clean.dropna(subset=[word_col, freq_col])
+        # Ensure frequency is numeric
+        df_clean[freq_col] = pd.to_numeric(df_clean[freq_col], errors='coerce')
+        df_clean = df_clean.dropna(subset=[freq_col])
+        # Remove zero or negative frequencies
+        df_clean = df_clean[df_clean[freq_col] > 0]
+        # Clean word column (remove extra whitespace)
+        df_clean[word_col] = df_clean[word_col].astype(str).str.strip()
+        # Add standardized column names for backward compatibility
+        df_clean['Type'] = df_clean[word_col]
+        df_clean['Freq'] = df_clean[freq_col]
+        # Add POS column if available
+        if pos_col and pos_col in df_clean.columns:
+            df_clean['POS'] = df_clean[pos_col]
+        # Sort by frequency (descending) for better analysis
+        df_clean = df_clean.sort_values(freq_col, ascending=False).reset_index(drop=True)
+        return df_clean
+    def get_available_frequency_columns(self) -> List[str]:
+        """
+        Get list of available frequency columns for analysis.
+        Returns:
+            List[str]: Available frequency columns from the detected columns
+        """
+        if self.detected_columns is None:
+            return []
+        return self.detected_columns.get('frequency_columns', [])
+    def get_available_word_columns(self) -> List[str]:
+        """
+        Get list of available word columns.
+        Returns:
+            List[str]: Available word columns from the detected columns
+        """
+        if self.detected_columns is None:
+            return []
+        return self.detected_columns.get('word_columns', [])
+    def create_multi_frequency_analysis(self, frequency_columns: List[str], bin_size: int = 500, log_transform: bool = False) -> Dict[str, Dict]:
+        """
+        Create rank-based analysis for multiple frequency columns.
+        Args:
+            frequency_columns: List of frequency column names to analyze
+            bin_size: Number of words per rank group
+            log_transform: Whether to apply log10 transformation
+        Returns:
+            Dict mapping column names to their analysis results
+        """
+        if self.original_data is None:
+            raise ValueError("No data loaded")
+        results = {}
+        for freq_col in frequency_columns:
+            if freq_col not in self.original_data.columns:
+                logger.warning(f"Frequency column '{freq_col}' not found, skipping")
+                continue
+            try:
+                # Create analysis for this frequency column
+                analysis = self.create_rank_based_visualization_flexible(
+                    column=freq_col,
+                    bin_size=bin_size,
+                    log_transform=log_transform
+                )
+                results[freq_col] = analysis
+            except Exception as e:
+                logger.error(f"Error analyzing column '{freq_col}': {e}")
+                continue
+        return results
+    def create_rank_based_visualization_flexible(self, column: str, bin_size: int = 500, log_transform: bool = False, max_words_to_retain: Optional[int] = None) -> Dict:
+        """
+        Create rank-based visualization with flexible column support.
+        Args:
+            column: Column name to analyze (can be any numeric column)
+            bin_size: Number of words per rank group
+            log_transform: Whether to apply log10 transformation
+            max_words_to_retain: Maximum number of top frequent words to retain for analysis
+        Returns:
+            Dict: Rank-based visualization data
+        """
+        if self.original_data is None:
+            raise ValueError("No data loaded")
+        if column not in self.original_data.columns:
+            raise ValueError(f"Column '{column}' not found in data")
+        # Get word column from config or use default
+        word_col = self.column_config.get('word_column', 'Type') if self.column_config else 'Type'
+        if word_col not in self.original_data.columns:
+            word_col = 'Type'  # Fallback to standardized column
+        # Sort by the specified frequency column (descending)
+        sorted_data = self.original_data.sort_values(column, ascending=False).reset_index(drop=True)
+        # Apply word limit if specified
+        if max_words_to_retain and max_words_to_retain < len(sorted_data):
+            sorted_data = sorted_data.head(max_words_to_retain)
+            logger.info(f"Limited analysis to top {max_words_to_retain} words")
+        # Create bins by slicing exactly bin_size words
+        group_labels = []
+        group_centers = []
+        avg_frequencies = []
+        sample_words = {}
+        group_stats_list = []
+        # Limit to top 20 bins for better UI performance
+        max_display_bins = 20
+        for i in range(0, len(sorted_data), bin_size):
+            if len(group_labels) >= max_display_bins:
+                break
+            end_idx = min(i + bin_size, len(sorted_data))
+            bin_data = sorted_data[i:end_idx]
+            # Calculate group boundaries
+            start_rank = i + 1
+            end_rank = end_idx
+            group_label = f"{start_rank}-{end_rank}"
+            group_labels.append(group_label)
+            group_centers.append((start_rank + end_rank) / 2)
+            # Calculate average frequency
+            avg_freq = bin_data[column].mean()
+            if log_transform:
+                avg_freq = np.log10(avg_freq + 1e-10)
+            avg_frequencies.append(avg_freq)
+            # Get sample words (5 randomly sampled from this bin)
+            n_samples = min(5, len(bin_data))
+            if n_samples > 0:
+                if n_samples == len(bin_data):
+                    # If fewer than 5 words, take all
+                    sample_word_list = bin_data[word_col].tolist()
+                else:
+                    # Randomly sample 5 words
+                    sample_indices = random.sample(range(len(bin_data)), n_samples)
+                    sample_word_list = [bin_data.iloc[idx][word_col] for idx in sample_indices]
+            else:
+                sample_word_list = []
+            group_idx = len(group_labels) - 1
+            sample_words[group_idx] = [{'word': word, 'group': group_label} for word in sample_word_list]
+            # Store group statistics
+            group_stats_list.append({
+                'group_idx': group_idx,
+                f'{column}_mean': bin_data[column].mean(),
+                f'{column}_count': len(bin_data),
+                f'{column}_min': bin_data[column].min(),
+                f'{column}_max': bin_data[column].max(),
+                'start_rank': start_rank,
+                'end_rank': end_rank
+            })
+        # Create a DataFrame for group stats
+        group_stats = pd.DataFrame(group_stats_list)
+        # Create title suffix with word limit info
+        title_parts = [f"Bin Size: {bin_size}"]
+        if max_words_to_retain:
+            title_parts.append(f"Top {max_words_to_retain:,} words")
+        title_parts.append(f"{'Log₁₀ ' if log_transform else ''}{column}")
+        title_suffix = " (" + ", ".join(title_parts) + ")"
+        return {
+            'group_labels': group_labels,
+            'group_centers': group_centers,
+            'avg_frequencies': avg_frequencies,
+            'group_stats': group_stats,
+            'sample_words': sample_words,
+            'bin_size': bin_size,
+            'column': column,
+            'log_transform': log_transform,
+            'max_words_to_retain': max_words_to_retain,
+            'total_groups': len(group_labels),
+            'title_suffix': title_suffix,
+            'x_label': f"Rank Groups (bin size: {bin_size})",
+            'y_label': f"{'Log₁₀ ' if log_transform else ''}Average {column}"
+        }
+    # Legacy methods for backward compatibility
+    def validate_format(self, df: pd.DataFrame) -> bool:
+        """Legacy method for backward compatibility."""
+        return 'Type' in df.columns and 'Freq' in df.columns
+    def get_available_columns(self) -> List[str]:
+        """Legacy method for backward compatibility."""
+        if self.data is None:
+            return []
+        freq_columns = []
+        if 'Freq' in self.data.columns:
+            freq_columns.append('Freq')
+        if 'NormFreq' in self.data.columns:
+            freq_columns.append('NormFreq')
+        return freq_columns
+    def create_histogram_data(self, column: str = 'Freq', bins: int = 25, log_transform: bool = False) -> Dict:
+        """Legacy histogram method for backward compatibility."""
+        if self.data is None:
+            raise ValueError("No data loaded")
+        if column not in self.data.columns:
+            raise ValueError(f"Column '{column}' not found in data")
+        # Get frequency values
+        freq_values = self.data[column].copy()
+        # Apply log transformation if requested
+        if log_transform:
+            freq_values = np.log10(freq_values + 1e-10)
+            title_suffix = f" (Log₁₀ {column})"
+            x_label = f"Log₁₀ {column}"
+        else:
+            title_suffix = f" ({column})"
+            x_label = column
+        # Create histogram
+        counts, bin_edges = np.histogram(freq_values, bins=bins)
+        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
+        bin_widths = bin_edges[1:] - bin_edges[:-1]
+        return {
+            'counts': counts,
+            'bin_edges': bin_edges,
+            'bin_centers': bin_centers,
+            'bin_widths': bin_widths,
+            'freq_values': freq_values,
+            'original_column': column,
+            'log_transform': log_transform,
+            'title_suffix': title_suffix,
+            'x_label': x_label,
+            'total_words': len(self.data)
+        }
+    def sample_words_per_bin(self, histogram_data: Dict, samples_per_bin: int = 5) -> Dict[int, List[Dict]]:
+        """Legacy word sampling method for backward compatibility."""
+        if self.data is None:
+            raise ValueError("No data loaded")
+        bin_edges = histogram_data['bin_edges']
+        freq_values = histogram_data['freq_values']
+        original_column = histogram_data['original_column']
+        sampled_words = {}
+        for i in range(len(bin_edges) - 1):
+            bin_start = bin_edges[i]
+            bin_end = bin_edges[i + 1]
+            # Find words in this bin
+            if i == len(bin_edges) - 2:  # Last bin, include right edge
+                mask = (freq_values >= bin_start) & (freq_values <= bin_end)
+            else:
+                mask = (freq_values >= bin_start) & (freq_values < bin_end)
+            bin_words = self.data[mask]
+            if len(bin_words) > 0:
+                # Sample words (up to samples_per_bin)
+                n_samples = min(samples_per_bin, len(bin_words))
+                sampled = bin_words.sample(n=n_samples, random_state=42)
+                # Create word info list
+                word_list = []
+                for _, word_row in sampled.iterrows():
+                    word_info = {
+                        'word': word_row['Type'],
+                        'freq': word_row[original_column],
+                        'rank': word_row.get('Rank', 'N/A'),
+                        'original_freq': word_row['Freq'] if original_column != 'Freq' else word_row['Freq']
+                    }
+                    word_list.append(word_info)
+                sampled_words[i] = word_list
+            else:
+                sampled_words[i] = []
+        return sampled_words
+    def create_rank_based_visualization(self, column: str = 'Freq', bin_size: int = 500, log_transform: bool = False) -> Dict:
+        """Legacy rank-based visualization method for backward compatibility."""
+        return self.create_rank_based_visualization_flexible(column, bin_size, log_transform)
+    def calculate_statistics(self, column: str = 'Freq') -> Dict:
+        """Calculate descriptive statistics for the frequency data."""
+        if self.data is None:
+            raise ValueError("No data loaded")
+        if column not in self.data.columns:
+            raise ValueError(f"Column '{column}' not found in data")
+        freq_values = self.data[column]
+        stats = {
+            'count': len(freq_values),
+            'mean': float(freq_values.mean()),
+            'median': float(freq_values.median()),
+            'std': float(freq_values.std()),
+            'min': float(freq_values.min()),
+            'max': float(freq_values.max()),
+            'q25': float(freq_values.quantile(0.25)),
+            'q75': float(freq_values.quantile(0.75)),
+            'skewness': float(freq_values.skew()),
+            'column_name': column
+        }
+        # Add some additional insights
+        stats['range'] = stats['max'] - stats['min']
+        stats['iqr'] = stats['q75'] - stats['q25']
+        stats['cv'] = stats['std'] / stats['mean'] if stats['mean'] != 0 else 0
+        return stats
+    def get_top_words(self, column: str = 'Freq', n: int = 10) -> List[Dict]:
+        """Get the top N words by frequency."""
+        if self.data is None:
+            raise ValueError("No data loaded")
+        if column not in self.data.columns:
+            raise ValueError(f"Column '{column}' not found in data")
+        top_words = self.data.nlargest(n, column)
+        result = []
+        for _, row in top_words.iterrows():
+            word_info = {
+                'word': row['Type'],
+                'freq': row[column],
+                'rank': row.get('Rank', 'N/A'),
+                'original_freq': row['Freq']
+            }
+            result.append(word_info)
+        return result

text_analyzer/lexical_sophistication.py CHANGED Viewed

@@ -13,50 +13,30 @@ import logging
 from collections import defaultdict
 import re
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class LexicalSophisticationAnalyzer:
     """
     Main class for lexical sophistication analysis.
     Handles tokenization, n-gram generation, and score calculation.
     """
-    def __init__(self, language: str = "en", model_size: str = "trf"):
         """
         Initialize analyzer with specified language and model.
         Args:
             language (str): Language code ('en' for English, 'ja' for Japanese)
-            model_size (str): SpaCy model size ('trf' or 'lg')
         """
-        self.language = language
-        self.model_size = model_size
-        self.nlp = None
         self.reference_lists = {}
-        self._load_spacy_model()
-    def _load_spacy_model(self):
-        """Load appropriate SpaCy model based on language and size."""
-        model_map = {
-            ("en", "md"): "en_core_web_md",
-            ("en", "trf"): "en_core_web_trf",
-            ("ja", "md"): "ja_core_news_md",
-            ("ja", "trf"): "ja_core_news_trf"
-        }
-        model_name = model_map.get((self.language, self.model_size))
-        if not model_name:
-            raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
-        try:
-            self.nlp = spacy.load(model_name)
-            logger.info(f"Loaded SpaCy model: {model_name}")
-        except OSError:
-            logger.error(f"SpaCy model {model_name} not found. Please install it first.")
-            raise
     def load_reference_lists(self, reference_files: Dict[str, Dict[str, Union[str, dict]]]):
         """
@@ -235,26 +215,6 @@ class LexicalSophisticationAnalyzer:
             logger.error(f"Error parsing custom config: {e}")
             return {}
-    def _classify_pos(self, token) -> str:
-        """
-        Classify token as content word (CW) or function word (FW).
-        Args:
-            token: SpaCy token object
-        Returns:
-            str: 'CW' for content words, 'FW' for function words
-        """
-        content_pos = {'NOUN', 'VERB', 'ADJ', 'ADV'}
-        function_pos = {'DET', 'PRON', 'ADP', 'CONJ', 'CCONJ', 'SCONJ'}
-        if token.pos_ in content_pos:
-            return 'CW'
-        elif token.pos_ in function_pos:
-            return 'FW'
-        else:
-            # Default classification for ambiguous cases
-            return 'CW' if token.pos_ not in {'PUNCT', 'SPACE', 'X'} else 'FW'
     def _generate_ngrams(self, tokens: List, n: int, sep: str = " ") -> List[str]:
         """
@@ -296,7 +256,7 @@ class LexicalSophisticationAnalyzer:
                      measure_col: Optional[str] = None) -> Optional[float]:
         """
         Look up score for a word in reference lists.
         Args:
             word: Word to look up
             index_name: Name of the reference index
@@ -314,6 +274,12 @@ class LexicalSophisticationAnalyzer:
             return None
         if file_type in ['token', 'lemma']:
             # Simple dictionary lookup for unigrams
             return ref_data.get(word.lower())
         else:
@@ -344,6 +310,169 @@ class LexicalSophisticationAnalyzer:
                     except (ValueError, TypeError):
                         return None
                 return None
     def analyze_text(self, text: str, selected_indices: List[str],
                     apply_log: bool = False, word_type_filter: Optional[str] = None) -> Dict:
@@ -359,12 +488,9 @@ class LexicalSophisticationAnalyzer:
         Returns:
             Dictionary containing analysis results
         """
-        if not self.nlp:
-            raise ValueError("SpaCy model not loaded")
-        # Process text
-        doc = self.nlp(text)
-        tokens = [token for token in doc if not token.is_punct and not token.is_space]
         # Generate n-grams
         bigrams = self._generate_ngrams(tokens, 2)
@@ -382,7 +508,9 @@ class LexicalSophisticationAnalyzer:
                 'content_words': len([t for t in tokens if self._classify_pos(t) == 'CW']),
                 'function_words': len([t for t in tokens if self._classify_pos(t) == 'FW'])
             },
-            'raw_scores': {}  # Add raw_scores for plotting
         }
         # Initialize score collections
@@ -396,23 +524,78 @@ class LexicalSophisticationAnalyzer:
             if word_type_filter and word_type != word_type_filter:
                 continue
             token_detail = {
                 'id': i + 1,
                 'token': token.text,
                 'lemma': token.lemma_,
                 'pos': token.pos_,
                 'word_type': word_type
             }
             # Look up scores for each selected index
             for index_name in selected_indices:
-                # Token-based lookup
-                token_score = self._lookup_score(token.text, index_name, 'token')
-                lemma_score = self._lookup_score(token.lemma_, index_name, 'lemma')
-                # Store scores
-                token_detail[f"{index_name}_token"] = token_score if token_score is not None else "NA"
-                token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else "NA"
                 # Collect for summary statistics
                 if token_score is not None:
@@ -477,7 +660,7 @@ class LexicalSophisticationAnalyzer:
                             score_val = np.log10(score) if apply_log and score > 0 else score
                             ngram_detail[f"{index_name}_{measure}"] = score_val
                         else:
-                            ngram_detail[f"{index_name}_{measure}"] = "NA"
                 results[ngram_details_key].append(ngram_detail)

 from collections import defaultdict
 import re
+from .base_analyzer import BaseAnalyzer
+from .app_config import AppConfig
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class LexicalSophisticationAnalyzer(BaseAnalyzer):
     """
     Main class for lexical sophistication analysis.
     Handles tokenization, n-gram generation, and score calculation.
     """
+    def __init__(self, language: str = None, model_size: str = None):
         """
         Initialize analyzer with specified language and model.
         Args:
             language (str): Language code ('en' for English, 'ja' for Japanese)
+            model_size (str): SpaCy model size ('md' or 'trf')
         """
+        super().__init__(language, model_size)
         self.reference_lists = {}
     def load_reference_lists(self, reference_files: Dict[str, Dict[str, Union[str, dict]]]):
         """
             logger.error(f"Error parsing custom config: {e}")
             return {}
     def _generate_ngrams(self, tokens: List, n: int, sep: str = " ") -> List[str]:
         """
                      measure_col: Optional[str] = None) -> Optional[float]:
         """
         Look up score for a word in reference lists.
         Args:
             word: Word to look up
             index_name: Name of the reference index
             return None
         if file_type in ['token', 'lemma']:
+            # Check if this is Japanese corpus data
+            if isinstance(ref_data, dict) and ref_data.get('is_japanese_corpus', False):
+                # This should not be called directly for Japanese data
+                # Use _lookup_japanese_score instead
+                return None
             # Simple dictionary lookup for unigrams
             return ref_data.get(word.lower())
         else:
                     except (ValueError, TypeError):
                         return None
                 return None
+    def _lookup_with_unidic_fallback(self, token, index_name: str, file_type: str) -> Dict:
+        """
+        Enhanced Japanese lookup with UniDic 3-level fallback using corpus-compatible keys.
+        Args:
+            token: SpaCy token object with UniDic extensions
+            index_name: Name of the reference index
+            file_type: Type of reference file ('token', 'lemma')
+        Returns:
+            Dictionary with score, method, key, and diagnostic information
+        """
+        # Initialize diagnostic tracking
+        attempted_keys = []
+        diagnostic_info = {
+            'attempted_keys': attempted_keys,
+            'unidic_features': {},
+            'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0),
+            'spacy_fallback_used': False,
+            'no_match': False
+        }
+        # Get UniDic features from token extensions
+        unidic_features = {
+            'lemma': getattr(token._, 'unidic_lemma', '') or '',
+            'lForm': getattr(token._, 'unidic_lform', '') or '',
+            'pos1': getattr(token._, 'unidic_pos1', '') or '',
+            'pos2': getattr(token._, 'unidic_pos2', '') or '',
+            'pos3': getattr(token._, 'unidic_pos3', '') or '',
+            'goshu': getattr(token._, 'unidic_goshu', '') or ''
+        }
+        diagnostic_info['unidic_features'] = unidic_features
+        # Only proceed with UniDic matching if we have good alignment and features
+        if diagnostic_info['alignment_confidence'] > 0.5 and any(unidic_features.values()):
+            # Try corpus-compatible keys using the hierarchical lookup dictionaries
+            # Level 1: {lemma}_{lForm}_{pos1}_{pos2}_{pos3} (when pos3 exists)
+            if all([unidic_features['lemma'], unidic_features['lForm'],
+                   unidic_features['pos1'], unidic_features['pos2'], unidic_features['pos3']]):
+                level1_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}_{unidic_features['pos2']}_{unidic_features['pos3']}"
+                attempted_keys.append(level1_key)
+                score = self._lookup_japanese_corpus_level(level1_key, index_name, file_type, 'level1_dict')
+                if score is not None:
+                    return {
+                        'score': score,
+                        'match_method': 'unidic_corpus_level_1',
+                        'match_key': level1_key,
+                        'diagnostic_info': diagnostic_info
+                    }
+            # Level 2: {lemma}_{lForm}_{pos1}_{pos2}
+            if all([unidic_features['lemma'], unidic_features['lForm'],
+                   unidic_features['pos1'], unidic_features['pos2']]):
+                level2_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}_{unidic_features['pos2']}"
+                attempted_keys.append(level2_key)
+                score = self._lookup_japanese_corpus_level(level2_key, index_name, file_type, 'level2_dict')
+                if score is not None:
+                    return {
+                        'score': score,
+                        'match_method': 'unidic_corpus_level_2',
+                        'match_key': level2_key,
+                        'diagnostic_info': diagnostic_info
+                    }
+            # Level 3: {lemma}_{lForm}_{pos1}
+            if all([unidic_features['lemma'], unidic_features['lForm'], unidic_features['pos1']]):
+                level3_key = f"{unidic_features['lemma']}_{unidic_features['lForm']}_{unidic_features['pos1']}"
+                attempted_keys.append(level3_key)
+                score = self._lookup_japanese_corpus_level(level3_key, index_name, file_type, 'level3_dict')
+                if score is not None:
+                    return {
+                        'score': score,
+                        'match_method': 'unidic_corpus_level_3',
+                        'match_key': level3_key,
+                        'diagnostic_info': diagnostic_info
+                    }
+        # Fallback to legacy spaCy-based matching
+        diagnostic_info['spacy_fallback_used'] = True
+        legacy_score = self._lookup_japanese_score(token, index_name, file_type, fallback=True)
+        if legacy_score is not None:
+            legacy_key = f"{token.lemma_}_{token.tag_}"
+            attempted_keys.append(f"legacy: {legacy_key}")
+            return {
+                'score': legacy_score,
+                'match_method': 'legacy_spacy',
+                'match_key': legacy_key,
+                'diagnostic_info': diagnostic_info
+            }
+        # No match found
+        diagnostic_info['no_match'] = True
+        return {
+            'score': None,
+            'match_method': 'none',
+            'match_key': None,
+            'diagnostic_info': diagnostic_info
+        }
+    def _lookup_japanese_corpus_level(self, key: str, index_name: str, file_type: str, level_dict_name: str) -> Optional[float]:
+        """
+        Look up score in a specific level dictionary of Japanese corpus data.
+        Args:
+            key: Composite key to look up
+            index_name: Name of the reference index
+            file_type: Type of reference file ('token', 'lemma')
+            level_dict_name: Name of the level dictionary ('level1_dict', 'level2_dict', 'level3_dict')
+        Returns:
+            Score if found, None otherwise
+        """
+        if index_name not in self.reference_lists:
+            return None
+        ref_data = self.reference_lists[index_name].get(file_type)
+        if ref_data is None or not isinstance(ref_data, dict):
+            return None
+        if not ref_data.get('is_japanese_corpus', False):
+            return None
+        level_dict = ref_data.get(level_dict_name, {})
+        return level_dict.get(key)
+    def _lookup_japanese_score(self, token, index_name: str, file_type: str, fallback: bool = False) -> Optional[float]:
+        """
+        Look up score for a Japanese word using composite key approach.
+        Args:
+            token: SpaCy token object
+            index_name: Name of the reference index
+            file_type: Type of reference file ('token', 'lemma')
+            fallback: Whether to use fallback search strategies
+        Returns:
+            Score if found, None otherwise
+        """
+        if index_name not in self.reference_lists:
+            return None
+        ref_data = self.reference_lists[index_name].get(file_type)
+        if ref_data is None or not isinstance(ref_data, dict):
+            return None
+        if not ref_data.get('is_japanese_corpus', False):
+            return None
+        # Try composite key first (lemma_pos)
+        composite_key = f"{token.lemma_}_{token.tag_}"
+        score = ref_data.get('composite_dict', {}).get(composite_key)
+        if score is None and fallback:
+            # Fallback to lemma only
+            score = ref_data.get('lemma_dict', {}).get(token.lemma_.lower())
+        if score is None and fallback:
+            # Final fallback to surface form
+            score = ref_data.get('surface_dict', {}).get(token.text.lower())
+        return score
     def analyze_text(self, text: str, selected_indices: List[str],
                     apply_log: bool = False, word_type_filter: Optional[str] = None) -> Dict:
         Returns:
             Dictionary containing analysis results
         """
+        # Process text using base class
+        doc = self.process_document(text)
+        tokens = self.filter_tokens(doc, exclude_punct=True, exclude_space=True)
         # Generate n-grams
         bigrams = self._generate_ngrams(tokens, 2)
                 'content_words': len([t for t in tokens if self._classify_pos(t) == 'CW']),
                 'function_words': len([t for t in tokens if self._classify_pos(t) == 'FW'])
             },
+            'raw_scores': {},  # Raw scores for plotting
+            'tokens': tokens,   # Raw spaCy tokens for advanced analysis
+            'doc': doc         # Full spaCy doc for complex operations
         }
         # Initialize score collections
             if word_type_filter and word_type != word_type_filter:
                 continue
+            # Work directly with spaCy token - include syntactic information
             token_detail = {
                 'id': i + 1,
                 'token': token.text,
                 'lemma': token.lemma_,
                 'pos': token.pos_,
+                'tag': token.tag_,
+                'dep_': token.dep_,  # Add dependency relation
+                'head_text': token.head.text,  # Add head word
+                'head_pos': token.head.pos_,   # Add head POS
                 'word_type': word_type
             }
             # Look up scores for each selected index
             for index_name in selected_indices:
+                # Check if this is a Japanese corpus reference list
+                ref_data = self.reference_lists.get(index_name, {})
+                is_japanese_corpus = False
+                for file_type in ['token', 'lemma']:
+                    data = ref_data.get(file_type, {})
+                    if isinstance(data, dict) and data.get('is_japanese_corpus', False):
+                        is_japanese_corpus = True
+                        break
+                if is_japanese_corpus and self.language == 'ja':
+                    # Use enhanced UniDic lookup with 3-level fallback and diagnostics
+                    token_result = self._lookup_with_unidic_fallback(token, index_name, 'token')
+                    lemma_result = self._lookup_with_unidic_fallback(token, index_name, 'lemma')
+                    # Extract scores and diagnostic information
+                    token_score = token_result['score']
+                    lemma_score = lemma_result['score']
+                    # Store enhanced details with diagnostic information
+                    token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
+                    token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
+                    # Add diagnostic information for debugging
+                    token_detail[f"{index_name}_token_match_method"] = token_result['match_method']
+                    token_detail[f"{index_name}_lemma_match_method"] = lemma_result['match_method']
+                    token_detail[f"{index_name}_token_match_key"] = token_result['match_key'] or None
+                    token_detail[f"{index_name}_lemma_match_key"] = lemma_result['match_key'] or None
+                    # Store UniDic features for display
+                    if hasattr(token, '_') and hasattr(token._, 'unidic_lemma'):
+                        token_detail['unidic_features'] = {
+                            'lemma': getattr(token._, 'unidic_lemma', ''),
+                            'lForm': getattr(token._, 'unidic_lform', ''),
+                            'pos1': getattr(token._, 'unidic_pos1', ''),
+                            'pos2': getattr(token._, 'unidic_pos2', ''),
+                            'goshu': getattr(token._, 'unidic_goshu', ''),
+                            'alignment_confidence': getattr(token._, 'alignment_confidence', 0.0)
+                        }
+                elif is_japanese_corpus:
+                    # Fallback to legacy Japanese lookup if UniDic not available
+                    token_score = self._lookup_japanese_score(token, index_name, 'token', fallback=True)
+                    lemma_score = self._lookup_japanese_score(token, index_name, 'lemma', fallback=True)
+                    # Store scores
+                    token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
+                    token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
+                    token_detail[f"{index_name}_token_match_method"] = "legacy_spacy"
+                    token_detail[f"{index_name}_lemma_match_method"] = "legacy_spacy"
+                else:
+                    # Standard lookup for non-Japanese data
+                    token_score = self._lookup_score(token.text, index_name, 'token')
+                    lemma_score = self._lookup_score(token.lemma_, index_name, 'lemma')
+                    # Store scores
+                    token_detail[f"{index_name}_token"] = token_score if token_score is not None else None
+                    token_detail[f"{index_name}_lemma"] = lemma_score if lemma_score is not None else None
                 # Collect for summary statistics
                 if token_score is not None:
                             score_val = np.log10(score) if apply_log and score > 0 else score
                             ngram_detail[f"{index_name}_{measure}"] = score_val
                         else:
+                            ngram_detail[f"{index_name}_{measure}"] = None
                 results[ngram_details_key].append(ngram_detail)

text_analyzer/pos_parser.py CHANGED Viewed

@@ -13,15 +13,18 @@ import base64
 from io import BytesIO
 import zipfile
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class POSParser:
     """
     Main class for POS tagging and dependency parsing.
     Handles multilingual analysis and visualization.
     """
     def __init__(self, language: str = "en", model_size: str = "trf"):
@@ -30,32 +33,9 @@ class POSParser:
         Args:
             language (str): Language code ('en' for English, 'ja' for Japanese)
-            model_size (str): SpaCy model size ('trf' or 'lg')
         """
-        self.language = language
-        self.model_size = model_size
-        self.nlp = None
-        self._load_spacy_model()
-    def _load_spacy_model(self):
-        """Load appropriate SpaCy model based on language and size."""
-        model_map = {
-            ("en", "md"): "en_core_web_md",
-            ("en", "trf"): "en_core_web_trf",
-            ("ja", "md"): "ja_core_news_md",
-            ("ja", "trf"): "ja_core_news_trf"
-        }
-        model_name = model_map.get((self.language, self.model_size))
-        if not model_name:
-            raise ValueError(f"Unsupported language/model combination: {self.language}/{self.model_size}")
-        try:
-            self.nlp = spacy.load(model_name)
-            logger.info(f"Loaded SpaCy model: {model_name}")
-        except OSError:
-            logger.error(f"SpaCy model {model_name} not found. Please install it first.")
-            raise
     def analyze_text(self, text: str) -> Dict:
         """
@@ -67,11 +47,8 @@ class POSParser:
         Returns:
             Dictionary containing analysis results
         """
-        if not self.nlp:
-            raise ValueError("SpaCy model not loaded")
-        # Process text
-        doc = self.nlp(text)
         # Extract token information
         token_data = []
@@ -131,10 +108,8 @@ class POSParser:
         Returns:
             List of HTML strings, one per sentence
         """
-        if not self.nlp:
-            raise ValueError("SpaCy model not loaded")
-        doc = self.nlp(text)
         html_outputs = []
         for sent in doc.sents:
@@ -235,4 +210,4 @@ class POSParser:
                     zip_file.write(file_path, file_path.name)
             zip_buffer.seek(0)
-            return zip_buffer.getvalue()

 from io import BytesIO
 import zipfile
+from .base_analyzer import BaseAnalyzer
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+class POSParser(BaseAnalyzer):
     """
     Main class for POS tagging and dependency parsing.
     Handles multilingual analysis and visualization.
+    Inherits from BaseAnalyzer for consistent SpaCy model management.
     """
     def __init__(self, language: str = "en", model_size: str = "trf"):
         Args:
             language (str): Language code ('en' for English, 'ja' for Japanese)
+            model_size (str): SpaCy model size ('trf' or 'md')
         """
+        super().__init__(language, model_size)
     def analyze_text(self, text: str) -> Dict:
         """
         Returns:
             Dictionary containing analysis results
         """
+        # Process text using base class method
+        doc = self.process_document(text)
         # Extract token information
         token_data = []
         Returns:
             List of HTML strings, one per sentence
         """
+        # Process text using base class method
+        doc = self.process_document(text)
         html_outputs = []
         for sent in doc.sents:
                     zip_file.write(file_path, file_path.name)
             zip_buffer.seek(0)
+            return zip_buffer.getvalue()

text_analyzer/text_utility.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+Text processing utilities module.
+Contains reusable functions for file handling, encoding detection, and text cleaning.
+"""
+import os
+import tempfile
+import chardet
+from pathlib import Path
+from typing import Union, Tuple, List, Dict, Any, Optional
+import logging
+import re
+from .app_config import AppConfig
+logger = logging.getLogger(__name__)
+class TextUtility:
+    """Collection of text processing and file handling utilities."""
+    @staticmethod
+    def detect_encoding(content: bytes) -> str:
+        """
+        Detect encoding of byte content.
+        Args:
+            content: Byte content to analyze
+        Returns:
+            Detected encoding string
+        """
+        try:
+            # Try chardet for automatic detection
+            result = chardet.detect(content)
+            encoding = result.get('encoding', 'utf-8')
+            # Validate detected encoding against supported list
+            if encoding and encoding.lower() in [enc.lower() for enc in AppConfig.SUPPORTED_ENCODINGS]:
+                return encoding
+            # Fall back to trying supported encodings
+            for enc in AppConfig.SUPPORTED_ENCODINGS:
+                try:
+                    content.decode(enc)
+                    return enc
+                except UnicodeDecodeError:
+                    continue
+            # Final fallback
+            return 'utf-8'
+        except Exception as e:
+            logger.warning(f"Error detecting encoding: {e}, defaulting to utf-8")
+            return 'utf-8'
+    @staticmethod
+    def detect_delimiter(text: str) -> str:
+        """
+        Detect delimiter in text content.
+        Args:
+            text: Text content to analyze
+        Returns:
+            Detected delimiter
+        """
+        # Count occurrences of each supported delimiter
+        delimiter_counts = {}
+        for delimiter in AppConfig.SUPPORTED_DELIMITERS:
+            delimiter_counts[delimiter] = text.count(delimiter)
+        # Return the most frequent delimiter, or tab as default
+        if delimiter_counts:
+            return max(delimiter_counts, key=delimiter_counts.get)
+        return '\t'
+    @staticmethod
+    def clean_text_input(text: str) -> str:
+        """
+        Clean text input by normalizing whitespace and removing problematic characters.
+        Args:
+            text: Raw text input
+        Returns:
+            Cleaned text
+        """
+        if not text:
+            return ""
+        # Normalize whitespace
+        text = TextUtility.normalize_whitespace(text)
+        # Remove or replace problematic characters
+        # Remove null bytes
+        text = text.replace('\x00', '')
+        # Normalize unicode
+        text = text.encode('utf-8', errors='ignore').decode('utf-8')
+        return text.strip()
+    @staticmethod
+    def normalize_whitespace(text: str) -> str:
+        """
+        Normalize whitespace in text.
+        Args:
+            text: Text to normalize
+        Returns:
+            Text with normalized whitespace
+        """
+        if not text:
+            return ""
+        # Replace multiple whitespace with single space
+        text = re.sub(r'\s+', ' ', text)
+        # Remove leading/trailing whitespace from each line
+        lines = text.split('\n')
+        lines = [line.strip() for line in lines]
+        # Remove empty lines at beginning and end
+        while lines and not lines[0]:
+            lines.pop(0)
+        while lines and not lines[-1]:
+            lines.pop()
+        return '\n'.join(lines)
+    @staticmethod
+    def validate_text_length(text: str, max_length: int = None) -> bool:
+        """
+        Validate text length against limits.
+        Args:
+            text: Text to validate
+            max_length: Maximum allowed length (optional)
+        Returns:
+            True if text length is valid
+        """
+        if not text:
+            return False
+        if max_length and len(text) > max_length:
+            return False
+        return True
+    @staticmethod
+    def extract_text_from_file(file_path: str) -> str:
+        """
+        Extract text content from a file with encoding detection.
+        Args:
+            file_path: Path to the file
+        Returns:
+            Extracted text content
+        """
+        try:
+            # Read as bytes first for encoding detection
+            with open(file_path, 'rb') as f:
+                content = f.read()
+            # Detect encoding
+            encoding = TextUtility.detect_encoding(content)
+            # Decode with detected encoding
+            text = content.decode(encoding)
+            # Clean the text
+            return TextUtility.clean_text_input(text)
+        except Exception as e:
+            logger.error(f"Error extracting text from {file_path}: {e}")
+            raise ValueError(f"Failed to extract text from file: {e}")
+    @staticmethod
+    def prepare_batch_files(file_paths: List[str]) -> List[Tuple[str, str]]:
+        """
+        Prepare batch files for processing by extracting text content.
+        Args:
+            file_paths: List of file paths
+        Returns:
+            List of tuples (file_path, text_content)
+        """
+        prepared_files = []
+        for file_path in file_paths:
+            try:
+                text_content = TextUtility.extract_text_from_file(file_path)
+                prepared_files.append((file_path, text_content))
+            except Exception as e:
+                logger.error(f"Error preparing file {file_path}: {e}")
+                # Add error entry
+                prepared_files.append((file_path, f"ERROR: {e}"))
+        return prepared_files
+    @staticmethod
+    def sanitize_filename(filename: str) -> str:
+        """
+        Sanitize filename by removing problematic characters.
+        Args:
+            filename: Original filename
+        Returns:
+            Sanitized filename
+        """
+        # Remove or replace problematic characters
+        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
+        # Remove control characters
+        filename = ''.join(char for char in filename if ord(char) >= 32)
+        # Limit length
+        if len(filename) > 255:
+            name, ext = os.path.splitext(filename)
+            filename = name[:255-len(ext)] + ext
+        return filename or "unnamed_file"
+    @staticmethod
+    def create_safe_temp_file(content: str, suffix: str = '.txt') -> str:
+        """
+        Create a temporary file with given content safely.
+        Args:
+            content: Content to write to file
+            suffix: File suffix
+        Returns:
+            Path to created temporary file
+        """
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix=suffix, delete=False, encoding='utf-8') as f:
+                f.write(content)
+                return f.name
+        except Exception as e:
+            logger.error(f"Error creating temporary file: {e}")
+            raise ValueError(f"Failed to create temporary file: {e}")
+    @staticmethod
+    def load_corpus_config(corpus_name: str) -> Dict[str, Any]:
+        """
+        Load specific corpus configuration from reference_lists.yaml
+        Args:
+            corpus_name: Name of the corpus
+        Returns:
+            Corpus configuration dictionary
+        """
+        return AppConfig.get_corpus_configuration(corpus_name)
+    @staticmethod
+    def get_column_mapping(config: Dict, corpus_type: str = 'columns') -> Dict[str, int]:
+        """
+        Extract column mappings from corpus configuration
+        Args:
+            config: Corpus configuration dictionary
+            corpus_type: Type of mapping to extract
+        Returns:
+            Dictionary mapping column names to indices
+        """
+        return config.get(corpus_type, {})
+    @staticmethod
+    def cleanup_temp_files(file_paths: List[str]) -> None:
+        """
+        Clean up temporary files safely.
+        Args:
+            file_paths: List of temporary file paths to clean up
+        """
+        for file_path in file_paths:
+            try:
+                if os.path.exists(file_path):
+                    os.unlink(file_path)
+            except Exception as e:
+                logger.warning(f"Error cleaning up temporary file {file_path}: {e}")

text_analyzer/unidic_enricher.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""
+UniDic morphological enricher for Japanese text analysis.
+Provides fugashi/UniDic integration with character offset alignment.
+"""
+import fugashi
+from unidic import DICDIR
+from typing import List, Dict, Optional, Tuple
+import logging
+logger = logging.getLogger(__name__)
+class UniDicEnricher:
+    """
+    Enriches spaCy tokens with UniDic morphological features using fugashi.
+    Handles character offset alignment and provides comprehensive feature extraction.
+    """
+    def __init__(self, unidic_path: Optional[str] = None):
+        """
+        Initialize with full UniDic dictionary.
+        Args:
+            unidic_path: Path to UniDic dictionary. Uses default if None.
+        """
+        if unidic_path is None:
+            unidic_path = DICDIR
+        try:
+            # Initialize tagger with full UniDic
+            self.tagger = fugashi.Tagger(f'-Owakati -d {unidic_path}')
+            logger.info(f"UniDicEnricher initialized with dictionary: {unidic_path}")
+        except Exception as e:
+            logger.error(f"Failed to initialize UniDic tagger: {e}")
+            raise
+    def extract_full_features(self, word_node) -> Dict[str, str]:
+        """
+        Extract complete UniDic features using proper fugashi API.
+        Args:
+            word_node: Fugashi word node object
+        Returns:
+            Dictionary of UniDic morphological features
+        """
+        try:
+            features = {
+                'surface': word_node.surface,
+                'pos1': word_node.feature.pos1,        # 品詞大分類
+                'pos2': word_node.feature.pos2,        # 品詞中分類
+                'pos3': word_node.feature.pos3,        # 品詞小分類
+                'pos4': word_node.feature.pos4,        # 品詞細分類
+                'cType': word_node.feature.cType,      # 活用型
+                'cForm': word_node.feature.cForm,      # 活用形
+                'lemma': word_node.feature.lemma,      # 基本形
+                'lForm': word_node.feature.lForm,      # 読み
+                'orth': word_node.feature.orth,        # 表記
+                'orthBase': word_node.feature.orthBase, # 表記基本形
+                'goshu': word_node.feature.goshu,      # 語種 (和/漢/外/混)
+            }
+            # Handle None values by converting to empty strings
+            for key, value in features.items():
+                if value is None:
+                    features[key] = ""
+            return features
+        except Exception as e:
+            logger.warning(f"Error extracting features from word node: {e}")
+            return self._get_empty_features(word_node.surface if hasattr(word_node, 'surface') else "")
+    def _get_empty_features(self, surface: str) -> Dict[str, str]:
+        """Return empty feature dictionary with surface form."""
+        return {
+            'surface': surface,
+            'pos1': '', 'pos2': '', 'pos3': '', 'pos4': '',
+            'cType': '', 'cForm': '', 'lemma': surface, 'lForm': '',
+            'orth': '', 'orthBase': '', 'goshu': ''
+        }
+    def parse_text(self, text: str) -> List[Dict[str, any]]:
+        """
+        Parse text with fugashi and extract character positions.
+        Args:
+            text: Input text to parse
+        Returns:
+            List of dictionaries containing features and character positions
+        """
+        result = []
+        char_pos = 0
+        try:
+            for word_node in self.tagger(text):
+                surface = word_node.surface
+                features = self.extract_full_features(word_node)
+                # Find character position in original text
+                start_pos = text.find(surface, char_pos)
+                if start_pos == -1:
+                    # Fallback: assume consecutive positioning
+                    start_pos = char_pos
+                end_pos = start_pos + len(surface)
+                result.append({
+                    'surface': surface,
+                    'start': start_pos,
+                    'end': end_pos,
+                    'features': features
+                })
+                char_pos = end_pos
+        except Exception as e:
+            logger.error(f"Error parsing text with fugashi: {e}")
+        return result
+    def align_with_spacy_tokens(self, text: str, spacy_tokens) -> List[Dict]:
+        """
+        Align fugashi tokens with spaCy tokens using character offsets.
+        Args:
+            text: Original input text
+            spacy_tokens: List of spaCy token objects
+        Returns:
+            List of alignment results with confidence scores
+        """
+        fugashi_tokens = self.parse_text(text)
+        alignments = []
+        for spacy_token in spacy_tokens:
+            spacy_start = spacy_token.idx
+            spacy_end = spacy_token.idx + len(spacy_token.text)
+            best_match = None
+            best_confidence = 0.0
+            # Find best overlapping fugashi token
+            for fugashi_token in fugashi_tokens:
+                overlap = self._calculate_overlap(
+                    spacy_start, spacy_end,
+                    fugashi_token['start'], fugashi_token['end']
+                )
+                if overlap > best_confidence:
+                    best_confidence = overlap
+                    best_match = fugashi_token
+            alignment = {
+                'spacy_token': spacy_token,
+                'fugashi_token': best_match,
+                'confidence': best_confidence,
+                'aligned': best_confidence > 0.5  # Threshold for successful alignment
+            }
+            alignments.append(alignment)
+        return alignments
+    def _calculate_overlap(self, start1: int, end1: int, start2: int, end2: int) -> float:
+        """
+        Calculate overlap ratio between two character ranges.
+        Args:
+            start1, end1: First range
+            start2, end2: Second range
+        Returns:
+            Overlap ratio (0.0 to 1.0)
+        """
+        if end1 <= start2 or end2 <= start1:
+            return 0.0  # No overlap
+        overlap_start = max(start1, start2)
+        overlap_end = min(end1, end2)
+        overlap_length = overlap_end - overlap_start
+        total_length = max(end1 - start1, end2 - start2)
+        return overlap_length / total_length if total_length > 0 else 0.0
+    def enrich_spacy_doc(self, doc, text: str):
+        """
+        Add UniDic features to spaCy tokens via extensions.
+        Args:
+            doc: spaCy document object
+            text: Original input text
+        """
+        try:
+            # Get token alignments
+            alignments = self.align_with_spacy_tokens(text, doc)
+            # Apply UniDic features to spaCy tokens
+            for alignment in alignments:
+                token = alignment['spacy_token']
+                fugashi_token = alignment['fugashi_token']
+                confidence = alignment['confidence']
+                # Set alignment confidence
+                token._.alignment_confidence = confidence
+                if fugashi_token and alignment['aligned']:
+                    features = fugashi_token['features']
+                    # Set UniDic features on token extensions
+                    token._.unidic_surface = features.get('surface', '')
+                    token._.unidic_lemma = features.get('lemma', '')
+                    token._.unidic_lform = features.get('lForm', '')
+                    token._.unidic_pos1 = features.get('pos1', '')
+                    token._.unidic_pos2 = features.get('pos2', '')
+                    token._.unidic_pos3 = features.get('pos3', '')
+                    token._.unidic_pos4 = features.get('pos4', '')
+                    token._.unidic_goshu = features.get('goshu', '')
+                    token._.unidic_orth = features.get('orth', '')
+                    token._.unidic_ctype = features.get('cType', '')
+                    token._.unidic_cform = features.get('cForm', '')
+                    token._.unidic_orthbase = features.get('orthBase', '')
+                    # Store full entry for debugging
+                    token._.unidic_entries = [features]
+                else:
+                    # No alignment found - set empty values
+                    self._set_empty_unidic_features(token)
+            logger.debug(f"Enriched {len(alignments)} tokens with UniDic features")
+        except Exception as e:
+            logger.error(f"Error enriching spaCy doc: {e}")
+            # Set empty features for all tokens on error
+            for token in doc:
+                self._set_empty_unidic_features(token)
+    def _set_empty_unidic_features(self, token):
+        """Set empty UniDic features on a token."""
+        token._.unidic_surface = ""
+        token._.unidic_lemma = ""
+        token._.unidic_lform = ""
+        token._.unidic_pos1 = ""
+        token._.unidic_pos2 = ""
+        token._.unidic_pos3 = ""
+        token._.unidic_pos4 = ""
+        token._.unidic_goshu = ""
+        token._.unidic_orth = ""
+        token._.unidic_ctype = ""
+        token._.unidic_cform = ""
+        token._.unidic_orthbase = ""
+        token._.unidic_entries = []
+        token._.alignment_confidence = 0.0

text_analyzer/unidic_extensions.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+spaCy token extensions for UniDic morphological features.
+This module defines custom token extensions to store UniDic analysis results.
+"""
+from spacy.tokens import Token
+# Comprehensive UniDic feature extensions
+Token.set_extension("unidic_entries", default=[], force=True)
+Token.set_extension("unidic_lemma", default=None, force=True)
+Token.set_extension("unidic_lform", default=None, force=True)
+Token.set_extension("unidic_pos1", default=None, force=True)
+Token.set_extension("unidic_pos2", default=None, force=True)
+Token.set_extension("unidic_pos3", default=None, force=True)
+Token.set_extension("unidic_sublemma", default=None, force=True)
+Token.set_extension("unidic_goshu", default=None, force=True)
+Token.set_extension("unidic_orth", default=None, force=True)
+Token.set_extension("alignment_confidence", default=1.0, force=True)
+# Additional extensions for diagnostic tracking
+Token.set_extension("unidic_surface", default=None, force=True)
+Token.set_extension("unidic_pos4", default=None, force=True)
+Token.set_extension("unidic_ctype", default=None, force=True)
+Token.set_extension("unidic_cform", default=None, force=True)
+Token.set_extension("unidic_orthbase", default=None, force=True)

uv.lock CHANGED Viewed

@@ -27,6 +27,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
 [[package]]
 name = "attrs"
 version = "25.3.0"
@@ -88,6 +106,48 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4f/52/34c6cf5bb9285074dc3531c437b3919e825d976fde097a7a73f79e726d03/certifi-2025.7.14-py3-none-any.whl", hash = "sha256:6b31f564a415d79ee77df69d757bb49a5bb53bd9f756cbbe24394ffd6fc1f4b2", size = 162722, upload-time = "2025-07-14T03:29:26.863Z" },
 ]
 [[package]]
 name = "charset-normalizer"
 version = "3.4.2"
@@ -153,6 +213,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
 [[package]]
 name = "confection"
 version = "0.1.5"
@@ -216,6 +288,32 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload-time = "2025-01-16T21:50:24.239Z" },
 ]
 [[package]]
 name = "en-core-web-md"
 version = "3.7.0"
@@ -248,6 +346,15 @@ requires-dist = [
     { name = "spacy-curated-transformers", specifier = ">=0.2.0,<0.3.0" },
 ]
 [[package]]
 name = "filelock"
 version = "3.18.0"
@@ -266,6 +373,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
 ]
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -351,6 +478,63 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ce/ff/3b59672c47c6284e8005b42e84ceba13864aa0f39f067c973d1af02f5d91/InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4", size = 67677, upload-time = "2022-06-27T23:11:17.723Z" },
 ]
 [[package]]
 name = "ja-core-news-md"
 version = "3.7.0"
@@ -393,6 +577,18 @@ requires-dist = [
     { name = "sudachipy", specifier = ">=0.5.2,!=0.6.1" },
 ]
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -432,6 +628,36 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" },
 ]
 [[package]]
 name = "langcodes"
 version = "3.5.0"
@@ -539,6 +765,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" },
 ]
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -588,6 +826,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/15/278693412221859a0159719878e51a79812a189edceef2fe325160a8e661/narwhals-1.47.1-py3-none-any.whl", hash = "sha256:b9f2b2557aba054231361a00f6fcabc5017e338575e810e82155eb34e38ace93", size = 375506, upload-time = "2025-07-17T18:23:02.492Z" },
 ]
 [[package]]
 name = "networkx"
 version = "3.5"
@@ -789,6 +1036,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
 ]
 [[package]]
 name = "pfzy"
 version = "0.3.4"
@@ -864,6 +1132,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
 ]
 [[package]]
 name = "plotly"
 version = "6.2.0"
@@ -929,6 +1215,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload-time = "2025-05-28T19:25:53.926Z" },
 ]
 [[package]]
 name = "pyarrow"
 version = "21.0.0"
@@ -958,6 +1277,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
 ]
 [[package]]
 name = "pydantic"
 version = "2.11.7"
@@ -1058,6 +1386,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
 ]
 [[package]]
 name = "pyyaml"
 version = "6.0.2"
@@ -1084,6 +1428,36 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
 ]
 [[package]]
 name = "referencing"
 version = "0.36.2"
@@ -1301,9 +1675,12 @@ name = "simple-text-analyzer"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
     { name = "en-core-web-md" },
     { name = "en-core-web-trf" },
     { name = "huggingface-hub", extra = ["cli"] },
     { name = "ja-core-news-md" },
     { name = "ja-core-news-trf" },
     { name = "numpy" },
@@ -1314,13 +1691,17 @@ dependencies = [
     { name = "spacy" },
     { name = "spacy-curated-transformers" },
     { name = "streamlit" },
 ]
 [package.metadata]
 requires-dist = [
     { name = "en-core-web-md", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl" },
     { name = "en-core-web-trf", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl" },
     { name = "huggingface-hub", extras = ["cli"], specifier = ">=0.33.4" },
     { name = "ja-core-news-md", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl" },
     { name = "ja-core-news-trf", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl" },
     { name = "numpy", specifier = ">=1.24.0,<2.0" },
@@ -1331,6 +1712,7 @@ requires-dist = [
     { name = "spacy", specifier = ">=3.7.0" },
     { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
     { name = "streamlit", specifier = ">=1.28.0" },
 ]
 [[package]]
@@ -1455,6 +1837,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload-time = "2025-01-17T09:26:10.018Z" },
 ]
 [[package]]
 name = "streamlit"
 version = "1.47.0"
@@ -1643,6 +2039,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
 ]
 [[package]]
 name = "triton"
 version = "3.3.1"
@@ -1701,6 +2106,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
 ]
 [[package]]
 name = "urllib3"
 version = "2.5.0"
@@ -1712,14 +2129,11 @@ wheels = [
 [[package]]
 name = "wasabi"
-version = "1.1.3"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ac/f9/054e6e2f1071e963b5e746b48d1e3727470b2a490834d18ad92364929db3/wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878", size = 30391, upload-time = "2024-05-31T16:56:18.99Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/06/7c/34330a89da55610daa5f245ddce5aab81244321101614751e7537f125133/wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c", size = 27880, upload-time = "2024-05-31T16:56:16.699Z" },
 ]
 [[package]]

     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]
+[[package]]
+name = "appnope"
+version = "0.1.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" },
+]
+[[package]]
+name = "asttokens"
+version = "3.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" },
+]
 [[package]]
 name = "attrs"
 version = "25.3.0"
     { url = "https://files.pythonhosted.org/packages/4f/52/34c6cf5bb9285074dc3531c437b3919e825d976fde097a7a73f79e726d03/certifi-2025.7.14-py3-none-any.whl", hash = "sha256:6b31f564a415d79ee77df69d757bb49a5bb53bd9f756cbbe24394ffd6fc1f4b2", size = 162722, upload-time = "2025-07-14T03:29:26.863Z" },
 ]
+[[package]]
+name = "cffi"
+version = "1.17.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pycparser" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178, upload-time = "2024-09-04T20:44:12.232Z" },
+    { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840, upload-time = "2024-09-04T20:44:13.739Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803, upload-time = "2024-09-04T20:44:15.231Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850, upload-time = "2024-09-04T20:44:17.188Z" },
+    { url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729, upload-time = "2024-09-04T20:44:18.688Z" },
+    { url = "https://files.pythonhosted.org/packages/91/2b/9a1ddfa5c7f13cab007a2c9cc295b70fbbda7cb10a286aa6810338e60ea1/cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99", size = 471256, upload-time = "2024-09-04T20:44:20.248Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424, upload-time = "2024-09-04T20:44:21.673Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568, upload-time = "2024-09-04T20:44:23.245Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736, upload-time = "2024-09-04T20:44:24.757Z" },
+    { url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448, upload-time = "2024-09-04T20:44:26.208Z" },
+    { url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976, upload-time = "2024-09-04T20:44:27.578Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/f8/dd6c246b148639254dad4d6803eb6a54e8c85c6e11ec9df2cffa87571dbe/cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e", size = 182989, upload-time = "2024-09-04T20:44:28.956Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/f1/672d303ddf17c24fc83afd712316fda78dc6fce1cd53011b839483e1ecc8/cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2", size = 178802, upload-time = "2024-09-04T20:44:30.289Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/2d/eab2e858a91fdff70533cab61dcff4a1f55ec60425832ddfdc9cd36bc8af/cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3", size = 454792, upload-time = "2024-09-04T20:44:32.01Z" },
+    { url = "https://files.pythonhosted.org/packages/75/b2/fbaec7c4455c604e29388d55599b99ebcc250a60050610fadde58932b7ee/cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683", size = 478893, upload-time = "2024-09-04T20:44:33.606Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/b7/6e4a2162178bf1935c336d4da8a9352cccab4d3a5d7914065490f08c0690/cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5", size = 485810, upload-time = "2024-09-04T20:44:35.191Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/8a/1d0e4a9c26e54746dc08c2c6c037889124d4f59dffd853a659fa545f1b40/cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4", size = 471200, upload-time = "2024-09-04T20:44:36.743Z" },
+    { url = "https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd", size = 479447, upload-time = "2024-09-04T20:44:38.492Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/e4/fb8b3dd8dc0e98edf1135ff067ae070bb32ef9d509d6cb0f538cd6f7483f/cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed", size = 484358, upload-time = "2024-09-04T20:44:40.046Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469, upload-time = "2024-09-04T20:44:41.616Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475, upload-time = "2024-09-04T20:44:43.733Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" },
+]
+[[package]]
+name = "chardet"
+version = "5.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" },
+]
 [[package]]
 name = "charset-normalizer"
 version = "3.4.2"
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
+[[package]]
+name = "comm"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e9/a8/fb783cb0abe2b5fded9f55e5703015cdf1c9c85b3669087c538dd15a6a86/comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e", size = 6210, upload-time = "2024-03-12T16:53:41.133Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/75/49e5bfe642f71f272236b5b2d2691cf915a7283cc0ceda56357b61daa538/comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3", size = 7180, upload-time = "2024-03-12T16:53:39.226Z" },
+]
 [[package]]
 name = "confection"
 version = "0.1.5"
     { url = "https://files.pythonhosted.org/packages/74/65/c162fbac63e867a055240b6600b92ef96c0eb7a1895312ac53c4be93d056/cymem-2.0.11-cp313-cp313-win_amd64.whl", hash = "sha256:25da111adf425c29af0cfd9fecfec1c71c8d82e2244a85166830a0817a66ada7", size = 39090, upload-time = "2025-01-16T21:50:24.239Z" },
 ]
+[[package]]
+name = "debugpy"
+version = "1.8.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8c/8b/3a9a28ddb750a76eaec445c7f4d3147ea2c579a97dbd9e25d39001b92b21/debugpy-1.8.15.tar.gz", hash = "sha256:58d7a20b7773ab5ee6bdfb2e6cf622fdf1e40c9d5aef2857d85391526719ac00", size = 1643279, upload-time = "2025-07-15T16:43:29.135Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ab/4a/4508d256e52897f5cdfee6a6d7580974811e911c6d01321df3264508a5ac/debugpy-1.8.15-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:3dcc7225cb317469721ab5136cda9ff9c8b6e6fb43e87c9e15d5b108b99d01ba", size = 2511197, upload-time = "2025-07-15T16:43:42.343Z" },
+    { url = "https://files.pythonhosted.org/packages/99/8d/7f6ef1097e7fecf26b4ef72338d08e41644a41b7ee958a19f494ffcffc29/debugpy-1.8.15-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:047a493ca93c85ccede1dbbaf4e66816794bdc214213dde41a9a61e42d27f8fc", size = 4229517, upload-time = "2025-07-15T16:43:44.14Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/e8/e8c6a9aa33a9c9c6dacbf31747384f6ed2adde4de2e9693c766bdf323aa3/debugpy-1.8.15-cp312-cp312-win32.whl", hash = "sha256:b08e9b0bc260cf324c890626961dad4ffd973f7568fbf57feb3c3a65ab6b6327", size = 5276132, upload-time = "2025-07-15T16:43:45.529Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/ad/231050c6177b3476b85fcea01e565dac83607b5233d003ff067e2ee44d8f/debugpy-1.8.15-cp312-cp312-win_amd64.whl", hash = "sha256:e2a4fe357c92334272eb2845fcfcdbec3ef9f22c16cf613c388ac0887aed15fa", size = 5317645, upload-time = "2025-07-15T16:43:46.968Z" },
+    { url = "https://files.pythonhosted.org/packages/28/70/2928aad2310726d5920b18ed9f54b9f06df5aa4c10cf9b45fa18ff0ab7e8/debugpy-1.8.15-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:f5e01291ad7d6649aed5773256c5bba7a1a556196300232de1474c3c372592bf", size = 2495538, upload-time = "2025-07-15T16:43:48.927Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/c6/9b8ffb4ca91fac8b2877eef63c9cc0e87dd2570b1120054c272815ec4cd0/debugpy-1.8.15-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94dc0f0d00e528d915e0ce1c78e771475b2335b376c49afcc7382ee0b146bab6", size = 4221874, upload-time = "2025-07-15T16:43:50.282Z" },
+    { url = "https://files.pythonhosted.org/packages/55/8a/9b8d59674b4bf489318c7c46a1aab58e606e583651438084b7e029bf3c43/debugpy-1.8.15-cp313-cp313-win32.whl", hash = "sha256:fcf0748d4f6e25f89dc5e013d1129ca6f26ad4da405e0723a4f704583896a709", size = 5275949, upload-time = "2025-07-15T16:43:52.079Z" },
+    { url = "https://files.pythonhosted.org/packages/72/83/9e58e6fdfa8710a5e6ec06c2401241b9ad48b71c0a7eb99570a1f1edb1d3/debugpy-1.8.15-cp313-cp313-win_amd64.whl", hash = "sha256:73c943776cb83e36baf95e8f7f8da765896fd94b05991e7bc162456d25500683", size = 5317720, upload-time = "2025-07-15T16:43:53.703Z" },
+    { url = "https://files.pythonhosted.org/packages/07/d5/98748d9860e767a1248b5e31ffa7ce8cb7006e97bf8abbf3d891d0a8ba4e/debugpy-1.8.15-py2.py3-none-any.whl", hash = "sha256:bce2e6c5ff4f2e00b98d45e7e01a49c7b489ff6df5f12d881c67d2f1ac635f3d", size = 5282697, upload-time = "2025-07-15T16:44:07.996Z" },
+]
+[[package]]
+name = "decorator"
+version = "5.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" },
+]
 [[package]]
 name = "en-core-web-md"
 version = "3.7.0"
     { name = "spacy-curated-transformers", specifier = ">=0.2.0,<0.3.0" },
 ]
+[[package]]
+name = "executing"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/50/a9d80c47ff289c611ff12e63f7c5d13942c65d68125160cefd768c73e6e4/executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755", size = 978693, upload-time = "2025-01-22T15:41:29.403Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702, upload-time = "2025-01-22T15:41:25.929Z" },
+]
 [[package]]
 name = "filelock"
 version = "3.18.0"
     { url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" },
 ]
+[[package]]
+name = "fugashi"
+version = "1.5.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5e/09/e41bb13152e591f3dd5984be112a97927f6a1ae73ab0301f3cbd1c38db20/fugashi-1.5.1.tar.gz", hash = "sha256:3ff9b4d0e40e04d56d7ced906ae8fba6c6fa41aac46f5210de1b56d6626e7a1f", size = 339745, upload-time = "2025-06-05T10:29:49.158Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0f/03/cb79fcc4ec503e39e4aec9878aa4ee2038f56794f418de7e5dccc127b6c3/fugashi-1.5.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9d6e6398a1dd8d704dbd26790195455166f6f93d0fdbebf5d1913a69d15adb22", size = 562515, upload-time = "2025-06-05T10:35:16.458Z" },
+    { url = "https://files.pythonhosted.org/packages/17/6d/cf637e80350e2127d682593ba51916c19dbea9eb7abc5f69b58c5cbbd0d6/fugashi-1.5.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a02a8e4ab7758c29d8b217c8d7b019079220846fdeb04b7e1ddd4dfdb2570b7e", size = 507454, upload-time = "2025-06-05T10:35:17.982Z" },
+    { url = "https://files.pythonhosted.org/packages/51/a1/41eeea4f5e71615b60f0ad39037dbbd787b9376e383219a2cc48e94b3733/fugashi-1.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a01c97af19a378545d7600bcb10552bebb4fe70b54a66032cc78cee1be328d66", size = 503416, upload-time = "2025-06-05T10:35:19.041Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/c1/02fa1c2bcdbb661cc618d11ef23aef5ed243a8f2e680cbf7398ae913961e/fugashi-1.5.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:97906d1c7c56907b87c3fcf587a4990504784d7beecb67673c78c8dd608644c1", size = 675822, upload-time = "2025-06-05T10:54:33.357Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/be/e5723a9c3a6866c14207e7dbb6d06bc49d55ea97e1784bf1096c86f0d954/fugashi-1.5.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:823e6db97d57079da4c3fcc26f04943b894974af5a22f4762e6f6ba2ed63f212", size = 697875, upload-time = "2025-06-05T10:30:50.634Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/bc/a65acd05eca1e5583f34f215df866635a232e6345a80d965ed23d1af0718/fugashi-1.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5afa5a2bf11d039a8e45eac0ba5c2bff54ed9ef9379cb9ad7f67c987a7f6dfc0", size = 513282, upload-time = "2025-06-05T10:29:38.667Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/2c/684cd6bb8d0a988f1d4b7e41c8eebe0385417113b2a18006c3d032df7139/fugashi-1.5.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e6f69766af17299635fa5c2ee9fe99476482003126ee1769f565a661ebd4cfb1", size = 560845, upload-time = "2025-06-05T10:35:20.042Z" },
+    { url = "https://files.pythonhosted.org/packages/96/c8/e8ce5efa5a7a80a5ad75770f1944c4b22694408e956b7d8a5780cda879dd/fugashi-1.5.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2055a0e0993261906f3010522ccc94b8bb9278b35a726ed58b314a5b539b9511", size = 506664, upload-time = "2025-06-05T10:35:21.015Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/5d/46a06d2ed06cccf8a553ba0c6d723bb9863b0a02ba81463a425e30eab082/fugashi-1.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f0f3e269bfd9ba92c64086d9e6963a0bd81a3dffb9b6eeb981f33902738b7956", size = 502687, upload-time = "2025-06-05T10:35:22.298Z" },
+    { url = "https://files.pythonhosted.org/packages/14/89/7f90847fd65ea1ef50a070b0cb63a8fad12b18f54d95627cf4ac57af3a41/fugashi-1.5.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:53ff43069ed46bd0d5dec4140115f7883bf4a590d70f3c90a422c61260be342b", size = 672332, upload-time = "2025-06-05T10:54:34.757Z" },
+    { url = "https://files.pythonhosted.org/packages/72/6e/b92fec651f430e258c9fd0a82b924be2fcc23d0defd74e76ad6a5bbd97f6/fugashi-1.5.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:53ce31df44b4e95904793280eda0e9895646828859801d457314efc1d535cb4f", size = 693962, upload-time = "2025-06-05T10:30:52.246Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a9/72a7c8261ddceb0fbaee8fe075d4acd9023504c8fa8cbea2cf6140892040/fugashi-1.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:adf1646103151af5c0b78f11fd01e145c506774609243935c0978606e4a96ad3", size = 513083, upload-time = "2025-06-05T10:29:30.189Z" },
+]
 [[package]]
 name = "gitdb"
 version = "4.0.12"
     { url = "https://files.pythonhosted.org/packages/ce/ff/3b59672c47c6284e8005b42e84ceba13864aa0f39f067c973d1af02f5d91/InquirerPy-0.3.4-py3-none-any.whl", hash = "sha256:c65fdfbac1fa00e3ee4fb10679f4d3ed7a012abf4833910e63c295827fe2a7d4", size = 67677, upload-time = "2022-06-27T23:11:17.723Z" },
 ]
+[[package]]
+name = "ipykernel"
+version = "6.29.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "appnope", marker = "sys_platform == 'darwin'" },
+    { name = "comm" },
+    { name = "debugpy" },
+    { name = "ipython" },
+    { name = "jupyter-client" },
+    { name = "jupyter-core" },
+    { name = "matplotlib-inline" },
+    { name = "nest-asyncio" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyzmq" },
+    { name = "tornado" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e9/5c/67594cb0c7055dc50814b21731c22a601101ea3b1b50a9a1b090e11f5d0f/ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215", size = 163367, upload-time = "2024-07-01T14:07:22.543Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/5c/368ae6c01c7628438358e6d337c19b05425727fbb221d2a3c4303c372f42/ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5", size = 117173, upload-time = "2024-07-01T14:07:19.603Z" },
+]
+[[package]]
+name = "ipython"
+version = "9.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "decorator" },
+    { name = "ipython-pygments-lexers" },
+    { name = "jedi" },
+    { name = "matplotlib-inline" },
+    { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "prompt-toolkit" },
+    { name = "pygments" },
+    { name = "stack-data" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/54/80/406f9e3bde1c1fd9bf5a0be9d090f8ae623e401b7670d8f6fdf2ab679891/ipython-9.4.0.tar.gz", hash = "sha256:c033c6d4e7914c3d9768aabe76bbe87ba1dc66a92a05db6bfa1125d81f2ee270", size = 4385338, upload-time = "2025-07-01T11:11:30.606Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl", hash = "sha256:25850f025a446d9b359e8d296ba175a36aedd32e83ca9b5060430fe16801f066", size = 611021, upload-time = "2025-07-01T11:11:27.85Z" },
+]
+[[package]]
+name = "ipython-pygments-lexers"
+version = "1.1.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" },
+]
 [[package]]
 name = "ja-core-news-md"
 version = "3.7.0"
     { name = "sudachipy", specifier = ">=0.5.2,!=0.6.1" },
 ]
+[[package]]
+name = "jedi"
+version = "0.19.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "parso" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" },
+]
 [[package]]
 name = "jinja2"
 version = "3.1.6"
     { url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" },
 ]
+[[package]]
+name = "jupyter-client"
+version = "8.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jupyter-core" },
+    { name = "python-dateutil" },
+    { name = "pyzmq" },
+    { name = "tornado" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/71/22/bf9f12fdaeae18019a468b68952a60fe6dbab5d67cd2a103cac7659b41ca/jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419", size = 342019, upload-time = "2024-09-17T10:44:17.613Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f", size = 106105, upload-time = "2024-09-17T10:44:15.218Z" },
+]
+[[package]]
+name = "jupyter-core"
+version = "5.8.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "platformdirs" },
+    { name = "pywin32", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'win32'" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/99/1b/72906d554acfeb588332eaaa6f61577705e9ec752ddb486f302dafa292d9/jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941", size = 88923, upload-time = "2025-05-27T07:38:16.655Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" },
+]
 [[package]]
 name = "langcodes"
 version = "3.5.0"
     { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" },
 ]
+[[package]]
+name = "matplotlib-inline"
+version = "0.1.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159, upload-time = "2024-04-15T13:44:44.803Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" },
+]
 [[package]]
 name = "mdurl"
 version = "0.1.2"
     { url = "https://files.pythonhosted.org/packages/c0/15/278693412221859a0159719878e51a79812a189edceef2fe325160a8e661/narwhals-1.47.1-py3-none-any.whl", hash = "sha256:b9f2b2557aba054231361a00f6fcabc5017e338575e810e82155eb34e38ace93", size = 375506, upload-time = "2025-07-17T18:23:02.492Z" },
 ]
+[[package]]
+name = "nest-asyncio"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" },
+]
 [[package]]
 name = "networkx"
 version = "3.5"
     { url = "https://files.pythonhosted.org/packages/d5/f9/07086f5b0f2a19872554abeea7658200824f5835c58a106fa8f2ae96a46c/pandas-2.3.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444", size = 13189044, upload-time = "2025-07-07T19:19:39.999Z" },
 ]
+[[package]]
+name = "parso"
+version = "0.8.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609, upload-time = "2024-04-05T09:43:55.897Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" },
+]
+[[package]]
+name = "pexpect"
+version = "4.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ptyprocess" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" },
+]
 [[package]]
 name = "pfzy"
 version = "0.3.4"
     { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" },
 ]
+[[package]]
+name = "plac"
+version = "1.4.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/23/09/26ef2d614cabdcc52a7f383d0dc7967bf46be3c9700898c594e37b710c3d/plac-1.4.5.tar.gz", hash = "sha256:5f05bf85235c017fcd76c73c8101d4ff8e96beb3dc58b9a37de49cac7de82d14", size = 38988, upload-time = "2025-04-04T14:03:25.651Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/36/38676114a0dbee137ec366daa86603d667a07e9a52667d5ebf5c580100ba/plac-1.4.5-py2.py3-none-any.whl", hash = "sha256:87187786b4e446688b1cf5112e18fed8a23ab3b316c25fe91266a10bd1736b16", size = 22468, upload-time = "2025-04-04T14:03:24.761Z" },
+]
+[[package]]
+name = "platformdirs"
+version = "4.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload-time = "2025-05-07T22:47:42.121Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload-time = "2025-05-07T22:47:40.376Z" },
+]
 [[package]]
 name = "plotly"
 version = "6.2.0"
     { url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload-time = "2025-05-28T19:25:53.926Z" },
 ]
+[[package]]
+name = "psutil"
+version = "7.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
+    { url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
+    { url = "https://files.pythonhosted.org/packages/50/e6/eecf58810b9d12e6427369784efe814a1eec0f492084ce8eb8f4d89d6d61/psutil-7.0.0-cp37-abi3-win32.whl", hash = "sha256:ba3fcef7523064a6c9da440fc4d6bd07da93ac726b5733c29027d7dc95b39d99", size = 241053, upload-time = "2025-02-13T21:54:34.31Z" },
+    { url = "https://files.pythonhosted.org/packages/50/1b/6921afe68c74868b4c9fa424dad3be35b095e16687989ebbb50ce4fceb7c/psutil-7.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:4cf3d4eb1aa9b348dec30105c55cd9b7d4629285735a102beb4441e38db90553", size = 244885, upload-time = "2025-02-13T21:54:37.486Z" },
+]
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" },
+]
+[[package]]
+name = "pure-eval"
+version = "0.2.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
+]
 [[package]]
 name = "pyarrow"
 version = "21.0.0"
     { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
 ]
+[[package]]
+name = "pycparser"
+version = "2.22"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1d/b2/31537cf4b1ca988837256c910a668b553fceb8f069bedc4b1c826024b52c/pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6", size = 172736, upload-time = "2024-03-30T13:22:22.564Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552, upload-time = "2024-03-30T13:22:20.476Z" },
+]
 [[package]]
 name = "pydantic"
 version = "2.11.7"
     { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
 ]
+[[package]]
+name = "pywin32"
+version = "311"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" },
+    { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" },
+    { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" },
+]
 [[package]]
 name = "pyyaml"
 version = "6.0.2"
     { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" },
 ]
+[[package]]
+name = "pyzmq"
+version = "27.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cffi", marker = "implementation_name == 'pypy'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f1/06/50a4e9648b3e8b992bef8eb632e457307553a89d294103213cfd47b3da69/pyzmq-27.0.0.tar.gz", hash = "sha256:b1f08eeb9ce1510e6939b6e5dcd46a17765e2333daae78ecf4606808442e52cf", size = 280478, upload-time = "2025-06-13T14:09:07.087Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/a7/9ad68f55b8834ede477842214feba6a4c786d936c022a67625497aacf61d/pyzmq-27.0.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:cbabc59dcfaac66655c040dfcb8118f133fb5dde185e5fc152628354c1598e52", size = 1305438, upload-time = "2025-06-13T14:07:31.676Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/ee/26aa0f98665a22bc90ebe12dced1de5f3eaca05363b717f6fb229b3421b3/pyzmq-27.0.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:cb0ac5179cba4b2f94f1aa208fbb77b62c4c9bf24dd446278b8b602cf85fcda3", size = 895095, upload-time = "2025-06-13T14:07:33.104Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/85/c57e7ab216ecd8aa4cc7e3b83b06cc4e9cf45c87b0afc095f10cd5ce87c1/pyzmq-27.0.0-cp312-abi3-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53a48f0228eab6cbf69fde3aa3c03cbe04e50e623ef92ae395fce47ef8a76152", size = 651826, upload-time = "2025-06-13T14:07:34.831Z" },
+    { url = "https://files.pythonhosted.org/packages/69/9a/9ea7e230feda9400fb0ae0d61d7d6ddda635e718d941c44eeab22a179d34/pyzmq-27.0.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:111db5f395e09f7e775f759d598f43cb815fc58e0147623c4816486e1a39dc22", size = 839750, upload-time = "2025-06-13T14:07:36.553Z" },
+    { url = "https://files.pythonhosted.org/packages/08/66/4cebfbe71f3dfbd417011daca267539f62ed0fbc68105357b68bbb1a25b7/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c8878011653dcdc27cc2c57e04ff96f0471e797f5c19ac3d7813a245bcb24371", size = 1641357, upload-time = "2025-06-13T14:07:38.21Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/f6/b0f62578c08d2471c791287149cb8c2aaea414ae98c6e995c7dbe008adfb/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:c0ed2c1f335ba55b5fdc964622254917d6b782311c50e138863eda409fbb3b6d", size = 2020281, upload-time = "2025-06-13T14:07:39.599Z" },
+    { url = "https://files.pythonhosted.org/packages/37/b9/4f670b15c7498495da9159edc374ec09c88a86d9cd5a47d892f69df23450/pyzmq-27.0.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e918d70862d4cfd4b1c187310015646a14e1f5917922ab45b29f28f345eeb6be", size = 1877110, upload-time = "2025-06-13T14:07:41.027Z" },
+    { url = "https://files.pythonhosted.org/packages/66/31/9dee25c226295b740609f0d46db2fe972b23b6f5cf786360980524a3ba92/pyzmq-27.0.0-cp312-abi3-win32.whl", hash = "sha256:88b4e43cab04c3c0f0d55df3b1eef62df2b629a1a369b5289a58f6fa8b07c4f4", size = 559297, upload-time = "2025-06-13T14:07:42.533Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/12/52da5509800f7ff2d287b2f2b4e636e7ea0f001181cba6964ff6c1537778/pyzmq-27.0.0-cp312-abi3-win_amd64.whl", hash = "sha256:dce4199bf5f648a902ce37e7b3afa286f305cd2ef7a8b6ec907470ccb6c8b371", size = 619203, upload-time = "2025-06-13T14:07:43.843Z" },
+    { url = "https://files.pythonhosted.org/packages/93/6d/7f2e53b19d1edb1eb4f09ec7c3a1f945ca0aac272099eab757d15699202b/pyzmq-27.0.0-cp312-abi3-win_arm64.whl", hash = "sha256:56e46bbb85d52c1072b3f809cc1ce77251d560bc036d3a312b96db1afe76db2e", size = 551927, upload-time = "2025-06-13T14:07:45.51Z" },
+    { url = "https://files.pythonhosted.org/packages/19/62/876b27c4ff777db4ceba1c69ea90d3c825bb4f8d5e7cd987ce5802e33c55/pyzmq-27.0.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:c36ad534c0c29b4afa088dc53543c525b23c0797e01b69fef59b1a9c0e38b688", size = 1340826, upload-time = "2025-06-13T14:07:46.881Z" },
+    { url = "https://files.pythonhosted.org/packages/43/69/58ef8f4f59d3bcd505260c73bee87b008850f45edca40ddaba54273c35f4/pyzmq-27.0.0-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:67855c14173aec36395d7777aaba3cc527b393821f30143fd20b98e1ff31fd38", size = 897283, upload-time = "2025-06-13T14:07:49.562Z" },
+    { url = "https://files.pythonhosted.org/packages/43/15/93a0d0396700a60475ad3c5d42c5f1c308d3570bc94626b86c71ef9953e0/pyzmq-27.0.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8617c7d43cd8ccdb62aebe984bfed77ca8f036e6c3e46dd3dddda64b10f0ab7a", size = 660567, upload-time = "2025-06-13T14:07:51.364Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/b3/fe055513e498ca32f64509abae19b9c9eb4d7c829e02bd8997dd51b029eb/pyzmq-27.0.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:67bfbcbd0a04c575e8103a6061d03e393d9f80ffdb9beb3189261e9e9bc5d5e9", size = 847681, upload-time = "2025-06-13T14:07:52.77Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/4f/ff15300b00b5b602191f3df06bbc8dd4164e805fdd65bb77ffbb9c5facdc/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5cd11d46d7b7e5958121b3eaf4cd8638eff3a720ec527692132f05a57f14341d", size = 1650148, upload-time = "2025-06-13T14:07:54.178Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/6f/84bdfff2a224a6f26a24249a342e5906993c50b0761e311e81b39aef52a7/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:b801c2e40c5aa6072c2f4876de8dccd100af6d9918d4d0d7aa54a1d982fd4f44", size = 2023768, upload-time = "2025-06-13T14:07:55.714Z" },
+    { url = "https://files.pythonhosted.org/packages/64/39/dc2db178c26a42228c5ac94a9cc595030458aa64c8d796a7727947afbf55/pyzmq-27.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:20d5cb29e8c5f76a127c75b6e7a77e846bc4b655c373baa098c26a61b7ecd0ef", size = 1885199, upload-time = "2025-06-13T14:07:57.166Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/21/dae7b06a1f8cdee5d8e7a63d99c5d129c401acc40410bef2cbf42025e26f/pyzmq-27.0.0-cp313-cp313t-win32.whl", hash = "sha256:a20528da85c7ac7a19b7384e8c3f8fa707841fd85afc4ed56eda59d93e3d98ad", size = 575439, upload-time = "2025-06-13T14:07:58.959Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/bc/1709dc55f0970cf4cb8259e435e6773f9946f41a045c2cb90e870b7072da/pyzmq-27.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d8229f2efece6a660ee211d74d91dbc2a76b95544d46c74c615e491900dc107f", size = 639933, upload-time = "2025-06-13T14:08:00.777Z" },
+]
 [[package]]
 name = "referencing"
 version = "0.36.2"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
+    { name = "chardet" },
     { name = "en-core-web-md" },
     { name = "en-core-web-trf" },
+    { name = "fugashi" },
     { name = "huggingface-hub", extra = ["cli"] },
+    { name = "ipykernel" },
     { name = "ja-core-news-md" },
     { name = "ja-core-news-trf" },
     { name = "numpy" },
     { name = "spacy" },
     { name = "spacy-curated-transformers" },
     { name = "streamlit" },
+    { name = "unidic" },
 ]
 [package.metadata]
 requires-dist = [
+    { name = "chardet", specifier = ">=5.2.0" },
     { name = "en-core-web-md", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl" },
     { name = "en-core-web-trf", url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.2/en_core_web_trf-3.7.2-py3-none-any.whl" },
+    { name = "fugashi", specifier = ">=1.3.0" },
     { name = "huggingface-hub", extras = ["cli"], specifier = ">=0.33.4" },
+    { name = "ipykernel", specifier = ">=6.29.5" },
     { name = "ja-core-news-md", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_md-3.7.0/ja_core_news_md-3.7.0-py3-none-any.whl" },
     { name = "ja-core-news-trf", url = "https://github.com/explosion/spacy-models/releases/download/ja_core_news_trf-3.7.2/ja_core_news_trf-3.7.2-py3-none-any.whl" },
     { name = "numpy", specifier = ">=1.24.0,<2.0" },
     { name = "spacy", specifier = ">=3.7.0" },
     { name = "spacy-curated-transformers", specifier = ">=0.1.0,<0.3.0" },
     { name = "streamlit", specifier = ">=1.28.0" },
+    { name = "unidic", specifier = ">=1.1.0" },
 ]
 [[package]]
     { url = "https://files.pythonhosted.org/packages/3a/e2/745aeba88a8513017fbac2fd2f9f07b8a36065e51695f818541eb795ec0c/srsly-2.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:e73712be1634b5e1de6f81c273a7d47fe091ad3c79dc779c03d3416a5c117cee", size = 630634, upload-time = "2025-01-17T09:26:10.018Z" },
 ]
+[[package]]
+name = "stack-data"
+version = "0.6.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asttokens" },
+    { name = "executing" },
+    { name = "pure-eval" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" },
+]
 [[package]]
 name = "streamlit"
 version = "1.47.0"
     { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
 ]
+[[package]]
+name = "traitlets"
+version = "5.14.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
+]
 [[package]]
 name = "triton"
 version = "3.3.1"
     { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
 ]
+[[package]]
+name = "unidic"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "plac" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "wasabi" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5a/09/271dfbf8d5b56adddc70e30fa94249f5d3ab35f615bf278d65258045564a/unidic-1.1.0.tar.gz", hash = "sha256:0ab91c05de342c84d2a6314901fd3afb9061ecd7534dd4a0431dccbb87d921b7", size = 7688, upload-time = "2021-10-10T08:56:44.301Z" }
 [[package]]
 name = "urllib3"
 version = "2.5.0"
 [[package]]
 name = "wasabi"
+version = "0.10.1"
 source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/41/0c31737ee1a29c8b829690ebb4ab988b1f489aa2c3efa115a732a9dd7997/wasabi-0.10.1.tar.gz", hash = "sha256:c8e372781be19272942382b14d99314d175518d7822057cb7a97010c4259d249", size = 28380, upload-time = "2022-07-28T08:17:54.968Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/34/74/bd566f876c2de097e75d525c2696fb9829009987a0d93a4fb3576778a0a8/wasabi-0.10.1-py3-none-any.whl", hash = "sha256:fe862cc24034fbc9f04717cd312ab884f71f51a8ecabebc3449b751c2a649d83", size = 26075, upload-time = "2022-07-28T08:17:53.504Z" },
 ]
 [[package]]

web_app/__pycache__/analysis_handlers.cpython-312.pyc DELETED Viewed

Binary file (17.9 kB)

web_app/__pycache__/app.cpython-312.pyc DELETED Viewed

Binary file (4.4 kB)

web_app/__pycache__/comparison_functions.cpython-312.pyc DELETED Viewed

Binary file (13.2 kB)

web_app/__pycache__/config_manager.cpython-312.pyc DELETED Viewed

Binary file (9.89 kB)

web_app/__pycache__/pos_handlers.cpython-312.pyc DELETED Viewed

Binary file (7.49 kB)

web_app/__pycache__/reference_manager.cpython-312.pyc DELETED Viewed

Binary file (10.6 kB)

web_app/__pycache__/session_manager.cpython-312.pyc DELETED Viewed

Binary file (6.81 kB)

web_app/__pycache__/ui_components.cpython-312.pyc DELETED Viewed

Binary file (11.8 kB)

web_app/app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from web_app.components.ui_components import UIComponents
 from web_app.handlers.analysis_handlers import AnalysisHandlers
 from web_app.reference_manager import ReferenceManager
 from web_app.handlers.pos_handlers import POSHandlers
 # Configure Streamlit page
 st.set_page_config(
@@ -32,7 +33,7 @@ st.set_page_config(
 def main():
     """Main application entry point."""
     st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
-    st.markdown("*Educational tools for lexical sophistication analysis and POS/dependency parsing*")
     # Initialize session state
     SessionManager.initialize_session_state()
@@ -46,8 +47,10 @@ def main():
     # Route to appropriate interface
     if tool_choice == 'Lexical Sophistication':
         render_lexical_sophistication_interface()
-    else:
         render_pos_parser_interface()
 def render_sidebar():
@@ -117,5 +120,14 @@ def render_pos_parser_interface():
         POSHandlers.handle_batch_pos_analysis(parser)
 if __name__ == "__main__":
-    main()

 from web_app.handlers.analysis_handlers import AnalysisHandlers
 from web_app.reference_manager import ReferenceManager
 from web_app.handlers.pos_handlers import POSHandlers
+from web_app.handlers.frequency_handlers import FrequencyHandlers
 # Configure Streamlit page
 st.set_page_config(
 def main():
     """Main application entry point."""
     st.title("📊 Linguistic Data Analysis I - Text Analysis Tools")
+    st.markdown("*Educational tools for lexical sophistication analysis, POS/dependency parsing, and word frequency visualization*")
     # Initialize session state
     SessionManager.initialize_session_state()
     # Route to appropriate interface
     if tool_choice == 'Lexical Sophistication':
         render_lexical_sophistication_interface()
+    elif tool_choice == 'POS Parser':
         render_pos_parser_interface()
+    else:  # Frequency Analysis
+        render_frequency_analysis_interface()
 def render_sidebar():
         POSHandlers.handle_batch_pos_analysis(parser)
+def render_frequency_analysis_interface():
+    """Render frequency analysis interface."""
+    st.header("📊 Word Frequency Analysis")
+    st.markdown("Analyze and visualize word frequency distributions from TSV data files.")
+    # Handle frequency analysis
+    FrequencyHandlers.handle_frequency_analysis()
 if __name__ == "__main__":
+    main()

web_app/components/__pycache__/__init__.cpython-312.pyc DELETED Viewed

Binary file (244 Bytes)

web_app/components/__pycache__/comparison_functions.cpython-312.pyc DELETED Viewed

Binary file (13.2 kB)

web_app/components/__pycache__/ui_components.cpython-312.pyc DELETED Viewed

Binary file (11.9 kB)

web_app/components/comparison_functions.py CHANGED Viewed

@@ -260,12 +260,13 @@ def display_token_comparison(results_a, results_b):
                 'Token': token.get('token', ''),
                 'Lemma': token.get('lemma', ''),
                 'POS': token.get('pos', ''),
                 'Type': token.get('word_type', '')
             }
             # Add scores for each measure (skip basic fields)
             for key, value in token.items():
-                if key not in ['id', 'token', 'lemma', 'pos', 'word_type']:
                     row[key] = value if value != 'NA' else 'N/A'
             token_data.append(row)

                 'Token': token.get('token', ''),
                 'Lemma': token.get('lemma', ''),
                 'POS': token.get('pos', ''),
+                "TAG": token.get('tag', ''),
                 'Type': token.get('word_type', '')
             }
             # Add scores for each measure (skip basic fields)
             for key, value in token.items():
+                if key not in ['id', 'token', 'lemma', 'pos', 'tag', 'word_type']:
                     row[key] = value if value != 'NA' else 'N/A'
             token_data.append(row)

web_app/components/ui_components.py CHANGED Viewed

@@ -121,7 +121,7 @@ class UIComponents:
         st.subheader("Analysis Tools")
         return st.radio(
             "Select Tool",
-            options=['Lexical Sophistication', 'POS Parser'],
             key='tool_choice'
         )
@@ -229,4 +229,4 @@ class UIComponents:
                 st.write(f"- {error}")
         if success_count == 0:
-            st.error("No valid configurations found")

         st.subheader("Analysis Tools")
         return st.radio(
             "Select Tool",
+            options=['Lexical Sophistication', 'POS Parser', 'Frequency Analysis'],
             key='tool_choice'
         )
                 st.write(f"- {error}")
         if success_count == 0:
+            st.error("No valid configurations found")

web_app/config_manager.py CHANGED Viewed

@@ -147,6 +147,9 @@ class ConfigManager:
         """Load actual data for a reference list based on its configuration."""
         data = {}
         # Check if this is a bigram or trigram configuration
         columns = list_config.get('columns', {})
         is_bigram = 'bigram' in columns
@@ -173,8 +176,12 @@ class ConfigManager:
                 # Get column mapping
                 columns = list_config.get('columns', {})
-                if file_type in ['token', 'lemma'] and not is_bigram and not is_trigram:
-                    # For unigrams only
                     word_col = columns.get('word', 0)
                     score_col = columns.get('frequency', 1)
@@ -208,9 +215,109 @@ class ConfigManager:
         return data
     @staticmethod
     def clean_default_reference_lists():
         """Clean up default reference lists that are no longer selected."""
         # This would be called by the UI when managing default reference lists
         # Implementation depends on how default lists are managed
-        pass

         """Load actual data for a reference list based on its configuration."""
         data = {}
+        # Check if this is a Japanese corpus
+        is_japanese_corpus = list_config.get('japanese_corpus', False)
         # Check if this is a bigram or trigram configuration
         columns = list_config.get('columns', {})
         is_bigram = 'bigram' in columns
                 # Get column mapping
                 columns = list_config.get('columns', {})
+                if is_japanese_corpus and file_type in ['token', 'lemma']:
+                    # Handle Japanese corpus format with composite keys
+                    processed_data = ConfigManager._parse_japanese_corpus_data(df, columns)
+                    data[file_type] = processed_data
+                elif file_type in ['token', 'lemma'] and not is_bigram and not is_trigram:
+                    # For standard unigrams
                     word_col = columns.get('word', 0)
                     score_col = columns.get('frequency', 1)
         return data
+    @staticmethod
+    def _parse_japanese_corpus_data(df: pd.DataFrame, columns: Dict[str, int]) -> Dict[str, Any]:
+        """Parse Japanese corpus data and create multiple lookup dictionaries with hierarchical POS splitting."""
+        try:
+            # Get column indices
+            surface_col_idx = columns.get('surface_form', 1)
+            lemma_col_idx = columns.get('lemma', 2)
+            pos_col_idx = columns.get('pos', 3)
+            freq_col_idx = columns.get('frequency', 6)
+            # Get actual column names
+            df_columns = list(df.columns)
+            surface_col = df_columns[surface_col_idx] if surface_col_idx < len(df_columns) else None
+            lemma_col = df_columns[lemma_col_idx] if lemma_col_idx < len(df_columns) else None
+            pos_col = df_columns[pos_col_idx] if pos_col_idx < len(df_columns) else None
+            freq_col = df_columns[freq_col_idx] if freq_col_idx < len(df_columns) else None
+            if not all([surface_col, lemma_col, pos_col, freq_col]):
+                raise ValueError("Missing required columns for Japanese corpus")
+            # Clean the data
+            df_clean = df.copy()
+            # Clean text columns
+            for col in [surface_col, lemma_col, pos_col]:
+                df_clean[col] = df_clean[col].astype(str).str.strip()
+                df_clean = df_clean[df_clean[col] != '']
+                df_clean = df_clean[df_clean[col] != 'nan']
+            # Clean and convert frequency column
+            df_clean[freq_col] = pd.to_numeric(df_clean[freq_col], errors='coerce')
+            df_clean = df_clean.dropna(subset=[freq_col])
+            df_clean = df_clean[df_clean[freq_col] > 0]  # Only positive frequencies
+            # Split POS column by hyphen to extract pos1, pos2, pos3
+            def split_pos(pos_str):
+                parts = str(pos_str).split('-')
+                return {
+                    'pos1': parts[0] if len(parts) > 0 else '',
+                    'pos2': parts[1] if len(parts) > 1 else '',
+                    'pos3': parts[2] if len(parts) > 2 else ''
+                }
+            pos_split = df_clean[pos_col].apply(split_pos)
+            df_clean['pos1'] = [p['pos1'] for p in pos_split]
+            df_clean['pos2'] = [p['pos2'] for p in pos_split]
+            df_clean['pos3'] = [p['pos3'] for p in pos_split]
+            # Create multiple levels of composite keys to match UniDic lookup hierarchy
+            # Level 1: lemma_lForm_pos1_pos2_pos3 (when pos3 exists)
+            df_clean['level1_key'] = df_clean.apply(
+                lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}_{row['pos2']}_{row['pos3']}"
+                if row['pos3'] else None, axis=1
+            )
+            # Level 2: lemma_lForm_pos1_pos2
+            df_clean['level2_key'] = df_clean.apply(
+                lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}_{row['pos2']}"
+                if row['pos2'] else None, axis=1
+            )
+            # Level 3: lemma_lForm_pos1
+            df_clean['level3_key'] = df_clean.apply(
+                lambda row: f"{row[lemma_col]}_{row[surface_col]}_{row['pos1']}"
+                if row['pos1'] else None, axis=1
+            )
+            # Legacy composite key for backward compatibility
+            df_clean['legacy_key'] = df_clean[lemma_col] + '_' + df_clean[pos_col]
+            # Create lookup dictionaries for each level
+            level1_dict = {}
+            level2_dict = {}
+            level3_dict = {}
+            for _, row in df_clean.iterrows():
+                freq = row[freq_col]
+                if row['level1_key']:
+                    level1_dict[row['level1_key']] = freq
+                if row['level2_key']:
+                    level2_dict[row['level2_key']] = freq
+                if row['level3_key']:
+                    level3_dict[row['level3_key']] = freq
+            # Return enhanced Japanese corpus data structure
+            return {
+                'level1_dict': level1_dict,  # Most specific UniDic-compatible keys
+                'level2_dict': level2_dict,
+                'level3_dict': level3_dict,
+                'composite_dict': dict(zip(df_clean['legacy_key'], df_clean[freq_col])),  # Legacy format
+                'lemma_dict': dict(zip(df_clean[lemma_col].str.lower(), df_clean[freq_col])),
+                'surface_dict': dict(zip(df_clean[surface_col].str.lower(), df_clean[freq_col])),
+                'is_japanese_corpus': True
+            }
+        except Exception as e:
+            st.error(f"Error parsing Japanese corpus data: {e}")
+            return {}
     @staticmethod
     def clean_default_reference_lists():
         """Clean up default reference lists that are no longer selected."""
         # This would be called by the UI when managing default reference lists
         # Implementation depends on how default lists are managed
+        pass

web_app/handlers/__pycache__/__init__.cpython-312.pyc DELETED Viewed

Binary file (245 Bytes)

web_app/handlers/__pycache__/analysis_handlers.cpython-312.pyc DELETED Viewed

Binary file (17.9 kB)

web_app/handlers/__pycache__/pos_handlers.cpython-312.pyc DELETED Viewed

Binary file (7.52 kB)

web_app/handlers/frequency_handlers.py ADDED Viewed

	@@ -0,0 +1,635 @@

+"""
+Frequency Analysis Handlers for Streamlit Interface
+This module provides Streamlit interface handlers for word frequency visualization,
+including file upload, visualization controls, and results display.
+Supports flexible column mapping for diverse frequency data formats.
+"""
+import streamlit as st
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+import numpy as np
+from typing import Dict, List, Optional
+import sys
+import os
+from pathlib import Path
+from io import StringIO
+# Add parent directory to path for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from text_analyzer.frequency_analyzer import FrequencyAnalyzer
+class FrequencyHandlers:
+    """
+    Streamlit interface handlers for frequency analysis functionality.
+    """
+    @staticmethod
+    def handle_frequency_analysis():
+        """
+        Enhanced frequency analysis interface handler with persistent column selection.
+        """
+        st.markdown("Upload a frequency data file (TSV/CSV) with flexible column mapping support. "
+                   "The system will automatically detect columns and let you choose which ones to use for analysis.")
+        # Initialize session state variables
+        if 'uploaded_file_name' not in st.session_state:
+            st.session_state.uploaded_file_name = None
+        if 'column_config' not in st.session_state:
+            st.session_state.column_config = None
+        if 'analyzer' not in st.session_state:
+            st.session_state.analyzer = None
+        if 'format_info' not in st.session_state:
+            st.session_state.format_info = None
+        if 'detected_cols' not in st.session_state:
+            st.session_state.detected_cols = None
+        if 'uploaded_file_content' not in st.session_state:
+            st.session_state.uploaded_file_content = None
+        # File upload section
+        uploaded_file = FrequencyHandlers.render_file_upload()
+        # Check if a new file was uploaded
+        if uploaded_file is not None:
+            current_file_name = uploaded_file.name
+            # Reset state if new file is uploaded
+            if st.session_state.uploaded_file_name != current_file_name:
+                st.session_state.uploaded_file_name = current_file_name
+                st.session_state.column_config = None
+                st.session_state.analyzer = None
+                st.session_state.format_info = None
+                st.session_state.detected_cols = None
+                st.session_state.uploaded_file_content = uploaded_file.getvalue()
+            try:
+                # Initialize analyzer and process file (only if needed)
+                if st.session_state.analyzer is None or st.session_state.format_info is None:
+                    st.session_state.analyzer = FrequencyAnalyzer(file_size_limit_mb=300)
+                    st.session_state.format_info = st.session_state.analyzer.detect_file_format(uploaded_file.getvalue())
+                    # Show format detection results
+                    st.success(f"✅ File format detected: {st.session_state.format_info['separator']}-separated, "
+                              f"{'with' if st.session_state.format_info['has_header'] else 'without'} header, "
+                              f"~{st.session_state.format_info['estimated_columns']} columns")
+                    # Prepare data for column detection
+                    content = uploaded_file.getvalue()
+                    if isinstance(content, bytes):
+                        content = content.decode('utf-8')
+                    # Read data for preview and column detection
+                    df_preview = pd.read_csv(StringIO(content),
+                                           sep=st.session_state.format_info['separator'],
+                                           header=0 if st.session_state.format_info['has_header'] else None,
+                                           nrows=100)
+                    # Detect available columns
+                    st.session_state.detected_cols = st.session_state.analyzer.detect_columns(df_preview)
+                    # Show data preview
+                    FrequencyHandlers.render_data_preview(df_preview, st.session_state.detected_cols)
+                # ALWAYS show column selection if we have detected columns (persistent interface)
+                if st.session_state.detected_cols is not None:
+                    with st.expander("🎯 Column Selection", expanded=True):
+                        column_config = FrequencyHandlers.render_persistent_column_selection(
+                            st.session_state.detected_cols,
+                            st.session_state.format_info,
+                            st.session_state.column_config
+                        )
+                        # Check if column configuration changed
+                        if column_config != st.session_state.column_config:
+                            st.session_state.column_config = column_config
+                            # Reload data with new configuration
+                            df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
+                            st.session_state.loaded_data = df
+                            st.rerun()
+                # ALWAYS show visualization controls if we have a column config
+                if st.session_state.column_config is not None:
+                    viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
+                    if viz_config:
+                        # Generate analysis
+                        FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
+            except Exception as e:
+                st.error(f"Error processing file: {str(e)}")
+                with st.expander("Error Details"):
+                    st.code(str(e))
+                st.info("Please ensure your file is a valid TSV/CSV with appropriate columns.")
+        elif st.session_state.column_config is not None and st.session_state.uploaded_file_content is not None:
+            # Show persistent interface even when no file is currently selected (using cached data)
+            with st.expander("🎯 Column Selection", expanded=False):
+                column_config = FrequencyHandlers.render_persistent_column_selection(
+                    st.session_state.detected_cols,
+                    st.session_state.format_info,
+                    st.session_state.column_config
+                )
+                # Check if column configuration changed
+                if column_config != st.session_state.column_config:
+                    st.session_state.column_config = column_config
+                    # Reload data with new configuration
+                    df = st.session_state.analyzer.load_frequency_data(st.session_state.uploaded_file_content, column_config)
+                    st.session_state.loaded_data = df
+                    st.rerun()
+            viz_config = FrequencyHandlers.render_enhanced_visualization_controls(st.session_state.analyzer, st.session_state.column_config)
+            if viz_config:
+                # Generate analysis
+                FrequencyHandlers.render_enhanced_rank_based_analysis(st.session_state.analyzer, viz_config)
+    @staticmethod
+    def render_file_upload():
+        """
+        Render enhanced file upload interface with flexible format support.
+        Returns:
+            Uploaded file object or None
+        """
+        st.subheader("📄 Upload Frequency Data")
+        uploaded_file = st.file_uploader(
+            "Choose a frequency data file",
+            type=['tsv', 'csv', 'txt'],
+            help="Upload a TSV or CSV file with frequency data. Supports flexible column mapping.",
+            accept_multiple_files=False
+        )
+        if uploaded_file is None:
+            # Show example formats
+            st.info("**Supported formats:**")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write("**Traditional format:**")
+                example_traditional = """Type\tFreq\tRank
+the\t69868\t1
+of\t36426\t2
+and\t28891\t3"""
+                st.code(example_traditional, language="text")
+            with col2:
+                st.write("**Rich corpus format:**")
+                example_rich = """rank\tlForm\tlemma\tpos\tfrequency\tpmw
+1\tノ\tの\t助詞\t5061558\t48383.9
+2\tニ\tに\t助詞\t3576558\t34188.7
+3\tテ\tて\t助詞\t3493117\t33391.0"""
+                st.code(example_rich, language="text")
+            st.write("**File size limit:** 300MB")
+        return uploaded_file
+    @staticmethod
+    def render_data_preview(df: pd.DataFrame, detected_cols: Dict[str, List[str]]):
+        """
+        Render enhanced data preview section with column detection results.
+        Args:
+            df: Preview DataFrame
+            detected_cols: Detected column categorization
+        """
+        st.subheader("📊 Data Preview")
+        # Basic metrics
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Total Rows", len(df))
+        with col2:
+            st.metric("Total Columns", len(df.columns))
+        with col3:
+            word_cols = len(detected_cols.get('word_columns', []))
+            freq_cols = len(detected_cols.get('frequency_columns', []))
+            st.metric("Detected", f"{word_cols} word, {freq_cols} freq")
+        # Show sample data
+        st.write("**First 5 rows:**")
+        st.dataframe(df.head(), use_container_width=True)
+        # Show detected column categories
+        with st.expander("🔍 Column Detection Results", expanded=True):
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write("**Word Columns (text data):**")
+                word_cols = detected_cols.get('word_columns', [])
+                if word_cols:
+                    for col in word_cols:
+                        st.write(f"- `{col}` ({df[col].dtype})")
+                else:
+                    st.write("None detected")
+                st.write("**POS Columns:**")
+                pos_cols = detected_cols.get('pos_columns', [])
+                if pos_cols:
+                    for col in pos_cols:
+                        st.write(f"- `{col}` ({df[col].dtype})")
+                else:
+                    st.write("None detected")
+            with col2:
+                st.write("**Frequency Columns (numeric data):**")
+                freq_cols = detected_cols.get('frequency_columns', [])
+                if freq_cols:
+                    for col in freq_cols:
+                        sample_vals = df[col].dropna().head(3).tolist()
+                        st.write(f"- `{col}` ({df[col].dtype}) - e.g., {sample_vals}")
+                else:
+                    st.write("None detected")
+                st.write("**Other Columns:**")
+                other_cols = detected_cols.get('other_columns', [])
+                if other_cols:
+                    for col in other_cols[:5]:  # Show max 5
+                        st.write(f"- `{col}` ({df[col].dtype})")
+                    if len(other_cols) > 5:
+                        st.write(f"... and {len(other_cols) - 5} more")
+                else:
+                    st.write("None")
+    @staticmethod
+    def render_column_selection_simplified(detected_cols: Dict[str, List[str]], format_info: Dict) -> Optional[Dict[str, str]]:
+        """
+        Render simplified column selection interface without multi-frequency complexity.
+        Args:
+            detected_cols: Detected column categorization
+            format_info: File format information
+        Returns:
+            Column configuration dict or None
+        """
+        st.subheader("🎯 Column Mapping")
+        st.write("Select which columns to use for your frequency analysis:")
+        word_cols = detected_cols.get('word_columns', [])
+        freq_cols = detected_cols.get('frequency_columns', [])
+        pos_cols = detected_cols.get('pos_columns', [])
+        if not word_cols or not freq_cols:
+            st.error("❌ Required columns not detected. Please ensure your file has:")
+            st.write("- At least one text column (for words)")
+            st.write("- At least one numeric column (for frequencies)")
+            return None
+        col1, col2 = st.columns(2)
+        with col1:
+            # Word column selection
+            word_column = st.selectbox(
+                "Word Column",
+                options=word_cols,
+                index=0,
+                help="Column containing word forms or lemmas"
+            )
+            # POS column selection (optional)
+            pos_column = None
+            if pos_cols:
+                use_pos = st.checkbox("Include POS column", value=False)
+                if use_pos:
+                    pos_column = st.selectbox(
+                        "POS Column",
+                        options=pos_cols,
+                        index=0,
+                        help="Column containing part-of-speech tags (optional)"
+                    )
+        with col2:
+            # Frequency column selection
+            frequency_column = st.selectbox(
+                "Frequency Column",
+                options=freq_cols,
+                index=0,
+                help="Column containing frequency values for analysis"
+            )
+        # Confirm button
+        if st.button("🚀 Start Analysis", type="primary"):
+            config = {
+                'word_column': word_column,
+                'frequency_column': frequency_column,
+                'separator': format_info['separator'],
+                'has_header': format_info['has_header']
+            }
+            if pos_column:
+                config['pos_column'] = pos_column
+            return config
+        return None
+    @staticmethod
+    def render_visualization_controls_simplified(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
+        """
+        Legacy method - redirects to enhanced controls for backward compatibility.
+        """
+        return FrequencyHandlers.render_enhanced_visualization_controls(analyzer, column_config)
+    @staticmethod
+    def render_rank_based_analysis_simplified(analyzer: FrequencyAnalyzer, viz_config: Dict):
+        """
+        Legacy method - redirects to enhanced analysis for backward compatibility.
+        """
+        return FrequencyHandlers.render_enhanced_rank_based_analysis(analyzer, viz_config)
+    @staticmethod
+    def render_persistent_column_selection(detected_cols: Dict[str, List[str]],
+                                         format_info: Dict,
+                                         current_config: Optional[Dict] = None) -> Dict[str, str]:
+        """
+        Render persistent column selection interface that doesn't disappear.
+        Args:
+            detected_cols: Detected column categorization
+            format_info: File format information
+            current_config: Current column configuration (for preserving selections)
+        Returns:
+            Column configuration dict
+        """
+        st.write("Select which columns to use for your frequency analysis:")
+        word_cols = detected_cols.get('word_columns', [])
+        freq_cols = detected_cols.get('frequency_columns', [])
+        pos_cols = detected_cols.get('pos_columns', [])
+        # Determine default selections
+        default_word_idx = 0
+        default_freq_idx = 0
+        default_use_pos = False
+        default_pos_idx = 0
+        if current_config:
+            # Preserve current selections
+            if current_config['word_column'] in word_cols:
+                default_word_idx = word_cols.index(current_config['word_column'])
+            if current_config['frequency_column'] in freq_cols:
+                default_freq_idx = freq_cols.index(current_config['frequency_column'])
+            if 'pos_column' in current_config and current_config['pos_column'] in pos_cols:
+                default_use_pos = True
+                default_pos_idx = pos_cols.index(current_config['pos_column'])
+        col1, col2 = st.columns(2)
+        with col1:
+            word_column = st.selectbox(
+                "Word Column",
+                options=word_cols,
+                index=default_word_idx,
+                help="Column containing word forms or lemmas",
+                key="persistent_word_col"
+            )
+            # POS column selection (optional)
+            pos_column = None
+            if pos_cols:
+                use_pos = st.checkbox("Include POS column", value=default_use_pos, key="persistent_use_pos")
+                if use_pos:
+                    pos_column = st.selectbox(
+                        "POS Column",
+                        options=pos_cols,
+                        index=default_pos_idx,
+                        help="Column containing part-of-speech tags (optional)",
+                        key="persistent_pos_col"
+                    )
+        with col2:
+            frequency_column = st.selectbox(
+                "Frequency Column",
+                options=freq_cols,
+                index=default_freq_idx,
+                help="Column containing frequency values for analysis",
+                key="persistent_freq_col"
+            )
+            # Show quick info about selected columns
+            st.write("**Selected Configuration:**")
+            st.write(f"• Words: `{word_column}`")
+            st.write(f"• Frequencies: `{frequency_column}`")
+            if pos_column:
+                st.write(f"• POS: `{pos_column}`")
+        # Always return configuration (no button needed)
+        config = {
+            'word_column': word_column,
+            'frequency_column': frequency_column,
+            'separator': format_info['separator'],
+            'has_header': format_info['has_header']
+        }
+        if pos_column:
+            config['pos_column'] = pos_column
+        return config
+    @staticmethod
+    def render_enhanced_visualization_controls(analyzer: FrequencyAnalyzer, column_config: Dict) -> Optional[Dict]:
+        """
+        Render enhanced visualization controls with max words limit.
+        Args:
+            analyzer: FrequencyAnalyzer instance with loaded data
+            column_config: Column configuration from user selection
+        Returns:
+            Dict with visualization configuration or None
+        """
+        st.subheader("🎛️ Enhanced Visualization Controls")
+        # Get the frequency column
+        frequency_column = column_config['frequency_column']
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            # Bin size controls
+            bin_size = st.slider(
+                "Bin Size (words per group)",
+                min_value=100,
+                max_value=2000,
+                value=500,
+                step=100,
+                help="Number of words to group together for rank-based analysis"
+            )
+        with col2:
+            # Log transformation option
+            log_transform = st.checkbox(
+                "Apply log₁₀ transformation",
+                value=False,
+                help="Transform frequency values using log₁₀ for better visualization"
+            )
+        with col3:
+            # Max words control
+            max_words = st.number_input(
+                "Max words to analyze",
+                min_value=1000,
+                max_value=200000,
+                value=None,
+                step=1000,
+                help="Limit analysis to top N most frequent words (leave empty for no limit)",
+                key="max_words_input"
+            )
+            # Quick preset buttons
+            st.write("**Quick Presets:**")
+            preset_cols = st.columns(4)
+            if preset_cols[0].button("10K", key="preset_10k"):
+                st.session_state.max_words_preset = 10000
+            if preset_cols[1].button("25K", key="preset_25k"):
+                st.session_state.max_words_preset = 25000
+            if preset_cols[2].button("50K", key="preset_50k"):
+                st.session_state.max_words_preset = 50000
+            if preset_cols[3].button("All", key="preset_all"):
+                st.session_state.max_words_preset = None
+            # Use preset value if set
+            if 'max_words_preset' in st.session_state:
+                max_words = st.session_state.max_words_preset
+                del st.session_state.max_words_preset
+        # Generate visualization button
+        if st.button("📊 Generate Enhanced Visualization", type="primary", key="generate_viz"):
+            return {
+                'frequency_column': frequency_column,
+                'bin_size': bin_size,
+                'log_transform': log_transform,
+                'max_words_to_retain': max_words
+            }
+        return None
+    @staticmethod
+    def render_enhanced_rank_based_analysis(analyzer: FrequencyAnalyzer, viz_config: Dict):
+        """
+        Render enhanced rank-based analysis with improved sample words display.
+        Args:
+            analyzer: FrequencyAnalyzer instance with loaded data
+            viz_config: Visualization configuration
+        """
+        st.subheader("📊 Enhanced Rank-Based Frequency Analysis")
+        frequency_column = viz_config['frequency_column']
+        bin_size = viz_config['bin_size']
+        log_transform = viz_config['log_transform']
+        max_words_to_retain = viz_config.get('max_words_to_retain')
+        try:
+            # Calculate statistics
+            stats = analyzer.calculate_statistics(frequency_column)
+            # Display basic statistics with word limit info
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                words_analyzed = max_words_to_retain if max_words_to_retain and max_words_to_retain < stats['count'] else stats['count']
+                st.metric("Words Analyzed", f"{words_analyzed:,}")
+            with col2:
+                st.metric("Mean Frequency", f"{stats['mean']:.2f}")
+            with col3:
+                st.metric("Median Frequency", f"{stats['median']:.2f}")
+            with col4:
+                st.metric("Std Deviation", f"{stats['std']:.2f}")
+            # Show word limit info if applied
+            if max_words_to_retain and max_words_to_retain < stats['count']:
+                st.info(f"📊 Analysis limited to top {max_words_to_retain:,} most frequent words (out of {stats['count']:,} total)")
+            # Create rank-based visualization with enhanced parameters
+            result = analyzer.create_rank_based_visualization_flexible(
+                column=frequency_column,
+                bin_size=bin_size,
+                log_transform=log_transform,
+                max_words_to_retain=max_words_to_retain
+            )
+            # Create the main visualization
+            fig = go.Figure()
+            fig.add_trace(go.Bar(
+                x=result['group_centers'],
+                y=result['avg_frequencies'],
+                name=f"Avg {frequency_column}",
+                marker_color='steelblue',
+                hovertemplate=(
+                    f"<b>Group %{{x}}</b><br>"
+                    f"Avg {'Log₁₀ ' if log_transform else ''}{frequency_column}: %{{y:.3f}}<br>"
+                    "<extra></extra>"
+                )
+            ))
+            fig.update_layout(
+                title=result.get('title_suffix', f"Enhanced Rank-Based Analysis - {frequency_column}"),
+                xaxis_title=result.get('x_label', f"Rank Groups (bin size: {bin_size})"),
+                yaxis_title=result.get('y_label', f"{'Log₁₀ ' if log_transform else ''}Average {frequency_column}"),
+                showlegend=False,
+                height=500
+            )
+            st.plotly_chart(fig, use_container_width=True)
+            # Enhanced sample words display (up to 20 bins with 5 random samples each)
+            st.write("### 🎯 Sample Words by Rank Group (5 Random Samples)")
+            sample_words = result.get('sample_words', {})
+            if sample_words:
+                # Display up to 20 groups in a more organized layout
+                num_groups = min(20, len(sample_words))
+                if num_groups > 0:
+                    st.write(f"Showing sample words from top {num_groups} rank groups:")
+                    # Display in rows of 4 groups each
+                    for row_start in range(0, num_groups, 4):
+                        cols = st.columns(4)
+                        for col_idx in range(4):
+                            group_idx = row_start + col_idx
+                            if group_idx < num_groups and group_idx in sample_words:
+                                with cols[col_idx]:
+                                    group_label = result['group_labels'][group_idx]
+                                    words = sample_words[group_idx]
+                                    st.write(f"**Group {group_label}:**")
+                                    word_list = [w['word'] for w in words]
+                                    # Display as bullet points for better readability
+                                    for word in word_list:
+                                        st.write(f"• {word}")
+                                    # Add spacing between groups
+                                    st.write("")
+            else:
+                st.write("No sample words available")
+            # Show enhanced group statistics
+            with st.expander("📈 Detailed Group Statistics"):
+                group_stats = result.get('group_stats')
+                if group_stats is not None and not group_stats.empty:
+                    display_stats = group_stats.copy()
+                    # Format numeric columns
+                    numeric_cols = display_stats.select_dtypes(include=[np.number]).columns
+                    for col in numeric_cols:
+                        if 'count' not in col.lower():
+                            display_stats[col] = display_stats[col].round(2)
+                    st.dataframe(display_stats, use_container_width=True)
+                else:
+                    st.write("No detailed statistics available")
+        except Exception as e:
+            st.error(f"Error in enhanced rank-based analysis: {str(e)}")
+            with st.expander("Error Details"):
+                st.code(str(e))