Upload 129 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +48 -0
- README.md +90 -11
- app.py +353 -0
- nltk_data/tokenizers/punkt_tab.zip +3 -0
- nltk_data/tokenizers/punkt_tab/README +98 -0
- nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
- nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
- nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +54 -0
- nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +211 -0
- nltk_data/tokenizers/punkt_tab/danish/collocations.tab +101 -0
- nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +64 -0
- nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +99 -0
- nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +37 -0
- nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +54 -0
- nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +156 -0
- nltk_data/tokenizers/punkt_tab/english/collocations.tab +37 -0
- nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +39 -0
- nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +48 -0
- nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +100 -0
- nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +89 -0
- nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +81 -0
- nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +167 -0
- nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +86 -0
- nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +61 -0
- nltk_data/tokenizers/punkt_tab/french/collocations.tab +18 -0
- nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +48 -0
- nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +71 -0
- nltk_data/tokenizers/punkt_tab/german/collocations.tab +28 -0
- nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +107 -0
- nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +100 -0
- nltk_data/tokenizers/punkt_tab/greek/collocations.tab +7 -0
- nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt +54 -0
- nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt +125 -0
- nltk_data/tokenizers/punkt_tab/italian/collocations.tab +6 -0
- nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt +40 -0
- nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt +285 -0
- nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab +153 -0
- nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt +14 -0
- nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt +106 -0
.gitignore
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.vscode
|
| 2 |
+
|
| 3 |
+
# Pylance
|
| 4 |
+
pyrightconfig.json
|
| 5 |
+
|
| 6 |
+
# Byte-compiled / optimized / DLL files
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
|
| 11 |
+
# C extensions
|
| 12 |
+
*.so
|
| 13 |
+
|
| 14 |
+
# Distribution / packaging
|
| 15 |
+
.Python
|
| 16 |
+
build/
|
| 17 |
+
develop-eggs/
|
| 18 |
+
dist/
|
| 19 |
+
downloads/
|
| 20 |
+
eggs/
|
| 21 |
+
.eggs/
|
| 22 |
+
lib/
|
| 23 |
+
lib64/
|
| 24 |
+
parts/
|
| 25 |
+
sdist/
|
| 26 |
+
var/
|
| 27 |
+
wheels/
|
| 28 |
+
*.egg-info/
|
| 29 |
+
.installed.cfg
|
| 30 |
+
*.egg
|
| 31 |
+
MANIFEST
|
| 32 |
+
|
| 33 |
+
# PyInstaller
|
| 34 |
+
# Usually these files are written by a python script from a template
|
| 35 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 36 |
+
*.manifest
|
| 37 |
+
*.spec
|
| 38 |
+
|
| 39 |
+
# Installer logs
|
| 40 |
+
pip-log.txt
|
| 41 |
+
pip-delete-this-directory.txt
|
| 42 |
+
|
| 43 |
+
syn_out/
|
| 44 |
+
checkpoints/
|
| 45 |
+
.gradio
|
| 46 |
+
|
| 47 |
+
# Ignore generated sample .wav files
|
| 48 |
+
**/*.wav
|
README.md
CHANGED
|
@@ -1,13 +1,92 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
| 1 |
+
# CloneTTS - Sao Chép Giọng Đọc Đa Ngôn Ngữ 🎙️🧠
|
| 2 |
+
|
| 3 |
+
**Tác giả:** Lý Trần
|
| 4 |
+
|
| 5 |
+
CloneTTS Giọng Đa Ngôn Ngữ là một ứng dụng web sử dụng Gradio, cung cấp giao diện thân thiện để tạo giọng nói, chuyển đổi giọng nói, và quản lý quy trình xử lý âm thanh nâng cao dựa trên mô hình Chatterbox của Resemble AI.
|
| 6 |
+
|
| 7 |
+
## Tính năng nổi bật
|
| 8 |
+
|
| 9 |
+
- **Quản lý dự án tập trung:**
|
| 10 |
+
Tạo, chọn và quản lý workspace riêng biệt. Mọi file đầu vào, file xử lý và kết quả sẽ được sắp xếp tự động vào đúng thư mục trong dự án.
|
| 11 |
+
|
| 12 |
+
- **Sinh giọng nói (Single Generation):**
|
| 13 |
+
- **Text-to-Speech (TTS):** Sinh giọng nói chất lượng cao từ văn bản, có thể dùng file tham chiếu để clone giọng.
|
| 14 |
+
- **Voice Conversion (VC):** Chuyển đổi đặc trưng giọng nói của file nguồn sang tham chiếu.
|
| 15 |
+
- **Quét tham số (Parameter Sweep):** Sinh nhiều phiên bản cùng lúc với các giá trị tham số khác nhau (ví dụ: Temperature, Pace...).
|
| 16 |
+
|
| 17 |
+
- **Xử lý hàng loạt (Batch Processing):**
|
| 18 |
+
- Xử lý cả thư mục văn bản hoặc âm thanh chỉ với một lần bấm.
|
| 19 |
+
- Có thể ghép tất cả file âm thanh sinh ra thành một file duy nhất.
|
| 20 |
+
|
| 21 |
+
- **Chuẩn bị dữ liệu:**
|
| 22 |
+
- **Tách văn bản:** Tự động chia nhỏ file văn bản dài thành nhiều đoạn phù hợp với mô hình.
|
| 23 |
+
- **Tách file âm thanh:** Chia nhỏ file âm thanh thành các đoạn ngắn hơn, ưu tiên tách ở đoạn im lặng.
|
| 24 |
+
|
| 25 |
+
- **Chỉnh sửa & hoàn thiện quy trình:**
|
| 26 |
+
- **Regenerate Audio:** Xem lại từng file audio, chỉnh sửa & thay thế nhanh chóng.
|
| 27 |
+
- **Trình soạn thảo văn bản trực tiếp:** Sửa văn bản nguồn ngay trên giao diện, lưu lại dễ dàng.
|
| 28 |
+
|
| 29 |
+
## Yêu cầu cài đặt
|
| 30 |
+
|
| 31 |
+
- **Python:** >=3.8 (Khuyến nghị 3.11)
|
| 32 |
+
- **Git**
|
| 33 |
+
- **FFmpeg**
|
| 34 |
+
- **GPU CUDA** (khuyến nghị, chạy CPU sẽ rất chậm)
|
| 35 |
+
|
| 36 |
+
## Hướng dẫn cài đặt nhanh
|
| 37 |
+
|
| 38 |
+
### 1. Clone dự án
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
git clone https://github.com/ltteamvn/CloneTTS
|
| 42 |
+
cd CloneTTS
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### 2. Tạo môi trường ảo Python
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
python3.11 -m venv toolkit
|
| 49 |
+
source toolkit/bin/activate # Trên Linux/macOS
|
| 50 |
+
# .\toolkit\Scripts\activate # Trên Windows
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### 3. Cài đặt thư viện
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
pip install -r requirements.txt
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
Lưu ý: Nếu bạn dùng GPU Nvidia 10 series hoặc AMD, cần tự cài torch phù hợp.
|
| 60 |
+
|
| 61 |
+
### 4. Chạy ứng dụng
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
python app.py
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
Truy cập địa chỉ xuất hiện trên terminal (thường là http://127.0.0.1:7860) để sử dụng giao diện web.
|
| 68 |
+
|
| 69 |
+
## Quy trình sử dụng điển hình
|
| 70 |
+
|
| 71 |
+
1. **Tạo project** ở tab Projects.
|
| 72 |
+
2. **Chuẩn bị dữ liệu:**
|
| 73 |
+
- Upload văn bản/audio vào thư mục dự án tương ứng.
|
| 74 |
+
- Sử dụng tab Data Preparation để tách nhỏ file nếu cần.
|
| 75 |
+
3. **Sinh audio:**
|
| 76 |
+
- Vào tab Batch Generation hoặc Single Generation để sinh file âm thanh mong muốn.
|
| 77 |
+
4. **Chỉnh sửa & hoàn thiện:**
|
| 78 |
+
- Vào Edit Project Data để chỉnh sửa file text hoặc thay thế từng file audio.
|
| 79 |
+
|
| 80 |
+
## Một số lưu ý
|
| 81 |
+
|
| 82 |
+
- Thư mục dự án sẽ tự động lưu trữ toàn bộ file đầu vào, file xử lý và kết quả theo cấu trúc rõ ràng.
|
| 83 |
+
- Khi chuyển giọng, file tham chiếu (reference voice) nên ngắn hơn hoặc bằng 40 giây.
|
| 84 |
+
- Source Audio có thể dài hơn 40s, chương trình sẽ tự động chia nhỏ và ghép lại kết quả.
|
| 85 |
+
|
| 86 |
+
## Đóng góp & liên hệ
|
| 87 |
+
|
| 88 |
+
Nếu bạn gặp lỗi hoặc muốn đóng góp ý kiến, hãy tạo issue hoặc liên hệ trực tiếp với tác giả.
|
| 89 |
+
|
| 90 |
---
|
| 91 |
|
| 92 |
+
Chúc bạn sử dụng hiệu quả công cụ này!
|
app.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import importlib
|
| 4 |
+
import json
|
| 5 |
+
import asyncio
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import gradio as gr
|
| 10 |
+
import pydub
|
| 11 |
+
import edge_tts
|
| 12 |
+
import srt
|
| 13 |
+
|
| 14 |
+
# --- 1) Đảm bảo src/ có trong Python path để import ChatterboxVC ---
|
| 15 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 16 |
+
src_path = os.path.join(script_dir, "src")
|
| 17 |
+
if src_path not in sys.path:
|
| 18 |
+
sys.path.insert(0, src_path)
|
| 19 |
+
|
| 20 |
+
import chatterbox.vc
|
| 21 |
+
importlib.reload(chatterbox.vc)
|
| 22 |
+
from chatterbox.vc import ChatterboxVC
|
| 23 |
+
|
| 24 |
+
# --- 2) Khởi tạo model VC ---
|
| 25 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 26 |
+
_vc_model = None
|
| 27 |
+
def get_vc_model():
|
| 28 |
+
global _vc_model
|
| 29 |
+
if _vc_model is None:
|
| 30 |
+
print(f"[VC] Đang tải model trên {DEVICE}…")
|
| 31 |
+
_vc_model = ChatterboxVC.from_pretrained(DEVICE)
|
| 32 |
+
print("[VC] Model sẵn sàng.")
|
| 33 |
+
return _vc_model
|
| 34 |
+
|
| 35 |
+
# --- 3) Helper cập nhật log, audio và file-download ---
|
| 36 |
+
global_log_messages_vc = []
|
| 37 |
+
def yield_vc_updates(log_msg=None, audio_data=None, file_list=None, log_append=True):
|
| 38 |
+
global global_log_messages_vc
|
| 39 |
+
# cập nhật log
|
| 40 |
+
if log_msg is not None:
|
| 41 |
+
prefix = datetime.now().strftime("[%H:%M:%S]")
|
| 42 |
+
if log_append:
|
| 43 |
+
global_log_messages_vc.append(f"{prefix} {log_msg}")
|
| 44 |
+
else:
|
| 45 |
+
global_log_messages_vc = [f"{prefix} {log_msg}"]
|
| 46 |
+
log_update = gr.update(value="\n".join(global_log_messages_vc))
|
| 47 |
+
|
| 48 |
+
# audio output
|
| 49 |
+
audio_update = gr.update(visible=(audio_data is not None),
|
| 50 |
+
value=audio_data if audio_data is not None else None)
|
| 51 |
+
# file-download output
|
| 52 |
+
files_update = gr.update(visible=(file_list is not None),
|
| 53 |
+
value=file_list if file_list is not None else [])
|
| 54 |
+
|
| 55 |
+
yield log_update, audio_update, files_update
|
| 56 |
+
|
| 57 |
+
# --- 4) Load voices Edge TTS từ voices.json ---
|
| 58 |
+
def load_edge_tts_voices(json_path="voices.json"):
|
| 59 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 60 |
+
voices = json.load(f)
|
| 61 |
+
display_list, code_map = [], {}
|
| 62 |
+
for lang, genders in voices.items():
|
| 63 |
+
for gender, items in genders.items():
|
| 64 |
+
for v in items:
|
| 65 |
+
disp = f"{lang} - {gender} - {v['display_name']} ({v['voice_code']})"
|
| 66 |
+
display_list.append(disp)
|
| 67 |
+
code_map[disp] = v["voice_code"]
|
| 68 |
+
return display_list, code_map
|
| 69 |
+
|
| 70 |
+
edge_choices, edge_code_map = load_edge_tts_voices()
|
| 71 |
+
|
| 72 |
+
# --- 5) TTS Edge với rate & volume ---
|
| 73 |
+
async def _edge_tts_async(text, disp, rate_pct, vol_pct):
|
| 74 |
+
code = edge_code_map.get(disp)
|
| 75 |
+
rate_str = f"{rate_pct:+d}%"
|
| 76 |
+
vol_str = f"{vol_pct:+d}%"
|
| 77 |
+
out = "temp_edge_tts.wav"
|
| 78 |
+
await edge_tts.Communicate(text, voice=code, rate=rate_str, volume=vol_str).save(out)
|
| 79 |
+
return out
|
| 80 |
+
|
| 81 |
+
def run_edge_tts(text, disp, rate_pct, vol_pct):
|
| 82 |
+
path = asyncio.run(_edge_tts_async(text, disp, rate_pct, vol_pct))
|
| 83 |
+
return path, path
|
| 84 |
+
|
| 85 |
+
# --- 6) Sinh audio từ SRT (có rate & vol) ---
|
| 86 |
+
def synthesize_srt_audio(srt_path: str, disp_voice: str, work_dir: str,
|
| 87 |
+
rate_pct: int, vol_pct: int) -> str:
|
| 88 |
+
with open(srt_path, "r", encoding="utf-8") as f:
|
| 89 |
+
subs = list(srt.parse(f.read()))
|
| 90 |
+
combined = pydub.AudioSegment.empty()
|
| 91 |
+
current_ms = 0
|
| 92 |
+
|
| 93 |
+
for sub in subs:
|
| 94 |
+
start_ms = int(sub.start.total_seconds() * 1000)
|
| 95 |
+
end_ms = int(sub.end.total_seconds() * 1000)
|
| 96 |
+
dur_ms = end_ms - start_ms
|
| 97 |
+
|
| 98 |
+
# silence until start
|
| 99 |
+
if start_ms > current_ms:
|
| 100 |
+
combined += pydub.AudioSegment.silent(duration=start_ms - current_ms)
|
| 101 |
+
|
| 102 |
+
tmp_wav, _ = run_edge_tts(sub.content, disp_voice, rate_pct, vol_pct)
|
| 103 |
+
tts_audio = pydub.AudioSegment.from_file(tmp_wav)
|
| 104 |
+
|
| 105 |
+
# crop/pad để match dur
|
| 106 |
+
if len(tts_audio) > dur_ms:
|
| 107 |
+
tts_audio = tts_audio[:dur_ms]
|
| 108 |
+
else:
|
| 109 |
+
tts_audio += pydub.AudioSegment.silent(duration=dur_ms - len(tts_audio))
|
| 110 |
+
|
| 111 |
+
combined += tts_audio
|
| 112 |
+
current_ms = end_ms
|
| 113 |
+
|
| 114 |
+
out_path = os.path.join(work_dir, "srt_source.wav")
|
| 115 |
+
combined.export(out_path, format="wav")
|
| 116 |
+
return out_path
|
| 117 |
+
|
| 118 |
+
# --- 7) Voice Conversion chính ---
|
| 119 |
+
def generate_vc(
|
| 120 |
+
source_audio_path,
|
| 121 |
+
target_voice_path,
|
| 122 |
+
cfg_rate: float,
|
| 123 |
+
sigma_min: float,
|
| 124 |
+
batch_mode: bool,
|
| 125 |
+
batch_parameter: str,
|
| 126 |
+
batch_values: str
|
| 127 |
+
):
|
| 128 |
+
model = get_vc_model()
|
| 129 |
+
yield from yield_vc_updates("Khởi tạo chuyển giọng…", log_append=False)
|
| 130 |
+
|
| 131 |
+
# thư mục đầu ra
|
| 132 |
+
date_folder = datetime.now().strftime("%Y%m%d")
|
| 133 |
+
work_dir = os.path.join("outputs/vc", date_folder)
|
| 134 |
+
os.makedirs(work_dir, exist_ok=True)
|
| 135 |
+
|
| 136 |
+
def run_once(src, tgt, rate, sigma):
|
| 137 |
+
return model.generate(src, target_voice_path=tgt, inference_cfg_rate=rate, sigma_min=sigma)
|
| 138 |
+
|
| 139 |
+
outputs = []
|
| 140 |
+
try:
|
| 141 |
+
if batch_mode:
|
| 142 |
+
try:
|
| 143 |
+
vals = [float(v.strip()) for v in batch_values.split(",") if v.strip()]
|
| 144 |
+
except:
|
| 145 |
+
raise gr.Error("Batch values phải là số, phân cách bởi dấu phẩy.")
|
| 146 |
+
yield from yield_vc_updates(f"Chạy batch '{batch_parameter}': {vals}")
|
| 147 |
+
for idx, v in enumerate(vals, 1):
|
| 148 |
+
r, s = cfg_rate, sigma_min
|
| 149 |
+
tag = ""
|
| 150 |
+
if batch_parameter == "Inference CFG Rate":
|
| 151 |
+
r, tag = v, f"cfg_{v}"
|
| 152 |
+
else:
|
| 153 |
+
s, tag = v, f"sigma_{v}"
|
| 154 |
+
yield from yield_vc_updates(f" • Mục {idx}/{len(vals)}: {batch_parameter}={v}")
|
| 155 |
+
wav = run_once(source_audio_path, target_voice_path, r, s)
|
| 156 |
+
fn = f"{tag}_{idx}.wav"
|
| 157 |
+
path = os.path.join(work_dir, fn)
|
| 158 |
+
model.save_wav(wav, path)
|
| 159 |
+
outputs.append(path)
|
| 160 |
+
yield from yield_vc_updates(f"Đã lưu: {path}")
|
| 161 |
+
else:
|
| 162 |
+
audio = pydub.AudioSegment.from_file(source_audio_path)
|
| 163 |
+
if len(audio) > 40_000:
|
| 164 |
+
yield from yield_vc_updates("Audio dài >40s: tách thành đoạn 40s…")
|
| 165 |
+
chunks = [audio[i:i+40_000] for i in range(0, len(audio), 40_000)]
|
| 166 |
+
temp_paths = []
|
| 167 |
+
for i, chunk in enumerate(chunks):
|
| 168 |
+
tmp = f"{source_audio_path}_chunk{i}.wav"
|
| 169 |
+
chunk.export(tmp, format="wav")
|
| 170 |
+
wav = run_once(tmp, target_voice_path, cfg_rate, sigma_min)
|
| 171 |
+
outp = os.path.join(work_dir, f"part{i}.wav")
|
| 172 |
+
model.save_wav(wav, outp)
|
| 173 |
+
temp_paths.append(outp)
|
| 174 |
+
os.remove(tmp)
|
| 175 |
+
yield from yield_vc_updates(f"Xử lý đoạn {i+1}/{len(chunks)}")
|
| 176 |
+
# ghép lại
|
| 177 |
+
combined = pydub.AudioSegment.empty()
|
| 178 |
+
for p in temp_paths:
|
| 179 |
+
combined += pydub.AudioSegment.from_file(p)
|
| 180 |
+
final = os.path.join(work_dir, "combined.wav")
|
| 181 |
+
combined.export(final, format="wav")
|
| 182 |
+
outputs.append(final)
|
| 183 |
+
yield from yield_vc_updates("Chuyển xong.")
|
| 184 |
+
else:
|
| 185 |
+
yield from yield_vc_updates("Đang chuyển giọng…")
|
| 186 |
+
wav = run_once(source_audio_path, target_voice_path, cfg_rate, sigma_min)
|
| 187 |
+
outp = os.path.join(work_dir, f"output_{datetime.now().strftime('%H%M%S')}.wav")
|
| 188 |
+
model.save_wav(wav, outp)
|
| 189 |
+
outputs.append(outp)
|
| 190 |
+
yield from yield_vc_updates("Hoàn thành.")
|
| 191 |
+
except Exception as e:
|
| 192 |
+
yield from yield_vc_updates(f"Lỗi: {e}")
|
| 193 |
+
raise
|
| 194 |
+
|
| 195 |
+
# cuốn cùng: luôn trả về cả audio đầu tiên và danh sách files cho download
|
| 196 |
+
first = outputs[0] if outputs else None
|
| 197 |
+
yield from yield_vc_updates(log_msg=None, audio_data=first, file_list=outputs)
|
| 198 |
+
|
| 199 |
+
# --- 8) Wrapper tổng hợp ---
|
| 200 |
+
def run_vc_from_srt_or_file(
|
| 201 |
+
use_srt: bool,
|
| 202 |
+
srt_file, srt_voice, srt_rate, srt_vol,
|
| 203 |
+
edge_text, edge_voice, edge_rate, edge_vol,
|
| 204 |
+
src_audio, tgt_audio,
|
| 205 |
+
cfg_rate, sigma_min,
|
| 206 |
+
batch_mode, batch_parameter, batch_values
|
| 207 |
+
):
|
| 208 |
+
yield from yield_vc_updates("Bắt đầu…", log_append=False)
|
| 209 |
+
|
| 210 |
+
date_folder = datetime.now().strftime("%Y%m%d")
|
| 211 |
+
work_dir = os.path.join("outputs/vc", date_folder)
|
| 212 |
+
os.makedirs(work_dir, exist_ok=True)
|
| 213 |
+
|
| 214 |
+
if use_srt:
|
| 215 |
+
yield from yield_vc_updates("Sinh audio từ SRT…")
|
| 216 |
+
source = synthesize_srt_audio(
|
| 217 |
+
srt_file.name, srt_voice, work_dir,
|
| 218 |
+
rate_pct=srt_rate, vol_pct=srt_vol
|
| 219 |
+
)
|
| 220 |
+
elif edge_text and edge_voice:
|
| 221 |
+
yield from yield_vc_updates("Sinh audio từ Edge TTS…")
|
| 222 |
+
tmp, _ = run_edge_tts(edge_text, edge_voice, edge_rate, edge_vol)
|
| 223 |
+
source = tmp
|
| 224 |
+
else:
|
| 225 |
+
source = src_audio
|
| 226 |
+
|
| 227 |
+
yield from generate_vc(
|
| 228 |
+
source, tgt_audio,
|
| 229 |
+
cfg_rate, sigma_min,
|
| 230 |
+
batch_mode, batch_parameter, batch_values
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# --- 9) Build Gradio UI ---
|
| 234 |
+
with gr.Blocks(title="Chuyển Giọng Nói AI") as demo:
|
| 235 |
+
gr.Markdown("## 📣 Chuyển Giọng Nói AI")
|
| 236 |
+
gr.Markdown("> Tác giả: **Lý Trần**")
|
| 237 |
+
|
| 238 |
+
with gr.Row():
|
| 239 |
+
with gr.Column():
|
| 240 |
+
# SRT
|
| 241 |
+
use_srt = gr.Checkbox(label="Sử dụng file SRT làm nguồn?", value=False)
|
| 242 |
+
srt_file = gr.File(file_types=[".srt"], label="Tải lên file .srt", visible=False)
|
| 243 |
+
srt_voice = gr.Dropdown(choices=edge_choices, label="Giọng Edge TTS (SRT)", visible=False)
|
| 244 |
+
srt_rate = gr.Slider(-100, 100, value=0, step=1, label="Tốc độ SRT (% chuẩn)", visible=False)
|
| 245 |
+
srt_vol = gr.Slider(-100, 100, value=0, step=1, label="Âm lượng SRT (% chuẩn)", visible=False)
|
| 246 |
+
|
| 247 |
+
# Edge TTS
|
| 248 |
+
use_edge = gr.Checkbox(label="Tạo nguồn qua Edge TTS?", value=False)
|
| 249 |
+
edge_text = gr.Textbox(label="Văn bản cho Edge TTS", visible=False)
|
| 250 |
+
edge_voice = gr.Dropdown(choices=edge_choices, label="Giọng Edge TTS", visible=False)
|
| 251 |
+
edge_rate = gr.Slider(-100, 100, value=0, step=1, label="Tốc độ Edge (% chuẩn)", visible=False)
|
| 252 |
+
edge_vol = gr.Slider(-100, 100, value=0, step=1, label="Âm lượng Edge (% chuẩn)", visible=False)
|
| 253 |
+
gen_edge_btn = gr.Button("🗣️ Tạo Edge TTS", visible=False)
|
| 254 |
+
edge_audio = gr.Audio(label="Nguồn Edge TTS", type="filepath", visible=False)
|
| 255 |
+
|
| 256 |
+
# Nguồn thủ công
|
| 257 |
+
src_audio = gr.Audio(sources=["upload","microphone"], type="filepath",
|
| 258 |
+
label="Tải lên / Ghi âm nguồn")
|
| 259 |
+
|
| 260 |
+
# Giọng tham chiếu
|
| 261 |
+
gr.Markdown("### Giọng tham chiếu (mục tiêu)")
|
| 262 |
+
tgt_audio = gr.Audio(sources=["upload","microphone"], type="filepath",
|
| 263 |
+
label="Tải lên / Ghi âm giọng mục tiêu")
|
| 264 |
+
|
| 265 |
+
# Tham số VC
|
| 266 |
+
gr.Markdown("### Tham số chuyển giọng")
|
| 267 |
+
cfg_slider = gr.Slider(0.0, 30.0, value=0.5, step=0.1, label="CFG Rate")
|
| 268 |
+
sigma_input = gr.Number(1e-6, label="Sigma Min",
|
| 269 |
+
minimum=1e-7, maximum=1e-5, step=1e-7)
|
| 270 |
+
|
| 271 |
+
# Batch sweep
|
| 272 |
+
with gr.Accordion("Tùy chọn Batch Sweep", open=False):
|
| 273 |
+
batch_chk = gr.Checkbox(label="Kích hoạt Batch Sweep", value=False)
|
| 274 |
+
batch_param = gr.Dropdown(choices=["Inference CFG Rate","Sigma Min"],
|
| 275 |
+
label="Tham số thay đổi")
|
| 276 |
+
batch_vals = gr.Textbox(placeholder="ví dụ: 0.5,1.0,2.0",
|
| 277 |
+
label="Giá trị phân cách dấu phẩy")
|
| 278 |
+
|
| 279 |
+
run_btn = gr.Button("🚀 Chuyển giọng")
|
| 280 |
+
|
| 281 |
+
with gr.Column():
|
| 282 |
+
gr.Markdown("### Nhật ký")
|
| 283 |
+
log_box = gr.Textbox(interactive=False, lines=12)
|
| 284 |
+
gr.Markdown("### Kết quả")
|
| 285 |
+
out_audio = gr.Audio(label="Âm thanh kết quả", type="filepath", visible=False)
|
| 286 |
+
out_files = gr.Files(label="Tải xuống file đầu ra", visible=False)
|
| 287 |
+
|
| 288 |
+
# Toggle SRT
|
| 289 |
+
def toggle_srt(v):
|
| 290 |
+
return (
|
| 291 |
+
gr.update(visible=v), # srt_file
|
| 292 |
+
gr.update(visible=v), # srt_voice
|
| 293 |
+
gr.update(visible=v), # srt_rate
|
| 294 |
+
gr.update(visible=v), # srt_vol
|
| 295 |
+
gr.update(visible=not v), # use_edge
|
| 296 |
+
gr.update(visible=not v), # edge_text
|
| 297 |
+
gr.update(visible=not v), # edge_voice
|
| 298 |
+
gr.update(visible=not v), # edge_rate
|
| 299 |
+
gr.update(visible=not v), # edge_vol
|
| 300 |
+
gr.update(visible=not v), # gen_edge_btn
|
| 301 |
+
gr.update(visible=not v), # edge_audio
|
| 302 |
+
gr.update(visible=not v) # src_audio
|
| 303 |
+
)
|
| 304 |
+
use_srt.change(
|
| 305 |
+
fn=toggle_srt,
|
| 306 |
+
inputs=[use_srt],
|
| 307 |
+
outputs=[
|
| 308 |
+
srt_file, srt_voice, srt_rate, srt_vol,
|
| 309 |
+
use_edge, edge_text, edge_voice, edge_rate, edge_vol,
|
| 310 |
+
gen_edge_btn, edge_audio, src_audio
|
| 311 |
+
]
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
# Toggle Edge TTS
|
| 315 |
+
def toggle_edge(v):
|
| 316 |
+
return (
|
| 317 |
+
gr.update(visible=v), # edge_text
|
| 318 |
+
gr.update(visible=v), # edge_voice
|
| 319 |
+
gr.update(visible=v), # edge_rate
|
| 320 |
+
gr.update(visible=v), # edge_vol
|
| 321 |
+
gr.update(visible=v), # gen_edge_btn
|
| 322 |
+
gr.update(visible=v), # edge_audio
|
| 323 |
+
gr.update(visible=not v) # src_audio
|
| 324 |
+
)
|
| 325 |
+
use_edge.change(
|
| 326 |
+
fn=toggle_edge,
|
| 327 |
+
inputs=[use_edge],
|
| 328 |
+
outputs=[edge_text, edge_voice, edge_rate, edge_vol, gen_edge_btn, edge_audio, src_audio]
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
# Sinh Edge TTS
|
| 332 |
+
gen_edge_btn.click(
|
| 333 |
+
fn=run_edge_tts,
|
| 334 |
+
inputs=[edge_text, edge_voice, edge_rate, edge_vol],
|
| 335 |
+
outputs=[edge_audio, src_audio]
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
# Chạy VC
|
| 339 |
+
run_btn.click(
|
| 340 |
+
fn=run_vc_from_srt_or_file,
|
| 341 |
+
inputs=[
|
| 342 |
+
use_srt, srt_file, srt_voice, srt_rate, srt_vol,
|
| 343 |
+
edge_text, edge_voice, edge_rate, edge_vol,
|
| 344 |
+
src_audio, tgt_audio,
|
| 345 |
+
cfg_slider, sigma_input,
|
| 346 |
+
batch_chk, batch_param, batch_vals
|
| 347 |
+
],
|
| 348 |
+
outputs=[log_box, out_audio, out_files],
|
| 349 |
+
show_progress="minimal"
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
if __name__ == "__main__":
|
| 353 |
+
demo.launch(share=True)
|
nltk_data/tokenizers/punkt_tab.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e57f64187974277726a3417ca6f181ec5403676c717672eef6a748a7b20e0106
|
| 3 |
+
size 4319076
|
nltk_data/tokenizers/punkt_tab/README
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
| 2 |
+
|
| 3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
| 4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
| 5 |
+
|
| 6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
| 7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
| 8 |
+
and chapter 3.8 of the NLTK book:
|
| 9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
| 10 |
+
|
| 11 |
+
There are pretrained tokenizers for the following languages:
|
| 12 |
+
|
| 13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
| 14 |
+
=======================================================================================================================================================================
|
| 15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
| 16 |
+
Literarni Noviny
|
| 17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
| 19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
| 20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
| 22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
| 24 |
+
(American)
|
| 25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
| 27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
| 29 |
+
Text Bank (Suomen Kielen newspapers
|
| 30 |
+
Tekstipankki)
|
| 31 |
+
Finnish Center for IT Science
|
| 32 |
+
(CSC)
|
| 33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
| 35 |
+
(European)
|
| 36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
| 38 |
+
(Switzerland) CD-ROM
|
| 39 |
+
(Uses "ss"
|
| 40 |
+
instead of "ß")
|
| 41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
| 43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
| 45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
| 47 |
+
(Bokmål and Information Technologies,
|
| 48 |
+
Nynorsk) Bergen
|
| 49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
| 51 |
+
(http://www.nkjp.pl/)
|
| 52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
| 54 |
+
(Brazilian) (Linguateca)
|
| 55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
| 57 |
+
Slovene Academy for Arts
|
| 58 |
+
and Sciences
|
| 59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
| 61 |
+
(European)
|
| 62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
| 64 |
+
(and some other texts)
|
| 65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
| 67 |
+
(Türkçe Derlem Projesi)
|
| 68 |
+
University of Ankara
|
| 69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
| 72 |
+
Unicode using the codecs module.
|
| 73 |
+
|
| 74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
| 75 |
+
Computational Linguistics 32: 485-525.
|
| 76 |
+
|
| 77 |
+
---- Training Code ----
|
| 78 |
+
|
| 79 |
+
# import punkt
|
| 80 |
+
import nltk.tokenize.punkt
|
| 81 |
+
|
| 82 |
+
# Make a new Tokenizer
|
| 83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
| 84 |
+
|
| 85 |
+
# Read in training corpus (one example: Slovene)
|
| 86 |
+
import codecs
|
| 87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
| 88 |
+
|
| 89 |
+
# Train tokenizer
|
| 90 |
+
tokenizer.train(text)
|
| 91 |
+
|
| 92 |
+
# Dump pickled tokenizer
|
| 93 |
+
import pickle
|
| 94 |
+
out = open("slovene.pickle","wb")
|
| 95 |
+
pickle.dump(tokenizer, out)
|
| 96 |
+
out.close()
|
| 97 |
+
|
| 98 |
+
---------
|
nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
množ
|
| 3 |
+
např
|
| 4 |
+
j.h
|
| 5 |
+
man
|
| 6 |
+
ú
|
| 7 |
+
jug
|
| 8 |
+
dr
|
| 9 |
+
bl
|
| 10 |
+
ml
|
| 11 |
+
okr
|
| 12 |
+
st
|
| 13 |
+
uh
|
| 14 |
+
šp
|
| 15 |
+
judr
|
| 16 |
+
u.s.a
|
| 17 |
+
p
|
| 18 |
+
arg
|
| 19 |
+
žitě
|
| 20 |
+
st.celsia
|
| 21 |
+
etc
|
| 22 |
+
p.s
|
| 23 |
+
t.r
|
| 24 |
+
lok
|
| 25 |
+
mil
|
| 26 |
+
ict
|
| 27 |
+
n
|
| 28 |
+
tl
|
| 29 |
+
min
|
| 30 |
+
č
|
| 31 |
+
d
|
| 32 |
+
al
|
| 33 |
+
ravenně
|
| 34 |
+
mj
|
| 35 |
+
nar
|
| 36 |
+
plk
|
| 37 |
+
s.p
|
| 38 |
+
a.g
|
| 39 |
+
roč
|
| 40 |
+
b
|
| 41 |
+
zdi
|
| 42 |
+
r.s.c
|
| 43 |
+
přek
|
| 44 |
+
m
|
| 45 |
+
gen
|
| 46 |
+
csc
|
| 47 |
+
mudr
|
| 48 |
+
vic
|
| 49 |
+
š
|
| 50 |
+
sb
|
| 51 |
+
resp
|
| 52 |
+
tzn
|
| 53 |
+
iv
|
| 54 |
+
s.r.o
|
| 55 |
+
mar
|
| 56 |
+
w
|
| 57 |
+
čs
|
| 58 |
+
vi
|
| 59 |
+
tzv
|
| 60 |
+
ul
|
| 61 |
+
pen
|
| 62 |
+
zv
|
| 63 |
+
str
|
| 64 |
+
čp
|
| 65 |
+
org
|
| 66 |
+
rak
|
| 67 |
+
sv
|
| 68 |
+
pplk
|
| 69 |
+
u.s
|
| 70 |
+
prof
|
| 71 |
+
c.k
|
| 72 |
+
op
|
| 73 |
+
g
|
| 74 |
+
vii
|
| 75 |
+
kr
|
| 76 |
+
ing
|
| 77 |
+
j.o
|
| 78 |
+
drsc
|
| 79 |
+
m3
|
| 80 |
+
l
|
| 81 |
+
tr
|
| 82 |
+
ceo
|
| 83 |
+
ch
|
| 84 |
+
fuk
|
| 85 |
+
vl
|
| 86 |
+
viii
|
| 87 |
+
líp
|
| 88 |
+
hl.m
|
| 89 |
+
t.zv
|
| 90 |
+
phdr
|
| 91 |
+
o.k
|
| 92 |
+
tis
|
| 93 |
+
doc
|
| 94 |
+
kl
|
| 95 |
+
ard
|
| 96 |
+
čkd
|
| 97 |
+
pok
|
| 98 |
+
apod
|
| 99 |
+
r
|
| 100 |
+
př
|
| 101 |
+
a.s
|
| 102 |
+
j
|
| 103 |
+
jr
|
| 104 |
+
i.m
|
| 105 |
+
e
|
| 106 |
+
kupř
|
| 107 |
+
f
|
| 108 |
+
tř
|
| 109 |
+
xvi
|
| 110 |
+
mir
|
| 111 |
+
atď
|
| 112 |
+
vr
|
| 113 |
+
r.i.v
|
| 114 |
+
hl
|
| 115 |
+
kv
|
| 116 |
+
t.j
|
| 117 |
+
y
|
| 118 |
+
q.p.r
|
nltk_data/tokenizers/punkt_tab/czech/collocations.tab
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
i dejmala
|
| 2 |
+
##number## prosince
|
| 3 |
+
h steina
|
| 4 |
+
##number## listopadu
|
| 5 |
+
a dvořák
|
| 6 |
+
v klaus
|
| 7 |
+
i čnhl
|
| 8 |
+
##number## wladyslawowo
|
| 9 |
+
##number## letech
|
| 10 |
+
a jiráska
|
| 11 |
+
a dubček
|
| 12 |
+
##number## štrasburk
|
| 13 |
+
##number## juniorské
|
| 14 |
+
##number## století
|
| 15 |
+
##number## kola
|
| 16 |
+
##number## pád
|
| 17 |
+
##number## května
|
| 18 |
+
##number## týdne
|
| 19 |
+
v dlouhý
|
| 20 |
+
k design
|
| 21 |
+
##number## červenec
|
| 22 |
+
i ligy
|
| 23 |
+
##number## kolo
|
| 24 |
+
z svěrák
|
| 25 |
+
##number## mája
|
| 26 |
+
##number## šimková
|
| 27 |
+
a bělého
|
| 28 |
+
a bradáč
|
| 29 |
+
##number## ročníku
|
| 30 |
+
##number## dubna
|
| 31 |
+
a vivaldiho
|
| 32 |
+
v mečiara
|
| 33 |
+
c carrićre
|
| 34 |
+
##number## sjezd
|
| 35 |
+
##number## výroční
|
| 36 |
+
##number## kole
|
| 37 |
+
##number## narozenin
|
| 38 |
+
k maleevová
|
| 39 |
+
i čnfl
|
| 40 |
+
##number## pádě
|
| 41 |
+
##number## září
|
| 42 |
+
##number## výročí
|
| 43 |
+
a dvořáka
|
| 44 |
+
h g.
|
| 45 |
+
##number## ledna
|
| 46 |
+
a dvorský
|
| 47 |
+
h měsíc
|
| 48 |
+
##number## srpna
|
| 49 |
+
##number## tř.
|
| 50 |
+
a mozarta
|
| 51 |
+
##number## sudetoněmeckých
|
| 52 |
+
o sokolov
|
| 53 |
+
k škrach
|
| 54 |
+
v benda
|
| 55 |
+
##number## symfonie
|
| 56 |
+
##number## července
|
| 57 |
+
x šalda
|
| 58 |
+
c abrahama
|
| 59 |
+
a tichý
|
| 60 |
+
##number## místo
|
| 61 |
+
k bielecki
|
| 62 |
+
v havel
|
| 63 |
+
##number## etapu
|
| 64 |
+
a dubčeka
|
| 65 |
+
i liga
|
| 66 |
+
##number## světový
|
| 67 |
+
v klausem
|
| 68 |
+
##number## ženy
|
| 69 |
+
##number## létech
|
| 70 |
+
##number## minutě
|
| 71 |
+
##number## listopadem
|
| 72 |
+
##number## místě
|
| 73 |
+
o vlček
|
| 74 |
+
k peteraje
|
| 75 |
+
i sponzor
|
| 76 |
+
##number## června
|
| 77 |
+
##number## min.
|
| 78 |
+
##number## oprávněnou
|
| 79 |
+
##number## květnu
|
| 80 |
+
##number## aktu
|
| 81 |
+
##number## květnem
|
| 82 |
+
##number## října
|
| 83 |
+
i rynda
|
| 84 |
+
##number## února
|
| 85 |
+
i snfl
|
| 86 |
+
a mozart
|
| 87 |
+
z košler
|
| 88 |
+
a dvorskému
|
| 89 |
+
v marhoul
|
| 90 |
+
v mečiar
|
| 91 |
+
##number## ročník
|
| 92 |
+
##number## máje
|
| 93 |
+
v havla
|
| 94 |
+
k gott
|
| 95 |
+
s bacha
|
| 96 |
+
##number## ad
|
nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
já
|
| 2 |
+
milena
|
| 3 |
+
tomáš
|
| 4 |
+
oznámila
|
| 5 |
+
podle
|
| 6 |
+
my
|
| 7 |
+
vyplývá
|
| 8 |
+
hlavní
|
| 9 |
+
jelikož
|
| 10 |
+
musíme
|
| 11 |
+
kdyby
|
| 12 |
+
foto
|
| 13 |
+
rozptylové
|
| 14 |
+
snad
|
| 15 |
+
zároveň
|
| 16 |
+
jaroslav
|
| 17 |
+
po
|
| 18 |
+
v
|
| 19 |
+
kromě
|
| 20 |
+
pokud
|
| 21 |
+
toto
|
| 22 |
+
jenže
|
| 23 |
+
oba
|
| 24 |
+
jak
|
| 25 |
+
zatímco
|
| 26 |
+
ten
|
| 27 |
+
myslím
|
| 28 |
+
navíc
|
| 29 |
+
dušan
|
| 30 |
+
zdá
|
| 31 |
+
dnes
|
| 32 |
+
přesto
|
| 33 |
+
tato
|
| 34 |
+
ti
|
| 35 |
+
bratislava
|
| 36 |
+
ale
|
| 37 |
+
když
|
| 38 |
+
nicméně
|
| 39 |
+
tento
|
| 40 |
+
mirka
|
| 41 |
+
přitom
|
| 42 |
+
dokud
|
| 43 |
+
jan
|
| 44 |
+
bohužel
|
| 45 |
+
ta
|
| 46 |
+
díky
|
| 47 |
+
prohlásil
|
| 48 |
+
praha
|
| 49 |
+
jestliže
|
| 50 |
+
jde
|
| 51 |
+
vždyť
|
| 52 |
+
moskva
|
| 53 |
+
proto
|
| 54 |
+
to
|
nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
tlf
|
| 3 |
+
b.p
|
| 4 |
+
evt
|
| 5 |
+
j.h
|
| 6 |
+
lenz
|
| 7 |
+
mht
|
| 8 |
+
gl
|
| 9 |
+
bl
|
| 10 |
+
stud.polit
|
| 11 |
+
e.j
|
| 12 |
+
st
|
| 13 |
+
o
|
| 14 |
+
dec
|
| 15 |
+
mag
|
| 16 |
+
h.b
|
| 17 |
+
p
|
| 18 |
+
adm
|
| 19 |
+
el.lign
|
| 20 |
+
e.s
|
| 21 |
+
saalba
|
| 22 |
+
styrt
|
| 23 |
+
nr
|
| 24 |
+
m.a.s.h
|
| 25 |
+
etc
|
| 26 |
+
pharm
|
| 27 |
+
hg
|
| 28 |
+
j.j
|
| 29 |
+
dj
|
| 30 |
+
mountainb
|
| 31 |
+
f.kr
|
| 32 |
+
h.r
|
| 33 |
+
cand.jur
|
| 34 |
+
sp
|
| 35 |
+
osv
|
| 36 |
+
s.g
|
| 37 |
+
ndr
|
| 38 |
+
inc
|
| 39 |
+
b.i.g
|
| 40 |
+
dk-sver
|
| 41 |
+
sl
|
| 42 |
+
v.s.o.d
|
| 43 |
+
cand.mag
|
| 44 |
+
d.v.s
|
| 45 |
+
v.i
|
| 46 |
+
bøddel
|
| 47 |
+
fr
|
| 48 |
+
ø«
|
| 49 |
+
dr.phil
|
| 50 |
+
chr
|
| 51 |
+
p.d
|
| 52 |
+
bj
|
| 53 |
+
fhv
|
| 54 |
+
tilskudsforhold
|
| 55 |
+
m.a
|
| 56 |
+
sek
|
| 57 |
+
p.g.a
|
| 58 |
+
int
|
| 59 |
+
pokalf
|
| 60 |
+
ik
|
| 61 |
+
dir
|
| 62 |
+
em-lodtrækn
|
| 63 |
+
a.h
|
| 64 |
+
o.lign
|
| 65 |
+
p.t
|
| 66 |
+
m.v
|
| 67 |
+
n.j
|
| 68 |
+
m.h.t
|
| 69 |
+
m.m
|
| 70 |
+
a.p
|
| 71 |
+
pers
|
| 72 |
+
4-bakketurn
|
| 73 |
+
dr.med
|
| 74 |
+
w.ø
|
| 75 |
+
polit
|
| 76 |
+
fremsættes
|
| 77 |
+
techn
|
| 78 |
+
tidl
|
| 79 |
+
o.g
|
| 80 |
+
i.c.i
|
| 81 |
+
mill
|
| 82 |
+
skt
|
| 83 |
+
m.fl
|
| 84 |
+
cand.merc
|
| 85 |
+
kbh
|
| 86 |
+
indiv
|
| 87 |
+
stk
|
| 88 |
+
dk-maked
|
| 89 |
+
memorandum
|
| 90 |
+
mestersk
|
| 91 |
+
mag.art
|
| 92 |
+
kitzb
|
| 93 |
+
h
|
| 94 |
+
lic
|
| 95 |
+
fig
|
| 96 |
+
dressurst
|
| 97 |
+
sportsg
|
| 98 |
+
r.e.m
|
| 99 |
+
d.u.m
|
| 100 |
+
sct
|
| 101 |
+
kld
|
| 102 |
+
bl.a
|
| 103 |
+
hf
|
| 104 |
+
g.a
|
| 105 |
+
corp
|
| 106 |
+
w
|
| 107 |
+
konk
|
| 108 |
+
zoeterm
|
| 109 |
+
b.t
|
| 110 |
+
a.d
|
| 111 |
+
l.b
|
| 112 |
+
jf
|
| 113 |
+
s.b
|
| 114 |
+
kgl
|
| 115 |
+
ill
|
| 116 |
+
beck
|
| 117 |
+
tosset
|
| 118 |
+
afd
|
| 119 |
+
johs
|
| 120 |
+
pct
|
| 121 |
+
k.b
|
| 122 |
+
sv
|
| 123 |
+
verbalt
|
| 124 |
+
kgs
|
| 125 |
+
l.m.k
|
| 126 |
+
j.l
|
| 127 |
+
aus
|
| 128 |
+
superl
|
| 129 |
+
t.v
|
| 130 |
+
mia
|
| 131 |
+
kr
|
| 132 |
+
pr
|
| 133 |
+
præmien
|
| 134 |
+
j.b.s
|
| 135 |
+
j.o
|
| 136 |
+
o.s.v
|
| 137 |
+
edb-oplysninger
|
| 138 |
+
o.m.a
|
| 139 |
+
ca
|
| 140 |
+
1b
|
| 141 |
+
f.eks
|
| 142 |
+
rens
|
| 143 |
+
ch
|
| 144 |
+
mr
|
| 145 |
+
schw
|
| 146 |
+
d.c
|
| 147 |
+
utraditionelt
|
| 148 |
+
idrætsgym
|
| 149 |
+
hhv
|
| 150 |
+
e.l
|
| 151 |
+
s.s
|
| 152 |
+
eks
|
| 153 |
+
f.o.m
|
| 154 |
+
dk-storbrit
|
| 155 |
+
dk-jugo
|
| 156 |
+
n.z
|
| 157 |
+
derivater
|
| 158 |
+
c
|
| 159 |
+
pt
|
| 160 |
+
vm-kval
|
| 161 |
+
kl
|
| 162 |
+
hr
|
| 163 |
+
cand
|
| 164 |
+
jur
|
| 165 |
+
sav
|
| 166 |
+
h.c
|
| 167 |
+
arab.-danm
|
| 168 |
+
d.a.d
|
| 169 |
+
fl
|
| 170 |
+
o.a
|
| 171 |
+
a.s
|
| 172 |
+
cand.polit
|
| 173 |
+
grundejerform
|
| 174 |
+
j
|
| 175 |
+
faglærte
|
| 176 |
+
cr
|
| 177 |
+
a.a
|
| 178 |
+
mou
|
| 179 |
+
f.r.i
|
| 180 |
+
årh
|
| 181 |
+
o.m.m
|
| 182 |
+
sve
|
| 183 |
+
c.a
|
| 184 |
+
engl
|
| 185 |
+
sikkerhedssystemerne
|
| 186 |
+
m.f
|
| 187 |
+
j.k
|
| 188 |
+
phil
|
| 189 |
+
f
|
| 190 |
+
vet
|
| 191 |
+
mio
|
| 192 |
+
k.e
|
| 193 |
+
m.k
|
| 194 |
+
atla
|
| 195 |
+
idrætsg
|
| 196 |
+
n.n
|
| 197 |
+
4-bakketur
|
| 198 |
+
dvs
|
| 199 |
+
sdr
|
| 200 |
+
s.j
|
| 201 |
+
hol
|
| 202 |
+
s.h
|
| 203 |
+
pei
|
| 204 |
+
kbhvn
|
| 205 |
+
aa
|
| 206 |
+
m.g.i
|
| 207 |
+
fvt
|
| 208 |
+
i«
|
| 209 |
+
b.c
|
| 210 |
+
th
|
| 211 |
+
lrs
|
nltk_data/tokenizers/punkt_tab/danish/collocations.tab
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## skak
|
| 2 |
+
##number## speedway
|
| 3 |
+
##number## rally
|
| 4 |
+
##number## april
|
| 5 |
+
##number## dm-fin
|
| 6 |
+
##number## viceformand
|
| 7 |
+
m jensen
|
| 8 |
+
##number## kano/kajak
|
| 9 |
+
##number## bowling
|
| 10 |
+
##number## dm-finale
|
| 11 |
+
##number## årh.
|
| 12 |
+
##number## januar
|
| 13 |
+
##number## august
|
| 14 |
+
##number## marathon
|
| 15 |
+
##number## kamp
|
| 16 |
+
##number## skihop
|
| 17 |
+
##number## etage
|
| 18 |
+
##number## tennis
|
| 19 |
+
##number## cykling
|
| 20 |
+
e andersen
|
| 21 |
+
##number## december
|
| 22 |
+
g h.
|
| 23 |
+
##number## neb
|
| 24 |
+
##number## sektion
|
| 25 |
+
##number## afd.
|
| 26 |
+
##number## klasse
|
| 27 |
+
##number## trampolin
|
| 28 |
+
##number## bordtennis
|
| 29 |
+
##number## formel
|
| 30 |
+
##number## århundredes
|
| 31 |
+
##number## dm-semifin
|
| 32 |
+
##number## heks
|
| 33 |
+
##number## taekwondo
|
| 34 |
+
##number## galop
|
| 35 |
+
##number## basketball
|
| 36 |
+
##number## dm
|
| 37 |
+
m skræl
|
| 38 |
+
##number## trav
|
| 39 |
+
##number## provins
|
| 40 |
+
##number## triathlon
|
| 41 |
+
k axel
|
| 42 |
+
##number## rugby
|
| 43 |
+
s h.
|
| 44 |
+
##number## klaverkoncert
|
| 45 |
+
a p.
|
| 46 |
+
e løgstrup
|
| 47 |
+
k telefax
|
| 48 |
+
##number## gyldendal
|
| 49 |
+
##number## fodbold
|
| 50 |
+
e rosenfeldt
|
| 51 |
+
##number## oktober
|
| 52 |
+
k o.
|
| 53 |
+
##number## september
|
| 54 |
+
##number## dec.
|
| 55 |
+
##number## juledag
|
| 56 |
+
##number## badminton
|
| 57 |
+
##number## sejlsport
|
| 58 |
+
##number## håndbold
|
| 59 |
+
r førsund
|
| 60 |
+
e jørgensen
|
| 61 |
+
d ##number##
|
| 62 |
+
k e
|
| 63 |
+
##number## alp.ski
|
| 64 |
+
##number## judo
|
| 65 |
+
##number## roning
|
| 66 |
+
##number## november
|
| 67 |
+
##number## atletik
|
| 68 |
+
##number## århundrede
|
| 69 |
+
##number## ridning
|
| 70 |
+
##number## marts
|
| 71 |
+
m andersen
|
| 72 |
+
d roosevelt
|
| 73 |
+
##number## brydning
|
| 74 |
+
s kr.
|
| 75 |
+
##number## runde
|
| 76 |
+
##number## division
|
| 77 |
+
##number## sal
|
| 78 |
+
##number## boksning
|
| 79 |
+
##number## minut
|
| 80 |
+
##number## golf
|
| 81 |
+
##number## juni
|
| 82 |
+
##number## symfoni
|
| 83 |
+
##number## hurtigløb
|
| 84 |
+
k jørgensen
|
| 85 |
+
##number## jörgen
|
| 86 |
+
##number## klasses
|
| 87 |
+
e jacobsen
|
| 88 |
+
k jensen
|
| 89 |
+
##number## februar
|
| 90 |
+
k nielsen
|
| 91 |
+
##number## volleyball
|
| 92 |
+
##number## maj
|
| 93 |
+
##number## verdenskrig
|
| 94 |
+
##number## juli
|
| 95 |
+
##number## ishockey
|
| 96 |
+
##number## kunstskøjteløb
|
| 97 |
+
b jørgensen
|
| 98 |
+
##number## gymnastik
|
| 99 |
+
##number## svømning
|
| 100 |
+
##number## tw
|
| 101 |
+
i pedersens
|
nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
kronik
|
| 2 |
+
alligevel
|
| 3 |
+
de
|
| 4 |
+
først
|
| 5 |
+
derfor
|
| 6 |
+
vi
|
| 7 |
+
selv
|
| 8 |
+
hertil
|
| 9 |
+
sådan
|
| 10 |
+
dette
|
| 11 |
+
sport
|
| 12 |
+
man
|
| 13 |
+
foto
|
| 14 |
+
begge
|
| 15 |
+
tag
|
| 16 |
+
dertil
|
| 17 |
+
reuter
|
| 18 |
+
efter
|
| 19 |
+
endelig
|
| 20 |
+
ifølge
|
| 21 |
+
lad
|
| 22 |
+
når
|
| 23 |
+
det
|
| 24 |
+
desuden
|
| 25 |
+
nu
|
| 26 |
+
reuters
|
| 27 |
+
årsagen
|
| 28 |
+
tænk
|
| 29 |
+
samtidig
|
| 30 |
+
udover
|
| 31 |
+
men
|
| 32 |
+
endvidere
|
| 33 |
+
rør
|
| 34 |
+
rb
|
| 35 |
+
udstillingen
|
| 36 |
+
faktabox
|
| 37 |
+
reception
|
| 38 |
+
blandt
|
| 39 |
+
hvad
|
| 40 |
+
skær
|
| 41 |
+
lilot
|
| 42 |
+
derudover
|
| 43 |
+
da
|
| 44 |
+
tilsæt
|
| 45 |
+
denne
|
| 46 |
+
afp
|
| 47 |
+
her
|
| 48 |
+
hvis
|
| 49 |
+
hæld
|
| 50 |
+
problemet
|
| 51 |
+
dermed
|
| 52 |
+
jeg
|
| 53 |
+
grafik
|
| 54 |
+
anmeldelse
|
| 55 |
+
den
|
| 56 |
+
ebbe
|
| 57 |
+
resultatet
|
| 58 |
+
tværtimod
|
| 59 |
+
hans
|
| 60 |
+
måske
|
| 61 |
+
feature
|
| 62 |
+
tillæg
|
| 63 |
+
hun
|
| 64 |
+
han
|
nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
m.j
|
| 2 |
+
t
|
| 3 |
+
ph
|
| 4 |
+
j.h
|
| 5 |
+
p.a.m
|
| 6 |
+
j.m
|
| 7 |
+
dr
|
| 8 |
+
st
|
| 9 |
+
j.b.m
|
| 10 |
+
p
|
| 11 |
+
nr
|
| 12 |
+
h.s
|
| 13 |
+
e.d
|
| 14 |
+
t.e
|
| 15 |
+
a.v
|
| 16 |
+
esb
|
| 17 |
+
s.z
|
| 18 |
+
drs
|
| 19 |
+
b.b
|
| 20 |
+
m.o
|
| 21 |
+
inc
|
| 22 |
+
n
|
| 23 |
+
pensioenfonds
|
| 24 |
+
s.v.p
|
| 25 |
+
bod
|
| 26 |
+
fr
|
| 27 |
+
pk
|
| 28 |
+
r.p
|
| 29 |
+
c.p.j
|
| 30 |
+
v.l.n.r
|
| 31 |
+
chr
|
| 32 |
+
m.v.d
|
| 33 |
+
int
|
| 34 |
+
o.m
|
| 35 |
+
j.v.d
|
| 36 |
+
u.o.m
|
| 37 |
+
f.c
|
| 38 |
+
k
|
| 39 |
+
bijgebracht
|
| 40 |
+
ontwaakte
|
| 41 |
+
m
|
| 42 |
+
j.w
|
| 43 |
+
a.l
|
| 44 |
+
a.v.d
|
| 45 |
+
s.v
|
| 46 |
+
s
|
| 47 |
+
j.d
|
| 48 |
+
binnengekomen
|
| 49 |
+
ds
|
| 50 |
+
schouwburg
|
| 51 |
+
b.v
|
| 52 |
+
h
|
| 53 |
+
a
|
| 54 |
+
j.a
|
| 55 |
+
aanvielen
|
| 56 |
+
h.g
|
| 57 |
+
p.f
|
| 58 |
+
j.l
|
| 59 |
+
mgr
|
| 60 |
+
c.j
|
| 61 |
+
blz
|
| 62 |
+
l.e.h
|
| 63 |
+
w.k
|
| 64 |
+
g
|
| 65 |
+
m.g
|
| 66 |
+
r.v.d
|
| 67 |
+
ing
|
| 68 |
+
v.d
|
| 69 |
+
c.q
|
| 70 |
+
l
|
| 71 |
+
h.p
|
| 72 |
+
mr
|
| 73 |
+
gesch
|
| 74 |
+
e.l
|
| 75 |
+
p.j
|
| 76 |
+
mm
|
| 77 |
+
j.g
|
| 78 |
+
j.f
|
| 79 |
+
c
|
| 80 |
+
f.m
|
| 81 |
+
jl
|
| 82 |
+
r
|
| 83 |
+
o.a
|
| 84 |
+
a.s
|
| 85 |
+
ir
|
| 86 |
+
v
|
| 87 |
+
j
|
| 88 |
+
jr
|
| 89 |
+
e
|
| 90 |
+
m.i.v
|
| 91 |
+
l.a
|
| 92 |
+
f.v.d
|
| 93 |
+
aansluit
|
| 94 |
+
c.c
|
| 95 |
+
a.m
|
| 96 |
+
f.o.j
|
| 97 |
+
m.b
|
| 98 |
+
y
|
| 99 |
+
th
|
nltk_data/tokenizers/punkt_tab/dutch/collocations.tab
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## sotelo
|
| 2 |
+
##number## clas
|
| 3 |
+
##number## buckler
|
| 4 |
+
##number## carrera
|
| 5 |
+
##number## rmo
|
| 6 |
+
##number## orioli
|
| 7 |
+
w baron
|
| 8 |
+
##number## morales
|
| 9 |
+
##number## snotselelaank
|
| 10 |
+
##number## arcarons
|
| 11 |
+
##number## cavandoli
|
| 12 |
+
##number## pdm
|
| 13 |
+
##number## helvetia
|
| 14 |
+
##number## panasonic
|
| 15 |
+
##number## motorola
|
| 16 |
+
w bruinsma
|
| 17 |
+
##number## heer
|
| 18 |
+
##number## lotus
|
| 19 |
+
##number## banesto
|
| 20 |
+
##number## magnaldi
|
| 21 |
+
w jense
|
| 22 |
+
w heuvelmans
|
| 23 |
+
w spatje
|
| 24 |
+
##number## telekom
|
| 25 |
+
f kennedy
|
| 26 |
+
##number## gatorade
|
| 27 |
+
##number## mg-gb
|
| 28 |
+
##number## once
|
| 29 |
+
##number## peterhansel
|
| 30 |
+
##number## ariostea
|
| 31 |
+
##number## tvm
|
| 32 |
+
##number## höl
|
| 33 |
+
##number## castorama
|
| 34 |
+
##number## tulip
|
| 35 |
+
b situatie
|
| 36 |
+
##number## mas
|
| 37 |
+
##number## lotto
|
nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
het
|
| 2 |
+
daardoor
|
| 3 |
+
de
|
| 4 |
+
er
|
| 5 |
+
hoewel
|
| 6 |
+
wat
|
| 7 |
+
urlings
|
| 8 |
+
na
|
| 9 |
+
ze
|
| 10 |
+
alleen
|
| 11 |
+
dat
|
| 12 |
+
ik
|
| 13 |
+
pijls
|
| 14 |
+
wie
|
| 15 |
+
daarna
|
| 16 |
+
foto
|
| 17 |
+
als
|
| 18 |
+
boer
|
| 19 |
+
hammes
|
| 20 |
+
verder
|
| 21 |
+
ook
|
| 22 |
+
evers
|
| 23 |
+
vandaar
|
| 24 |
+
toen
|
| 25 |
+
we
|
| 26 |
+
langenberg
|
| 27 |
+
naast
|
| 28 |
+
want
|
| 29 |
+
in
|
| 30 |
+
wij
|
| 31 |
+
zo
|
| 32 |
+
hendrikx
|
| 33 |
+
daar
|
| 34 |
+
crouzen
|
| 35 |
+
dit
|
| 36 |
+
daarnaast
|
| 37 |
+
anp
|
| 38 |
+
zij
|
| 39 |
+
behalve
|
| 40 |
+
waarom
|
| 41 |
+
daarom
|
| 42 |
+
bovendien
|
| 43 |
+
hij
|
| 44 |
+
daarbij
|
| 45 |
+
nee
|
| 46 |
+
volgens
|
| 47 |
+
daarmee
|
| 48 |
+
bukkems
|
| 49 |
+
dvnl
|
| 50 |
+
eén
|
| 51 |
+
pas
|
| 52 |
+
tijdens
|
| 53 |
+
vooral
|
| 54 |
+
maar
|
nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ct
|
| 2 |
+
m.j
|
| 3 |
+
t
|
| 4 |
+
a.c
|
| 5 |
+
n.h
|
| 6 |
+
ms
|
| 7 |
+
p.a.m
|
| 8 |
+
dr
|
| 9 |
+
pa
|
| 10 |
+
p.m
|
| 11 |
+
u.k
|
| 12 |
+
st
|
| 13 |
+
dec
|
| 14 |
+
u.s.a
|
| 15 |
+
lt
|
| 16 |
+
g.k
|
| 17 |
+
adm
|
| 18 |
+
p
|
| 19 |
+
h.m
|
| 20 |
+
ga
|
| 21 |
+
tenn
|
| 22 |
+
yr
|
| 23 |
+
sen
|
| 24 |
+
n.c
|
| 25 |
+
j.j
|
| 26 |
+
d.h
|
| 27 |
+
s.g
|
| 28 |
+
inc
|
| 29 |
+
vs
|
| 30 |
+
s.p.a
|
| 31 |
+
a.t
|
| 32 |
+
n
|
| 33 |
+
feb
|
| 34 |
+
sr
|
| 35 |
+
jan
|
| 36 |
+
s.a.y
|
| 37 |
+
n.y
|
| 38 |
+
col
|
| 39 |
+
g.f
|
| 40 |
+
c.o.m.b
|
| 41 |
+
d
|
| 42 |
+
ft
|
| 43 |
+
va
|
| 44 |
+
r.k
|
| 45 |
+
e.f
|
| 46 |
+
chg
|
| 47 |
+
r.i
|
| 48 |
+
a.g
|
| 49 |
+
minn
|
| 50 |
+
a.h
|
| 51 |
+
k
|
| 52 |
+
n.j
|
| 53 |
+
m
|
| 54 |
+
l.f
|
| 55 |
+
f.j
|
| 56 |
+
gen
|
| 57 |
+
i.m.s
|
| 58 |
+
s.a
|
| 59 |
+
aug
|
| 60 |
+
j.p
|
| 61 |
+
okla
|
| 62 |
+
m.d.c
|
| 63 |
+
ltd
|
| 64 |
+
oct
|
| 65 |
+
s
|
| 66 |
+
vt
|
| 67 |
+
r.a
|
| 68 |
+
j.c
|
| 69 |
+
ariz
|
| 70 |
+
w.w
|
| 71 |
+
b.v
|
| 72 |
+
ore
|
| 73 |
+
h
|
| 74 |
+
w.r
|
| 75 |
+
e.h
|
| 76 |
+
mrs
|
| 77 |
+
cie
|
| 78 |
+
corp
|
| 79 |
+
w
|
| 80 |
+
n.v
|
| 81 |
+
a.d
|
| 82 |
+
r.j
|
| 83 |
+
ok
|
| 84 |
+
. .
|
| 85 |
+
e.m
|
| 86 |
+
w.c
|
| 87 |
+
ill
|
| 88 |
+
nov
|
| 89 |
+
u.s
|
| 90 |
+
prof
|
| 91 |
+
conn
|
| 92 |
+
u.s.s.r
|
| 93 |
+
mg
|
| 94 |
+
f.g
|
| 95 |
+
ph.d
|
| 96 |
+
g
|
| 97 |
+
calif
|
| 98 |
+
messrs
|
| 99 |
+
h.f
|
| 100 |
+
wash
|
| 101 |
+
tues
|
| 102 |
+
sw
|
| 103 |
+
bros
|
| 104 |
+
u.n
|
| 105 |
+
l
|
| 106 |
+
wis
|
| 107 |
+
mr
|
| 108 |
+
sep
|
| 109 |
+
d.c
|
| 110 |
+
ave
|
| 111 |
+
e.l
|
| 112 |
+
co
|
| 113 |
+
s.s
|
| 114 |
+
reps
|
| 115 |
+
c
|
| 116 |
+
r.t
|
| 117 |
+
h.c
|
| 118 |
+
r
|
| 119 |
+
wed
|
| 120 |
+
a.s
|
| 121 |
+
v
|
| 122 |
+
fla
|
| 123 |
+
jr
|
| 124 |
+
r.h
|
| 125 |
+
c.v
|
| 126 |
+
m.b.a
|
| 127 |
+
rep
|
| 128 |
+
a.a
|
| 129 |
+
e
|
| 130 |
+
c.i.t
|
| 131 |
+
l.a
|
| 132 |
+
b.f
|
| 133 |
+
j.b
|
| 134 |
+
d.w
|
| 135 |
+
j.k
|
| 136 |
+
ala
|
| 137 |
+
f
|
| 138 |
+
w.va
|
| 139 |
+
sept
|
| 140 |
+
mich
|
| 141 |
+
n.m
|
| 142 |
+
j.r
|
| 143 |
+
l.p
|
| 144 |
+
s.c
|
| 145 |
+
colo
|
| 146 |
+
fri
|
| 147 |
+
a.m
|
| 148 |
+
g.d
|
| 149 |
+
kan
|
| 150 |
+
maj
|
| 151 |
+
ky
|
| 152 |
+
a.m.e
|
| 153 |
+
n.d
|
| 154 |
+
t.j
|
| 155 |
+
cos
|
| 156 |
+
nev
|
nltk_data/tokenizers/punkt_tab/english/collocations.tab
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## international
|
| 2 |
+
##number## rj
|
| 3 |
+
##number## commodities
|
| 4 |
+
##number## cooper
|
| 5 |
+
b stewart
|
| 6 |
+
##number## genentech
|
| 7 |
+
##number## wedgestone
|
| 8 |
+
i toussie
|
| 9 |
+
##number## pepper
|
| 10 |
+
j fialka
|
| 11 |
+
o ludcke
|
| 12 |
+
##number## insider
|
| 13 |
+
##number## aes
|
| 14 |
+
i magnin
|
| 15 |
+
##number## credit
|
| 16 |
+
##number## corrections
|
| 17 |
+
##number## financing
|
| 18 |
+
##number## henley
|
| 19 |
+
##number## business
|
| 20 |
+
##number## pay-fone
|
| 21 |
+
b wigton
|
| 22 |
+
b edelman
|
| 23 |
+
b levine
|
| 24 |
+
##number## leisure
|
| 25 |
+
b smith
|
| 26 |
+
j walter
|
| 27 |
+
##number## pegasus
|
| 28 |
+
##number## dividend
|
| 29 |
+
j aron
|
| 30 |
+
##number## review
|
| 31 |
+
##number## abreast
|
| 32 |
+
##number## who
|
| 33 |
+
##number## letters
|
| 34 |
+
##number## colgate
|
| 35 |
+
##number## cbot
|
| 36 |
+
##number## notable
|
| 37 |
+
##number## zimmer
|
nltk_data/tokenizers/punkt_tab/english/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/english/sent_starters.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
most
|
| 2 |
+
he
|
| 3 |
+
since
|
| 4 |
+
so
|
| 5 |
+
both
|
| 6 |
+
these
|
| 7 |
+
it
|
| 8 |
+
nevertheless
|
| 9 |
+
this
|
| 10 |
+
indeed
|
| 11 |
+
however
|
| 12 |
+
instead
|
| 13 |
+
under
|
| 14 |
+
similarly
|
| 15 |
+
some
|
| 16 |
+
though
|
| 17 |
+
while
|
| 18 |
+
when
|
| 19 |
+
in
|
| 20 |
+
despite
|
| 21 |
+
although
|
| 22 |
+
nonetheless
|
| 23 |
+
thus
|
| 24 |
+
there
|
| 25 |
+
if
|
| 26 |
+
the
|
| 27 |
+
nor
|
| 28 |
+
separately
|
| 29 |
+
moreover
|
| 30 |
+
but
|
| 31 |
+
they
|
| 32 |
+
yet
|
| 33 |
+
many
|
| 34 |
+
according
|
| 35 |
+
sales
|
| 36 |
+
among
|
| 37 |
+
meanwhile
|
| 38 |
+
even
|
| 39 |
+
i
|
nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
eos
|
| 2 |
+
c
|
| 3 |
+
a.d
|
| 4 |
+
t.a.s.s
|
| 5 |
+
e.t
|
| 6 |
+
päevapiltnikud
|
| 7 |
+
c.h
|
| 8 |
+
b.p
|
| 9 |
+
amm
|
| 10 |
+
ameerika-mees
|
| 11 |
+
n.-ö
|
| 12 |
+
cm
|
| 13 |
+
b
|
| 14 |
+
mhm
|
| 15 |
+
a.s
|
| 16 |
+
m.e
|
| 17 |
+
j.l
|
| 18 |
+
j
|
| 19 |
+
u.t
|
| 20 |
+
vm
|
| 21 |
+
g.u.n
|
| 22 |
+
hajutada
|
| 23 |
+
p.s
|
| 24 |
+
a.b
|
| 25 |
+
c.h.-r
|
| 26 |
+
i.q
|
| 27 |
+
gr
|
| 28 |
+
fido
|
| 29 |
+
pankurit
|
| 30 |
+
s.v
|
| 31 |
+
l.l
|
| 32 |
+
c.-h
|
| 33 |
+
m.h
|
| 34 |
+
h.l
|
| 35 |
+
m.k
|
| 36 |
+
j.r
|
| 37 |
+
t.k
|
| 38 |
+
k.h
|
| 39 |
+
89/90
|
| 40 |
+
h
|
| 41 |
+
a
|
| 42 |
+
dost
|
| 43 |
+
v.k
|
| 44 |
+
e.q
|
| 45 |
+
t.j
|
| 46 |
+
m.b
|
| 47 |
+
d
|
| 48 |
+
p.k
|
nltk_data/tokenizers/punkt_tab/estonian/collocations.tab
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## juuni
|
| 2 |
+
##number## novembril
|
| 3 |
+
##number## juulilt
|
| 4 |
+
r järve-vomm
|
| 5 |
+
##number## mida
|
| 6 |
+
n liidu
|
| 7 |
+
##number## milliseid
|
| 8 |
+
##number## oktoobri
|
| 9 |
+
##number## iidol
|
| 10 |
+
m e
|
| 11 |
+
##number## klassist
|
| 12 |
+
##number## millest
|
| 13 |
+
##number## august
|
| 14 |
+
##number## pariis
|
| 15 |
+
##number## septembrist
|
| 16 |
+
##number## oktoober
|
| 17 |
+
##number## märtsini
|
| 18 |
+
##number## kust
|
| 19 |
+
k mägi
|
| 20 |
+
##number## detsembrist
|
| 21 |
+
##number## jaanuari
|
| 22 |
+
##number## epee
|
| 23 |
+
##number## nimetage
|
| 24 |
+
##number## novembrini
|
| 25 |
+
##number## eluaasta
|
| 26 |
+
s mill
|
| 27 |
+
##number## helsingi
|
| 28 |
+
##number## jaanuarini
|
| 29 |
+
##number## aastail
|
| 30 |
+
##number## augustil
|
| 31 |
+
##number## millise
|
| 32 |
+
##number## juulist
|
| 33 |
+
##number## mai
|
| 34 |
+
##number## novembri
|
| 35 |
+
##number## oktoobrist
|
| 36 |
+
##number## juunini
|
| 37 |
+
##number## septembriks
|
| 38 |
+
##number## detsembril
|
| 39 |
+
p s
|
| 40 |
+
##number## jaanuar
|
| 41 |
+
##number## aastate
|
| 42 |
+
##number## milline
|
| 43 |
+
##number## kelle
|
| 44 |
+
##number## jaanuaril
|
| 45 |
+
s stadnikov
|
| 46 |
+
##number## aastaks
|
| 47 |
+
##number## stockholm
|
| 48 |
+
##number## suurim
|
| 49 |
+
##number## aasta
|
| 50 |
+
##number## sajandi
|
| 51 |
+
##number## millega
|
| 52 |
+
##number## aastast
|
| 53 |
+
##number## aastal
|
| 54 |
+
##number## kumb
|
| 55 |
+
##number## septembril
|
| 56 |
+
##number## korruselt
|
| 57 |
+
##number## septembri
|
| 58 |
+
##number## veebruarini
|
| 59 |
+
##number## london
|
| 60 |
+
##number## aastatel
|
| 61 |
+
##number## september
|
| 62 |
+
##number## veebruari
|
| 63 |
+
##number## oktoobrini
|
| 64 |
+
##number## mail
|
| 65 |
+
m kassovitz
|
| 66 |
+
##number## action-film
|
| 67 |
+
##number## mis
|
| 68 |
+
k herkül
|
| 69 |
+
n n
|
| 70 |
+
##number## detsembrini
|
| 71 |
+
##number## imre
|
| 72 |
+
t jõgeda
|
| 73 |
+
##number## casino
|
| 74 |
+
##number## septembrit
|
| 75 |
+
##number## augustini
|
| 76 |
+
##number## juulil
|
| 77 |
+
##number## november
|
| 78 |
+
##number## kuupäeval
|
| 79 |
+
##number## taevas
|
| 80 |
+
##number## septembrini
|
| 81 |
+
##number## detsember
|
| 82 |
+
##number## detsembri
|
| 83 |
+
##number## juunil
|
| 84 |
+
##number## augustist
|
| 85 |
+
n jurist
|
| 86 |
+
##number## missugust
|
| 87 |
+
##number## aastatesse
|
| 88 |
+
##number## aprillil
|
| 89 |
+
##number## augusti
|
| 90 |
+
##number## oktoobril
|
| 91 |
+
##number## märtsil
|
| 92 |
+
##number## a
|
| 93 |
+
##number## the
|
| 94 |
+
##number## sajandil
|
| 95 |
+
##number## aastani
|
| 96 |
+
##number## juuli
|
| 97 |
+
##number## septembrile
|
| 98 |
+
##number## millist
|
| 99 |
+
##number## millised
|
| 100 |
+
##number## veebruaril
|
nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
kalad
|
| 2 |
+
õnneks
|
| 3 |
+
selle
|
| 4 |
+
maimu
|
| 5 |
+
teisipäeval
|
| 6 |
+
ma
|
| 7 |
+
skorpion
|
| 8 |
+
aga
|
| 9 |
+
lisaks
|
| 10 |
+
selleks
|
| 11 |
+
maitse
|
| 12 |
+
esiteks
|
| 13 |
+
erinevalt
|
| 14 |
+
pealegi
|
| 15 |
+
praegu
|
| 16 |
+
kas
|
| 17 |
+
tegelikult
|
| 18 |
+
neitsi
|
| 19 |
+
nädalavahetus
|
| 20 |
+
tema
|
| 21 |
+
kui
|
| 22 |
+
seega
|
| 23 |
+
täna
|
| 24 |
+
lugupidamisega
|
| 25 |
+
miks
|
| 26 |
+
teiseks
|
| 27 |
+
väldi
|
| 28 |
+
pohlak
|
| 29 |
+
osades
|
| 30 |
+
sõnn
|
| 31 |
+
samas
|
| 32 |
+
nimelt
|
| 33 |
+
juhtkiri
|
| 34 |
+
krimi
|
| 35 |
+
nädalavahetusel
|
| 36 |
+
näiteks
|
| 37 |
+
kuidas
|
| 38 |
+
ambur
|
| 39 |
+
telgmaa
|
| 40 |
+
laupäeval
|
| 41 |
+
seetõttu
|
| 42 |
+
rezhissöör
|
| 43 |
+
kahjuks
|
| 44 |
+
ent
|
| 45 |
+
samuti
|
| 46 |
+
ehkki
|
| 47 |
+
veevalaja
|
| 48 |
+
seepärast
|
| 49 |
+
muidugi
|
| 50 |
+
kuna
|
| 51 |
+
tänaseks
|
| 52 |
+
mina
|
| 53 |
+
loomulikult
|
| 54 |
+
ometi
|
| 55 |
+
arvamus
|
| 56 |
+
lõvi
|
| 57 |
+
ee
|
| 58 |
+
niisiis
|
| 59 |
+
mul
|
| 60 |
+
kaksikud
|
| 61 |
+
tõsi
|
| 62 |
+
hinnete
|
| 63 |
+
sestap
|
| 64 |
+
tõenäoliselt
|
| 65 |
+
samal
|
| 66 |
+
see
|
| 67 |
+
paraku
|
| 68 |
+
jäär
|
| 69 |
+
kokkuvõttes
|
| 70 |
+
küllap
|
| 71 |
+
muide
|
| 72 |
+
nüüd
|
| 73 |
+
kolmapäeval
|
| 74 |
+
võibolla
|
| 75 |
+
kuid
|
| 76 |
+
nädalavahetuse
|
| 77 |
+
kuigi
|
| 78 |
+
võid
|
| 79 |
+
lõpuks
|
| 80 |
+
kaalud
|
| 81 |
+
areen
|
| 82 |
+
kirjad
|
| 83 |
+
vähk
|
| 84 |
+
esmaspäeval
|
| 85 |
+
nii
|
| 86 |
+
need
|
| 87 |
+
uue
|
| 88 |
+
ta
|
| 89 |
+
minu
|
nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
suom
|
| 3 |
+
dr
|
| 4 |
+
st
|
| 5 |
+
970125090.jtun
|
| 6 |
+
p
|
| 7 |
+
sis
|
| 8 |
+
t.h
|
| 9 |
+
961221327.jtun
|
| 10 |
+
a.i
|
| 11 |
+
milj
|
| 12 |
+
ski
|
| 13 |
+
kp
|
| 14 |
+
970131067.jtun
|
| 15 |
+
970124030.jtun
|
| 16 |
+
nk
|
| 17 |
+
va
|
| 18 |
+
pan
|
| 19 |
+
yhteystiedot
|
| 20 |
+
ruots
|
| 21 |
+
jne
|
| 22 |
+
t.a
|
| 23 |
+
l.-g
|
| 24 |
+
k
|
| 25 |
+
j.w
|
| 26 |
+
p2
|
| 27 |
+
oik
|
| 28 |
+
970102248.jtun
|
| 29 |
+
hj
|
| 30 |
+
s
|
| 31 |
+
vt
|
| 32 |
+
muistelmia
|
| 33 |
+
o.s
|
| 34 |
+
elo
|
| 35 |
+
h
|
| 36 |
+
ortod
|
| 37 |
+
o.l
|
| 38 |
+
w
|
| 39 |
+
tms
|
| 40 |
+
970120219.jtun
|
| 41 |
+
pj
|
| 42 |
+
ok
|
| 43 |
+
toissapäiväinen
|
| 44 |
+
28.t1
|
| 45 |
+
pelintekijä
|
| 46 |
+
970111011.jtun
|
| 47 |
+
op
|
| 48 |
+
os
|
| 49 |
+
ns
|
| 50 |
+
m.g
|
| 51 |
+
o.-i
|
| 52 |
+
m3
|
| 53 |
+
pros
|
| 54 |
+
mr
|
| 55 |
+
970102171.jtun
|
| 56 |
+
waller
|
| 57 |
+
hels
|
| 58 |
+
rotary-järjestössä
|
| 59 |
+
ins
|
| 60 |
+
esim
|
| 61 |
+
apul
|
| 62 |
+
fil
|
| 63 |
+
id
|
| 64 |
+
ym
|
| 65 |
+
j
|
| 66 |
+
rf
|
| 67 |
+
v.o
|
| 68 |
+
lis
|
| 69 |
+
c.a
|
| 70 |
+
em
|
| 71 |
+
kand
|
| 72 |
+
r.y
|
| 73 |
+
valt
|
| 74 |
+
dipl
|
| 75 |
+
ö
|
| 76 |
+
970111092.jtun
|
| 77 |
+
ponteva
|
| 78 |
+
y
|
| 79 |
+
kapakoista
|
| 80 |
+
970130160.jtun
|
| 81 |
+
th
|
nltk_data/tokenizers/punkt_tab/finnish/collocations.tab
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## sm
|
| 2 |
+
##number## ohjelmassa
|
| 3 |
+
##number## a3
|
| 4 |
+
##number## rc3
|
| 5 |
+
##number## rxd4
|
| 6 |
+
##number## hxg4
|
| 7 |
+
o stenberg
|
| 8 |
+
##number## lg5
|
| 9 |
+
##number## tallitontun
|
| 10 |
+
##number## lähetysohjeet
|
| 11 |
+
##number## uimakoulu
|
| 12 |
+
##number## jaana
|
| 13 |
+
##number## alustuksen
|
| 14 |
+
##number## uppo-nallen
|
| 15 |
+
##number## anne
|
| 16 |
+
##number## rxf3
|
| 17 |
+
a sjögren
|
| 18 |
+
##number## kamarikuoro
|
| 19 |
+
##number## vetäjänä
|
| 20 |
+
##number## pääsymaksu
|
| 21 |
+
##number## kerros
|
| 22 |
+
##number## kurssi
|
| 23 |
+
##number## kuori
|
| 24 |
+
##number## g4
|
| 25 |
+
##number## h3
|
| 26 |
+
##number## tiede-teatterissa
|
| 27 |
+
##number## kh2
|
| 28 |
+
##number## kausimaksu
|
| 29 |
+
##number## tia
|
| 30 |
+
##number## gxf5
|
| 31 |
+
##number## täky-galleria
|
| 32 |
+
##number## le2
|
| 33 |
+
##number## te8+
|
| 34 |
+
##number## la4
|
| 35 |
+
##number## keitä
|
| 36 |
+
##number## huhtikuuta
|
| 37 |
+
##number## menotiedoissa
|
| 38 |
+
##number## valmista
|
| 39 |
+
##number## txb5
|
| 40 |
+
##number## maskeerauskurssin
|
| 41 |
+
##number## rd2
|
| 42 |
+
##number## re2
|
| 43 |
+
##number## solisteina
|
| 44 |
+
##number## esitelmä
|
| 45 |
+
##number## puupiirrossarja
|
| 46 |
+
##number## ta1
|
| 47 |
+
##number## vaahdota
|
| 48 |
+
##number## h4
|
| 49 |
+
##number## kesäkuuta
|
| 50 |
+
##number## liikkeitä
|
| 51 |
+
##number## tuolloin
|
| 52 |
+
##number## viikko
|
| 53 |
+
##number## mittaa
|
| 54 |
+
a sjögrenin
|
| 55 |
+
##number## exf6
|
| 56 |
+
##number## rc6+
|
| 57 |
+
##number## viimeistele
|
| 58 |
+
##number## ld1
|
| 59 |
+
##number## elokuuta
|
| 60 |
+
##number## dh5+
|
| 61 |
+
##number## syyskuuta
|
| 62 |
+
##number## opettajina
|
| 63 |
+
##number## b3
|
| 64 |
+
##number## rauhankatu
|
| 65 |
+
c clarke
|
| 66 |
+
##number## saakka
|
| 67 |
+
##number## elokuvat
|
| 68 |
+
b huggins
|
| 69 |
+
g gahmberg
|
| 70 |
+
##number## luento
|
| 71 |
+
##number## lf3
|
| 72 |
+
##number## tammikuuta
|
| 73 |
+
##number## ryömä
|
| 74 |
+
##number## meller
|
| 75 |
+
##number## jäsenkortti
|
| 76 |
+
##number## esiintyjinä
|
| 77 |
+
##number## maria
|
| 78 |
+
##number## lf4
|
| 79 |
+
##number## siirto
|
| 80 |
+
##number## aurinko
|
| 81 |
+
##number## lxg6
|
| 82 |
+
##number## marraskuuta
|
| 83 |
+
##number## harjoituksissa
|
| 84 |
+
##number## romantika-yhtye
|
| 85 |
+
##number## g3
|
| 86 |
+
##number## heinäkuuta
|
| 87 |
+
##number## rxd5
|
| 88 |
+
##number## kuumenna
|
| 89 |
+
e hämäläisen
|
| 90 |
+
##number## bxc4
|
| 91 |
+
##number## te1
|
| 92 |
+
##number## kg2
|
| 93 |
+
##number## osallistumismaksu
|
| 94 |
+
##number## re5
|
| 95 |
+
##number## ohjelma
|
| 96 |
+
##number## varapuheenjohtajaksi
|
| 97 |
+
##number## raisa
|
| 98 |
+
##number## päivään
|
| 99 |
+
##number## luokan
|
| 100 |
+
##number## sulata
|
| 101 |
+
##number## levitä
|
| 102 |
+
##number## kaustinen
|
| 103 |
+
##number## kuoroa
|
| 104 |
+
##number## df3
|
| 105 |
+
v helsingistä
|
| 106 |
+
##number## mieskuoro
|
| 107 |
+
##number## lokakuuta
|
| 108 |
+
##number## kerho
|
| 109 |
+
##number## helmikuuta
|
| 110 |
+
##number## kokkola
|
| 111 |
+
##number## suuruusluokan
|
| 112 |
+
v kaupungista
|
| 113 |
+
##number## krs
|
| 114 |
+
##number## tekstit
|
| 115 |
+
##number## menyy
|
| 116 |
+
##number## rf3
|
| 117 |
+
##number## ulkoasiainministeriön
|
| 118 |
+
##number## kaada
|
| 119 |
+
##number## cxd5
|
| 120 |
+
##number## ilmailumuseo
|
| 121 |
+
e waris
|
| 122 |
+
##number## kierros
|
| 123 |
+
##number## tunnille
|
| 124 |
+
##number## kh3
|
| 125 |
+
##number## ohjaus
|
| 126 |
+
a t.
|
| 127 |
+
##number## postimaksu
|
| 128 |
+
##number## pane
|
| 129 |
+
##number## th3
|
| 130 |
+
##number## joulukuuta
|
| 131 |
+
##number## vatkaa
|
| 132 |
+
##number## kokeessa
|
| 133 |
+
l j.
|
| 134 |
+
##number## asti
|
| 135 |
+
##number## opastajana
|
| 136 |
+
##number## kirsi
|
| 137 |
+
##number## lc2
|
| 138 |
+
##number## lh2
|
| 139 |
+
##number## e4
|
| 140 |
+
##number## sairaankuljetukset
|
| 141 |
+
##number## sekoita
|
| 142 |
+
##number## mervi
|
| 143 |
+
##number## de2
|
| 144 |
+
a pietilän
|
| 145 |
+
##number## kf1
|
| 146 |
+
##number## toukokuuta
|
| 147 |
+
##number## maaliskuuta
|
| 148 |
+
##number## leikkaa
|
| 149 |
+
##number## ryhmänäytökset
|
| 150 |
+
v maaseudulta
|
| 151 |
+
##number## de3-e1
|
| 152 |
+
##number## c4
|
| 153 |
+
##number## ta1-b1
|
| 154 |
+
##number## d5
|
| 155 |
+
##number## pia
|
| 156 |
+
##number## lxd6
|
| 157 |
+
##number## d4
|
| 158 |
+
##number## f3-f4
|
| 159 |
+
##number## dxg6+
|
| 160 |
+
##number## sari
|
| 161 |
+
##number## pelkkään
|
| 162 |
+
##number## ld3
|
| 163 |
+
##number## perkaa
|
| 164 |
+
##number## lg3
|
| 165 |
+
##number## kg3
|
| 166 |
+
##number## kvm
|
| 167 |
+
##number## tb1xb6
|
nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
siinä
|
| 2 |
+
lämpötila
|
| 3 |
+
viiden
|
| 4 |
+
he
|
| 5 |
+
vapaa
|
| 6 |
+
viime
|
| 7 |
+
useimmat
|
| 8 |
+
kansallisooppera
|
| 9 |
+
rooleissa
|
| 10 |
+
näin
|
| 11 |
+
odotettavissa
|
| 12 |
+
tiedustelut
|
| 13 |
+
kansallisteatterin
|
| 14 |
+
sen
|
| 15 |
+
musiikki
|
| 16 |
+
monet
|
| 17 |
+
uusi
|
| 18 |
+
avoinna
|
| 19 |
+
pakkasta
|
| 20 |
+
freeze
|
| 21 |
+
tämä
|
| 22 |
+
lämpö
|
| 23 |
+
lautakunta
|
| 24 |
+
vastaväittäjänä
|
| 25 |
+
päivällä
|
| 26 |
+
tällä
|
| 27 |
+
esimerkiksi
|
| 28 |
+
varoituksia
|
| 29 |
+
merenkurkku
|
| 30 |
+
meriennuste
|
| 31 |
+
näyttelyssä
|
| 32 |
+
kun
|
| 33 |
+
pilvistä
|
| 34 |
+
silloin
|
| 35 |
+
selkämeren
|
| 36 |
+
suurin
|
| 37 |
+
se
|
| 38 |
+
jos
|
| 39 |
+
vaihtelevaa
|
| 40 |
+
vastaväittäjinä
|
| 41 |
+
sivu
|
| 42 |
+
kaupunginteatterin
|
| 43 |
+
pilvisyys
|
| 44 |
+
siellä
|
| 45 |
+
siksi
|
| 46 |
+
kurssimaksu
|
| 47 |
+
tämän
|
| 48 |
+
kotimaa
|
| 49 |
+
näiden
|
| 50 |
+
teatteri
|
| 51 |
+
kaikki
|
| 52 |
+
puolipilvistä
|
| 53 |
+
niiden
|
| 54 |
+
maksimilämpötila
|
| 55 |
+
lisäksi
|
| 56 |
+
kaupunginhallitus
|
| 57 |
+
helsingin
|
| 58 |
+
nyt
|
| 59 |
+
samalla
|
| 60 |
+
hänen
|
| 61 |
+
olen
|
| 62 |
+
kaupunkikierros
|
| 63 |
+
vastaväittäjä
|
| 64 |
+
ne
|
| 65 |
+
tästä
|
| 66 |
+
enimmäkseen
|
| 67 |
+
poika
|
| 68 |
+
niinpä
|
| 69 |
+
viirus
|
| 70 |
+
me
|
| 71 |
+
poliisi
|
| 72 |
+
liput
|
| 73 |
+
ilmoittautuminen
|
| 74 |
+
tarjoa
|
| 75 |
+
hän
|
| 76 |
+
molemmat
|
| 77 |
+
ulkomaat
|
| 78 |
+
rock
|
| 79 |
+
lääketieteen
|
| 80 |
+
tanssi
|
| 81 |
+
sainks
|
| 82 |
+
näyttely
|
| 83 |
+
lisätietoja
|
| 84 |
+
ulkomaiden
|
| 85 |
+
näyttelyn
|
| 86 |
+
palo
|
nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
p.o.l
|
| 2 |
+
pds
|
| 3 |
+
3o
|
| 4 |
+
inscr
|
| 5 |
+
suè
|
| 6 |
+
z
|
| 7 |
+
abst
|
| 8 |
+
g.-b
|
| 9 |
+
tél
|
| 10 |
+
r
|
| 11 |
+
ed
|
| 12 |
+
o
|
| 13 |
+
b
|
| 14 |
+
esp
|
| 15 |
+
j.l
|
| 16 |
+
v
|
| 17 |
+
k
|
| 18 |
+
e.p
|
| 19 |
+
aus
|
| 20 |
+
jap
|
| 21 |
+
r.e
|
| 22 |
+
gb-bel
|
| 23 |
+
p
|
| 24 |
+
aut
|
| 25 |
+
usx
|
| 26 |
+
arg
|
| 27 |
+
g
|
| 28 |
+
e
|
| 29 |
+
etc
|
| 30 |
+
fra
|
| 31 |
+
p.s
|
| 32 |
+
j.-l
|
| 33 |
+
blu
|
| 34 |
+
e.-u
|
| 35 |
+
f.b
|
| 36 |
+
msf
|
| 37 |
+
e.d
|
| 38 |
+
shi
|
| 39 |
+
can
|
| 40 |
+
j.b
|
| 41 |
+
s.a
|
| 42 |
+
f.o
|
| 43 |
+
you
|
| 44 |
+
mir
|
| 45 |
+
inc
|
| 46 |
+
ital
|
| 47 |
+
expr
|
| 48 |
+
tch
|
| 49 |
+
g-b-bel
|
| 50 |
+
cid
|
| 51 |
+
c.u
|
| 52 |
+
ctk
|
| 53 |
+
j.-m.g
|
| 54 |
+
bta
|
| 55 |
+
p.-b
|
| 56 |
+
cie
|
| 57 |
+
ita
|
| 58 |
+
equ
|
| 59 |
+
corp
|
| 60 |
+
vot
|
| 61 |
+
w
|
nltk_data/tokenizers/punkt_tab/french/collocations.tab
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## shinozuka-magne
|
| 2 |
+
##number## ambrosino-baumgartner
|
| 3 |
+
c tanvier
|
| 4 |
+
f b.
|
| 5 |
+
##number## waldegaard-fenouil
|
| 6 |
+
##number## fermé
|
| 7 |
+
a dechaume
|
| 8 |
+
i demongeot
|
| 9 |
+
s motos
|
| 10 |
+
##number## rahier
|
| 11 |
+
##number## magnaldi
|
| 12 |
+
##number## orioli
|
| 13 |
+
f tél.
|
| 14 |
+
##number## cowan-delferrier
|
| 15 |
+
##number## vatanen-berglund
|
| 16 |
+
##number## picco
|
| 17 |
+
##number## masuoka-oligo
|
| 18 |
+
##number## medardo
|
nltk_data/tokenizers/punkt_tab/french/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/french/sent_starters.txt
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
c
|
| 2 |
+
depuis
|
| 3 |
+
la
|
| 4 |
+
enfin
|
| 5 |
+
certains
|
| 6 |
+
selon
|
| 7 |
+
cet
|
| 8 |
+
car
|
| 9 |
+
ces
|
| 10 |
+
il
|
| 11 |
+
cependant
|
| 12 |
+
pour
|
| 13 |
+
j
|
| 14 |
+
alors
|
| 15 |
+
un
|
| 16 |
+
certes
|
| 17 |
+
les
|
| 18 |
+
nous
|
| 19 |
+
dans
|
| 20 |
+
le
|
| 21 |
+
une
|
| 22 |
+
si
|
| 23 |
+
mais
|
| 24 |
+
en
|
| 25 |
+
dès
|
| 26 |
+
or
|
| 27 |
+
tout
|
| 28 |
+
ils
|
| 29 |
+
l
|
| 30 |
+
mr
|
| 31 |
+
malgré
|
| 32 |
+
elles
|
| 33 |
+
né
|
| 34 |
+
je
|
| 35 |
+
on
|
| 36 |
+
quand
|
| 37 |
+
pourtant
|
| 38 |
+
cela
|
| 39 |
+
a
|
| 40 |
+
après
|
| 41 |
+
puis
|
| 42 |
+
ce
|
| 43 |
+
elle
|
| 44 |
+
voilà
|
| 45 |
+
cette
|
| 46 |
+
comment
|
| 47 |
+
quant
|
| 48 |
+
ainsi
|
nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rfr
|
| 2 |
+
t
|
| 3 |
+
c
|
| 4 |
+
a.d
|
| 5 |
+
dk
|
| 6 |
+
he
|
| 7 |
+
mjm
|
| 8 |
+
inkl
|
| 9 |
+
bt
|
| 10 |
+
69f
|
| 11 |
+
crz
|
| 12 |
+
dr
|
| 13 |
+
st
|
| 14 |
+
ib
|
| 15 |
+
liv
|
| 16 |
+
mrd
|
| 17 |
+
n.r
|
| 18 |
+
rg
|
| 19 |
+
v
|
| 20 |
+
vgl
|
| 21 |
+
mgr
|
| 22 |
+
cs
|
| 23 |
+
prof
|
| 24 |
+
j
|
| 25 |
+
kfr
|
| 26 |
+
bd
|
| 27 |
+
fre
|
| 28 |
+
gfh
|
| 29 |
+
fon
|
| 30 |
+
m
|
| 31 |
+
rp
|
| 32 |
+
nr
|
| 33 |
+
chr
|
| 34 |
+
etc
|
| 35 |
+
hg
|
| 36 |
+
sx
|
| 37 |
+
rz
|
| 38 |
+
48f
|
| 39 |
+
kmu
|
| 40 |
+
abs
|
| 41 |
+
nkm
|
| 42 |
+
z.b
|
| 43 |
+
usw
|
| 44 |
+
f
|
| 45 |
+
d.h
|
| 46 |
+
lz
|
| 47 |
+
sc
|
| 48 |
+
usf
|
| 49 |
+
gir
|
| 50 |
+
hag
|
| 51 |
+
ff
|
| 52 |
+
mio
|
| 53 |
+
zr
|
| 54 |
+
k
|
| 55 |
+
h
|
| 56 |
+
mey
|
| 57 |
+
bst
|
| 58 |
+
ne
|
| 59 |
+
u.a
|
| 60 |
+
fem
|
| 61 |
+
bzw
|
| 62 |
+
bü
|
| 63 |
+
med
|
| 64 |
+
u
|
| 65 |
+
lts
|
| 66 |
+
fr
|
| 67 |
+
s.o.s
|
| 68 |
+
w
|
| 69 |
+
lib
|
| 70 |
+
k.a
|
| 71 |
+
th
|
nltk_data/tokenizers/punkt_tab/german/collocations.tab
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## oktober
|
| 2 |
+
##number## jahrhunderts
|
| 3 |
+
##number## geburtstag
|
| 4 |
+
##number## juni
|
| 5 |
+
s ##number##
|
| 6 |
+
##number## september
|
| 7 |
+
##number## mai
|
| 8 |
+
##number## dezember
|
| 9 |
+
##number## april
|
| 10 |
+
##number## ahv-revision
|
| 11 |
+
##number## revision
|
| 12 |
+
##number## jahrhundert
|
| 13 |
+
##number## landwirtschaftsbericht
|
| 14 |
+
##number## altersjahr
|
| 15 |
+
##number## februar
|
| 16 |
+
a schumpeter
|
| 17 |
+
##number## freiheit
|
| 18 |
+
##number## august
|
| 19 |
+
##number## januar
|
| 20 |
+
##number## märz
|
| 21 |
+
a meyers
|
| 22 |
+
##number## november
|
| 23 |
+
##number## bauetappe
|
| 24 |
+
##number## ahv-
|
| 25 |
+
##number## eu-richtlinie
|
| 26 |
+
##number## juli
|
| 27 |
+
a meyer
|
| 28 |
+
##number## säule
|
nltk_data/tokenizers/punkt_tab/german/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/german/sent_starters.txt
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
das
|
| 2 |
+
man
|
| 3 |
+
es
|
| 4 |
+
wir
|
| 5 |
+
dabei
|
| 6 |
+
ferner
|
| 7 |
+
ähnliches
|
| 8 |
+
während
|
| 9 |
+
entscheidend
|
| 10 |
+
ausserdem
|
| 11 |
+
ein
|
| 12 |
+
in
|
| 13 |
+
der
|
| 14 |
+
daraus
|
| 15 |
+
obschon
|
| 16 |
+
beide
|
| 17 |
+
hier
|
| 18 |
+
all
|
| 19 |
+
neben
|
| 20 |
+
solche
|
| 21 |
+
hingegen
|
| 22 |
+
selbstverständlich
|
| 23 |
+
daneben
|
| 24 |
+
hinzu
|
| 25 |
+
vielmehr
|
| 26 |
+
sie
|
| 27 |
+
natürlich
|
| 28 |
+
obwohl
|
| 29 |
+
nun
|
| 30 |
+
doch
|
| 31 |
+
ob
|
| 32 |
+
abgesehen
|
| 33 |
+
überdies
|
| 34 |
+
im
|
| 35 |
+
zweitens
|
| 36 |
+
darin
|
| 37 |
+
erstens
|
| 38 |
+
dieses
|
| 39 |
+
nach
|
| 40 |
+
wer
|
| 41 |
+
da
|
| 42 |
+
interessant
|
| 43 |
+
seit
|
| 44 |
+
zudem
|
| 45 |
+
darüber
|
| 46 |
+
umgekehrt
|
| 47 |
+
ähnlich
|
| 48 |
+
aber
|
| 49 |
+
was
|
| 50 |
+
nachdem
|
| 51 |
+
insbesondere
|
| 52 |
+
statt
|
| 53 |
+
angesichts
|
| 54 |
+
gefragt
|
| 55 |
+
gleiches
|
| 56 |
+
solange
|
| 57 |
+
wenn
|
| 58 |
+
dies
|
| 59 |
+
dass
|
| 60 |
+
wie
|
| 61 |
+
damit
|
| 62 |
+
allerdings
|
| 63 |
+
denn
|
| 64 |
+
letztere
|
| 65 |
+
eine
|
| 66 |
+
selbst
|
| 67 |
+
gleichzeitig
|
| 68 |
+
wo
|
| 69 |
+
weder
|
| 70 |
+
gerade
|
| 71 |
+
unter
|
| 72 |
+
problematischer
|
| 73 |
+
wieso
|
| 74 |
+
dennoch
|
| 75 |
+
bei
|
| 76 |
+
deshalb
|
| 77 |
+
davon
|
| 78 |
+
andernfalls
|
| 79 |
+
er
|
| 80 |
+
die
|
| 81 |
+
anders
|
| 82 |
+
auch
|
| 83 |
+
ebenso
|
| 84 |
+
so
|
| 85 |
+
inzwischen
|
| 86 |
+
sonst
|
| 87 |
+
immerhin
|
| 88 |
+
entsprechend
|
| 89 |
+
danach
|
| 90 |
+
am
|
| 91 |
+
trotz
|
| 92 |
+
trotzdem
|
| 93 |
+
worum
|
| 94 |
+
damals
|
| 95 |
+
dafür
|
| 96 |
+
schliesslich
|
| 97 |
+
gemäss
|
| 98 |
+
demgegenüber
|
| 99 |
+
warum
|
| 100 |
+
letzteres
|
| 101 |
+
mit
|
| 102 |
+
dazu
|
| 103 |
+
anderseits
|
| 104 |
+
ganz
|
| 105 |
+
zwar
|
| 106 |
+
dieser
|
| 107 |
+
diese
|
nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
κλ
|
| 2 |
+
δημ
|
| 3 |
+
χλμ
|
| 4 |
+
σ.τ.ε
|
| 5 |
+
ό.π
|
| 6 |
+
δρχ
|
| 7 |
+
κων
|
| 8 |
+
χρ
|
| 9 |
+
π.α
|
| 10 |
+
ριχ
|
| 11 |
+
π.χρ
|
| 12 |
+
υγ
|
| 13 |
+
tel
|
| 14 |
+
ζ
|
| 15 |
+
ο.π
|
| 16 |
+
βασ
|
| 17 |
+
γλ
|
| 18 |
+
n.c
|
| 19 |
+
d.j
|
| 20 |
+
σωκ
|
| 21 |
+
π
|
| 22 |
+
ιω
|
| 23 |
+
αχ
|
| 24 |
+
βα
|
| 25 |
+
γερ
|
| 26 |
+
εκδ
|
| 27 |
+
κλπ
|
| 28 |
+
φ
|
| 29 |
+
ελ
|
| 30 |
+
οσ
|
| 31 |
+
α
|
| 32 |
+
σελ
|
| 33 |
+
ευ
|
| 34 |
+
ε.έ
|
| 35 |
+
ρ
|
| 36 |
+
ε.τ.α
|
| 37 |
+
λ
|
| 38 |
+
εβ
|
| 39 |
+
θρ
|
| 40 |
+
ν
|
| 41 |
+
βλ
|
| 42 |
+
ηλ
|
| 43 |
+
γ
|
| 44 |
+
αρ
|
| 45 |
+
π.χ
|
| 46 |
+
ε.μ
|
| 47 |
+
κ.μ
|
| 48 |
+
α.ε
|
| 49 |
+
μιχ
|
| 50 |
+
δισ
|
| 51 |
+
ολ
|
| 52 |
+
μ
|
| 53 |
+
κ.ά
|
| 54 |
+
κ
|
| 55 |
+
δηλ
|
| 56 |
+
ε.α.χ
|
| 57 |
+
πρ
|
| 58 |
+
αγ
|
| 59 |
+
μac
|
| 60 |
+
κ.ο.κ
|
| 61 |
+
λ.χ
|
| 62 |
+
θ
|
| 63 |
+
αδσ
|
| 64 |
+
εκατ
|
| 65 |
+
δρη
|
| 66 |
+
εμμ
|
| 67 |
+
δ
|
| 68 |
+
δεκ
|
| 69 |
+
σ.σ
|
| 70 |
+
55ο
|
| 71 |
+
κκ
|
| 72 |
+
αδ
|
| 73 |
+
τ.μ
|
| 74 |
+
ε.ε
|
| 75 |
+
μ.χ
|
| 76 |
+
ν.μ
|
| 77 |
+
κτλ
|
| 78 |
+
δολ
|
| 79 |
+
κ.ά.π
|
| 80 |
+
αγγ
|
| 81 |
+
μ.κ
|
| 82 |
+
δ.σ
|
| 83 |
+
μπ
|
| 84 |
+
έκδ
|
| 85 |
+
ι
|
| 86 |
+
v
|
| 87 |
+
χαρ
|
| 88 |
+
γρ
|
| 89 |
+
μ.μ.ε
|
| 90 |
+
σχ
|
| 91 |
+
λεκ
|
| 92 |
+
σπ
|
| 93 |
+
πλι
|
| 94 |
+
αθ
|
| 95 |
+
χ
|
| 96 |
+
τζ
|
| 97 |
+
τρισ
|
| 98 |
+
στ
|
| 99 |
+
ευθ
|
| 100 |
+
μ.μ
|
nltk_data/tokenizers/punkt_tab/greek/collocations.tab
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
β δερτιλής
|
| 2 |
+
##number## φιλάρετος
|
| 3 |
+
ο gehry
|
| 4 |
+
η αβεε
|
| 5 |
+
##number## βλ.
|
| 6 |
+
β παπανδρέου
|
| 7 |
+
σ μ.
|
nltk_data/tokenizers/punkt_tab/greek/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/greek/sent_starters.txt
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
οπως
|
| 2 |
+
πάντως
|
| 3 |
+
δεύτερον
|
| 4 |
+
παράλληλα
|
| 5 |
+
οχι
|
| 6 |
+
ειδικότερα
|
| 7 |
+
τι
|
| 8 |
+
επίσης
|
| 9 |
+
ωστόσο
|
| 10 |
+
ενας
|
| 11 |
+
ηδη
|
| 12 |
+
σύμφωνα
|
| 13 |
+
συγκεκριμένα
|
| 14 |
+
ηταν
|
| 15 |
+
εκεί
|
| 16 |
+
αλλωστε
|
| 17 |
+
πολλοί
|
| 18 |
+
διότι
|
| 19 |
+
οποιος
|
| 20 |
+
τρίτον
|
| 21 |
+
πώς
|
| 22 |
+
ολοι
|
| 23 |
+
ισως
|
| 24 |
+
ο
|
| 25 |
+
ολες
|
| 26 |
+
οι
|
| 27 |
+
γιατί
|
| 28 |
+
αλλοι
|
| 29 |
+
οσοι
|
| 30 |
+
αυτό
|
| 31 |
+
τα
|
| 32 |
+
ολα
|
| 33 |
+
ετσι
|
| 34 |
+
ενα
|
| 35 |
+
πράγματι
|
| 36 |
+
αλλά
|
| 37 |
+
επιπλέον
|
| 38 |
+
δεν
|
| 39 |
+
εχει
|
| 40 |
+
αντίθετα
|
| 41 |
+
οσον
|
| 42 |
+
γι
|
| 43 |
+
αντιθέτως
|
| 44 |
+
ας
|
| 45 |
+
η
|
| 46 |
+
πρόκειται
|
| 47 |
+
αρα
|
| 48 |
+
οσο
|
| 49 |
+
αν
|
| 50 |
+
μετά
|
| 51 |
+
εξάλλου
|
| 52 |
+
το
|
| 53 |
+
οταν
|
| 54 |
+
πέρα
|
nltk_data/tokenizers/punkt_tab/italian/abbrev_types.txt
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
b.p
|
| 3 |
+
cer
|
| 4 |
+
sik
|
| 5 |
+
'ing
|
| 6 |
+
dr
|
| 7 |
+
p.m
|
| 8 |
+
st
|
| 9 |
+
t.t.c
|
| 10 |
+
a.r
|
| 11 |
+
p
|
| 12 |
+
ecc
|
| 13 |
+
t.b
|
| 14 |
+
tel
|
| 15 |
+
etc
|
| 16 |
+
'on
|
| 17 |
+
mb
|
| 18 |
+
g.dol
|
| 19 |
+
g.d.g
|
| 20 |
+
sè»
|
| 21 |
+
m.p
|
| 22 |
+
b.b
|
| 23 |
+
vs
|
| 24 |
+
s.p.a
|
| 25 |
+
g.b
|
| 26 |
+
v6
|
| 27 |
+
ipp
|
| 28 |
+
s.r
|
| 29 |
+
r.c
|
| 30 |
+
moz
|
| 31 |
+
n.f
|
| 32 |
+
s.mr
|
| 33 |
+
c.s
|
| 34 |
+
g.i.p
|
| 35 |
+
r.i
|
| 36 |
+
a.g
|
| 37 |
+
rc
|
| 38 |
+
'è»
|
| 39 |
+
1-o
|
| 40 |
+
e.p
|
| 41 |
+
m.g.b
|
| 42 |
+
gen
|
| 43 |
+
i.e
|
| 44 |
+
s.a
|
| 45 |
+
vic
|
| 46 |
+
g.gi
|
| 47 |
+
c»
|
| 48 |
+
m.cas
|
| 49 |
+
re.po
|
| 50 |
+
giri/min
|
| 51 |
+
e.i
|
| 52 |
+
mrs
|
| 53 |
+
w
|
| 54 |
+
n.d.r
|
| 55 |
+
l4ª
|
| 56 |
+
bad
|
| 57 |
+
p.l.f
|
| 58 |
+
dur
|
| 59 |
+
s.l
|
| 60 |
+
t.s
|
| 61 |
+
wwf
|
| 62 |
+
u.q
|
| 63 |
+
lod
|
| 64 |
+
b.col
|
| 65 |
+
prof
|
| 66 |
+
n.s
|
| 67 |
+
ii.dd
|
| 68 |
+
a.f
|
| 69 |
+
c.i
|
| 70 |
+
op
|
| 71 |
+
end
|
| 72 |
+
g
|
| 73 |
+
'u.s
|
| 74 |
+
o.b
|
| 75 |
+
t.t
|
| 76 |
+
s.m
|
| 77 |
+
ing
|
| 78 |
+
shi
|
| 79 |
+
oren
|
| 80 |
+
m.l
|
| 81 |
+
f.l.l
|
| 82 |
+
mr
|
| 83 |
+
jvp
|
| 84 |
+
fia
|
| 85 |
+
pag
|
| 86 |
+
e.c
|
| 87 |
+
g.p
|
| 88 |
+
pp
|
| 89 |
+
u
|
| 90 |
+
p.d.v
|
| 91 |
+
c.cer
|
| 92 |
+
cod
|
| 93 |
+
d.p.r
|
| 94 |
+
e.t
|
| 95 |
+
e.st
|
| 96 |
+
h.c
|
| 97 |
+
z
|
| 98 |
+
r
|
| 99 |
+
c.n.r
|
| 100 |
+
o.r
|
| 101 |
+
mons
|
| 102 |
+
j
|
| 103 |
+
jr
|
| 104 |
+
kin
|
| 105 |
+
v6»
|
| 106 |
+
g.p.s
|
| 107 |
+
l.z
|
| 108 |
+
c.a
|
| 109 |
+
m.f
|
| 110 |
+
sig
|
| 111 |
+
s.r.l
|
| 112 |
+
riz
|
| 113 |
+
f
|
| 114 |
+
m.s
|
| 115 |
+
c.c
|
| 116 |
+
l.p
|
| 117 |
+
f.ama
|
| 118 |
+
pi
|
| 119 |
+
s.c
|
| 120 |
+
p.d.p
|
| 121 |
+
ta
|
| 122 |
+
di»
|
| 123 |
+
r.e.s
|
| 124 |
+
n.d
|
| 125 |
+
p2»
|
nltk_data/tokenizers/punkt_tab/italian/collocations.tab
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
n ##number##
|
| 2 |
+
s pietro
|
| 3 |
+
s francisco
|
| 4 |
+
c wolf
|
| 5 |
+
s maria
|
| 6 |
+
a r.
|
nltk_data/tokenizers/punkt_tab/italian/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/italian/sent_starters.txt
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
poi
|
| 2 |
+
c
|
| 3 |
+
la
|
| 4 |
+
temperatura
|
| 5 |
+
oggi
|
| 6 |
+
ebbene
|
| 7 |
+
infine
|
| 8 |
+
ieri
|
| 9 |
+
ecco
|
| 10 |
+
il
|
| 11 |
+
questo
|
| 12 |
+
quando
|
| 13 |
+
se
|
| 14 |
+
anche
|
| 15 |
+
e
|
| 16 |
+
ma
|
| 17 |
+
perciò
|
| 18 |
+
in
|
| 19 |
+
visibilità
|
| 20 |
+
gli
|
| 21 |
+
insomma
|
| 22 |
+
le
|
| 23 |
+
nel
|
| 24 |
+
lo
|
| 25 |
+
ora
|
| 26 |
+
adesso
|
| 27 |
+
venti
|
| 28 |
+
l
|
| 29 |
+
ci
|
| 30 |
+
per
|
| 31 |
+
inoltre
|
| 32 |
+
ho
|
| 33 |
+
secondo
|
| 34 |
+
non
|
| 35 |
+
così
|
| 36 |
+
intanto
|
| 37 |
+
certo
|
| 38 |
+
dopo
|
| 39 |
+
i
|
| 40 |
+
eppure
|
nltk_data/tokenizers/punkt_tab/malayalam/abbrev_types.txt
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
പി.ജെ
|
| 2 |
+
var
|
| 3 |
+
ഡി.വൈ
|
| 4 |
+
സിഹാനൂക്കാണ്
|
| 5 |
+
എന്നാണറിയപ്പെട്ടത്
|
| 6 |
+
മു
|
| 7 |
+
ബി.ആർ
|
| 8 |
+
സി.ടി
|
| 9 |
+
വി.പി
|
| 10 |
+
u.s
|
| 11 |
+
എം.എം
|
| 12 |
+
ഏ.ഓ
|
| 13 |
+
എ.എം
|
| 14 |
+
ജെ.ബി
|
| 15 |
+
കെ.എൽ
|
| 16 |
+
ഉയർത്തപ്പെട്ടു
|
| 17 |
+
o8
|
| 18 |
+
ജെ.ഡി
|
| 19 |
+
പാരീസിലേക്കയച്ചു
|
| 20 |
+
വിവാഹംകഴിച്ചു
|
| 21 |
+
ഒ.എ
|
| 22 |
+
മുസ്ലിങ്ങളാണ്
|
| 23 |
+
ടി.പി
|
| 24 |
+
ഒ.സി
|
| 25 |
+
ആർ.എം
|
| 26 |
+
കൃ
|
| 27 |
+
മറിച്ചാണ്
|
| 28 |
+
മാറ്റിയിരിക്കണം
|
| 29 |
+
ജി.വി
|
| 30 |
+
കെ.ജെ
|
| 31 |
+
കെ.ബി
|
| 32 |
+
സി.വി
|
| 33 |
+
ഒ.ആർ
|
| 34 |
+
വിഭജിക്കപ്പെട്ടു
|
| 35 |
+
ജെ.ആർ.എം
|
| 36 |
+
അഭിപ്രായപ്പെട്ടിരുന്നു
|
| 37 |
+
എം.ഒ
|
| 38 |
+
ശ
|
| 39 |
+
എൽ.കെ
|
| 40 |
+
കെ.ഇ
|
| 41 |
+
ബി.എ.സി
|
| 42 |
+
ടി.ഡി
|
| 43 |
+
അക്രമോത്സുകരാക്കി
|
| 44 |
+
എസ്.എൽ
|
| 45 |
+
തെ
|
| 46 |
+
എ.പി.ഐ
|
| 47 |
+
ചലച്ചിത്രം,പുസ്തകം,സി.ഡി,ഡി.വി.ഡി
|
| 48 |
+
ഡെ.റി
|
| 49 |
+
എൽ.എ
|
| 50 |
+
ഇ.എൻ
|
| 51 |
+
k.n
|
| 52 |
+
ഒ.എൻ
|
| 53 |
+
വി.ഖു
|
| 54 |
+
ഫെ
|
| 55 |
+
eur
|
| 56 |
+
vol
|
| 57 |
+
d.c
|
| 58 |
+
വി.എം.എസ്
|
| 59 |
+
ജി.കെ
|
| 60 |
+
സി.സി.ഡി
|
| 61 |
+
എം.ജി.ആർ
|
| 62 |
+
ഡോ.ടി
|
| 63 |
+
മേൻമ
|
| 64 |
+
എളുപ്പവഴിയാണിത്
|
| 65 |
+
j.s
|
| 66 |
+
കെ.ഡി
|
| 67 |
+
ഇ.ഡി
|
| 68 |
+
കൊ
|
| 69 |
+
ഇ.വി
|
| 70 |
+
a.k.a
|
| 71 |
+
ആർ.സി
|
| 72 |
+
ഐ.എസ്.പി
|
| 73 |
+
ഇ.എ
|
| 74 |
+
ബി.എൻ
|
| 75 |
+
എ.ഇ
|
| 76 |
+
ഡോ.എ
|
| 77 |
+
വി.എം
|
| 78 |
+
കലാശിച്ചുളളു
|
| 79 |
+
എം.എൻ
|
| 80 |
+
എ.ഒ
|
| 81 |
+
എ.പി.ജെ
|
| 82 |
+
പി.എ
|
| 83 |
+
ഓ.എം
|
| 84 |
+
ടി.എൻ
|
| 85 |
+
ഘ.മീ
|
| 86 |
+
ജൂല
|
| 87 |
+
പി.ജി
|
| 88 |
+
ഒക്കാറ
|
| 89 |
+
സി.ഒ
|
| 90 |
+
ജി.ഇ
|
| 91 |
+
കെ.ആർ
|
| 92 |
+
ഉ.സാ.ഘ
|
| 93 |
+
ആഗ
|
| 94 |
+
ഒ.ടി.ജി
|
| 95 |
+
ആ.സാ
|
| 96 |
+
സിംഹാസനമേറി
|
| 97 |
+
എ.യു
|
| 98 |
+
സി.എൽ
|
| 99 |
+
സു
|
| 100 |
+
ഐ.വി
|
| 101 |
+
ജി.ഐ
|
| 102 |
+
അവതരിപ്പിക്കാറുള്ള
|
| 103 |
+
ഒ.ടി
|
| 104 |
+
trin
|
| 105 |
+
എൻ.വി
|
| 106 |
+
രചനകളിലാണ്
|
| 107 |
+
ഏ.കെ
|
| 108 |
+
എ.ജെ
|
| 109 |
+
ല.സാ.ഗു
|
| 110 |
+
എൻ.ഡി.ആർ.എഫ്
|
| 111 |
+
k.m
|
| 112 |
+
എഫ്.ആർ
|
| 113 |
+
തെ.കി
|
| 114 |
+
എം.എൽ
|
| 115 |
+
എൻ.ആർ
|
| 116 |
+
ഐ.ഡി
|
| 117 |
+
ജെ.ആർ
|
| 118 |
+
അനുവാദമുണ്ടായിരുന്നു
|
| 119 |
+
സി.എസ്.എസ്
|
| 120 |
+
ആണ
|
| 121 |
+
ടി.ടി
|
| 122 |
+
etc
|
| 123 |
+
കെ.എ
|
| 124 |
+
എറിഞ്ഞുകളയപ്പെടുന്നു
|
| 125 |
+
ട്
|
| 126 |
+
നടപ്പായി
|
| 127 |
+
നിരോധിച്ചിരിക്കുകയാണ്
|
| 128 |
+
എൽ.എസ്.ഡി
|
| 129 |
+
ഇ.ജെ
|
| 130 |
+
ആരാഞ്ഞു
|
| 131 |
+
എൽ.എൻ
|
| 132 |
+
ജെ.കെ
|
| 133 |
+
ജനു
|
| 134 |
+
യു.ബി
|
| 135 |
+
പുറ
|
| 136 |
+
എസ്.വി.ജി
|
| 137 |
+
പി.ഒ
|
| 138 |
+
എഫ്.ടി.പി
|
| 139 |
+
ഐ.സി
|
| 140 |
+
വ.പ
|
| 141 |
+
കെ.എൻ
|
| 142 |
+
ജൂ
|
| 143 |
+
ഖു
|
| 144 |
+
എ.ആർ.എം
|
| 145 |
+
യു.ആർ
|
| 146 |
+
ഐ.എൻ.എ
|
| 147 |
+
കലനം
|
| 148 |
+
ഐ.ആർ.ഡി.പി
|
| 149 |
+
ആവശ്യപ്പെടു
|
| 150 |
+
ജെ.ആർ.ഡി
|
| 151 |
+
ഒ.വി
|
| 152 |
+
പി.എം
|
| 153 |
+
എ.ബി
|
| 154 |
+
ജി.എ
|
| 155 |
+
എസ്.എഫ്.ടി.പി
|
| 156 |
+
ജെ.ആർ.കെ
|
| 157 |
+
ബി.ഇ.എം.എൽ
|
| 158 |
+
ചേരൂ
|
| 159 |
+
സി.ഇ
|
| 160 |
+
ഡി.വി
|
| 161 |
+
ജി.എം.പി
|
| 162 |
+
7ൽ
|
| 163 |
+
യു.സി
|
| 164 |
+
എൽ.ജെ
|
| 165 |
+
വി.ടി
|
| 166 |
+
ഐ.കെ
|
| 167 |
+
എം.ഐ
|
| 168 |
+
പി.ഐ
|
| 169 |
+
ടി.എസ്
|
| 170 |
+
ഐ.ടി.ബി.പി
|
| 171 |
+
മി.മീ
|
| 172 |
+
കെ.എസ്
|
| 173 |
+
എൽ.ടി
|
| 174 |
+
ഇ.ബി
|
| 175 |
+
യു.എ
|
| 176 |
+
ഇ.ടി
|
| 177 |
+
i.e
|
| 178 |
+
ഇ.എ
|
| 179 |
+
വ്യക്തമാക്കുന്നത്
|
| 180 |
+
എം.ഈ
|
| 181 |
+
a.h
|
| 182 |
+
ഇ.സി
|
| 183 |
+
4k.m
|
| 184 |
+
'എ.ആർ
|
| 185 |
+
എ.ജി
|
| 186 |
+
തി.ക
|
| 187 |
+
ഇ.എം
|
| 188 |
+
പി.എൻ
|
| 189 |
+
പുസ്തകങ്ങളുണ്ട്
|
| 190 |
+
എ.എൻ
|
| 191 |
+
ഒത്തുതീർപ്പിലെത്തി
|
| 192 |
+
ശ.ശ
|
| 193 |
+
r.i
|
| 194 |
+
mz+
|
| 195 |
+
അക്ഷാ
|
| 196 |
+
t.n
|
| 197 |
+
രേഖാംശത്തിലാണ്
|
| 198 |
+
m.b.b.s
|
| 199 |
+
എസ്.എം.പി.എസ്
|
| 200 |
+
ഈ.മ.യൌ
|
| 201 |
+
ഡി.വി.ഡി
|
| 202 |
+
സി.എൻ.ആർ
|
| 203 |
+
ഇ.ഒ
|
| 204 |
+
ലഫ്
|
| 205 |
+
ഐ.ജി
|
| 206 |
+
എ.ഐ.കെ.എസ്
|
| 207 |
+
ബി.എഡ്
|
| 208 |
+
റുദ്
|
| 209 |
+
കെ.എസ്.യു
|
| 210 |
+
apk
|
| 211 |
+
എ.വി.എം
|
| 212 |
+
ഏ.സി
|
| 213 |
+
fol
|
| 214 |
+
നല്ലതാണ്
|
| 215 |
+
mt
|
| 216 |
+
ഒ.കെ
|
| 217 |
+
ബി.ഇ.എൽ
|
| 218 |
+
എം.വി
|
| 219 |
+
04മ.34.4മി
|
| 220 |
+
പി.ഡി
|
| 221 |
+
വി.ഐ
|
| 222 |
+
എസ്.എസ്.ബി
|
| 223 |
+
കോജ
|
| 224 |
+
ഡി.എ.വി
|
| 225 |
+
വ്യക്തമല്ല
|
| 226 |
+
എ.എൽ
|
| 227 |
+
ഏ.എൻ
|
| 228 |
+
സൃഷ്ടിച്ചിരുന്നു
|
| 229 |
+
ഡി.സി
|
| 230 |
+
എ
|
| 231 |
+
ഗ
|
| 232 |
+
ഐ.എം
|
| 233 |
+
ജെ.ഇ
|
| 234 |
+
ഐ.എസ്.ഒ
|
| 235 |
+
w.h
|
| 236 |
+
oh
|
| 237 |
+
നടുക
|
| 238 |
+
ഏറ
|
| 239 |
+
എൻ.സി.സി
|
| 240 |
+
എ.ആർ
|
| 241 |
+
സി.കെ
|
| 242 |
+
സ്ഥാപിച്ചെടുത്തു
|
| 243 |
+
എ.എ
|
| 244 |
+
ഖ
|
| 245 |
+
മി.ലി
|
| 246 |
+
തെ.പ
|
| 247 |
+
†y
|
| 248 |
+
ടേൽ
|
| 249 |
+
പി.ആർ
|
| 250 |
+
ബി.ജെ
|
| 251 |
+
ed
|
| 252 |
+
ഒ.ഇ.എം
|
| 253 |
+
15എ
|
| 254 |
+
ഐ.ആർ
|
| 255 |
+
എം.യു
|
| 256 |
+
co
|
| 257 |
+
എം.ആർ
|
| 258 |
+
വി.എ
|
| 259 |
+
ഡി.കെ
|
| 260 |
+
എ
|
| 261 |
+
ശ.മാ
|
| 262 |
+
നല
|
| 263 |
+
വളരും
|
| 264 |
+
ഐ.ആർ.ഡി
|
| 265 |
+
mr
|
| 266 |
+
ബി.സി.ഇ
|
| 267 |
+
എച്ച്.എ.എൽ.,എൻ.എ.എൽ
|
| 268 |
+
ടി.എ
|
| 269 |
+
ജെ.ജെ
|
| 270 |
+
ഓ.എസ്
|
| 271 |
+
ആർ.ഒ
|
| 272 |
+
ഇ.പി
|
| 273 |
+
ഉപയോഗിച്ചുപോന്നിട്ടുണ്ടു്
|
| 274 |
+
ഏ.ആർ
|
| 275 |
+
അഡ്വ
|
| 276 |
+
എസ്.പി.സി.കെ
|
| 277 |
+
m.f
|
| 278 |
+
ജി.സി
|
| 279 |
+
ജി.പി
|
| 280 |
+
എ.ഐ.എ.ഡി.എം.കെ
|
| 281 |
+
ഒ.പി
|
| 282 |
+
ബി.എച്ച്.ഇ.എൽ
|
| 283 |
+
വി.ആർ
|
| 284 |
+
6k.m
|
| 285 |
+
a.a.k
|
nltk_data/tokenizers/punkt_tab/malayalam/collocations.tab
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ഐ എ
|
| 2 |
+
ഇ സുലൈമാൻ
|
| 3 |
+
ഇ ആർ
|
| 4 |
+
##number## ശേഖരിച്ചത്
|
| 5 |
+
ഐ ഐ
|
| 6 |
+
##number## രഹസ്യാത്മകത
|
| 7 |
+
ന 20-ന്
|
| 8 |
+
പ പാർനസസ്
|
| 9 |
+
ഇ സന്തോഷ്
|
| 10 |
+
h asquith
|
| 11 |
+
ഏ 29-ന്
|
| 12 |
+
##number## ക്ലയന്റ്
|
| 13 |
+
എ ഇബ്രാഹിം
|
| 14 |
+
ഒ മാധവൻ
|
| 15 |
+
എ ഡി
|
| 16 |
+
##number## ആർകിടെക്ചർ
|
| 17 |
+
ഇ കോളിയിൽ
|
| 18 |
+
ഐ ആസ്കാർ
|
| 19 |
+
എ താണുപിള്ളയുടെ
|
| 20 |
+
c camelus
|
| 21 |
+
ഇ എം
|
| 22 |
+
##number## മുതുമല
|
| 23 |
+
ഇ കോളി
|
| 24 |
+
ഒ അബ്ദുറഹ്മാൻ
|
| 25 |
+
എ രാമചന്ദ്രൻ
|
| 26 |
+
ഐ എം
|
| 27 |
+
c massaicus
|
| 28 |
+
എ രാമചന്ദ്രൻനായർ
|
| 29 |
+
ഇ വെബ്ബിന്റെ
|
| 30 |
+
##number## ml
|
| 31 |
+
##number## ഉമ്മാമ
|
| 32 |
+
ഇ സി
|
| 33 |
+
##number## ഏകീകരിക്കപ്പെട്ട
|
| 34 |
+
ഐ രണ്ടായി
|
| 35 |
+
##number## ജ്ഞാനകർമവിഭാഗയോഗം
|
| 36 |
+
എ എസ്
|
| 37 |
+
ഐ സി
|
| 38 |
+
എ എൻ
|
| 39 |
+
##number## ഖ്വള
|
| 40 |
+
ഒ അബ്ദുറഹ്മാന്
|
| 41 |
+
##number## പക്ഷിമനുഷ്യൻ
|
| 42 |
+
e see
|
| 43 |
+
c molybdophanes
|
| 44 |
+
ഐ ഷണ്മുഖദാസ്
|
| 45 |
+
വ കി.-തെ.
|
| 46 |
+
c syriacus
|
| 47 |
+
ഏ 26-ന്
|
| 48 |
+
പ നിന്നു
|
| 49 |
+
b english
|
| 50 |
+
ഒ ഹെൻറിയുടെ
|
| 51 |
+
വ പ്രദേശങ്ങളിൽ
|
| 52 |
+
എ കോളേജ്
|
| 53 |
+
വ കിഴക്കൻ
|
| 54 |
+
ഇ ഒ,യും
|
| 55 |
+
എ ആഡംസാണ്
|
| 56 |
+
ഐ ബി.എം
|
| 57 |
+
എ ഐ
|
| 58 |
+
എ ആർ
|
| 59 |
+
ഒ അബ്ദുള്ളയുടെ
|
| 60 |
+
##number## അർജ്ജുനവിഷാദയോഗം
|
| 61 |
+
n sircar
|
| 62 |
+
ഇ പുന്നൻ
|
| 63 |
+
ന 24-ന്
|
| 64 |
+
വ നിന്നു
|
| 65 |
+
c ronaldo
|
| 66 |
+
c whether
|
| 67 |
+
v prasad
|
| 68 |
+
ഐ ടി
|
| 69 |
+
ച കി.മീ
|
| 70 |
+
എ എൽ
|
| 71 |
+
അ ഉ
|
| 72 |
+
ഇ കെ
|
| 73 |
+
വ ഭാഗത്തുള്ള
|
| 74 |
+
##number## ഓട്ടോമൊബൈൽ
|
| 75 |
+
എ കണാരൻ
|
| 76 |
+
ഒ എൻ
|
| 77 |
+
##number## യാലല്ല
|
| 78 |
+
വ അമേരിക്കയിൽനിന്നും
|
| 79 |
+
b r
|
| 80 |
+
f salsa
|
| 81 |
+
c australus
|
| 82 |
+
എ വിൻസെന്റ്
|
| 83 |
+
എ കുഞ്ഞുകൃഷ്ണൻ
|
| 84 |
+
എ എം
|
| 85 |
+
എ കോളി
|
| 86 |
+
ഒ ഓലെസ്സും
|
| 87 |
+
എ ഷെരീഫ്
|
| 88 |
+
ഒ അബ്ദുറഹ്മാൻ
|
| 89 |
+
എ 13-ആം
|
| 90 |
+
l v
|
| 91 |
+
d the
|
| 92 |
+
##number## വിവരാവകാശനിയമം
|
| 93 |
+
##number## naturally
|
| 94 |
+
ഇ ചന്ദ്രശേഖരൻ
|
| 95 |
+
ഒ കോരൻ
|
| 96 |
+
ക ഉത്തർപ്രദേശിന്റെ
|
| 97 |
+
എ സുബാസ്കാരൻ
|
| 98 |
+
വ ഭാഗത്ത്
|
| 99 |
+
l indicus
|
| 100 |
+
ഒ മാധവന്റെ
|
| 101 |
+
ഇ യോനാത്ത്
|
| 102 |
+
h h
|
| 103 |
+
ച കി
|
| 104 |
+
വ കി
|
| 105 |
+
ഐ എ.
|
| 106 |
+
ഐ ഇ
|
| 107 |
+
##number## നിർമ്മാണാവകാശം
|
| 108 |
+
##number## bibcode
|
| 109 |
+
ഇ ഒ
|
| 110 |
+
b n
|
| 111 |
+
##number## 32-ബിറ്റുള്ള
|
| 112 |
+
ഒ എം
|
| 113 |
+
ഇ പത്മനാഭൻ
|
| 114 |
+
p p
|
| 115 |
+
a r
|
| 116 |
+
ഐ ഒ
|
| 117 |
+
എ സഹദേവൻ
|
| 118 |
+
f piliferus
|
| 119 |
+
ഐ ടി.കളും
|
| 120 |
+
a baileyana
|
| 121 |
+
ഒ രാജഗോപാൽ
|
| 122 |
+
s c
|
| 123 |
+
ഐ ആറിന്റെ
|
| 124 |
+
എ ടി
|
| 125 |
+
പ ദിശയിൽ
|
| 126 |
+
പ ഈഗാലിയോസ്
|
| 127 |
+
എ ഭീം
|
| 128 |
+
g kilgour
|
| 129 |
+
ഒ എ
|
| 130 |
+
എ അയ്യപ്പൻ
|
| 131 |
+
##number## മൂലതാളിൽ
|
| 132 |
+
ഇ ഒന്നാം
|
| 133 |
+
ഒ 11-ന്
|
| 134 |
+
r chopra
|
| 135 |
+
വ പ
|
| 136 |
+
എ കെ
|
| 137 |
+
എ വി
|
| 138 |
+
ഇ എസ്
|
| 139 |
+
##number## c4
|
| 140 |
+
എ ഗ്രിഫിൻ
|
| 141 |
+
d ramanaidu
|
| 142 |
+
എ ഹൈൻലൈൻ
|
| 143 |
+
r rahman
|
| 144 |
+
##number## ജംദേന
|
| 145 |
+
എ ഗോപാലകൃഷ്ണൻ
|
| 146 |
+
എ എച്ച്
|
| 147 |
+
എ ഹേമചന്ദ്രൻ
|
| 148 |
+
l ഇംഗ്ലീഷിൽ
|
| 149 |
+
ഏ ആർ
|
| 150 |
+
##number## സൂക്തങ്ങൾ
|
| 151 |
+
എ അച്യുതൻവക്കീലിന്റെ
|
| 152 |
+
ഏ ശ്രീധരമേനോൻ
|
| 153 |
+
ഇ ബി
|
nltk_data/tokenizers/punkt_tab/malayalam/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/malayalam/sent_starters.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
പല
|
| 2 |
+
ഇവ
|
| 3 |
+
http
|
| 4 |
+
അവ
|
| 5 |
+
ആ
|
| 6 |
+
ആൺ
|
| 7 |
+
പഴയ
|
| 8 |
+
ഏൽ
|
| 9 |
+
ഈ
|
| 10 |
+
coordinates
|
| 11 |
+
അവർ
|
| 12 |
+
helier
|
| 13 |
+
പകൽ
|
| 14 |
+
ഇവർ
|
nltk_data/tokenizers/punkt_tab/norwegian/abbrev_types.txt
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
p.p
|
| 3 |
+
bk
|
| 4 |
+
cc
|
| 5 |
+
pga
|
| 6 |
+
e.e.o
|
| 7 |
+
o.h
|
| 8 |
+
dr
|
| 9 |
+
st
|
| 10 |
+
uh
|
| 11 |
+
kk
|
| 12 |
+
t.d
|
| 13 |
+
h.m
|
| 14 |
+
p
|
| 15 |
+
adm
|
| 16 |
+
nr
|
| 17 |
+
etc
|
| 18 |
+
t.h
|
| 19 |
+
dæhlie-triumf
|
| 20 |
+
ev
|
| 21 |
+
udv
|
| 22 |
+
anm
|
| 23 |
+
ø
|
| 24 |
+
osv
|
| 25 |
+
dm
|
| 26 |
+
hi
|
| 27 |
+
b.b
|
| 28 |
+
inc
|
| 29 |
+
r.c
|
| 30 |
+
d.v.s
|
| 31 |
+
ce
|
| 32 |
+
fr
|
| 33 |
+
chr
|
| 34 |
+
adm.dir
|
| 35 |
+
m.a
|
| 36 |
+
b
|
| 37 |
+
p.t
|
| 38 |
+
m.v
|
| 39 |
+
k
|
| 40 |
+
m.m
|
| 41 |
+
i.l
|
| 42 |
+
mill
|
| 43 |
+
h.e
|
| 44 |
+
d.å
|
| 45 |
+
bl.a
|
| 46 |
+
i.h.h.t
|
| 47 |
+
mrs
|
| 48 |
+
b.i.t
|
| 49 |
+
sam.pol
|
| 50 |
+
o.l
|
| 51 |
+
w
|
| 52 |
+
jfr
|
| 53 |
+
h.g
|
| 54 |
+
str
|
| 55 |
+
mil.org
|
| 56 |
+
c.j
|
| 57 |
+
sifre
|
| 58 |
+
l.t
|
| 59 |
+
t.v
|
| 60 |
+
ex
|
| 61 |
+
gj.v
|
| 62 |
+
pr
|
| 63 |
+
d.y
|
| 64 |
+
j.o
|
| 65 |
+
g.c
|
| 66 |
+
avd
|
| 67 |
+
o.s.v
|
| 68 |
+
pol
|
| 69 |
+
ca
|
| 70 |
+
f.eks
|
| 71 |
+
tjenesteforsømmelse
|
| 72 |
+
mr
|
| 73 |
+
d.c
|
| 74 |
+
sam
|
| 75 |
+
.e
|
| 76 |
+
h.h.v
|
| 77 |
+
f.v
|
| 78 |
+
fenomen
|
| 79 |
+
kl
|
| 80 |
+
hr
|
| 81 |
+
c.h
|
| 82 |
+
miljøvernavdelingen
|
| 83 |
+
h.c
|
| 84 |
+
startstreken
|
| 85 |
+
r
|
| 86 |
+
o.a
|
| 87 |
+
mrd
|
| 88 |
+
a.s
|
| 89 |
+
j.v
|
| 90 |
+
j
|
| 91 |
+
jr
|
| 92 |
+
f.w
|
| 93 |
+
kfr
|
| 94 |
+
l.h
|
| 95 |
+
bås
|
| 96 |
+
schlickernrieder
|
| 97 |
+
f.-eks
|
| 98 |
+
f
|
| 99 |
+
5.n7
|
| 100 |
+
c.c
|
| 101 |
+
fung
|
| 102 |
+
dvs
|
| 103 |
+
d.e
|
| 104 |
+
wc
|
| 105 |
+
f.å
|
| 106 |
+
th
|