Spaces:
Runtime error
Runtime error
Commit
·
92e7a8f
0
Parent(s):
tech-chat rag first push
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +37 -0
- Dockerfile +15 -0
- README.md +10 -0
- app.py +196 -0
- nltk_data/tokenizers/punkt.zip +3 -0
- nltk_data/tokenizers/punkt/.DS_Store +0 -0
- nltk_data/tokenizers/punkt/PY3/README +98 -0
- nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
- nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
- nltk_data/tokenizers/punkt/README +98 -0
- nltk_data/tokenizers/punkt/czech.pickle +3 -0
- nltk_data/tokenizers/punkt/danish.pickle +3 -0
- nltk_data/tokenizers/punkt/dutch.pickle +3 -0
- nltk_data/tokenizers/punkt/english.pickle +3 -0
- nltk_data/tokenizers/punkt/estonian.pickle +3 -0
- nltk_data/tokenizers/punkt/finnish.pickle +3 -0
- nltk_data/tokenizers/punkt/french.pickle +3 -0
- nltk_data/tokenizers/punkt/german.pickle +3 -0
- nltk_data/tokenizers/punkt/greek.pickle +3 -0
- nltk_data/tokenizers/punkt/italian.pickle +3 -0
- nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
- nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
- nltk_data/tokenizers/punkt/polish.pickle +3 -0
- nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
- nltk_data/tokenizers/punkt/russian.pickle +3 -0
- nltk_data/tokenizers/punkt/slovene.pickle +3 -0
- nltk_data/tokenizers/punkt/spanish.pickle +3 -0
- nltk_data/tokenizers/punkt/swedish.pickle +3 -0
- nltk_data/tokenizers/punkt/turkish.pickle +3 -0
- nltk_data/tokenizers/punkt_tab.zip +3 -0
- nltk_data/tokenizers/punkt_tab/README +98 -0
- nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
- nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
LMI[[:space:]]HUMAN[[:space:]]RESOURCE[[:space:]]POLICY.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
LMI-HUMAN-RESOURCE-POLICY.pdf filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
| 4 |
+
COPY requirements.txt .
|
| 5 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 6 |
+
COPY app.py .
|
| 7 |
+
COPY "Admission_Requirement.pdf" .
|
| 8 |
+
COPY "POSTGRADUATE_ADMISSIONS.pdf" .
|
| 9 |
+
COPY "cleaned-dataset.jsonl" .
|
| 10 |
+
RUN chmod 644 "Admission_Requirement.pdf" "POSTGRADUATE_ADMISSIONS.pdf" "cleaned-dataset.jsonl"
|
| 11 |
+
COPY nltk_data /app/nltk_data
|
| 12 |
+
RUN mkdir -p /app/cache /app/tmp
|
| 13 |
+
RUN chmod -R 777 /app
|
| 14 |
+
EXPOSE 7860
|
| 15 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Hr Policy Bot
|
| 3 |
+
emoji: 💻
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import nltk
|
| 3 |
+
import logging
|
| 4 |
+
import json
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sklearn.cluster import KMeans
|
| 7 |
+
nltk.data.path.append("/app/nltk_data")
|
| 8 |
+
|
| 9 |
+
os.environ["HF_HOME"] = "/app/cache"
|
| 10 |
+
os.environ["XDG_CACHE_HOME"] = "/app/cache"
|
| 11 |
+
os.environ["TMPDIR"] = "/app/tmp"
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
from fastapi import FastAPI, HTTPException
|
| 17 |
+
from pydantic import BaseModel
|
| 18 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 19 |
+
from langchain_community.vectorstores import FAISS
|
| 20 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 21 |
+
from langchain.prompts import PromptTemplate
|
| 22 |
+
from langchain.chains import LLMChain
|
| 23 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 24 |
+
from langchain_community.retrievers import BM25Retriever
|
| 25 |
+
from langchain.retrievers import EnsembleRetriever
|
| 26 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 27 |
+
from langchain_core.documents import Document
|
| 28 |
+
|
| 29 |
+
app = FastAPI(title="HR Policy Bot")
|
| 30 |
+
|
| 31 |
+
class Question(BaseModel):
|
| 32 |
+
text: str
|
| 33 |
+
|
| 34 |
+
def semantic_chunk_with_embeddings(documents, embeddings, max_chunk_size=1000, min_sentences=2, overlap_sentences=1):
|
| 35 |
+
"""Chunk documents into semantically related groups using embeddings and clustering."""
|
| 36 |
+
all_chunks = []
|
| 37 |
+
for doc in documents:
|
| 38 |
+
sentences = nltk.sent_tokenize(doc.page_content)
|
| 39 |
+
if len(sentences) < min_sentences:
|
| 40 |
+
all_chunks.append(Document(page_content=" ".join(sentences), metadata=doc.metadata))
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
# Generate embeddings for each sentence
|
| 44 |
+
sentence_embeddings = embeddings.embed_documents(sentences)
|
| 45 |
+
sentence_embeddings = np.array(sentence_embeddings)
|
| 46 |
+
|
| 47 |
+
# Cluster sentences using KMeans (dynamically determine num clusters)
|
| 48 |
+
num_clusters = max(1, min(len(sentences) // min_sentences, 10)) # Cap at 10 clusters
|
| 49 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(sentence_embeddings)
|
| 50 |
+
labels = kmeans.labels_
|
| 51 |
+
|
| 52 |
+
# Group sentences by cluster
|
| 53 |
+
clusters = {}
|
| 54 |
+
for sentence, label in zip(sentences, labels):
|
| 55 |
+
if label not in clusters:
|
| 56 |
+
clusters[label] = []
|
| 57 |
+
clusters[label].append(sentence)
|
| 58 |
+
|
| 59 |
+
# Form chunks from clusters with overlap
|
| 60 |
+
for cluster_id, cluster_sentences in clusters.items():
|
| 61 |
+
current_chunk = ""
|
| 62 |
+
chunk_sentences = []
|
| 63 |
+
for i, sentence in enumerate(cluster_sentences):
|
| 64 |
+
if len(current_chunk) + len(sentence) < max_chunk_size:
|
| 65 |
+
current_chunk += sentence + " "
|
| 66 |
+
chunk_sentences.append(sentence)
|
| 67 |
+
else:
|
| 68 |
+
all_chunks.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))
|
| 69 |
+
# Add overlap
|
| 70 |
+
overlap = " ".join(chunk_sentences[-overlap_sentences:]) + " "
|
| 71 |
+
current_chunk = overlap + sentence + " "
|
| 72 |
+
chunk_sentences = chunk_sentences[-overlap_sentences:] + [sentence]
|
| 73 |
+
if current_chunk:
|
| 74 |
+
all_chunks.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))
|
| 75 |
+
|
| 76 |
+
return all_chunks
|
| 77 |
+
|
| 78 |
+
def load_rag_system():
|
| 79 |
+
logger.info("Loading Gemini model...")
|
| 80 |
+
try:
|
| 81 |
+
llm = ChatGoogleGenerativeAI(
|
| 82 |
+
model="gemini-2.0-flash",
|
| 83 |
+
google_api_key=os.getenv("GOOGLE_API_KEY"),
|
| 84 |
+
temperature=0.3,
|
| 85 |
+
top_p=0.9,
|
| 86 |
+
max_tokens=1024
|
| 87 |
+
)
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.error(f"Gemini loading failed: {str(e)}")
|
| 90 |
+
raise
|
| 91 |
+
|
| 92 |
+
# Load embeddings for chunking and retrieval
|
| 93 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 94 |
+
|
| 95 |
+
# Load PDF
|
| 96 |
+
logger.info("Loading PDF...")
|
| 97 |
+
pdf_paths = ["Admission_Requirement.pdf", "POSTGRADUATE_ADMISSIONS.pdf"]
|
| 98 |
+
pages = []
|
| 99 |
+
for path in pdf_paths:
|
| 100 |
+
if os.path.exists(path):
|
| 101 |
+
loader = PyPDFLoader(path)
|
| 102 |
+
pages.extend(loader.load())
|
| 103 |
+
else:
|
| 104 |
+
logger.warning(f"PDF not found: {path}")
|
| 105 |
+
|
| 106 |
+
pdf_docs = semantic_chunk_with_embeddings(pages, embeddings)
|
| 107 |
+
|
| 108 |
+
# Load JSONL
|
| 109 |
+
logger.info("Loading JSONL data...")
|
| 110 |
+
jsonl_paths = ["cleaned-dataset.jsonl"]
|
| 111 |
+
jsonl_docs = []
|
| 112 |
+
for path in jsonl_paths:
|
| 113 |
+
if os.path.exists(path):
|
| 114 |
+
with open(path, "r") as f:
|
| 115 |
+
for line in f:
|
| 116 |
+
data = json.loads(line.strip())
|
| 117 |
+
content = f"Instruction: {data['instruction']}\nResponse: {data['response']}"
|
| 118 |
+
jsonl_docs.append(Document(page_content=content, metadata={"source": "jsonl", "instruction": data["instruction"]}))
|
| 119 |
+
else:
|
| 120 |
+
logger.warning(f"JSONL not found: {path}")
|
| 121 |
+
|
| 122 |
+
# Combine documents
|
| 123 |
+
all_docs = pdf_docs + jsonl_docs
|
| 124 |
+
unique_docs = {doc.page_content: doc for doc in all_docs}.values()
|
| 125 |
+
for i, doc in enumerate(unique_docs):
|
| 126 |
+
doc.metadata["doc_id"] = i
|
| 127 |
+
|
| 128 |
+
logger.info("Building vector store...")
|
| 129 |
+
vectorstore = FAISS.from_documents(list(unique_docs), embedding=embeddings)
|
| 130 |
+
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 4}) # Increased to 4
|
| 131 |
+
bm25_retriever = BM25Retriever.from_documents(list(unique_docs))
|
| 132 |
+
bm25_retriever.k = 4 # Increased to 4
|
| 133 |
+
retriever = EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5])
|
| 134 |
+
|
| 135 |
+
template = """
|
| 136 |
+
You are TechChat, an AI assistant created to provide accurate, concise, and helpful information about admissions to Kwame Nkrumah University of Science and Technology (KNUST). Your primary goal is to assist users with questions related to KNUST admissions, including application processes, requirements, deadlines, programs, and other relevant details.
|
| 137 |
+
|
| 138 |
+
### Instructions:
|
| 139 |
+
1. **KNUST Admissions Questions**: Use the provided context to answer questions about KNUST admissions clearly and accurately. If the context is sufficient, tailor your response to the specific details provided.
|
| 140 |
+
2. **Limited Context**: If the context lacks enough information to fully answer a KNUST admissions question, provide a general but accurate response based on your knowledge of KNUST admissions, and invite the user to provide more details for a more specific answer.
|
| 141 |
+
3. **Off-Topic Questions**: If the question is unrelated to KNUST admissions, respond politely with: "I'm sorry, that question is outside my focus on KNUST admissions. Feel free to ask about KNUST application processes, requirements, or programs, and I'll be happy to help!"
|
| 142 |
+
4. **Tone and Style**: Maintain a friendly, professional, and approachable tone. Avoid overly technical jargon unless necessary, and ensure responses are easy to understand.
|
| 143 |
+
5. **No Assumptions**: Do not invent information. If you cannot answer due to missing or unclear information, acknowledge it and encourage the user to clarify.
|
| 144 |
+
|
| 145 |
+
### Context:
|
| 146 |
+
{context}
|
| 147 |
+
|
| 148 |
+
### Question:
|
| 149 |
+
{question}
|
| 150 |
+
|
| 151 |
+
### Answer:
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
prompt = PromptTemplate.from_template(template)
|
| 155 |
+
parser = StrOutputParser()
|
| 156 |
+
chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser)
|
| 157 |
+
|
| 158 |
+
return retriever, vectorstore, chain
|
| 159 |
+
|
| 160 |
+
logger.info("Initializing RAG system...")
|
| 161 |
+
retriever, vectorstore, chain = load_rag_system()
|
| 162 |
+
|
| 163 |
+
def rewrite_query(question):
|
| 164 |
+
if "dress" in question.lower() and "employees" in question.lower():
|
| 165 |
+
return "What is the dress code policy for employees regarding casual attire?"
|
| 166 |
+
return question
|
| 167 |
+
|
| 168 |
+
@app.post("/ask")
|
| 169 |
+
async def ask(question: Question):
|
| 170 |
+
print('ask route reached')
|
| 171 |
+
try:
|
| 172 |
+
logger.info(f"Received question: {question.text}")
|
| 173 |
+
rewritten_question = rewrite_query(question.text)
|
| 174 |
+
logger.info(f"Rewritten question: {rewritten_question}")
|
| 175 |
+
context_docs = retriever.invoke(rewritten_question)
|
| 176 |
+
logger.info(f"Retrieved {len(context_docs)} context documents")
|
| 177 |
+
max_similarity = max([vectorstore.similarity_search_with_score(rewritten_question, k=1)[0][1] for _ in context_docs], default=0)
|
| 178 |
+
if max_similarity < 0.25: # Lowered threshold slightly
|
| 179 |
+
logger.info("Similarity too low, returning 'I don’t know'")
|
| 180 |
+
return {"answer": "I'm not sure about that, but I'd be happy to help if you provide more details!"}
|
| 181 |
+
context_text = "\n".join([doc.page_content for doc in context_docs])
|
| 182 |
+
logger.info("Generating response...")
|
| 183 |
+
response = chain.invoke({"context": context_text, "question": rewritten_question})
|
| 184 |
+
answer = response['text'].split("Answer:")[-1].strip()
|
| 185 |
+
logger.info(f"Generated answer: {answer}")
|
| 186 |
+
return {"answer": answer}
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Error processing question: {str(e)}")
|
| 189 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 190 |
+
|
| 191 |
+
@app.get("/")
|
| 192 |
+
async def root():
|
| 193 |
+
print('HR Policy Bot is actively running!')
|
| 194 |
+
logger.info("Root endpoint accessed")
|
| 195 |
+
return {"message": "HR Policy Bot is running!"}
|
| 196 |
+
|
nltk_data/tokenizers/punkt.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
|
| 3 |
+
size 13905355
|
nltk_data/tokenizers/punkt/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
nltk_data/tokenizers/punkt/PY3/README
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
| 2 |
+
|
| 3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
| 4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
| 5 |
+
|
| 6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
| 7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
| 8 |
+
and chapter 3.8 of the NLTK book:
|
| 9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
| 10 |
+
|
| 11 |
+
There are pretrained tokenizers for the following languages:
|
| 12 |
+
|
| 13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
| 14 |
+
=======================================================================================================================================================================
|
| 15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
| 16 |
+
Literarni Noviny
|
| 17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
| 19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
| 20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
| 22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
| 24 |
+
(American)
|
| 25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
| 27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
| 29 |
+
Text Bank (Suomen Kielen newspapers
|
| 30 |
+
Tekstipankki)
|
| 31 |
+
Finnish Center for IT Science
|
| 32 |
+
(CSC)
|
| 33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
| 35 |
+
(European)
|
| 36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
| 38 |
+
(Switzerland) CD-ROM
|
| 39 |
+
(Uses "ss"
|
| 40 |
+
instead of "ß")
|
| 41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
| 43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
| 45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
| 47 |
+
(Bokmål and Information Technologies,
|
| 48 |
+
Nynorsk) Bergen
|
| 49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
| 51 |
+
(http://www.nkjp.pl/)
|
| 52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
| 54 |
+
(Brazilian) (Linguateca)
|
| 55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
| 57 |
+
Slovene Academy for Arts
|
| 58 |
+
and Sciences
|
| 59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
| 61 |
+
(European)
|
| 62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
| 64 |
+
(and some other texts)
|
| 65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
| 67 |
+
(Türkçe Derlem Projesi)
|
| 68 |
+
University of Ankara
|
| 69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
| 72 |
+
Unicode using the codecs module.
|
| 73 |
+
|
| 74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
| 75 |
+
Computational Linguistics 32: 485-525.
|
| 76 |
+
|
| 77 |
+
---- Training Code ----
|
| 78 |
+
|
| 79 |
+
# import punkt
|
| 80 |
+
import nltk.tokenize.punkt
|
| 81 |
+
|
| 82 |
+
# Make a new Tokenizer
|
| 83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
| 84 |
+
|
| 85 |
+
# Read in training corpus (one example: Slovene)
|
| 86 |
+
import codecs
|
| 87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
| 88 |
+
|
| 89 |
+
# Train tokenizer
|
| 90 |
+
tokenizer.train(text)
|
| 91 |
+
|
| 92 |
+
# Dump pickled tokenizer
|
| 93 |
+
import pickle
|
| 94 |
+
out = open("slovene.pickle","wb")
|
| 95 |
+
pickle.dump(tokenizer, out)
|
| 96 |
+
out.close()
|
| 97 |
+
|
| 98 |
+
---------
|
nltk_data/tokenizers/punkt/PY3/czech.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
|
| 3 |
+
size 1119050
|
nltk_data/tokenizers/punkt/PY3/danish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
|
| 3 |
+
size 1191710
|
nltk_data/tokenizers/punkt/PY3/dutch.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
|
| 3 |
+
size 693759
|
nltk_data/tokenizers/punkt/PY3/english.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
|
| 3 |
+
size 406697
|
nltk_data/tokenizers/punkt/PY3/estonian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
|
| 3 |
+
size 1499502
|
nltk_data/tokenizers/punkt/PY3/finnish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
|
| 3 |
+
size 1852226
|
nltk_data/tokenizers/punkt/PY3/french.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
|
| 3 |
+
size 553575
|
nltk_data/tokenizers/punkt/PY3/german.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
|
| 3 |
+
size 1463575
|
nltk_data/tokenizers/punkt/PY3/greek.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
|
| 3 |
+
size 876006
|
nltk_data/tokenizers/punkt/PY3/italian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
|
| 3 |
+
size 615089
|
nltk_data/tokenizers/punkt/PY3/malayalam.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
| 3 |
+
size 221207
|
nltk_data/tokenizers/punkt/PY3/norwegian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
|
| 3 |
+
size 1181271
|
nltk_data/tokenizers/punkt/PY3/polish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
|
| 3 |
+
size 1738386
|
nltk_data/tokenizers/punkt/PY3/portuguese.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
|
| 3 |
+
size 611919
|
nltk_data/tokenizers/punkt/PY3/russian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
|
| 3 |
+
size 33020
|
nltk_data/tokenizers/punkt/PY3/slovene.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
|
| 3 |
+
size 734444
|
nltk_data/tokenizers/punkt/PY3/spanish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
|
| 3 |
+
size 562337
|
nltk_data/tokenizers/punkt/PY3/swedish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
|
| 3 |
+
size 979681
|
nltk_data/tokenizers/punkt/PY3/turkish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
|
| 3 |
+
size 1017038
|
nltk_data/tokenizers/punkt/README
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
| 2 |
+
|
| 3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
| 4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
| 5 |
+
|
| 6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
| 7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
| 8 |
+
and chapter 3.8 of the NLTK book:
|
| 9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
| 10 |
+
|
| 11 |
+
There are pretrained tokenizers for the following languages:
|
| 12 |
+
|
| 13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
| 14 |
+
=======================================================================================================================================================================
|
| 15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
| 16 |
+
Literarni Noviny
|
| 17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
| 19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
| 20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
| 22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
| 24 |
+
(American)
|
| 25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
| 27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
| 29 |
+
Text Bank (Suomen Kielen newspapers
|
| 30 |
+
Tekstipankki)
|
| 31 |
+
Finnish Center for IT Science
|
| 32 |
+
(CSC)
|
| 33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
| 35 |
+
(European)
|
| 36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
| 38 |
+
(Switzerland) CD-ROM
|
| 39 |
+
(Uses "ss"
|
| 40 |
+
instead of "ß")
|
| 41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
| 43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
| 45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
| 47 |
+
(Bokmål and Information Technologies,
|
| 48 |
+
Nynorsk) Bergen
|
| 49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
| 51 |
+
(http://www.nkjp.pl/)
|
| 52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
| 54 |
+
(Brazilian) (Linguateca)
|
| 55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
| 57 |
+
Slovene Academy for Arts
|
| 58 |
+
and Sciences
|
| 59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
| 61 |
+
(European)
|
| 62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
| 64 |
+
(and some other texts)
|
| 65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
| 67 |
+
(Türkçe Derlem Projesi)
|
| 68 |
+
University of Ankara
|
| 69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
| 72 |
+
Unicode using the codecs module.
|
| 73 |
+
|
| 74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
| 75 |
+
Computational Linguistics 32: 485-525.
|
| 76 |
+
|
| 77 |
+
---- Training Code ----
|
| 78 |
+
|
| 79 |
+
# import punkt
|
| 80 |
+
import nltk.tokenize.punkt
|
| 81 |
+
|
| 82 |
+
# Make a new Tokenizer
|
| 83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
| 84 |
+
|
| 85 |
+
# Read in training corpus (one example: Slovene)
|
| 86 |
+
import codecs
|
| 87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
| 88 |
+
|
| 89 |
+
# Train tokenizer
|
| 90 |
+
tokenizer.train(text)
|
| 91 |
+
|
| 92 |
+
# Dump pickled tokenizer
|
| 93 |
+
import pickle
|
| 94 |
+
out = open("slovene.pickle","wb")
|
| 95 |
+
pickle.dump(tokenizer, out)
|
| 96 |
+
out.close()
|
| 97 |
+
|
| 98 |
+
---------
|
nltk_data/tokenizers/punkt/czech.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ba73d293c7d7953956bcf02f3695ec5c1f0d527f2a3c38097f5593394fa1690
|
| 3 |
+
size 1265552
|
nltk_data/tokenizers/punkt/danish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea29760a0a9197f52ca59e78aeafc5a6f55d05258faf7db1709b2b9eb321ef20
|
| 3 |
+
size 1264725
|
nltk_data/tokenizers/punkt/dutch.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a8e26b3d68c45c38e594d19e2d5677447bfdcaa636d3b1e7acfed0e9272d73c
|
| 3 |
+
size 742624
|
nltk_data/tokenizers/punkt/english.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
|
| 3 |
+
size 433305
|
nltk_data/tokenizers/punkt/estonian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3867fee26a36bdb197c64362aa13ac683f5f33fa4d0d225a5d56707582a55a1d
|
| 3 |
+
size 1596714
|
nltk_data/tokenizers/punkt/finnish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a9e17b3d5b4df76345d812b8a65b1da0767eda5086eadcc11e625eef0942835
|
| 3 |
+
size 1951656
|
nltk_data/tokenizers/punkt/french.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de05f3d5647d3d2296626fb83f68428e4c6ad6e05a00ed4694c8bdc8f2f197ee
|
| 3 |
+
size 583482
|
nltk_data/tokenizers/punkt/german.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eab497fa085413130c8fd0fb13b929128930afe2f6a26ea8715c95df7088e97c
|
| 3 |
+
size 1526714
|
nltk_data/tokenizers/punkt/greek.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21752a6762fad5cfe46fb5c45fad9a85484a0e8e81c67e6af6fb973cfc27d67c
|
| 3 |
+
size 1953106
|
nltk_data/tokenizers/punkt/italian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dcb2717d7be5f26e860a92e05acf69b1123a5f4527cd7a269a9ab9e9e668c805
|
| 3 |
+
size 658331
|
nltk_data/tokenizers/punkt/malayalam.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
|
| 3 |
+
size 221207
|
nltk_data/tokenizers/punkt/norwegian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4a97f8f9a03a0338dd746bcc89a0ae0f54ae43b835fa37d83e279e1ca794faf
|
| 3 |
+
size 1259779
|
nltk_data/tokenizers/punkt/polish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16127b6d10933427a3e90fb20e9be53e1fb371ff79a730c1030734ed80b90c92
|
| 3 |
+
size 2042451
|
nltk_data/tokenizers/punkt/portuguese.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb01bf7c79a4eadc2178bbd209665139a0e4b38f2d1c44fef097de93955140e0
|
| 3 |
+
size 649051
|
nltk_data/tokenizers/punkt/russian.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
|
| 3 |
+
size 33027
|
nltk_data/tokenizers/punkt/slovene.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7dac650212b3787b39996c01bd2084115493e6f6ec390bab61f767525b08b8ea
|
| 3 |
+
size 832867
|
nltk_data/tokenizers/punkt/spanish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:271dc6027c4aae056f72a9bfab5645cf67e198bf4f972895844e40f5989ccdc3
|
| 3 |
+
size 597831
|
nltk_data/tokenizers/punkt/swedish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40d50ebdad6caa87715f2e300b1217ec92c42de205a543cc4a56903bd2c9acfa
|
| 3 |
+
size 1034496
|
nltk_data/tokenizers/punkt/turkish.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3ae47d76501d027698809d12e75292c9c392910488543342802f95db9765ccc
|
| 3 |
+
size 1225013
|
nltk_data/tokenizers/punkt_tab.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e57f64187974277726a3417ca6f181ec5403676c717672eef6a748a7b20e0106
|
| 3 |
+
size 4319076
|
nltk_data/tokenizers/punkt_tab/README
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
| 2 |
+
|
| 3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
| 4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
| 5 |
+
|
| 6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
| 7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
| 8 |
+
and chapter 3.8 of the NLTK book:
|
| 9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
| 10 |
+
|
| 11 |
+
There are pretrained tokenizers for the following languages:
|
| 12 |
+
|
| 13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
| 14 |
+
=======================================================================================================================================================================
|
| 15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
| 16 |
+
Literarni Noviny
|
| 17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
| 19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
| 20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
| 22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
| 24 |
+
(American)
|
| 25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
| 27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
| 29 |
+
Text Bank (Suomen Kielen newspapers
|
| 30 |
+
Tekstipankki)
|
| 31 |
+
Finnish Center for IT Science
|
| 32 |
+
(CSC)
|
| 33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
| 35 |
+
(European)
|
| 36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
| 38 |
+
(Switzerland) CD-ROM
|
| 39 |
+
(Uses "ss"
|
| 40 |
+
instead of "ß")
|
| 41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
| 43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
| 45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
| 47 |
+
(Bokmål and Information Technologies,
|
| 48 |
+
Nynorsk) Bergen
|
| 49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
| 51 |
+
(http://www.nkjp.pl/)
|
| 52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
| 54 |
+
(Brazilian) (Linguateca)
|
| 55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
| 57 |
+
Slovene Academy for Arts
|
| 58 |
+
and Sciences
|
| 59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
| 61 |
+
(European)
|
| 62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
| 64 |
+
(and some other texts)
|
| 65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
| 67 |
+
(Türkçe Derlem Projesi)
|
| 68 |
+
University of Ankara
|
| 69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
| 72 |
+
Unicode using the codecs module.
|
| 73 |
+
|
| 74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
| 75 |
+
Computational Linguistics 32: 485-525.
|
| 76 |
+
|
| 77 |
+
---- Training Code ----
|
| 78 |
+
|
| 79 |
+
# import punkt
|
| 80 |
+
import nltk.tokenize.punkt
|
| 81 |
+
|
| 82 |
+
# Make a new Tokenizer
|
| 83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
| 84 |
+
|
| 85 |
+
# Read in training corpus (one example: Slovene)
|
| 86 |
+
import codecs
|
| 87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
| 88 |
+
|
| 89 |
+
# Train tokenizer
|
| 90 |
+
tokenizer.train(text)
|
| 91 |
+
|
| 92 |
+
# Dump pickled tokenizer
|
| 93 |
+
import pickle
|
| 94 |
+
out = open("slovene.pickle","wb")
|
| 95 |
+
pickle.dump(tokenizer, out)
|
| 96 |
+
out.close()
|
| 97 |
+
|
| 98 |
+
---------
|
nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
množ
|
| 3 |
+
např
|
| 4 |
+
j.h
|
| 5 |
+
man
|
| 6 |
+
ú
|
| 7 |
+
jug
|
| 8 |
+
dr
|
| 9 |
+
bl
|
| 10 |
+
ml
|
| 11 |
+
okr
|
| 12 |
+
st
|
| 13 |
+
uh
|
| 14 |
+
šp
|
| 15 |
+
judr
|
| 16 |
+
u.s.a
|
| 17 |
+
p
|
| 18 |
+
arg
|
| 19 |
+
žitě
|
| 20 |
+
st.celsia
|
| 21 |
+
etc
|
| 22 |
+
p.s
|
| 23 |
+
t.r
|
| 24 |
+
lok
|
| 25 |
+
mil
|
| 26 |
+
ict
|
| 27 |
+
n
|
| 28 |
+
tl
|
| 29 |
+
min
|
| 30 |
+
č
|
| 31 |
+
d
|
| 32 |
+
al
|
| 33 |
+
ravenně
|
| 34 |
+
mj
|
| 35 |
+
nar
|
| 36 |
+
plk
|
| 37 |
+
s.p
|
| 38 |
+
a.g
|
| 39 |
+
roč
|
| 40 |
+
b
|
| 41 |
+
zdi
|
| 42 |
+
r.s.c
|
| 43 |
+
přek
|
| 44 |
+
m
|
| 45 |
+
gen
|
| 46 |
+
csc
|
| 47 |
+
mudr
|
| 48 |
+
vic
|
| 49 |
+
š
|
| 50 |
+
sb
|
| 51 |
+
resp
|
| 52 |
+
tzn
|
| 53 |
+
iv
|
| 54 |
+
s.r.o
|
| 55 |
+
mar
|
| 56 |
+
w
|
| 57 |
+
čs
|
| 58 |
+
vi
|
| 59 |
+
tzv
|
| 60 |
+
ul
|
| 61 |
+
pen
|
| 62 |
+
zv
|
| 63 |
+
str
|
| 64 |
+
čp
|
| 65 |
+
org
|
| 66 |
+
rak
|
| 67 |
+
sv
|
| 68 |
+
pplk
|
| 69 |
+
u.s
|
| 70 |
+
prof
|
| 71 |
+
c.k
|
| 72 |
+
op
|
| 73 |
+
g
|
| 74 |
+
vii
|
| 75 |
+
kr
|
| 76 |
+
ing
|
| 77 |
+
j.o
|
| 78 |
+
drsc
|
| 79 |
+
m3
|
| 80 |
+
l
|
| 81 |
+
tr
|
| 82 |
+
ceo
|
| 83 |
+
ch
|
| 84 |
+
fuk
|
| 85 |
+
vl
|
| 86 |
+
viii
|
| 87 |
+
líp
|
| 88 |
+
hl.m
|
| 89 |
+
t.zv
|
| 90 |
+
phdr
|
| 91 |
+
o.k
|
| 92 |
+
tis
|
| 93 |
+
doc
|
| 94 |
+
kl
|
| 95 |
+
ard
|
| 96 |
+
čkd
|
| 97 |
+
pok
|
| 98 |
+
apod
|
| 99 |
+
r
|
| 100 |
+
př
|
| 101 |
+
a.s
|
| 102 |
+
j
|
| 103 |
+
jr
|
| 104 |
+
i.m
|
| 105 |
+
e
|
| 106 |
+
kupř
|
| 107 |
+
f
|
| 108 |
+
tř
|
| 109 |
+
xvi
|
| 110 |
+
mir
|
| 111 |
+
atď
|
| 112 |
+
vr
|
| 113 |
+
r.i.v
|
| 114 |
+
hl
|
| 115 |
+
kv
|
| 116 |
+
t.j
|
| 117 |
+
y
|
| 118 |
+
q.p.r
|
nltk_data/tokenizers/punkt_tab/czech/collocations.tab
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
i dejmala
|
| 2 |
+
##number## prosince
|
| 3 |
+
h steina
|
| 4 |
+
##number## listopadu
|
| 5 |
+
a dvořák
|
| 6 |
+
v klaus
|
| 7 |
+
i čnhl
|
| 8 |
+
##number## wladyslawowo
|
| 9 |
+
##number## letech
|
| 10 |
+
a jiráska
|
| 11 |
+
a dubček
|
| 12 |
+
##number## štrasburk
|
| 13 |
+
##number## juniorské
|
| 14 |
+
##number## století
|
| 15 |
+
##number## kola
|
| 16 |
+
##number## pád
|
| 17 |
+
##number## května
|
| 18 |
+
##number## týdne
|
| 19 |
+
v dlouhý
|
| 20 |
+
k design
|
| 21 |
+
##number## červenec
|
| 22 |
+
i ligy
|
| 23 |
+
##number## kolo
|
| 24 |
+
z svěrák
|
| 25 |
+
##number## mája
|
| 26 |
+
##number## šimková
|
| 27 |
+
a bělého
|
| 28 |
+
a bradáč
|
| 29 |
+
##number## ročníku
|
| 30 |
+
##number## dubna
|
| 31 |
+
a vivaldiho
|
| 32 |
+
v mečiara
|
| 33 |
+
c carrićre
|
| 34 |
+
##number## sjezd
|
| 35 |
+
##number## výroční
|
| 36 |
+
##number## kole
|
| 37 |
+
##number## narozenin
|
| 38 |
+
k maleevová
|
| 39 |
+
i čnfl
|
| 40 |
+
##number## pádě
|
| 41 |
+
##number## září
|
| 42 |
+
##number## výročí
|
| 43 |
+
a dvořáka
|
| 44 |
+
h g.
|
| 45 |
+
##number## ledna
|
| 46 |
+
a dvorský
|
| 47 |
+
h měsíc
|
| 48 |
+
##number## srpna
|
| 49 |
+
##number## tř.
|
| 50 |
+
a mozarta
|
| 51 |
+
##number## sudetoněmeckých
|
| 52 |
+
o sokolov
|
| 53 |
+
k škrach
|
| 54 |
+
v benda
|
| 55 |
+
##number## symfonie
|
| 56 |
+
##number## července
|
| 57 |
+
x šalda
|
| 58 |
+
c abrahama
|
| 59 |
+
a tichý
|
| 60 |
+
##number## místo
|
| 61 |
+
k bielecki
|
| 62 |
+
v havel
|
| 63 |
+
##number## etapu
|
| 64 |
+
a dubčeka
|
| 65 |
+
i liga
|
| 66 |
+
##number## světový
|
| 67 |
+
v klausem
|
| 68 |
+
##number## ženy
|
| 69 |
+
##number## létech
|
| 70 |
+
##number## minutě
|
| 71 |
+
##number## listopadem
|
| 72 |
+
##number## místě
|
| 73 |
+
o vlček
|
| 74 |
+
k peteraje
|
| 75 |
+
i sponzor
|
| 76 |
+
##number## června
|
| 77 |
+
##number## min.
|
| 78 |
+
##number## oprávněnou
|
| 79 |
+
##number## květnu
|
| 80 |
+
##number## aktu
|
| 81 |
+
##number## květnem
|
| 82 |
+
##number## října
|
| 83 |
+
i rynda
|
| 84 |
+
##number## února
|
| 85 |
+
i snfl
|
| 86 |
+
a mozart
|
| 87 |
+
z košler
|
| 88 |
+
a dvorskému
|
| 89 |
+
v marhoul
|
| 90 |
+
v mečiar
|
| 91 |
+
##number## ročník
|
| 92 |
+
##number## máje
|
| 93 |
+
v havla
|
| 94 |
+
k gott
|
| 95 |
+
s bacha
|
| 96 |
+
##number## ad
|