pfrimpong commited on
Commit
92e7a8f
·
0 Parent(s):

tech-chat rag first push

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +37 -0
  2. Dockerfile +15 -0
  3. README.md +10 -0
  4. app.py +196 -0
  5. nltk_data/tokenizers/punkt.zip +3 -0
  6. nltk_data/tokenizers/punkt/.DS_Store +0 -0
  7. nltk_data/tokenizers/punkt/PY3/README +98 -0
  8. nltk_data/tokenizers/punkt/PY3/czech.pickle +3 -0
  9. nltk_data/tokenizers/punkt/PY3/danish.pickle +3 -0
  10. nltk_data/tokenizers/punkt/PY3/dutch.pickle +3 -0
  11. nltk_data/tokenizers/punkt/PY3/english.pickle +3 -0
  12. nltk_data/tokenizers/punkt/PY3/estonian.pickle +3 -0
  13. nltk_data/tokenizers/punkt/PY3/finnish.pickle +3 -0
  14. nltk_data/tokenizers/punkt/PY3/french.pickle +3 -0
  15. nltk_data/tokenizers/punkt/PY3/german.pickle +3 -0
  16. nltk_data/tokenizers/punkt/PY3/greek.pickle +3 -0
  17. nltk_data/tokenizers/punkt/PY3/italian.pickle +3 -0
  18. nltk_data/tokenizers/punkt/PY3/malayalam.pickle +3 -0
  19. nltk_data/tokenizers/punkt/PY3/norwegian.pickle +3 -0
  20. nltk_data/tokenizers/punkt/PY3/polish.pickle +3 -0
  21. nltk_data/tokenizers/punkt/PY3/portuguese.pickle +3 -0
  22. nltk_data/tokenizers/punkt/PY3/russian.pickle +3 -0
  23. nltk_data/tokenizers/punkt/PY3/slovene.pickle +3 -0
  24. nltk_data/tokenizers/punkt/PY3/spanish.pickle +3 -0
  25. nltk_data/tokenizers/punkt/PY3/swedish.pickle +3 -0
  26. nltk_data/tokenizers/punkt/PY3/turkish.pickle +3 -0
  27. nltk_data/tokenizers/punkt/README +98 -0
  28. nltk_data/tokenizers/punkt/czech.pickle +3 -0
  29. nltk_data/tokenizers/punkt/danish.pickle +3 -0
  30. nltk_data/tokenizers/punkt/dutch.pickle +3 -0
  31. nltk_data/tokenizers/punkt/english.pickle +3 -0
  32. nltk_data/tokenizers/punkt/estonian.pickle +3 -0
  33. nltk_data/tokenizers/punkt/finnish.pickle +3 -0
  34. nltk_data/tokenizers/punkt/french.pickle +3 -0
  35. nltk_data/tokenizers/punkt/german.pickle +3 -0
  36. nltk_data/tokenizers/punkt/greek.pickle +3 -0
  37. nltk_data/tokenizers/punkt/italian.pickle +3 -0
  38. nltk_data/tokenizers/punkt/malayalam.pickle +3 -0
  39. nltk_data/tokenizers/punkt/norwegian.pickle +3 -0
  40. nltk_data/tokenizers/punkt/polish.pickle +3 -0
  41. nltk_data/tokenizers/punkt/portuguese.pickle +3 -0
  42. nltk_data/tokenizers/punkt/russian.pickle +3 -0
  43. nltk_data/tokenizers/punkt/slovene.pickle +3 -0
  44. nltk_data/tokenizers/punkt/spanish.pickle +3 -0
  45. nltk_data/tokenizers/punkt/swedish.pickle +3 -0
  46. nltk_data/tokenizers/punkt/turkish.pickle +3 -0
  47. nltk_data/tokenizers/punkt_tab.zip +3 -0
  48. nltk_data/tokenizers/punkt_tab/README +98 -0
  49. nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
  50. nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ LMI[[:space:]]HUMAN[[:space:]]RESOURCE[[:space:]]POLICY.pdf filter=lfs diff=lfs merge=lfs -text
37
+ LMI-HUMAN-RESOURCE-POLICY.pdf filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+ WORKDIR /app
3
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+ COPY app.py .
7
+ COPY "Admission_Requirement.pdf" .
8
+ COPY "POSTGRADUATE_ADMISSIONS.pdf" .
9
+ COPY "cleaned-dataset.jsonl" .
10
+ RUN chmod 644 "Admission_Requirement.pdf" "POSTGRADUATE_ADMISSIONS.pdf" "cleaned-dataset.jsonl"
11
+ COPY nltk_data /app/nltk_data
12
+ RUN mkdir -p /app/cache /app/tmp
13
+ RUN chmod -R 777 /app
14
+ EXPOSE 7860
15
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hr Policy Bot
3
+ emoji: 💻
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nltk
3
+ import logging
4
+ import json
5
+ import numpy as np
6
+ from sklearn.cluster import KMeans
7
+ nltk.data.path.append("/app/nltk_data")
8
+
9
+ os.environ["HF_HOME"] = "/app/cache"
10
+ os.environ["XDG_CACHE_HOME"] = "/app/cache"
11
+ os.environ["TMPDIR"] = "/app/tmp"
12
+
13
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
+ logger = logging.getLogger(__name__)
15
+
16
+ from fastapi import FastAPI, HTTPException
17
+ from pydantic import BaseModel
18
+ from langchain_community.document_loaders import PyPDFLoader
19
+ from langchain_community.vectorstores import FAISS
20
+ from langchain_community.embeddings import HuggingFaceEmbeddings
21
+ from langchain.prompts import PromptTemplate
22
+ from langchain.chains import LLMChain
23
+ from langchain_core.output_parsers import StrOutputParser
24
+ from langchain_community.retrievers import BM25Retriever
25
+ from langchain.retrievers import EnsembleRetriever
26
+ from langchain_google_genai import ChatGoogleGenerativeAI
27
+ from langchain_core.documents import Document
28
+
29
+ app = FastAPI(title="HR Policy Bot")
30
+
31
+ class Question(BaseModel):
32
+ text: str
33
+
34
+ def semantic_chunk_with_embeddings(documents, embeddings, max_chunk_size=1000, min_sentences=2, overlap_sentences=1):
35
+ """Chunk documents into semantically related groups using embeddings and clustering."""
36
+ all_chunks = []
37
+ for doc in documents:
38
+ sentences = nltk.sent_tokenize(doc.page_content)
39
+ if len(sentences) < min_sentences:
40
+ all_chunks.append(Document(page_content=" ".join(sentences), metadata=doc.metadata))
41
+ continue
42
+
43
+ # Generate embeddings for each sentence
44
+ sentence_embeddings = embeddings.embed_documents(sentences)
45
+ sentence_embeddings = np.array(sentence_embeddings)
46
+
47
+ # Cluster sentences using KMeans (dynamically determine num clusters)
48
+ num_clusters = max(1, min(len(sentences) // min_sentences, 10)) # Cap at 10 clusters
49
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(sentence_embeddings)
50
+ labels = kmeans.labels_
51
+
52
+ # Group sentences by cluster
53
+ clusters = {}
54
+ for sentence, label in zip(sentences, labels):
55
+ if label not in clusters:
56
+ clusters[label] = []
57
+ clusters[label].append(sentence)
58
+
59
+ # Form chunks from clusters with overlap
60
+ for cluster_id, cluster_sentences in clusters.items():
61
+ current_chunk = ""
62
+ chunk_sentences = []
63
+ for i, sentence in enumerate(cluster_sentences):
64
+ if len(current_chunk) + len(sentence) < max_chunk_size:
65
+ current_chunk += sentence + " "
66
+ chunk_sentences.append(sentence)
67
+ else:
68
+ all_chunks.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))
69
+ # Add overlap
70
+ overlap = " ".join(chunk_sentences[-overlap_sentences:]) + " "
71
+ current_chunk = overlap + sentence + " "
72
+ chunk_sentences = chunk_sentences[-overlap_sentences:] + [sentence]
73
+ if current_chunk:
74
+ all_chunks.append(Document(page_content=current_chunk.strip(), metadata=doc.metadata))
75
+
76
+ return all_chunks
77
+
78
+ def load_rag_system():
79
+ logger.info("Loading Gemini model...")
80
+ try:
81
+ llm = ChatGoogleGenerativeAI(
82
+ model="gemini-2.0-flash",
83
+ google_api_key=os.getenv("GOOGLE_API_KEY"),
84
+ temperature=0.3,
85
+ top_p=0.9,
86
+ max_tokens=1024
87
+ )
88
+ except Exception as e:
89
+ logger.error(f"Gemini loading failed: {str(e)}")
90
+ raise
91
+
92
+ # Load embeddings for chunking and retrieval
93
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
94
+
95
+ # Load PDF
96
+ logger.info("Loading PDF...")
97
+ pdf_paths = ["Admission_Requirement.pdf", "POSTGRADUATE_ADMISSIONS.pdf"]
98
+ pages = []
99
+ for path in pdf_paths:
100
+ if os.path.exists(path):
101
+ loader = PyPDFLoader(path)
102
+ pages.extend(loader.load())
103
+ else:
104
+ logger.warning(f"PDF not found: {path}")
105
+
106
+ pdf_docs = semantic_chunk_with_embeddings(pages, embeddings)
107
+
108
+ # Load JSONL
109
+ logger.info("Loading JSONL data...")
110
+ jsonl_paths = ["cleaned-dataset.jsonl"]
111
+ jsonl_docs = []
112
+ for path in jsonl_paths:
113
+ if os.path.exists(path):
114
+ with open(path, "r") as f:
115
+ for line in f:
116
+ data = json.loads(line.strip())
117
+ content = f"Instruction: {data['instruction']}\nResponse: {data['response']}"
118
+ jsonl_docs.append(Document(page_content=content, metadata={"source": "jsonl", "instruction": data["instruction"]}))
119
+ else:
120
+ logger.warning(f"JSONL not found: {path}")
121
+
122
+ # Combine documents
123
+ all_docs = pdf_docs + jsonl_docs
124
+ unique_docs = {doc.page_content: doc for doc in all_docs}.values()
125
+ for i, doc in enumerate(unique_docs):
126
+ doc.metadata["doc_id"] = i
127
+
128
+ logger.info("Building vector store...")
129
+ vectorstore = FAISS.from_documents(list(unique_docs), embedding=embeddings)
130
+ faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 4}) # Increased to 4
131
+ bm25_retriever = BM25Retriever.from_documents(list(unique_docs))
132
+ bm25_retriever.k = 4 # Increased to 4
133
+ retriever = EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5])
134
+
135
+ template = """
136
+ You are TechChat, an AI assistant created to provide accurate, concise, and helpful information about admissions to Kwame Nkrumah University of Science and Technology (KNUST). Your primary goal is to assist users with questions related to KNUST admissions, including application processes, requirements, deadlines, programs, and other relevant details.
137
+
138
+ ### Instructions:
139
+ 1. **KNUST Admissions Questions**: Use the provided context to answer questions about KNUST admissions clearly and accurately. If the context is sufficient, tailor your response to the specific details provided.
140
+ 2. **Limited Context**: If the context lacks enough information to fully answer a KNUST admissions question, provide a general but accurate response based on your knowledge of KNUST admissions, and invite the user to provide more details for a more specific answer.
141
+ 3. **Off-Topic Questions**: If the question is unrelated to KNUST admissions, respond politely with: "I'm sorry, that question is outside my focus on KNUST admissions. Feel free to ask about KNUST application processes, requirements, or programs, and I'll be happy to help!"
142
+ 4. **Tone and Style**: Maintain a friendly, professional, and approachable tone. Avoid overly technical jargon unless necessary, and ensure responses are easy to understand.
143
+ 5. **No Assumptions**: Do not invent information. If you cannot answer due to missing or unclear information, acknowledge it and encourage the user to clarify.
144
+
145
+ ### Context:
146
+ {context}
147
+
148
+ ### Question:
149
+ {question}
150
+
151
+ ### Answer:
152
+ """
153
+
154
+ prompt = PromptTemplate.from_template(template)
155
+ parser = StrOutputParser()
156
+ chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser)
157
+
158
+ return retriever, vectorstore, chain
159
+
160
+ logger.info("Initializing RAG system...")
161
+ retriever, vectorstore, chain = load_rag_system()
162
+
163
+ def rewrite_query(question):
164
+ if "dress" in question.lower() and "employees" in question.lower():
165
+ return "What is the dress code policy for employees regarding casual attire?"
166
+ return question
167
+
168
+ @app.post("/ask")
169
+ async def ask(question: Question):
170
+ print('ask route reached')
171
+ try:
172
+ logger.info(f"Received question: {question.text}")
173
+ rewritten_question = rewrite_query(question.text)
174
+ logger.info(f"Rewritten question: {rewritten_question}")
175
+ context_docs = retriever.invoke(rewritten_question)
176
+ logger.info(f"Retrieved {len(context_docs)} context documents")
177
+ max_similarity = max([vectorstore.similarity_search_with_score(rewritten_question, k=1)[0][1] for _ in context_docs], default=0)
178
+ if max_similarity < 0.25: # Lowered threshold slightly
179
+ logger.info("Similarity too low, returning 'I don’t know'")
180
+ return {"answer": "I'm not sure about that, but I'd be happy to help if you provide more details!"}
181
+ context_text = "\n".join([doc.page_content for doc in context_docs])
182
+ logger.info("Generating response...")
183
+ response = chain.invoke({"context": context_text, "question": rewritten_question})
184
+ answer = response['text'].split("Answer:")[-1].strip()
185
+ logger.info(f"Generated answer: {answer}")
186
+ return {"answer": answer}
187
+ except Exception as e:
188
+ logger.error(f"Error processing question: {str(e)}")
189
+ raise HTTPException(status_code=500, detail=str(e))
190
+
191
+ @app.get("/")
192
+ async def root():
193
+ print('HR Policy Bot is actively running!')
194
+ logger.info("Root endpoint accessed")
195
+ return {"message": "HR Policy Bot is running!"}
196
+
nltk_data/tokenizers/punkt.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec
3
+ size 13905355
nltk_data/tokenizers/punkt/.DS_Store ADDED
Binary file (6.15 kB). View file
 
nltk_data/tokenizers/punkt/PY3/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/PY3/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64b0734b6fbe8e8d7cac79f48d1dd9f853824e57c4e3594dadd74ba2c1d97f50
3
+ size 1119050
nltk_data/tokenizers/punkt/PY3/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6189c7dd254e29e2bd406a7f6a4336297c8953214792466a790ea4444223ceb3
3
+ size 1191710
nltk_data/tokenizers/punkt/PY3/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fda0d6a13f02e8898daec7fe923da88e25abe081bcfa755c0e015075c215fe4c
3
+ size 693759
nltk_data/tokenizers/punkt/PY3/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
3
+ size 406697
nltk_data/tokenizers/punkt/PY3/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b364f72538d17b146a98009ad239a8096ce6c0a8b02958c0bc776ecd0c58a25f
3
+ size 1499502
nltk_data/tokenizers/punkt/PY3/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a4b5ff5500ee851c456f9dd40d5fc0d8c1859c88eb3178de1317d26b7d22833
3
+ size 1852226
nltk_data/tokenizers/punkt/PY3/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e3a4cd2971989b3cb9fd3433a6f15d17981e464db2be039364313b5de94f29
3
+ size 553575
nltk_data/tokenizers/punkt/PY3/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddcbbe85e2042a019b1a6e37fd8c153286c38ba201fae0f5bfd9a3f74abae25c
3
+ size 1463575
nltk_data/tokenizers/punkt/PY3/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85dabc44ab90a5f208ef37ff6b4892ebe7e740f71fb4da47cfd95417ca3e22fd
3
+ size 876006
nltk_data/tokenizers/punkt/PY3/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a94007b1e4ffdc4d1a190185ca5442c3dafeb17ab39d30329e84cd74a43947
3
+ size 615089
nltk_data/tokenizers/punkt/PY3/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/PY3/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff7a46d1438b311457d15d7763060b8d3270852c1850fd788c5cee194dc4a1d
3
+ size 1181271
nltk_data/tokenizers/punkt/PY3/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:624900ae3ddfb4854a98c5d3b8b1c9bb719975f33fee61ce1441dab9f8a00718
3
+ size 1738386
nltk_data/tokenizers/punkt/PY3/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02a0b7b25c3c7471e1791b66a31bbb530afbb0160aee4fcecf0107652067b4a1
3
+ size 611919
nltk_data/tokenizers/punkt/PY3/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:549762f8190024d89b511472df21a3a135eee5d9233e63ac244db737c2c61d7e
3
+ size 33020
nltk_data/tokenizers/punkt/PY3/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52ef2cc0ed27d79b3aa635cbbc40ad811883a75a4b8a8be1ae406972870fd864
3
+ size 734444
nltk_data/tokenizers/punkt/PY3/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:164a50fadc5a49f8ec7426eae11d3111ee752b48a3ef373d47745011192a5984
3
+ size 562337
nltk_data/tokenizers/punkt/PY3/swedish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0f7d538bfd5266633b09e842cd92e9e0ac10f1d923bf211e1497972ddc47318
3
+ size 979681
nltk_data/tokenizers/punkt/PY3/turkish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae68ef5863728ac5332e87eb1f6bae772ff32a13a4caa2b01a5c68103e853c5b
3
+ size 1017038
nltk_data/tokenizers/punkt/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt/czech.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba73d293c7d7953956bcf02f3695ec5c1f0d527f2a3c38097f5593394fa1690
3
+ size 1265552
nltk_data/tokenizers/punkt/danish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea29760a0a9197f52ca59e78aeafc5a6f55d05258faf7db1709b2b9eb321ef20
3
+ size 1264725
nltk_data/tokenizers/punkt/dutch.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a8e26b3d68c45c38e594d19e2d5677447bfdcaa636d3b1e7acfed0e9272d73c
3
+ size 742624
nltk_data/tokenizers/punkt/english.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dda37972ae88998a6fd3e3ec002697a6bd362b32d050fda7d7ca5276873092aa
3
+ size 433305
nltk_data/tokenizers/punkt/estonian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3867fee26a36bdb197c64362aa13ac683f5f33fa4d0d225a5d56707582a55a1d
3
+ size 1596714
nltk_data/tokenizers/punkt/finnish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a9e17b3d5b4df76345d812b8a65b1da0767eda5086eadcc11e625eef0942835
3
+ size 1951656
nltk_data/tokenizers/punkt/french.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de05f3d5647d3d2296626fb83f68428e4c6ad6e05a00ed4694c8bdc8f2f197ee
3
+ size 583482
nltk_data/tokenizers/punkt/german.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eab497fa085413130c8fd0fb13b929128930afe2f6a26ea8715c95df7088e97c
3
+ size 1526714
nltk_data/tokenizers/punkt/greek.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21752a6762fad5cfe46fb5c45fad9a85484a0e8e81c67e6af6fb973cfc27d67c
3
+ size 1953106
nltk_data/tokenizers/punkt/italian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb2717d7be5f26e860a92e05acf69b1123a5f4527cd7a269a9ab9e9e668c805
3
+ size 658331
nltk_data/tokenizers/punkt/malayalam.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8cf58acbdb7f472ac40affc13663be42dafb47c15030c11ade0444c9e0e53d
3
+ size 221207
nltk_data/tokenizers/punkt/norwegian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a97f8f9a03a0338dd746bcc89a0ae0f54ae43b835fa37d83e279e1ca794faf
3
+ size 1259779
nltk_data/tokenizers/punkt/polish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16127b6d10933427a3e90fb20e9be53e1fb371ff79a730c1030734ed80b90c92
3
+ size 2042451
nltk_data/tokenizers/punkt/portuguese.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb01bf7c79a4eadc2178bbd209665139a0e4b38f2d1c44fef097de93955140e0
3
+ size 649051
nltk_data/tokenizers/punkt/russian.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc984432fbe31f7000014f8047502476889169c60f09be5413ca09276b16c909
3
+ size 33027
nltk_data/tokenizers/punkt/slovene.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dac650212b3787b39996c01bd2084115493e6f6ec390bab61f767525b08b8ea
3
+ size 832867
nltk_data/tokenizers/punkt/spanish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:271dc6027c4aae056f72a9bfab5645cf67e198bf4f972895844e40f5989ccdc3
3
+ size 597831
nltk_data/tokenizers/punkt/swedish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40d50ebdad6caa87715f2e300b1217ec92c42de205a543cc4a56903bd2c9acfa
3
+ size 1034496
nltk_data/tokenizers/punkt/turkish.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3ae47d76501d027698809d12e75292c9c392910488543342802f95db9765ccc
3
+ size 1225013
nltk_data/tokenizers/punkt_tab.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57f64187974277726a3417ca6f181ec5403676c717672eef6a748a7b20e0106
3
+ size 4319076
nltk_data/tokenizers/punkt_tab/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ t
2
+ množ
3
+ např
4
+ j.h
5
+ man
6
+ ú
7
+ jug
8
+ dr
9
+ bl
10
+ ml
11
+ okr
12
+ st
13
+ uh
14
+ šp
15
+ judr
16
+ u.s.a
17
+ p
18
+ arg
19
+ žitě
20
+ st.celsia
21
+ etc
22
+ p.s
23
+ t.r
24
+ lok
25
+ mil
26
+ ict
27
+ n
28
+ tl
29
+ min
30
+ č
31
+ d
32
+ al
33
+ ravenně
34
+ mj
35
+ nar
36
+ plk
37
+ s.p
38
+ a.g
39
+ roč
40
+ b
41
+ zdi
42
+ r.s.c
43
+ přek
44
+ m
45
+ gen
46
+ csc
47
+ mudr
48
+ vic
49
+ š
50
+ sb
51
+ resp
52
+ tzn
53
+ iv
54
+ s.r.o
55
+ mar
56
+ w
57
+ čs
58
+ vi
59
+ tzv
60
+ ul
61
+ pen
62
+ zv
63
+ str
64
+ čp
65
+ org
66
+ rak
67
+ sv
68
+ pplk
69
+ u.s
70
+ prof
71
+ c.k
72
+ op
73
+ g
74
+ vii
75
+ kr
76
+ ing
77
+ j.o
78
+ drsc
79
+ m3
80
+ l
81
+ tr
82
+ ceo
83
+ ch
84
+ fuk
85
+ vl
86
+ viii
87
+ líp
88
+ hl.m
89
+ t.zv
90
+ phdr
91
+ o.k
92
+ tis
93
+ doc
94
+ kl
95
+ ard
96
+ čkd
97
+ pok
98
+ apod
99
+ r
100
+
101
+ a.s
102
+ j
103
+ jr
104
+ i.m
105
+ e
106
+ kupř
107
+ f
108
+
109
+ xvi
110
+ mir
111
+ atď
112
+ vr
113
+ r.i.v
114
+ hl
115
+ kv
116
+ t.j
117
+ y
118
+ q.p.r
nltk_data/tokenizers/punkt_tab/czech/collocations.tab ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ i dejmala
2
+ ##number## prosince
3
+ h steina
4
+ ##number## listopadu
5
+ a dvořák
6
+ v klaus
7
+ i čnhl
8
+ ##number## wladyslawowo
9
+ ##number## letech
10
+ a jiráska
11
+ a dubček
12
+ ##number## štrasburk
13
+ ##number## juniorské
14
+ ##number## století
15
+ ##number## kola
16
+ ##number## pád
17
+ ##number## května
18
+ ##number## týdne
19
+ v dlouhý
20
+ k design
21
+ ##number## červenec
22
+ i ligy
23
+ ##number## kolo
24
+ z svěrák
25
+ ##number## mája
26
+ ##number## šimková
27
+ a bělého
28
+ a bradáč
29
+ ##number## ročníku
30
+ ##number## dubna
31
+ a vivaldiho
32
+ v mečiara
33
+ c carrićre
34
+ ##number## sjezd
35
+ ##number## výroční
36
+ ##number## kole
37
+ ##number## narozenin
38
+ k maleevová
39
+ i čnfl
40
+ ##number## pádě
41
+ ##number## září
42
+ ##number## výročí
43
+ a dvořáka
44
+ h g.
45
+ ##number## ledna
46
+ a dvorský
47
+ h měsíc
48
+ ##number## srpna
49
+ ##number## tř.
50
+ a mozarta
51
+ ##number## sudetoněmeckých
52
+ o sokolov
53
+ k škrach
54
+ v benda
55
+ ##number## symfonie
56
+ ##number## července
57
+ x šalda
58
+ c abrahama
59
+ a tichý
60
+ ##number## místo
61
+ k bielecki
62
+ v havel
63
+ ##number## etapu
64
+ a dubčeka
65
+ i liga
66
+ ##number## světový
67
+ v klausem
68
+ ##number## ženy
69
+ ##number## létech
70
+ ##number## minutě
71
+ ##number## listopadem
72
+ ##number## místě
73
+ o vlček
74
+ k peteraje
75
+ i sponzor
76
+ ##number## června
77
+ ##number## min.
78
+ ##number## oprávněnou
79
+ ##number## květnu
80
+ ##number## aktu
81
+ ##number## květnem
82
+ ##number## října
83
+ i rynda
84
+ ##number## února
85
+ i snfl
86
+ a mozart
87
+ z košler
88
+ a dvorskému
89
+ v marhoul
90
+ v mečiar
91
+ ##number## ročník
92
+ ##number## máje
93
+ v havla
94
+ k gott
95
+ s bacha
96
+ ##number## ad