UnreliableTakesFlight commited on
Commit
f2d4ac2
Β·
verified Β·
1 Parent(s): 9dcfe2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -20
app.py CHANGED
@@ -4,6 +4,7 @@ HuggingFace Spaces / Gradio
4
  """
5
 
6
  import re
 
7
  import gradio as gr
8
  from datasets import load_dataset
9
  from rank_bm25 import BM25Okapi
@@ -32,30 +33,17 @@ print(f"Corpus hazir: {len(doc_ids):,} dokuman")
32
  # 2. TOKENΔ°ZERS
33
  # ══════════════════════════════════════════════════════════════════════
34
 
 
35
  def whitespace_tokenize(text):
36
- return re.findall(r'\b[a-z]+\b', text.lower())
37
 
38
-
39
- _SUFFIXES = [
40
- 'ization', 'isation', 'ation', 'tion', 'sion', 'ment', 'ness',
41
- 'ity', 'ical', 'ous', 'ful', 'less', 'ize', 'ise',
42
- 'ing', 'al', 'er', 'est', 'ly', 'ed',
43
- ]
44
 
45
  def bert_tokenize(text):
46
- tokens = []
47
- for word in re.findall(r"[a-z]+(?:-[a-z]+)*", text.lower()):
48
- for part in word.split('-'):
49
- matched = False
50
- for suf in sorted(_SUFFIXES, key=len, reverse=True):
51
- if len(part) > len(suf) + 2 and part.endswith(suf):
52
- tokens.append(part[:-len(suf)])
53
- tokens.append('##' + suf)
54
- matched = True
55
- break
56
- if not matched:
57
- tokens.append(part)
58
- return tokens
59
 
60
 
61
  # ══════════════════════════════════════════════════════════════════════
 
4
  """
5
 
6
  import re
7
+ from transformers import BertTokenizer
8
  import gradio as gr
9
  from datasets import load_dataset
10
  from rank_bm25 import BM25Okapi
 
33
  # 2. TOKENΔ°ZERS
34
  # ══════════════════════════════════════════════════════════════════════
35
 
36
+ # Whitespace tokenizer: Python split() bazlΔ±
37
  def whitespace_tokenize(text):
38
+ return text.lower().split()
39
 
40
+ # BERT tokenizer: HuggingFace bert-base-uncased
41
+ print("BERT tokenizer yukleniyor...")
42
+ bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
43
+ print("BERT tokenizer hazir.")
 
 
44
 
45
  def bert_tokenize(text):
46
+ return bert_tokenizer.tokenize(text)
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
 
49
  # ══════════════════════════════════════════════════════════════════════