UnreliableTakesFlight commited on
Commit
9a2ca05
Β·
verified Β·
1 Parent(s): 47c5eaf

first_commit

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tokenization Impact on Retrieval β€” TREC-COVID Demo
3
+ HuggingFace Spaces / Gradio
4
+ """
5
+
6
+ import re
7
+ import gradio as gr
8
+ from datasets import load_dataset
9
+ from rank_bm25 import BM25Okapi
10
+
11
+ # ══════════════════════════════════════════════════════════════════════
12
+ # 1. CORPUS İNDİR & İNDEKS KUR (uygulama başlarken bir kere)
13
+ # ══════════════════════════════════════════════════════════════════════
14
+
15
+ print("TREC-COVID corpus indiriliyor...")
16
+ corpus_ds = load_dataset("BeIR/trec-covid", "corpus", split="corpus")
17
+
18
+ corpus_title = {}
19
+ corpus_dict = {}
20
+ for doc in corpus_ds:
21
+ did = str(doc["_id"])
22
+ title = doc["title"] if doc["title"] else doc["text"][:120]
23
+ corpus_title[did] = title
24
+ corpus_dict[did] = title + " " + doc["text"]
25
+
26
+ doc_ids = list(corpus_dict.keys())
27
+ doc_texts = [corpus_dict[did] for did in doc_ids]
28
+ print(f"Corpus hazir: {len(doc_ids):,} dokuman")
29
+
30
+
31
+ # ══════════════════════════════════════════════════════════════════════
32
+ # 2. TOKENΔ°ZERS
33
+ # ══════════════════════════════════════════════════════════════════════
34
+
35
+ def whitespace_tokenize(text):
36
+ return re.findall(r'\b[a-z]+\b', text.lower())
37
+
38
+
39
+ _SUFFIXES = [
40
+ 'ization', 'isation', 'ation', 'tion', 'sion', 'ment', 'ness',
41
+ 'ity', 'ical', 'ous', 'ful', 'less', 'ize', 'ise',
42
+ 'ing', 'al', 'er', 'est', 'ly', 'ed',
43
+ ]
44
+
45
+ def bert_tokenize(text):
46
+ tokens = []
47
+ for word in re.findall(r"[a-z]+(?:-[a-z]+)*", text.lower()):
48
+ for part in word.split('-'):
49
+ matched = False
50
+ for suf in sorted(_SUFFIXES, key=len, reverse=True):
51
+ if len(part) > len(suf) + 2 and part.endswith(suf):
52
+ tokens.append(part[:-len(suf)])
53
+ tokens.append('##' + suf)
54
+ matched = True
55
+ break
56
+ if not matched:
57
+ tokens.append(part)
58
+ return tokens
59
+
60
+
61
+ # ══════════════════════════════════════════════════════════════════════
62
+ # 3. BM25 Δ°NDEKSLERΔ°
63
+ # ══════════════════════════════════════════════════════════════════════
64
+
65
+ print("BM25 indeksleri kuruluyor (birkaΓ§ dakika)...")
66
+ bm25_ws = BM25Okapi([whitespace_tokenize(t) for t in doc_texts])
67
+ bm25_bert = BM25Okapi([bert_tokenize(t) for t in doc_texts])
68
+ print("Hazir!")
69
+
70
+
71
+ # ══════════════════════════════════════════════════════════════════════
72
+ # 4. RETRIEVAL
73
+ # ══════════════════════════════════════════════════════════════════════
74
+
75
+ def retrieve(bm25, tokenize_fn, query, top_k=5):
76
+ tokens = tokenize_fn(query)
77
+ scores = bm25.get_scores(tokens)
78
+ ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
79
+ return [(doc_ids[i], corpus_title[doc_ids[i]], round(s, 2)) for i, s in ranked[:top_k]]
80
+
81
+
82
+ # ══════════════════════════════════════════════════════════════════════
83
+ # 5. GRADIO ARAYÜZ
84
+ # ══════════════════════════════════════════════════════════════════════
85
+
86
+ def search(query):
87
+ if not query.strip():
88
+ return "Query boş olamaz.", "Query boş olamaz."
89
+
90
+ ws_tokens = whitespace_tokenize(query)
91
+ bert_tokens = bert_tokenize(query)
92
+ ws_results = retrieve(bm25_ws, whitespace_tokenize, query)
93
+ bert_results = retrieve(bm25_bert, bert_tokenize, query)
94
+
95
+ def format_tokens(tokens, style):
96
+ if style == "ws":
97
+ return " | ".join(f"`{t}`" for t in tokens)
98
+ else:
99
+ parts = []
100
+ for t in tokens:
101
+ if t.startswith("##"):
102
+ parts.append(f"**`{t}`**")
103
+ else:
104
+ parts.append(f"`{t}`")
105
+ return " | ".join(parts)
106
+
107
+ def format_results(results):
108
+ lines = []
109
+ for i, (did, title, score) in enumerate(results, 1):
110
+ lines.append(f"**{i}.** {title} \n`score: {score}`")
111
+ return "\n\n---\n\n".join(lines)
112
+
113
+ ws_out = f"### ⬜ Whitespace Tokens\n{format_tokens(ws_tokens, 'ws')}\n\n---\n\n"
114
+ ws_out += f"### Top-5 SonuΓ§lar\n\n{format_results(ws_results)}"
115
+
116
+ bert_out = f"### πŸ”· BERT-style Tokens\n{format_tokens(bert_tokens, 'bert')}\n\n---\n\n"
117
+ bert_out += f"### Top-5 SonuΓ§lar\n\n{format_results(bert_results)}"
118
+
119
+ return ws_out, bert_out
120
+
121
+
122
+ examples = [
123
+ "what is the origin of COVID-19",
124
+ "how does coronavirus spread among people",
125
+ "COVID-19 symptoms fever cough loss of smell",
126
+ "remdesivir antiviral treatment efficacy",
127
+ "vaccine mRNA clinical trial efficacy",
128
+ "coronavirus incubation period transmission",
129
+ "comorbidities risk factors severe COVID",
130
+ ]
131
+
132
+ with gr.Blocks(theme=gr.themes.Soft(), title="Tokenization Impact on Retrieval") as demo:
133
+ gr.Markdown("""
134
+ # πŸ” Tokenization Impact on Retrieval Quality
135
+ **TREC-COVID Β· BM25 Β· Whitespace vs BERT-style Tokenization**
136
+
137
+ Assignment 15 β€” Information Retrieval
138
+ """)
139
+
140
+ with gr.Row():
141
+ query_input = gr.Textbox(
142
+ placeholder="e.g. how does coronavirus spread among people",
143
+ label="Query",
144
+ scale=5,
145
+ )
146
+ search_btn = gr.Button("Search πŸ”", variant="primary", scale=1)
147
+
148
+ gr.Examples(examples=examples, inputs=query_input, label="Example Queries")
149
+
150
+ with gr.Row():
151
+ ws_output = gr.Markdown(label="⬜ Whitespace BM25")
152
+ bert_output = gr.Markdown(label="πŸ”· BERT-style BM25")
153
+
154
+ search_btn.click(fn=search, inputs=query_input, outputs=[ws_output, bert_output])
155
+ query_input.submit(fn=search, inputs=query_input, outputs=[ws_output, bert_output])
156
+
157
+ demo.launch()