File size: 6,426 Bytes
9a2ca05
 
 
 
 
 
f2d4ac2
9a2ca05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2d4ac2
9a2ca05
f2d4ac2
9a2ca05
f2d4ac2
 
 
 
9a2ca05
 
f2d4ac2
9a2ca05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6ff18d
9a2ca05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86c723b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Tokenization Impact on Retrieval β€” TREC-COVID Demo
HuggingFace Spaces / Gradio
"""

import re
from transformers import BertTokenizer
import gradio as gr
from datasets import load_dataset
from rank_bm25 import BM25Okapi

# ══════════════════════════════════════════════════════════════════════
# 1. CORPUS İNDİR & İNDEKS KUR (uygulama başlarken bir kere)
# ══════════════════════════════════════════════════════════════════════

print("TREC-COVID corpus indiriliyor...")
corpus_ds = load_dataset("BeIR/trec-covid", "corpus", split="corpus")

corpus_title = {}
corpus_dict  = {}
for doc in corpus_ds:
    did   = str(doc["_id"])
    title = doc["title"] if doc["title"] else doc["text"][:120]
    corpus_title[did] = title
    corpus_dict[did]  = title + " " + doc["text"]

doc_ids   = list(corpus_dict.keys())
doc_texts = [corpus_dict[did] for did in doc_ids]
print(f"Corpus hazir: {len(doc_ids):,} dokuman")


# ══════════════════════════════════════════════════════════════════════
# 2. TOKENΔ°ZERS
# ══════════════════════════════════════════════════════════════════════

# Whitespace tokenizer: Python split() bazlΔ±
def whitespace_tokenize(text):
    return text.lower().split()

# BERT tokenizer: HuggingFace bert-base-uncased
print("BERT tokenizer yukleniyor...")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print("BERT tokenizer hazir.")

def bert_tokenize(text):
    return bert_tokenizer.tokenize(text)


# ══════════════════════════════════════════════════════════════════════
# 3. BM25 Δ°NDEKSLERΔ°
# ══════════════════════════════════════════════════════════════════════

print("BM25 indeksleri kuruluyor (birkaΓ§ dakika)...")
bm25_ws   = BM25Okapi([whitespace_tokenize(t) for t in doc_texts])
bm25_bert = BM25Okapi([bert_tokenize(t) for t in doc_texts])
print("Hazir!")


# ══════════════════════════════════════════════════════════════════════
# 4. RETRIEVAL
# ══════════════════════════════════════════════════════════════════════

def retrieve(bm25, tokenize_fn, query, top_k=5):
    tokens = tokenize_fn(query)
    scores = bm25.get_scores(tokens)
    ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    return [(doc_ids[i], corpus_title[doc_ids[i]], round(s, 2)) for i, s in ranked[:top_k]]


# ══════════════════════════════════════════════════════════════════════
# 5. GRADIO ARAYÜZ
# ══════════════════════════════════════════════════════════════════════

def search(query):
    if not query.strip():
        return "Query boş olamaz.", "Query boş olamaz."

    ws_tokens   = whitespace_tokenize(query)
    bert_tokens = bert_tokenize(query)
    ws_results   = retrieve(bm25_ws,   whitespace_tokenize, query)
    bert_results = retrieve(bm25_bert, bert_tokenize,       query)

    def format_tokens(tokens, style):
        if style == "ws":
            return " | ".join(f"`{t}`" for t in tokens)
        else:
            parts = []
            for t in tokens:
                if t.startswith("##"):
                    parts.append(f"**`{t}`**")
                else:
                    parts.append(f"`{t}`")
            return " | ".join(parts)

    def format_results(results):
        lines = []
        for i, (did, title, score) in enumerate(results, 1):
            lines.append(f"**{i}.** {title}  \n`score: {score}`")
        return "\n\n---\n\n".join(lines)

    ws_out = f"### ⬜ Whitespace Tokens\n{format_tokens(ws_tokens, 'ws')}\n\n---\n\n"
    ws_out += f"### Top-5 SonuΓ§lar\n\n{format_results(ws_results)}"

    bert_out = f"### πŸ”· BERT-style Tokens\n{format_tokens(bert_tokens, 'bert')}\n\n---\n\n"
    bert_out += f"### Top-5 SonuΓ§lar\n\n{format_results(bert_results)}"

    return ws_out, bert_out


examples = [
    "what is the origin of COVID-19",
    "how does coronavirus spread among people",
    "COVID-19 symptoms fever cough loss of smell",
    "remdesivir antiviral treatment efficacy",
    "vaccine mRNA clinical trial efficacy",
    "coronavirus incubation period transmission",
    "comorbidities risk factors severe COVID",
]

with gr.Blocks(theme=gr.themes.Soft(), title="Tokenization Impact on Retrieval") as demo:
    gr.Markdown("""
    # πŸ” Tokenization Impact on Retrieval Quality
    **TREC-COVID Β· BM25 Β· Whitespace vs BERT-style Tokenization**

    Midterm - Information Retrieval
    """)

    with gr.Row():
        query_input = gr.Textbox(
            placeholder="e.g. how does coronavirus spread among people",
            label="Query",
            scale=5,
        )
        search_btn = gr.Button("Search πŸ”", variant="primary", scale=1)

    gr.Examples(examples=examples, inputs=query_input, label="Example Queries")

    with gr.Row():
        ws_output   = gr.Markdown(label="⬜ Whitespace BM25")
        bert_output = gr.Markdown(label="πŸ”· BERT-style BM25")

    search_btn.click(fn=search, inputs=query_input, outputs=[ws_output, bert_output])
    query_input.submit(fn=search, inputs=query_input, outputs=[ws_output, bert_output])

demo.launch(share = True, debug = True)