Spaces:
Sleeping
Sleeping
File size: 6,426 Bytes
9a2ca05 f2d4ac2 9a2ca05 f2d4ac2 9a2ca05 f2d4ac2 9a2ca05 f2d4ac2 9a2ca05 f2d4ac2 9a2ca05 f6ff18d 9a2ca05 86c723b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """
Tokenization Impact on Retrieval β TREC-COVID Demo
HuggingFace Spaces / Gradio
"""
import re
from transformers import BertTokenizer
import gradio as gr
from datasets import load_dataset
from rank_bm25 import BM25Okapi
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. CORPUS Δ°NDΔ°R & Δ°NDEKS KUR (uygulama baΕlarken bir kere)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("TREC-COVID corpus indiriliyor...")
corpus_ds = load_dataset("BeIR/trec-covid", "corpus", split="corpus")
corpus_title = {}
corpus_dict = {}
for doc in corpus_ds:
did = str(doc["_id"])
title = doc["title"] if doc["title"] else doc["text"][:120]
corpus_title[did] = title
corpus_dict[did] = title + " " + doc["text"]
doc_ids = list(corpus_dict.keys())
doc_texts = [corpus_dict[did] for did in doc_ids]
print(f"Corpus hazir: {len(doc_ids):,} dokuman")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. TOKENΔ°ZERS
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Whitespace tokenizer: Python split() bazlΔ±
def whitespace_tokenize(text):
return text.lower().split()
# BERT tokenizer: HuggingFace bert-base-uncased
print("BERT tokenizer yukleniyor...")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print("BERT tokenizer hazir.")
def bert_tokenize(text):
return bert_tokenizer.tokenize(text)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. BM25 Δ°NDEKSLERΔ°
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("BM25 indeksleri kuruluyor (birkaΓ§ dakika)...")
bm25_ws = BM25Okapi([whitespace_tokenize(t) for t in doc_texts])
bm25_bert = BM25Okapi([bert_tokenize(t) for t in doc_texts])
print("Hazir!")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. RETRIEVAL
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def retrieve(bm25, tokenize_fn, query, top_k=5):
tokens = tokenize_fn(query)
scores = bm25.get_scores(tokens)
ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
return [(doc_ids[i], corpus_title[doc_ids[i]], round(s, 2)) for i, s in ranked[:top_k]]
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. GRADIO ARAYΓZ
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def search(query):
if not query.strip():
return "Query boΕ olamaz.", "Query boΕ olamaz."
ws_tokens = whitespace_tokenize(query)
bert_tokens = bert_tokenize(query)
ws_results = retrieve(bm25_ws, whitespace_tokenize, query)
bert_results = retrieve(bm25_bert, bert_tokenize, query)
def format_tokens(tokens, style):
if style == "ws":
return " | ".join(f"`{t}`" for t in tokens)
else:
parts = []
for t in tokens:
if t.startswith("##"):
parts.append(f"**`{t}`**")
else:
parts.append(f"`{t}`")
return " | ".join(parts)
def format_results(results):
lines = []
for i, (did, title, score) in enumerate(results, 1):
lines.append(f"**{i}.** {title} \n`score: {score}`")
return "\n\n---\n\n".join(lines)
ws_out = f"### β¬ Whitespace Tokens\n{format_tokens(ws_tokens, 'ws')}\n\n---\n\n"
ws_out += f"### Top-5 SonuΓ§lar\n\n{format_results(ws_results)}"
bert_out = f"### π· BERT-style Tokens\n{format_tokens(bert_tokens, 'bert')}\n\n---\n\n"
bert_out += f"### Top-5 SonuΓ§lar\n\n{format_results(bert_results)}"
return ws_out, bert_out
examples = [
"what is the origin of COVID-19",
"how does coronavirus spread among people",
"COVID-19 symptoms fever cough loss of smell",
"remdesivir antiviral treatment efficacy",
"vaccine mRNA clinical trial efficacy",
"coronavirus incubation period transmission",
"comorbidities risk factors severe COVID",
]
with gr.Blocks(theme=gr.themes.Soft(), title="Tokenization Impact on Retrieval") as demo:
gr.Markdown("""
# π Tokenization Impact on Retrieval Quality
**TREC-COVID Β· BM25 Β· Whitespace vs BERT-style Tokenization**
Midterm - Information Retrieval
""")
with gr.Row():
query_input = gr.Textbox(
placeholder="e.g. how does coronavirus spread among people",
label="Query",
scale=5,
)
search_btn = gr.Button("Search π", variant="primary", scale=1)
gr.Examples(examples=examples, inputs=query_input, label="Example Queries")
with gr.Row():
ws_output = gr.Markdown(label="β¬ Whitespace BM25")
bert_output = gr.Markdown(label="π· BERT-style BM25")
search_btn.click(fn=search, inputs=query_input, outputs=[ws_output, bert_output])
query_input.submit(fn=search, inputs=query_input, outputs=[ws_output, bert_output])
demo.launch(share = True, debug = True) |