""" Tokenization Impact on Retrieval — TREC-COVID Demo HuggingFace Spaces / Gradio """ import re from transformers import BertTokenizer import gradio as gr from datasets import load_dataset from rank_bm25 import BM25Okapi # ══════════════════════════════════════════════════════════════════════ # 1. CORPUS İNDİR & İNDEKS KUR (uygulama başlarken bir kere) # ══════════════════════════════════════════════════════════════════════ print("TREC-COVID corpus indiriliyor...") corpus_ds = load_dataset("BeIR/trec-covid", "corpus", split="corpus") corpus_title = {} corpus_dict = {} for doc in corpus_ds: did = str(doc["_id"]) title = doc["title"] if doc["title"] else doc["text"][:120] corpus_title[did] = title corpus_dict[did] = title + " " + doc["text"] doc_ids = list(corpus_dict.keys()) doc_texts = [corpus_dict[did] for did in doc_ids] print(f"Corpus hazir: {len(doc_ids):,} dokuman") # ══════════════════════════════════════════════════════════════════════ # 2. TOKENİZERS # ══════════════════════════════════════════════════════════════════════ # Whitespace tokenizer: Python split() bazlı def whitespace_tokenize(text): return text.lower().split() # BERT tokenizer: HuggingFace bert-base-uncased print("BERT tokenizer yukleniyor...") bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") print("BERT tokenizer hazir.") def bert_tokenize(text): return bert_tokenizer.tokenize(text) # ══════════════════════════════════════════════════════════════════════ # 3. BM25 İNDEKSLERİ # ══════════════════════════════════════════════════════════════════════ print("BM25 indeksleri kuruluyor (birkaç dakika)...") bm25_ws = BM25Okapi([whitespace_tokenize(t) for t in doc_texts]) bm25_bert = BM25Okapi([bert_tokenize(t) for t in doc_texts]) print("Hazir!") # ══════════════════════════════════════════════════════════════════════ # 4. RETRIEVAL # ══════════════════════════════════════════════════════════════════════ def retrieve(bm25, tokenize_fn, query, top_k=5): tokens = tokenize_fn(query) scores = bm25.get_scores(tokens) ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True) return [(doc_ids[i], corpus_title[doc_ids[i]], round(s, 2)) for i, s in ranked[:top_k]] # ══════════════════════════════════════════════════════════════════════ # 5. GRADIO ARAYÜZ # ══════════════════════════════════════════════════════════════════════ def search(query): if not query.strip(): return "Query boş olamaz.", "Query boş olamaz." ws_tokens = whitespace_tokenize(query) bert_tokens = bert_tokenize(query) ws_results = retrieve(bm25_ws, whitespace_tokenize, query) bert_results = retrieve(bm25_bert, bert_tokenize, query) def format_tokens(tokens, style): if style == "ws": return " | ".join(f"`{t}`" for t in tokens) else: parts = [] for t in tokens: if t.startswith("##"): parts.append(f"**`{t}`**") else: parts.append(f"`{t}`") return " | ".join(parts) def format_results(results): lines = [] for i, (did, title, score) in enumerate(results, 1): lines.append(f"**{i}.** {title} \n`score: {score}`") return "\n\n---\n\n".join(lines) ws_out = f"### ⬜ Whitespace Tokens\n{format_tokens(ws_tokens, 'ws')}\n\n---\n\n" ws_out += f"### Top-5 Sonuçlar\n\n{format_results(ws_results)}" bert_out = f"### 🔷 BERT-style Tokens\n{format_tokens(bert_tokens, 'bert')}\n\n---\n\n" bert_out += f"### Top-5 Sonuçlar\n\n{format_results(bert_results)}" return ws_out, bert_out examples = [ "what is the origin of COVID-19", "how does coronavirus spread among people", "COVID-19 symptoms fever cough loss of smell", "remdesivir antiviral treatment efficacy", "vaccine mRNA clinical trial efficacy", "coronavirus incubation period transmission", "comorbidities risk factors severe COVID", ] with gr.Blocks(theme=gr.themes.Soft(), title="Tokenization Impact on Retrieval") as demo: gr.Markdown(""" # 🔍 Tokenization Impact on Retrieval Quality **TREC-COVID · BM25 · Whitespace vs BERT-style Tokenization** Midterm - Information Retrieval """) with gr.Row(): query_input = gr.Textbox( placeholder="e.g. how does coronavirus spread among people", label="Query", scale=5, ) search_btn = gr.Button("Search 🔍", variant="primary", scale=1) gr.Examples(examples=examples, inputs=query_input, label="Example Queries") with gr.Row(): ws_output = gr.Markdown(label="⬜ Whitespace BM25") bert_output = gr.Markdown(label="🔷 BERT-style BM25") search_btn.click(fn=search, inputs=query_input, outputs=[ws_output, bert_output]) query_input.submit(fn=search, inputs=query_input, outputs=[ws_output, bert_output]) demo.launch(share = True, debug = True)