Spaces:
Sleeping
Sleeping
| """ | |
| Tokenization Impact on Retrieval β TREC-COVID Demo | |
| HuggingFace Spaces / Gradio | |
| """ | |
| import re | |
| from transformers import BertTokenizer | |
| import gradio as gr | |
| from datasets import load_dataset | |
| from rank_bm25 import BM25Okapi | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. CORPUS Δ°NDΔ°R & Δ°NDEKS KUR (uygulama baΕlarken bir kere) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("TREC-COVID corpus indiriliyor...") | |
| corpus_ds = load_dataset("BeIR/trec-covid", "corpus", split="corpus") | |
| corpus_title = {} | |
| corpus_dict = {} | |
| for doc in corpus_ds: | |
| did = str(doc["_id"]) | |
| title = doc["title"] if doc["title"] else doc["text"][:120] | |
| corpus_title[did] = title | |
| corpus_dict[did] = title + " " + doc["text"] | |
| doc_ids = list(corpus_dict.keys()) | |
| doc_texts = [corpus_dict[did] for did in doc_ids] | |
| print(f"Corpus hazir: {len(doc_ids):,} dokuman") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. TOKENΔ°ZERS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Whitespace tokenizer: Python split() bazlΔ± | |
| def whitespace_tokenize(text): | |
| return text.lower().split() | |
| # BERT tokenizer: HuggingFace bert-base-uncased | |
| print("BERT tokenizer yukleniyor...") | |
| bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| print("BERT tokenizer hazir.") | |
| def bert_tokenize(text): | |
| return bert_tokenizer.tokenize(text) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. BM25 Δ°NDEKSLERΔ° | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("BM25 indeksleri kuruluyor (birkaΓ§ dakika)...") | |
| bm25_ws = BM25Okapi([whitespace_tokenize(t) for t in doc_texts]) | |
| bm25_bert = BM25Okapi([bert_tokenize(t) for t in doc_texts]) | |
| print("Hazir!") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. RETRIEVAL | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def retrieve(bm25, tokenize_fn, query, top_k=5): | |
| tokens = tokenize_fn(query) | |
| scores = bm25.get_scores(tokens) | |
| ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True) | |
| return [(doc_ids[i], corpus_title[doc_ids[i]], round(s, 2)) for i, s in ranked[:top_k]] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. GRADIO ARAYΓZ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def search(query): | |
| if not query.strip(): | |
| return "Query boΕ olamaz.", "Query boΕ olamaz." | |
| ws_tokens = whitespace_tokenize(query) | |
| bert_tokens = bert_tokenize(query) | |
| ws_results = retrieve(bm25_ws, whitespace_tokenize, query) | |
| bert_results = retrieve(bm25_bert, bert_tokenize, query) | |
| def format_tokens(tokens, style): | |
| if style == "ws": | |
| return " | ".join(f"`{t}`" for t in tokens) | |
| else: | |
| parts = [] | |
| for t in tokens: | |
| if t.startswith("##"): | |
| parts.append(f"**`{t}`**") | |
| else: | |
| parts.append(f"`{t}`") | |
| return " | ".join(parts) | |
| def format_results(results): | |
| lines = [] | |
| for i, (did, title, score) in enumerate(results, 1): | |
| lines.append(f"**{i}.** {title} \n`score: {score}`") | |
| return "\n\n---\n\n".join(lines) | |
| ws_out = f"### β¬ Whitespace Tokens\n{format_tokens(ws_tokens, 'ws')}\n\n---\n\n" | |
| ws_out += f"### Top-5 SonuΓ§lar\n\n{format_results(ws_results)}" | |
| bert_out = f"### π· BERT-style Tokens\n{format_tokens(bert_tokens, 'bert')}\n\n---\n\n" | |
| bert_out += f"### Top-5 SonuΓ§lar\n\n{format_results(bert_results)}" | |
| return ws_out, bert_out | |
| examples = [ | |
| "what is the origin of COVID-19", | |
| "how does coronavirus spread among people", | |
| "COVID-19 symptoms fever cough loss of smell", | |
| "remdesivir antiviral treatment efficacy", | |
| "vaccine mRNA clinical trial efficacy", | |
| "coronavirus incubation period transmission", | |
| "comorbidities risk factors severe COVID", | |
| ] | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Tokenization Impact on Retrieval") as demo: | |
| gr.Markdown(""" | |
| # π Tokenization Impact on Retrieval Quality | |
| **TREC-COVID Β· BM25 Β· Whitespace vs BERT-style Tokenization** | |
| Midterm - Information Retrieval | |
| """) | |
| with gr.Row(): | |
| query_input = gr.Textbox( | |
| placeholder="e.g. how does coronavirus spread among people", | |
| label="Query", | |
| scale=5, | |
| ) | |
| search_btn = gr.Button("Search π", variant="primary", scale=1) | |
| gr.Examples(examples=examples, inputs=query_input, label="Example Queries") | |
| with gr.Row(): | |
| ws_output = gr.Markdown(label="β¬ Whitespace BM25") | |
| bert_output = gr.Markdown(label="π· BERT-style BM25") | |
| search_btn.click(fn=search, inputs=query_input, outputs=[ws_output, bert_output]) | |
| query_input.submit(fn=search, inputs=query_input, outputs=[ws_output, bert_output]) | |
| demo.launch(share = True, debug = True) |