Spaces:

intelava
/

nlp4web

Sleeping

nlp4web / app.py

Atak Kan

Add application file

2cbafcd about 1 year ago

44.9 kB

	from __future__ import annotations
	# -- coding: utf-8 --
	"""HW1 (more instructed).ipynb adlı not defterinin kopyası

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/18CpMm-9nCuo64vywjq-qJhJF_DWrGavX
	"""

	# we also need this additionally for this homework

	"""## Pre-requisite code

	The code within this section will be used in the tasks. Please do not change these code lines.

	### SciQ loading and counting
	"""

	from dataclasses import dataclass
	import pickle
	import os
	from typing import Iterable, Callable, List, Dict, Optional, Type, TypeVar
	from nlp4web_codebase.ir.data_loaders.dm import Document
	from collections import Counter
	import tqdm
	import re
	import nltk
	nltk.download("stopwords", quiet=True)
	from nltk.corpus import stopwords as nltk_stopwords

	LANGUAGE = "english"
	word_splitter = re.compile(r"(?u)\b\w\w+\b").findall
	stopwords = set(nltk_stopwords.words(LANGUAGE))


	def word_splitting(text: str) -> List[str]:
	return word_splitter(text.lower())

	def lemmatization(words: List[str]) -> List[str]:
	return words # We ignore lemmatization here for simplicity

	def simple_tokenize(text: str) -> List[str]:
	words = word_splitting(text)
	tokenized = list(filter(lambda w: w not in stopwords, words))
	tokenized = lemmatization(tokenized)
	return tokenized

	T = TypeVar("T", bound="InvertedIndex")

	@dataclass
	class PostingList:
	term: str # The term
	docid_postings: List[int] # docid_postings[i] means the docid (int) of the i-th associated posting
	tweight_postings: List[float] # tweight_postings[i] means the term weight (float) of the i-th associated posting


	@dataclass
	class InvertedIndex:
	posting_lists: List[PostingList] # docid -> posting_list
	vocab: Dict[str, int]
	cid2docid: Dict[str, int] # collection_id -> docid
	collection_ids: List[str] # docid -> collection_id
	doc_texts: Optional[List[str]] = None # docid -> document text

	def save(self, output_dir: str) -> None:
	os.makedirs(output_dir, exist_ok=True)
	with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
	pickle.dump(self, f)

	@classmethod
	def from_saved(cls: Type[T], saved_dir: str) -> T:
	index = cls(
	posting_lists=[], vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
	)
	with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
	index = pickle.load(f)
	return index


	# The output of the counting function:
	@dataclass
	class Counting:
	posting_lists: List[PostingList]
	vocab: Dict[str, int]
	cid2docid: Dict[str, int]
	collection_ids: List[str]
	dfs: List[int] # tid -> df
	dls: List[int] # docid -> doc length
	avgdl: float
	nterms: int
	doc_texts: Optional[List[str]] = None

	def run_counting(
	documents: Iterable[Document],
	tokenize_fn: Callable[[str], List[str]] = simple_tokenize,
	store_raw: bool = True, # store the document text in doc_texts
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	) -> Counting:
	"""Counting TFs, DFs, doc_lengths, etc."""
	posting_lists: List[PostingList] = []
	vocab: Dict[str, int] = {}
	cid2docid: Dict[str, int] = {}
	collection_ids: List[str] = []
	dfs: List[int] = [] # tid -> df
	dls: List[int] = [] # docid -> doc length
	nterms: int = 0
	doc_texts: Optional[List[str]] = []
	for doc in tqdm.tqdm(
	documents,
	desc="Counting",
	total=ndocs,
	disable=not show_progress_bar,
	):
	if doc.collection_id in cid2docid:
	continue
	collection_ids.append(doc.collection_id)
	docid = cid2docid.setdefault(doc.collection_id, len(cid2docid))
	toks = tokenize_fn(doc.text)
	tok2tf = Counter(toks)
	dls.append(sum(tok2tf.values()))
	for tok, tf in tok2tf.items():
	nterms += tf
	tid = vocab.get(tok, None)
	if tid is None:
	posting_lists.append(
	PostingList(term=tok, docid_postings=[], tweight_postings=[])
	)
	tid = vocab.setdefault(tok, len(vocab))
	posting_lists[tid].docid_postings.append(docid)
	posting_lists[tid].tweight_postings.append(tf)
	if tid < len(dfs):
	dfs[tid] += 1
	else:
	dfs.append(0)
	if store_raw:
	doc_texts.append(doc.text)
	else:
	doc_texts = None
	return Counting(
	posting_lists=posting_lists,
	vocab=vocab,
	cid2docid=cid2docid,
	collection_ids=collection_ids,
	dfs=dfs,
	dls=dls,
	avgdl=sum(dls) / len(dls),
	nterms=nterms,
	doc_texts=doc_texts,
	)

	from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
	sciq = load_sciq()
	counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))

	"""### BM25 Index"""


	from dataclasses import asdict, dataclass
	import math
	import os
	from typing import Iterable, List, Optional, Type
	import tqdm
	from nlp4web_codebase.ir.data_loaders.dm import Document


	@dataclass
	class BM25Index(InvertedIndex):

	@staticmethod
	def tokenize(text: str) -> List[str]:
	return simple_tokenize(text)

	@staticmethod
	def cache_term_weights(
	posting_lists: List[PostingList],
	total_docs: int,
	avgdl: float,
	dfs: List[int],
	dls: List[int],
	k1: float,
	b: float,
	) -> None:
	"""Compute term weights and caching"""

	N = total_docs
	for tid, posting_list in enumerate(
	tqdm.tqdm(posting_lists, desc="Regularizing TFs")
	):
	idf = BM25Index.calc_idf(df=dfs[tid], N=N)
	for i in range(len(posting_list.docid_postings)):
	docid = posting_list.docid_postings[i]
	tf = posting_list.tweight_postings[i]
	dl = dls[docid]
	regularized_tf = BM25Index.calc_regularized_tf(
	tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
	)
	posting_list.tweight_postings[i] = regularized_tf * idf

	@staticmethod
	def calc_regularized_tf(
	tf: int, dl: float, avgdl: float, k1: float, b: float
	) -> float:
	return tf / (tf + k1 * (1 - b + b * dl / avgdl))

	@staticmethod
	def calc_idf(df: int, N: int):
	return math.log(1 + (N - df + 0.5) / (df + 0.5))

	@classmethod
	def build_from_documents(
	cls: Type[BM25Index],
	documents: Iterable[Document],
	store_raw: bool = True,
	output_dir: Optional[str] = None,
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	k1: float = 0.9,
	b: float = 0.4,
	) -> BM25Index:
	# Counting TFs, DFs, doc_lengths, etc.:
	counting = run_counting(
	documents=documents,
	tokenize_fn=BM25Index.tokenize,
	store_raw=store_raw,
	ndocs=ndocs,
	show_progress_bar=show_progress_bar,
	)

	# Compute term weights and caching:
	posting_lists = counting.posting_lists
	total_docs = len(counting.cid2docid)
	BM25Index.cache_term_weights(
	posting_lists=posting_lists,
	total_docs=total_docs,
	avgdl=counting.avgdl,
	dfs=counting.dfs,
	dls=counting.dls,
	k1=k1,
	b=b,
	)

	# Assembly and save:
	index = BM25Index(
	posting_lists=posting_lists,
	vocab=counting.vocab,
	cid2docid=counting.cid2docid,
	collection_ids=counting.collection_ids,
	doc_texts=counting.doc_texts,
	)
	return index

	bm25_index = BM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True,
	)
	bm25_index.save("output/bm25_index")


	"""### BM25 Retriever"""

	from nlp4web_codebase.ir.models import BaseRetriever
	from typing import Type
	from abc import abstractmethod


	class BaseInvertedIndexRetriever(BaseRetriever):

	@property
	@abstractmethod
	def index_class(self) -> Type[InvertedIndex]:
	pass

	def __init__(self, index_dir: str) -> None:
	self.index = self.index_class.from_saved(index_dir)

	def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	target_docid = self.index.cid2docid[cid]
	term_weights = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists[tid]
	for docid, tweight in zip(
	posting_list.docid_postings, posting_list.tweight_postings
	):
	if docid == target_docid:
	term_weights[tok] = tweight
	break
	return term_weights

	def score(self, query: str, cid: str) -> float:
	return sum(self.get_term_weights(query=query, cid=cid).values())

	def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	docid2score: Dict[int, float] = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists[tid]
	for docid, tweight in zip(
	posting_list.docid_postings, posting_list.tweight_postings
	):
	docid2score.setdefault(docid, 0)
	docid2score[docid] += tweight
	docid2score = dict(
	sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
	)
	return {
	self.index.collection_ids[docid]: score
	for docid, score in docid2score.items()
	}


	class BM25Retriever(BaseInvertedIndexRetriever):

	@property
	def index_class(self) -> Type[BM25Index]:
	return BM25Index

	bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
	bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")

	"""# TASK1: tune b and k1 (4 points)

	Tune b and k1 on the dev split of SciQ using the metric MAP@10. The evaluation function (`evalaute_map`) is provided. Record the values in `plots_k1` and `plots_b`. Do it in a greedy manner: as the influence from b is larger, please first tune b (with k1 fixed to the default value 0.9) and use the best value of b to further tune k1.

	$${\displaystyle {\text{score}}(D,Q)=\sum _{i=1}^{n}{\text{IDF}}(q_{i})\cdot {\frac {f(q_{i},D)\cdot (k_{1}+1)}{f(q_{i},D)+k_{1}\cdot \left(1-b+b\cdot {\frac {\|D\|}{\text{avgdl}}}\right)}}}$$
	"""

	from nlp4web_codebase.ir.data_loaders import Split
	import pytrec_eval


	def evaluate_map(rankings: Dict[str, Dict[str, float]], split=Split.dev) -> float:
	metric = "map_cut_10"
	qrels = sciq.get_qrels_dict(split)
	evaluator = pytrec_eval.RelevanceEvaluator(sciq.get_qrels_dict(split), (metric,))
	qps = evaluator.evaluate(rankings)
	return float(np.mean([qp[metric] for qp in qps.values()]))

	"""Example of using the pre-requisite code:"""

	# Loading dataset:
	from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
	sciq = load_sciq()
	counting = run_counting(documents=iter(sciq.corpus), ndocs=len(sciq.corpus))

	# Building BM25 index and save:
	bm25_index = BM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True
	)
	bm25_index.save("output/bm25_index")

	# Loading index and use BM25 retriever to retrieve:
	bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
	print(bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")) # the ranking

	import tqdm
	import numpy as np

	plots_b: Dict[str, List[float]] = {
	"X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
	"Y": []
	}
	plots_k1: Dict[str, List[float]] = {
	"X": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
	"Y": []
	}

	## YOUR_CODE_STARTS_HERE
	# Two steps should be involved:
	# Step 1. Fix k1 value to the default one 0.9,
	# go through all the candidate b values (0, 0.1, ..., 1.0),
	# and record in plots_b["Y"] the corresponding performances obtained via evaluate_map;
	# Step 2. Fix b to the best one in step 1. and do the same for k1.

	# Hint (on using the pre-requisite code):
	# - One can use the loaded sciq dataset directly (loaded in the pre-requisite code);
	# - One can build bm25_index with `BM25Index.build_from_documents`;
	# - One can use BM25Retriever to load the index and perform retrieval on the dev queries
	# (dev queries can be obtained via sciq.get_split_queries(Split.dev))

	result = {}
	best_b = 0
	best_k1 = 0
	for b in plots_b["X"]:
	bm25_index = BM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True,
	k1=0.9,
	b=b
	)
	bm25_index.save("output/bm25_index")
	bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
	for query in sciq.get_split_queries(Split.dev):
	result[query.query_id]=bm25_retriever.retrieve(query.text)

	if best_b < evaluate_map(result):
	best_b = evaluate_map(result)
	plots_b["Y"].append(evaluate_map(result))

	for k1 in plots_k1["X"]:
	bm25_index = BM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True,
	k1=k1,
	b=best_b
	)
	bm25_index.save("output/bm25_index")
	bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
	for query in sciq.get_split_queries(Split.dev):
	result[query.query_id]=bm25_retriever.retrieve(query.text)

	if best_k1 < evaluate_map(result):
	best_k1 = evaluate_map(result)
	plots_k1["Y"].append(evaluate_map(result))



	## YOU_CODE_ENDS_HERE

	# Adjusting the code to handle `dev_queries` as a list

	# Step 1: Tuning parameter `b`

	# Updated structure for rankings to match `evaluate_map` requirements

	## TEST_CASES (should be close to 0.8135637188208616 and 0.7512916099773244)
	print(plots_k1["Y"][9])
	print(plots_b["Y"][1])

	## RESULT_CHECKING_POINT
	print(plots_k1)
	print(plots_b)

	from matplotlib import pyplot as plt
	plt.plot(plots_b["X"], plots_b["Y"], label="b")
	plt.plot(plots_k1["X"], plots_k1["Y"], label="k1")
	plt.ylabel("MAP")
	plt.legend()
	plt.grid()
	plt.show()

	"""Let's check the effectiveness gain on test after this tuning on dev"""

	default_map = 0.7849
	best_b = plots_b["X"][np.argmax(plots_b["Y"])]
	best_k1 = plots_k1["X"][np.argmax(plots_k1["Y"])]
	bm25_index = BM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True,
	k1=best_k1,
	b=best_b
	)
	bm25_index.save("output/bm25_index")
	bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
	rankings = {}
	for query in sciq.get_split_queries(Split.test): # note this is now on test
	ranking = bm25_retriever.retrieve(query=query.text)
	rankings[query.query_id] = ranking
	optimized_map = evaluate_map(rankings, split=Split.test) # note this is now on test
	print(default_map, optimized_map)

	"""# TASK2: CSC matrix and `CSCBM25Index` (12 points)

	Recall that we use Python lists to implement posting lists, mapping term IDs to the documents in which they appear. This is inefficient due to its naive design. Actually [Compressed Sparse Column matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html) is very suitable for storing the posting lists and can boost the efficiency.

	## TASK2.1: learn about `scipy.sparse.csc_matrix` (2 point)

	Convert the matrix \begin{bmatrix}
	0 & 1 & 0 & 3 \\
	10 & 2 & 1 & 0 \\
	0 & 0 & 0 & 9
	\end{bmatrix} to a `csc_matrix` by specifying `data`, `indices`, `indptr` and `shape`.
	"""

	from scipy.sparse._csc import csc_matrix
	input_matrix = [[0, 1, 0, 3], [10, 2, 1, 0], [0, 0, 0, 9]]
	data = None
	indices = None
	indptr = None
	shape = None
	## YOUR_CODE_STARTS_HERE
	# Please assign the values to data, indices, indptr and shape
	# One can just do it in a hard-coded manner
	data = [10, 1, 2, 1, 3, 9]
	indices = [1, 0, 1, 2, 0, 2]
	indptr = [0, 1, 3, 4, 6]
	shape = (3, 4)
	## YOUR_CODE_ENDS_HERE
	output_matrix = csc_matrix((data, indices, indptr), shape=shape)



	## TEST_CASES (should be 3 and 11)
	print((output_matrix.indices + output_matrix.data).tolist()[2])
	print((output_matrix.indices + output_matrix.data).tolist()[-1])

	## RESULT_CHECKING_POINT
	print((output_matrix.indices + output_matrix.data).tolist())

	"""## TASK2.2: implement `CSCBM25Index` (4 points)

	Implement `CSCBM25Index` by completing the missing code. Note that `CSCInvertedIndex` is similar to `InvertedIndex` which we talked about during the class. The main difference is posting lists are represented by a CSC sparse matrix.
	"""

	@dataclass
	class CSCInvertedIndex:
	posting_lists_matrix: csc_matrix # docid -> posting_list
	vocab: Dict[str, int]
	cid2docid: Dict[str, int] # collection_id -> docid
	collection_ids: List[str] # docid -> collection_id
	doc_texts: Optional[List[str]] = None # docid -> document text

	def save(self, output_dir: str) -> None:
	os.makedirs(output_dir, exist_ok=True)
	with open(os.path.join(output_dir, "index.pkl"), "wb") as f:
	pickle.dump(self, f)

	@classmethod
	def from_saved(cls: Type[T], saved_dir: str) -> T:
	index = cls(
	posting_lists_matrix=None, vocab={}, cid2docid={}, collection_ids=[], doc_texts=None
	)
	with open(os.path.join(saved_dir, "index.pkl"), "rb") as f:
	index = pickle.load(f)
	return index

	def convert_to_csc(posting_lists, num_docs):
	data = []

	row = []
	column=[]
	for col_idx, posting in enumerate(posting_lists):

	for i in range(len(posting.docid_postings)):
	data.append(posting.tweight_postings[i])
	column.append(posting.docid_postings[i])
	row.append(col_idx)

	return csc_matrix((data, (row, column)), shape=(len(posting_lists),num_docs ), dtype=np.float32)




	@dataclass
	class CSCBM25Index(CSCInvertedIndex):

	@staticmethod
	def tokenize(text: str) -> List[str]:
	return simple_tokenize(text)

	@staticmethod
	def cache_term_weights(
	posting_lists: List[PostingList],
	total_docs: int,
	avgdl: float,
	dfs: List[int],
	dls: List[int],
	k1: float,
	b: float,
	) -> csc_matrix:
	"""Compute term weights and caching"""
	N = total_docs

	for tid, posting_list in enumerate(
	tqdm.tqdm(posting_lists, desc="Regularizing TFs")
	):
	idf = CSCBM25Index.calc_idf(df=dfs[tid], N=N)
	for i in range(len(posting_list.docid_postings)):
	docid = posting_list.docid_postings[i]
	tf = posting_list.tweight_postings[i]
	dl = dls[docid]
	regularized_tf = CSCBM25Index.calc_regularized_tf(
	tf=tf, dl=dl, avgdl=avgdl, k1=k1, b=b
	)
	posting_list.tweight_postings[i] = regularized_tf * idf

	return convert_to_csc(posting_lists, N)


	@staticmethod
	def calc_regularized_tf(
	tf: int, dl: float, avgdl: float, k1: float, b: float
	) -> float:
	return tf / (tf + k1 * (1 - b + b * dl / avgdl))

	@staticmethod
	def calc_idf(df: int, N: int):
	return math.log(1 + (N - df + 0.5) / (df + 0.5))

	@classmethod
	def build_from_documents(
	cls: Type[CSCBM25Index],
	documents: Iterable[Document],
	store_raw: bool = True,
	output_dir: Optional[str] = None,
	ndocs: Optional[int] = None,
	show_progress_bar: bool = True,
	k1: float = 0.9,
	b: float = 0.4,
	) -> CSCBM25Index:
	# Counting TFs, DFs, doc_lengths, etc.:
	counting = run_counting(
	documents=documents,
	tokenize_fn=CSCBM25Index.tokenize,
	store_raw=store_raw,
	ndocs=ndocs,
	show_progress_bar=show_progress_bar,
	)

	# Compute term weights and caching:
	posting_lists = counting.posting_lists
	total_docs = len(counting.cid2docid)
	posting_lists_matrix = CSCBM25Index.cache_term_weights(
	posting_lists=posting_lists,
	total_docs=total_docs,
	avgdl=counting.avgdl,
	dfs=counting.dfs,
	dls=counting.dls,
	k1=k1,
	b=b,
	)

	# Assembly and save:
	index = CSCBM25Index(
	posting_lists_matrix=posting_lists_matrix,
	vocab=counting.vocab,
	cid2docid=counting.cid2docid,
	collection_ids=counting.collection_ids,
	doc_texts=counting.doc_texts,
	)
	return index

	csc_bm25_index = CSCBM25Index.build_from_documents(
	documents=iter(sciq.corpus),
	ndocs=12160,
	show_progress_bar=True,
	k1=0.9,
	b=0.8
	)
	csc_bm25_index.save("output/csc_bm25_index")
	## TEST_CASES (should be 7 and 95)
	print(len(str(os.path.getsize("output/csc_bm25_index/index.pkl"))))
	print(os.path.getsize("output/csc_bm25_index/index.pkl") // int(1e5))

	## TEST_CASES (should be 7 and 95)
	print(len(str(os.path.getsize("output/csc_bm25_index/index.pkl"))))
	print(os.path.getsize("output/csc_bm25_index/index.pkl") // int(1e5))

	## RESULT_CHECKING_POINT
	print(os.path.getsize("output/csc_bm25_index/index.pkl"))

	"""We can compare the size of the CSC-based index with the Python-list-based index:"""

	print(os.path.getsize("output/bm25_index/index.pkl"))

	"""## TASK2.3: implement `CSCInvertedIndexRetriever` (6 points)

	Implement `CSCInvertedIndexRetriever` by completing the missing code.
	"""

	from nlp4web_codebase.ir.models import BaseRetriever
	from typing import Type
	from abc import abstractmethod


	class BaseInvertedIndexRetriever(BaseRetriever):

	@property
	@abstractmethod
	def index_class(self) -> Type[InvertedIndex]:
	pass

	def __init__(self, index_dir: str) -> None:
	self.index = self.index_class.from_saved(index_dir)

	def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	target_docid = self.index.cid2docid[cid]
	term_weights = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists[tid]
	for docid, tweight in zip(
	posting_list.docid_postings, posting_list.tweight_postings
	):
	if docid == target_docid:
	term_weights[tok] = tweight
	break
	return term_weights

	def score(self, query: str, cid: str) -> float:
	return sum(self.get_term_weights(query=query, cid=cid).values())

	def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
	toks = self.index.tokenize(query)
	docid2score: Dict[int, float] = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	posting_list = self.index.posting_lists[tid]
	for docid, tweight in zip(
	posting_list.docid_postings, posting_list.tweight_postings
	):
	docid2score.setdefault(docid, 0)
	docid2score[docid] += tweight
	docid2score = dict(
	sorted(docid2score.items(), key=lambda pair: pair[1], reverse=True)[:topk]
	)
	return {
	self.index.collection_ids[docid]: score
	for docid, score in docid2score.items()
	}


	class BM25Retriever(BaseInvertedIndexRetriever):

	@property
	def index_class(self) -> Type[BM25Index]:
	return BM25Index

	bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
	bm25_retriever.retrieve("What type of diseases occur when the immune system attacks normal body cells?")

	## TEST_CASES (should be close to
	# {'theory': 3.1838157176971436, 'evolution': 3.488086223602295, 'natural': 2.629807710647583, 'selection': 3.552377462387085}
	# {'train-11632': 16.241527557373047, 'train-10931': 13.352127075195312, 'train-2006': 12.854086875915527, 'train-7040': 12.690572738647461, 'train-1719': 11.01913833618164, 'train-9875': 10.886155128479004, 'train-1971': 10.796306610107422, 'train-9882': 10.535819053649902, 'train-2018': 10.481085777282715, 'test-586': 10.478515625}
	#)
	bm25_retriever = BM25Retriever(index_dir="output/bm25_index")
	query = "What type of diseases occur when the immune system attacks normal body cells?"
	print(bm25_retriever.get_term_weights(query=query, cid="train-2006"))
	print(bm25_retriever.retrieve(query))







	class BaseCSCInvertedIndexRetriever(BaseRetriever):

	@property
	@abstractmethod
	def index_class(self) -> Type[CSCInvertedIndex]:
	pass

	def __init__(self, index_dir: str) -> None:
	self.index = self.index_class.from_saved(index_dir)

	def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
	"""
	Retrieve term weights for a specific query and collection ID using CSC matrix.
	"""
	toks = self.index.tokenize(query)
	target_docid = self.index.cid2docid.get(cid, None)
	if target_docid is None:
	return {}
	term_weights = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]

	tweight = self.index.posting_lists_matrix[tid, target_docid]
	if tweight != 0:
	term_weights[tok] = tweight
	return term_weights

	def score(self, query: str, cid: str) -> float:
	return sum(self.get_term_weights(query=query, cid=cid).values())

	def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
	"""
	Retrieve top-k documents for a given query using the CSC-based BM25 index.
	"""
	toks = self.index.tokenize(query)
	docid2score: Dict[int, float] = {}
	for tok in toks:
	if tok not in self.index.vocab:
	continue
	tid = self.index.vocab[tok]
	# Get the row for term 'tid' which corresponds to all documents containing this term
	term_vector = self.index.posting_lists_matrix.getrow(tid)
	# Iterate over all non-zero entries in the term vector
	for docid, tweight in zip(term_vector.indices, term_vector.data):
	docid2score.setdefault(docid, 0)
	docid2score[docid] += tweight
	# Sort documents by score in descending order and select top-k
	top_docs = sorted(docid2score.items(), key=lambda x: x[1], reverse=True)[:topk]
	return {
	self.index.collection_ids[docid]: score
	for docid, score in top_docs
	}

	class CSCBM25Retriever(BaseCSCInvertedIndexRetriever):

	@property
	def index_class(self) -> Type[CSCBM25Index]:
	return CSCBM25Index

	## TEST_CASES (should be close to
	# {'theory': 3.1838157176971436, 'evolution': 3.488086223602295, 'natural': 2.629807710647583, 'selection': 3.552377462387085}
	# {'train-11632': 16.241527557373047, 'train-10931': 13.352127075195312, 'train-2006': 12.854086875915527, 'train-7040': 12.690572738647461, 'train-1719': 11.01913833618164, 'train-9875': 10.886155128479004, 'train-1971': 10.796306610107422, 'train-9882': 10.535819053649902, 'train-2018': 10.481085777282715, 'test-586': 10.478515625}
	#)
	csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")
	query = "Who proposed the theory of evolution by natural selection?"
	print(csc_bm25_retriever.get_term_weights(query=query, cid="train-2006"))
	print(csc_bm25_retriever.retrieve(query))

	## RESULT_CHECKING_POINT
	csc_bm25_retriever = CSCBM25Retriever(index_dir="output/csc_bm25_index")
	query = "What are the differences between immunodeficiency and autoimmune diseases?"
	print(csc_bm25_retriever.get_term_weights(query=query, cid="train-1691"))
	print(csc_bm25_retriever.retrieve("What are the differences between immunodeficiency and autoimmune diseases?"))

	"""# TASK3: a search-engine demo based on Huggingface space (4 points)

	## TASK3.1: create the gradio app (2 point)

	Create a gradio app to demo the BM25 search engine index on SciQ. The app should have a single input variable for the query (of type `str`) and a single output variable for the returned ranking (of type `List[Hit]` in the code below). Please use the BM25 system with default k1 and b values.

	Hint: it should use a "search" function of signature:

	```python
	def search(query: str) -> List[Hit]:
	...
	```
	"""



	import gradio as gr
	from typing import TypedDict

	class Hit(TypedDict):
	cid: str
	score: float
	text: str

	demo: Optional[gr.Interface] = None
	return_type = List[Hit]

	## YOUR_CODE_STARTS_HERE
	def search(query: str) -> List[Hit]:
	# Retrieve rankings using the BM25 retriever
	rankings = bm25_retriever.retrieve(query)

	hits = []
	for cid, score in rankings.items():
	# Get the document ID from the collection ID
	docid = bm25_retriever.index.cid2docid.get(cid, None)
	if docid is not None:
	# Retrieve the document text if available
	text = bm25_retriever.index.doc_texts[docid] if bm25_retriever.index.doc_texts else "Text not available."
	hits.append({
	"cid": cid,
	"score": score,
	"text": text
	})
	return hits

	demo = gr.Interface(
	fn=search,
	inputs=gr.Textbox(lines=2, placeholder="Enter your query here...", label="Search Query"),
	outputs=gr.Textbox(label="Search Results", lines=10),
	title="BM25 Search Engine",
	description="""
	BM25
	"""
	)

	## YOUR_CODE_ENDS_HERE
	demo.launch()

	## TEST_CASES (result should be [{'cid': 'train-10966', 'score': 12.417802868109781, 'text': 'Bacteria can be used to make cheese from milk. The bacteria turn the milk sugars into lactic acid. The acid is what causes the milk to curdle to form cheese. Bacteria are also involved in producing other foods. Yogurt is made by using bacteria to ferment milk ( Figure below ). Fermenting cabbage with bacteria produces sauerkraut.'}, {'cid': 'train-0', 'score': 10.702840907292215, 'text': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.'}, {'cid': 'dev-569', 'score': 9.78520518303728, 'text': 'A wide range of friendly bacteria live in the gut. Bacteria begin to populate the human digestive system right after birth. Gut bacteria include Lactobacillus , the bacteria commonly used in probiotic foods such as yogurt, and E. coli bacteria. About a third of all bacteria in the gut are members of the Bacteroides species. Bacteroides are key in helping us digest plant food.'}, {'cid': 'train-1133', 'score': 8.292180216871554, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement.'}, {'cid': 'train-5314', 'score': 8.211635318028303, 'text': 'Bacteria are often used to make cheese from milk. But making foods is not the only beneficial role of bacteria. For example, they also play an essential role in your gut!.'}, {'cid': 'train-6684', 'score': 8.168255107424818, 'text': 'Osteoporosis is a disease in which bones lose mass and become more fragile than they should be. Osteoporosis also makes bones more likely to break. Two of the easiest ways to prevent osteoporosis are eating a healthy diet that has the right amount of calcium and vitamin D and to do some sort of weight-bearing exercise every day. Foods that are a good source of calcium include milk, yogurt, and cheese. Non-dairy sources of calcium include Chinese cabbage, kale, and broccoli. Many fruit juices, fruit drinks, tofu, and cereals have calcium added to them. It is recommended that teenagers get 1300 mg of calcium every day. For example, one cup (8 fl. oz. ) of milk provides about 300 mg of calcium, or about 30% of the daily requirement. Other sources of calcium are pictured in the Figure below .'}, {'cid': 'train-7890', 'score': 7.930578384187305, 'text': 'Animals and some bacteria and fungi carry out lactic acid fermentation. Lactic acid is a waste product of this process. Our muscles perform lactic acid fermentation during strenuous exercise, since oxygen cannot be delivered to the muscles quickly enough. The buildup of lactic acid is believed to make your muscles sore after exercise. Bacteria that produce lactic acid are used to make cheese and yogurt. The lactic acid causes the proteins in milk to thicken. Lactic acid also causes tooth decay, because bacteria use the sugars in your mouth for energy.'}, {'cid': 'train-6916', 'score': 7.833677059320589, 'text': 'Yogurt is a good source of calcium. Yogurt also contains active cultures of "good" bacteria. Foods that contain these beneficial bacteria are sometimes called "probiotic. ".'}, {'cid': 'train-10029', 'score': 7.725028405457634, 'text': 'Humans have collected and grown mushrooms for food for thousands of years. Figure below shows some of the many types of mushrooms that people eat. Yeasts are used in bread baking and brewing alcoholic beverages. Other fungi are used in fermenting a wide variety of foods, including soy sauce, tempeh, and cheeses. Blue cheese has its distinctive appearance and flavor because of the fungus growing though it (see Figure below ).'}, {'cid': 'train-10983', 'score': 7.334055808872751, 'text': "No doubt you've had a sore throat before, and you've probably eaten cheese or yogurt. If so, then you've already encountered the amazing world of prokaryotes. Prokaryotes are single-celled organisms that lack a nucleus. They also lack other membrane-bound organelles. Prokaryotes are tiny. They can only be viewed with a microscope (see Figure below ). But they are the most numerous organisms on Earth. Without them, the world would be a very different place."}])
	import requests
	import json

	headers = {"Content-Type": "application/json"}
	data = {"data": ["What type of organism is commonly used in preparation of foods such as cheese and yogurt?"]}
	response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
	event_id = response.json()["event_id"]
	response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True)
	lines = list(response.iter_lines())
	print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))

	## RESULT_CHECKING_POINT
	import requests
	import json

	headers = {"Content-Type": "application/json"}
	data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]}
	response = requests.post(f"{demo.local_api_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
	event_id = response.json()["event_id"]
	response = requests.get(f"{demo.local_api_url.strip('/')}/call/predict/{event_id}", stream=True)
	lines = list(response.iter_lines())
	print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))

	"""## TASK3.2: upload it to Huggingface Space (2 point)

	Upload your gradio app to Huggingface Space. Put your URL to the Space app in the variable `hf_space_url`.

	IMPORTANT!!! You can get this URL from:

	Your Space page -> "three dots" on the top right -> "embedd this space" -> "Direct URL"

	An example URL (not for our task) is: https://stabilityai-stable-diffusion-3-5-large.hf.space (from https://huggingface.co/spaces/stabilityai/stable-diffusion-3.5-large)
	"""

	hf_space_url: Optional[str] = None # Store your created Huggingface Space URL in this variable
	## YOUR_CODE_STARTS_HERE
	hf_space_url: Optional[str] = "https://intelava-nlp4web.hf.space"
	## YOUR_CODE_ENDS_HERE

	## RESULT_CHECKING_POINT
	import requests
	import json

	print(hf_space_url)
	headers = {"Content-Type": "application/json"}
	data = {"data": ["What are the differences between immunodeficiency and autoimmune diseases?"]}
	response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
	event_id = response.json()["event_id"]
	response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True)
	lines = list(response.iter_lines())
	print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))

	## TEST_CASES (result should be [{'cid': 'train-5587', 'score': 26.74537329473182, 'text': 'The entropy change is positive as the solid state changes into the liquid state. If the transition went from the liquid to the solid state, the numerical value for would be the same, but the sign would be reversed since we are going from a less ordered to a more ordered situation.'}, {'cid': 'train-2', 'score': 25.93532475963942, 'text': 'Summary Changes of state are examples of phase changes, or phase transitions. All phase changes are accompanied by changes in the energy of a system. Changes from a more-ordered state to a less-ordered state (such as a liquid to a gas) areendothermic. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (ΔHfus). The energy change required to vaporize 1 mol of a substance is the enthalpy of vaporization (ΔHvap). The direct conversion of a solid to a gas is sublimation. The amount of energy needed to sublime 1 mol of a substance is its enthalpy of sublimation (ΔHsub) and is the sum of the enthalpies of fusion and vaporization. Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. Heating curves relate temperature changes to phase transitions. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. Supercooled liquids usually crystallize on standing, or adding a seed crystal of the same or another substance can induce crystallization.'}, {'cid': 'train-1658', 'score': 19.0263955721366, 'text': 'There are many examples in the chemical world of changes in entropy. Phase transitions are one obvious example. When a substance makes a transition from the liquid state to the gaseous state, the particles have many more possible arrangements, because they are no longer confined to a specified volume in which they are close to each other; gas particles can move freely throughout their container. Vaporization represents an increase in entropy. In the opposite direction, a liquid loses entropy when it freezes to a solid. Because solids have very ordered structures, there are fewer possible arrangements of particles that would result in the properties associated with a solid.'}, {'cid': 'train-5603', 'score': 16.14918704233498, 'text': 'Chemical energy, the energy stored in molecules and atoms, is one type of potential energy. Certain reactions can cause this energy to be released as heat. Other reactions require an input of energy, in which case the products will store more potential energy than the reactants. When we studied phase changes, we saw a relationship between energy and the state of matter. To melt a solid or boil a liquid, energy needs to be added in order to break up the intermolecular forces holding particles together in more ordered states. The reverse processes, condensation and freezing, release energy, because more favorable intermolecular interactions are formed.'}, {'cid': 'train-8144', 'score': 13.369317026860408, 'text': 'Solid carbon dioxide is also called dry ice. That’s because when it gets warmer and changes state, it doesn’t change to a liquid by melting. Instead, it changes directly to a gas without going through the liquid state. The process in which a solid changes directly to a gas is called sublimation . It occurs when energy is added to a solid such as dry ice. You can watch dry ice changing directly to a gas in the video at this URL: http://www. youtube. com/watch?v=J8mDGwf-5x0 .'}, {'cid': 'train-844', 'score': 12.931270408607555, 'text': 'The water droplets of fog form from water vapor in the air. Fog disappears when the water droplets change back to water vapor. These changes are examples of changes of state. A change of state occurs whenever matter changes from one state to another. Common states of matter on Earth are solid, liquid, and gas. Matter may change back and forth between any two of these states.'}, {'cid': 'train-9811', 'score': 12.904636038613848, 'text': 'Start right above point on the temperature axis and follow the red line vertically. At very low pressure, the particles of the substance are far apart from one another and the substance is in the gas state. As the pressure is increased, the particles of the substance are forced closer and closer together. Eventually the particles are pushed so close together that attractive forces cause the substance to condense into the liquid state. Continually increasing the pressure on the liquid will eventually cause the substance to solidify. For the majority of substances, the solid state is denser than the liquid state and so putting a liquid under great pressure will cause it to turn into a solid. The line segment represents the process of sublimation, where the substance changes directly from a solid to a gas. At a sufficiently low pressure, the liquid phase does not exist. The point labeled is called the triple point . The triple point is the one condition of temperature and pressure where the solid, liquid, and vapor states of a substance can all coexist at equilibrium.'}, {'cid': 'train-8260', 'score': 12.876342252900347, 'text': 'Unlike a crystalline solid, an amorphous solid is a solid that lacks an ordered internal structure. Some examples of amorphous solids include rubber, plastic, and gels. Glass is a very important amorphous solid that is made by cooling a mixture of materials in such a way that it does not crystallize. Glass is sometimes referred to as a supercooled liquid rather than a solid. If you have ever watched a glassblower in action, you have noticed that he takes advantage of the fact that amorphous solids do not have a distinct melting point like crystalline solids do. Instead, as glass is heated, it slowly softens and can be shaped into all sorts of interesting forms. When a glass object shatters, it does so in a very irregular way, unlike crystalline solids, which always break into fragments that have the same shape as dictated by its crystal system.'}, {'cid': 'train-317', 'score': 12.82403749702155, 'text': 'An amorphous solid is a solid that lacks an ordered internal structure.'}, {'cid': 'train-6203', 'score': 12.76684203292532, 'text': 'Matter can exist in one of several different states, including a gas, liquid, or solid state. States of matter differ in the amount of energy their molecules have. When matter recycles, it changes state by gaining or losing energy.'}]
	import requests
	import json

	headers = {"Content-Type": "application/json"}
	data = {"data": ["Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always what?"]}
	response = requests.post(f"{hf_space_url.strip('/')}/call/predict", headers=headers, data=json.dumps(data))
	event_id = response.json()["event_id"]
	response = requests.get(f"{hf_space_url.strip('/')}/call/predict/{event_id}", stream=True)
	lines = list(response.iter_lines())
	print(eval(json.loads(lines[1].decode("UTF-8").replace("data:", ""))[0]))