CCSS_Alignment / core /bm25_utility.py
meruem123's picture
Upload 15 files
34a1c85 verified
from core.preprocessing_pipeline import preprocessing_pipeline
from rank_bm25 import BM25Okapi
import pandas as pd
# Load the dataset
df = pd.read_csv('data/CCSS Common Core Standards(English Standards).csv')
df.dropna(inplace=True)
df['State Standard'] = df['State Standard'].apply(lambda x: preprocessing_pipeline(x).preprocess())
# Tokenize the documents for BM25
tokenized_docs = [doc.lower().split() for doc in df['State Standard']]
bm25 = BM25Okapi(tokenized_docs)
class bm25_utility:
def __init__(self,text,top_n=5):
self.text = text
self.top_n = top_n
def retrieve_top_n_bm25(self):
preprocessing_pipeline_instance = preprocessing_pipeline(self.text)
preprocessed_text = preprocessing_pipeline_instance.preprocess()
tokenized_query = preprocessed_text.split()
scores = bm25.get_scores(tokenized_query)
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:self.top_n]
# ID Category Sub Category State Standard
results = []
for idx in top_indices:
row = df.iloc[idx]
results.append({
"ID": row["ID"],
"Category": row["Category"],
"Sub Category": row["Sub Category"],
"standard": row["State Standard"],
"score": round(scores[idx], 4)
})
return results
query = "Identify the main idea of a text"
bm25_utility_instance = bm25_utility(query, top_n=5)
top_n_results = bm25_utility_instance.retrieve_top_n_bm25()
print(top_n_results)