Spaces:
Sleeping
Sleeping
| from core.preprocessing_pipeline import preprocessing_pipeline | |
| from rank_bm25 import BM25Okapi | |
| import pandas as pd | |
| # Load the dataset | |
| df = pd.read_csv('data/CCSS Common Core Standards(English Standards).csv') | |
| df.dropna(inplace=True) | |
| df['State Standard'] = df['State Standard'].apply(lambda x: preprocessing_pipeline(x).preprocess()) | |
| # Tokenize the documents for BM25 | |
| tokenized_docs = [doc.lower().split() for doc in df['State Standard']] | |
| bm25 = BM25Okapi(tokenized_docs) | |
| class bm25_utility: | |
| def __init__(self,text,top_n=5): | |
| self.text = text | |
| self.top_n = top_n | |
| def retrieve_top_n_bm25(self): | |
| preprocessing_pipeline_instance = preprocessing_pipeline(self.text) | |
| preprocessed_text = preprocessing_pipeline_instance.preprocess() | |
| tokenized_query = preprocessed_text.split() | |
| scores = bm25.get_scores(tokenized_query) | |
| top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:self.top_n] | |
| # ID Category Sub Category State Standard | |
| results = [] | |
| for idx in top_indices: | |
| row = df.iloc[idx] | |
| results.append({ | |
| "ID": row["ID"], | |
| "Category": row["Category"], | |
| "Sub Category": row["Sub Category"], | |
| "standard": row["State Standard"], | |
| "score": round(scores[idx], 4) | |
| }) | |
| return results | |
| query = "Identify the main idea of a text" | |
| bm25_utility_instance = bm25_utility(query, top_n=5) | |
| top_n_results = bm25_utility_instance.retrieve_top_n_bm25() | |
| print(top_n_results) |