Spaces:

meruem123
/

CCSS_Alignment

Sleeping

App Files Files Community

meruem123 commited on Jul 25, 2025

Commit

34a1c85

verified ·

1 Parent(s): 703a7c0

Upload 15 files

Browse files

Files changed (15) hide show

Readme.md +52 -0
app.py +71 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-312.pyc +0 -0
core/__pycache__/bm25_utility.cpython-312.pyc +0 -0
core/__pycache__/preprocessing_pipeline.cpython-312.pyc +0 -0
core/__pycache__/splade_utility.cpython-312.pyc +0 -0
core/bm25_utility.py +49 -0
core/main.py +5 -0
core/preprocessing_pipeline.py +42 -0
core/splade_utility.py +69 -0
data/CCSS Common Core Standards(English Standards).csv +0 -0
data/data.csv +0 -0
notebook/ccss_standard_mapper.ipynb +1120 -0
requirements.txt +24 -0

Readme.md ADDED Viewed

	@@ -0,0 +1,52 @@

+# 📚 CCSS Alignment with BM25 & SPLADE
+This project allows you to align input educational text (lesson plans, learning objectives) with Common Core State Standards (ELA) using two retrieval techniques:
+- **BM25** (sparse lexical search)
+- **SPLADE** (sparse transformer embeddings)
+## 🚀 How to Run the App
+Make sure you're in the project root folder, then run:
+```bash
+streamlit run app.py
+```
+You will be able to:
+- Select either BM25 or SPLADE
+- Input a query (e.g., "identify key ideas and details")
+- View top-matching CCSS standards
+- Compare accuracy between both retrieval models
+## 🧪 Sample Starter Code for app.py
+```python
+import streamlit as st
+from core.bm25_utility import bm25_utility
+from core.splade_utility import SpladeUtility
+query = st.text_input("Enter your query:")
+method = st.selectbox("Choose retrieval method", ["BM25", "SPLADE"])
+if st.button("Get Standards"):
+    if method == "BM25":
+        results = bm25_utility(query).retrieve_top_n_bm25()
+    else:
+        results = SpladeUtility(query).retrieve_top_n_splade()
+    for r in results:
+        st.write(f"**{r['ID']}** - {r['standard']} (Score: {r['score']})")
+```
+## 📝 Notes
+- Ensure that model weights for SPLADE are downloaded or cached.
+- Make sure you're using cleaned and preprocessed CCSS data for accurate matching.
+- Streamlit interface supports rapid switching between BM25 and SPLADE for testing.
+---
+**Author**: Shivendra Gupta
+**Purpose**: Educational NLP for aligning teaching content to learning standards.

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import streamlit as st
+import pandas as pd
+import json
+import matplotlib.pyplot as plt
+from core.splade_utility import splade_utility
+from core.bm25_utility import bm25_utility
+# ==== Import your models and functions ====
+# Assume these are already defined in your notebook/script
+# from your_code import retrieve_top_n_bm25, retrieve_top_n_splade, evaluate_top1_on_state_standard
+# Dummy accuracy values (replace with real ones from your code)
+bm25_accuracy = 0.9959
+splade_accuracy = 0.9797
+# Dummy placeholder functions (replace with your actual ones)
+def retrieve_top_n_bm25(query, top_n=5):
+    bm25_utility_instance = bm25_utility(query, top_n=5)
+    top_n_results = bm25_utility_instance.retrieve_top_n_bm25()
+    return top_n_results
+def retrieve_top_n_splade(query, top_n=5):
+    splade_utility_instance = splade_utility(query, top_n=top_n)
+    return splade_utility_instance.retrieve_top_n_splade()
+# ==== Streamlit UI ====
+st.set_page_config(page_title="CCSS Alignment", layout="centered")
+st.title("📚 CCSS Alignment Search")
+# Select model
+model_choice = st.radio("Select Retrieval Model:", ["BM25", "SPLADE"])
+# Accuracy bar chart
+st.subheader("🎯 Model Top-1 Accuracy")
+fig, ax = plt.subplots()
+ax.bar(["BM25", "SPLADE"], [bm25_accuracy, splade_accuracy], color=["skyblue", "lightgreen"])
+ax.set_ylim([0.9, 1.01])
+ax.set_ylabel("Top-1 Accuracy")
+for i, acc in enumerate([bm25_accuracy, splade_accuracy]):
+    ax.text(i, acc + 0.001, f"{acc:.4f}", ha='center', fontsize=10)
+st.pyplot(fig)
+# Query input
+st.subheader("🔍 Try a Query")
+query = st.text_area("Enter a lesson or objective text:", height=100)
+# Search button
+if st.button("Search"):
+    st.subheader("📄 Top Results")
+    if model_choice == "BM25":
+        results = retrieve_top_n_bm25(query, top_n=5)
+    else:
+        results = retrieve_top_n_splade(query, top_n=5)
+    if results:
+        for i, r in enumerate(results, 1):
+            st.markdown(f"""
+            **Rank {i}**
+            - **Standard**: {r['standard']}
+            - **ID**: {r.get('ID', 'N/A')}
+            - **Category**: {r.get('Category', 'N/A')}
+            - **Sub Category**: {r.get('Sub Category', 'N/A')}
+            - **Score**: `{r['score']}`
+            """)
+    else:
+        st.warning("No results found.")

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (161 Bytes). View file

core/__pycache__/bm25_utility.cpython-312.pyc ADDED Viewed

Binary file (2.77 kB). View file

core/__pycache__/preprocessing_pipeline.cpython-312.pyc ADDED Viewed

Binary file (3.25 kB). View file

core/__pycache__/splade_utility.cpython-312.pyc ADDED Viewed

Binary file (4.6 kB). View file

core/bm25_utility.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from core.preprocessing_pipeline import preprocessing_pipeline
+from rank_bm25 import BM25Okapi
+import pandas as pd
+# Load the dataset
+df = pd.read_csv('data/CCSS Common Core Standards(English Standards).csv')
+df.dropna(inplace=True)
+df['State Standard'] = df['State Standard'].apply(lambda x: preprocessing_pipeline(x).preprocess())
+# Tokenize the documents for BM25
+tokenized_docs = [doc.lower().split() for doc in df['State Standard']]
+bm25 = BM25Okapi(tokenized_docs)
+class bm25_utility:
+    def __init__(self,text,top_n=5):
+        self.text = text
+        self.top_n = top_n
+    def retrieve_top_n_bm25(self):
+        preprocessing_pipeline_instance = preprocessing_pipeline(self.text)
+        preprocessed_text = preprocessing_pipeline_instance.preprocess()
+        tokenized_query = preprocessed_text.split()
+        scores = bm25.get_scores(tokenized_query)
+        top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:self.top_n]
+        # ID	Category	Sub Category	State Standard
+        results = []
+        for idx in top_indices:
+            row = df.iloc[idx]
+            results.append({
+                "ID": row["ID"],
+                "Category": row["Category"],
+                "Sub Category": row["Sub Category"],
+                "standard": row["State Standard"],
+                "score": round(scores[idx], 4)
+            })
+        return results
+query = "Identify the main idea of a text"
+bm25_utility_instance = bm25_utility(query, top_n=5)
+top_n_results = bm25_utility_instance.retrieve_top_n_bm25()
+print(top_n_results)

core/main.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from preprocessing_pipeline import preprocessing_pipeline
+preprocessing_pipeline_instance = preprocessing_pipeline("i am a student and i am learning how to code. hi, how are you? and what are you doing?")
+preprocessed_text = preprocessing_pipeline_instance.preprocess()
+print(preprocessed_text)

core/preprocessing_pipeline.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import re
+import pandas as pd
+import re
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+stop_words = set(stopwords.words('english'))
+class preprocessing_pipeline:
+    def __init__(self,text):
+        self.text = text
+    def preprocess(self):
+        self.text = self.clean_text(self.text)
+        self.text = self.lowercase(self.text)
+        self.text = self.remove_punctuation(self.text)
+        self.text = self.remove_stopwords(self.text)
+        self.text = self.lemmatize_tokens(self.text)
+        return self.text
+    def clean_text(self , text: str) -> str:
+        text = text.strip()
+        text = text.replace("\n", " ").replace("\xa0", " ")
+        text = text.replace("“", "\"").replace("”", "\"").replace("–", "-")
+        return text
+    def lowercase(self, text: str) -> str:
+        return text.lower()
+    def remove_punctuation(self, text: str) -> str:
+        return re.sub(r"[^\w\s]", "", text)
+    def remove_stopwords(self, text: str) -> str:
+        tokens = word_tokenize(text)
+        return ' '.join([word for word in tokens if word not in stop_words])
+    def lemmatize_tokens(self, text: str) -> str:
+        tokens = word_tokenize(text)
+        lemmatizer = WordNetLemmatizer()
+        return ' '.join([lemmatizer.lemmatize(token) for token in tokens])

core/splade_utility.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+import torch
+import torch.nn.functional as F
+model_name = "naver/splade-cocondenser-ensembledistil"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForMaskedLM.from_pretrained(model_name)
+model.eval()
+df = pd.read_csv('data/CCSS Common Core Standards(English Standards).csv')
+df.dropna(inplace=True)
+# Reset index to align doc IDs
+class splade_utility:
+    def __init__(self, query, top_n=5):
+        self.query = query
+        self.top_n = top_n
+    @staticmethod
+    def get_splade_sparse_vector(text):
+        with torch.no_grad():
+            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+            logits = model(**inputs).logits.squeeze(0)  # [seq_len, vocab_size]
+            relu_out = F.relu(logits)
+            splade_weights = torch.log1p(relu_out).max(dim=0).values
+            indices = torch.nonzero(splade_weights).squeeze()
+            return {
+                tokenizer.convert_ids_to_tokens([i.item()])[0]: splade_weights[i].item()
+                for i in indices
+            }
+    def dot_product_sparse(self , query_vec, doc_vec):
+        return sum(query_vec.get(term, 0.0) * doc_vec.get(term, 0.0) for term in query_vec)
+    def retrieve_top_n_splade(self):
+        query_vec = self.get_splade_sparse_vector(self.query)
+        scores = [
+            (self.dot_product_sparse(query_vec, doc_vec), idx)
+            for idx, doc_vec in enumerate(splade_doc_vectors)
+        ]
+        top_matches = sorted(scores, reverse=True)[:self.top_n]
+        results = []
+        for score, idx in top_matches:
+            results.append({
+                "score": round(score, 4),
+                "standard": df.iloc[idx]["State Standard"],
+                "ID": df.iloc[idx]["ID"],
+                "Category": df.iloc[idx]["Category"],
+                "Sub Category": df.iloc[idx]["Sub Category"]
+            })
+        return results
+df = df.reset_index(drop=True)
+# Get list of standard texts
+standard_texts = df["State Standard"].astype(str).tolist()
+# Compute sparse vectors
+splade_doc_vectors = [splade_utility.get_splade_sparse_vector(text) for text in (standard_texts)]
+# Example usage
+query = "determine main idea text explain supported key detail summarize text"
+splade_instance = splade_utility(query)
+results = splade_instance.retrieve_top_n_splade()
+print(results)

data/CCSS Common Core Standards(English Standards).csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

notebook/ccss_standard_mapper.ipynb ADDED Viewed

	@@ -0,0 +1,1120 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e2192e55",
+   "metadata": {},
+   "source": [
+    "#  Project: CCSS Standard Alignment using BM25 and SPLADE\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Background\n",
+    "\n",
+    "### BM25 (Best Matching 25)\n",
+    "\n",
+    "BM25 is a **traditional lexical retrieval model** used in information retrieval systems (like search engines). It ranks documents based on the **term frequency–inverse document frequency (TF-IDF)** concept, with additional normalization for document length.\n",
+    "\n",
+    "**Core Characteristics:**\n",
+    "- Lexical-only: matches exact words (not synonyms/paraphrases)\n",
+    "- Scores documents using a tunable function of:\n",
+    "  - **Term frequency (TF)** – how often a query term appears in the doc\n",
+    "  - **Inverse Document Frequency (IDF)** – how rare the term is overall\n",
+    "  - **Document length normalization**\n",
+    "- Fast and interpretable\n",
+    "\n",
+    "**Strengths:**\n",
+    "- Simple and fast\n",
+    "- Strong for keyword-heavy queries\n",
+    "- Works well on small datasets\n",
+    "\n",
+    "**Limitations:**\n",
+    "- Cannot understand synonyms, rephrasing, or context\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### SPLADE (Sparse Lexical and Expansion Model)\n",
+    "\n",
+    "SPLADE is a **neural sparse retriever** that combines the **interpretability of sparse vectors** with the **semantic power of transformers (like BERT)**.\n",
+    "\n",
+    "**How it works:**\n",
+    "- Instead of dense embeddings (like BERT or SBERT), SPLADE generates **sparse term-weighted vectors**\n",
+    "- These vectors can:\n",
+    "  - Activate terms **not explicitly in the query** (semantic expansion)\n",
+    "  - Assign importance scores to vocabulary terms\n",
+    "- Supports use of **inverted indexes** like BM25, but with neural knowledge\n",
+    "\n",
+    "**Strengths:**\n",
+    "- Captures paraphrasing and synonyms\n",
+    "- Sparse and interpretable\n",
+    "- Works better on natural language queries\n",
+    "\n",
+    "**Limitations:**\n",
+    "- Slower than BM25\n",
+    "- Requires GPU for efficient inference\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Project Overview\n",
+    "\n",
+    "### Goal:\n",
+    "\n",
+    "Build a system that **automatically aligns educational content (e.g., lesson descriptions, learning objectives)** to the most relevant **Common Core State Standards (CCSS)** for English Language Arts (ELA).\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Approach:\n",
+    "\n",
+    "We implement and compare **two retrieval pipelines**:\n",
+    "\n",
+    "| Component     | Pipeline 1           | Pipeline 2             |\n",
+    "|---------------|----------------------|------------------------|\n",
+    "| Model         | BM25                 | SPLADE                 |\n",
+    "| Representation | Token frequency      | Sparse transformer weights |\n",
+    "| Input         | Free-form text       | Free-form text         |\n",
+    "| Output        | Top-N most relevant CCSS standards with scores |\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Dataset:\n",
+    "\n",
+    "- Source: `CCSS Common Core Standards.xlsx`\n",
+    "- Focus: Only **ELA standards**\n",
+    "- Fields used: `ID`, `Sub Category`, `State Standard`\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Output Format:\n",
+    "\n",
+    "Each pipeline returns a list of matches:\n",
+    "```json\n",
+    "[\n",
+    "  {\n",
+    "    \"rank\": 1,\n",
+    "    \"score\": 10.87,\n",
+    "    \"ID\": \"4.RI.2\",\n",
+    "    \"Category\": \"Reading Informational\",\n",
+    "    \"Sub Category\": \"Key Ideas and Details\",\n",
+    "    \"State Standard\": \"Determine the main idea of a text...\"\n",
+    "  },\n",
+    "  ...\n",
+    "]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "cfa8b1b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "748918e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop_words = set(stopwords.words('english'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "3cf17d78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('/Users/shivendragupta/Desktop/internship25/CCSS/data/CCSS Common Core Standards(English Standards).csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "ee2b47e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>Category</th>\n",
+       "      <th>Sub Category</th>\n",
+       "      <th>State Standard</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>K.RL.1</td>\n",
+       "      <td>Reading Literature</td>\n",
+       "      <td>Key Ideas and Details</td>\n",
+       "      <td>With prompting and support, ask and answer que...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>K.RL.2</td>\n",
+       "      <td>Reading Literature</td>\n",
+       "      <td>Key Ideas and Details</td>\n",
+       "      <td>With prompting and support, retell familiar st...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>K.RL.3</td>\n",
+       "      <td>Reading Literature</td>\n",
+       "      <td>Key Ideas and Details</td>\n",
+       "      <td>With prompting and support, identify character...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>K.RL.4</td>\n",
+       "      <td>Reading Literature</td>\n",
+       "      <td>Craft and Structure</td>\n",
+       "      <td>Ask and answer questions about unknown words i...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>K.RL.5</td>\n",
+       "      <td>Reading Literature</td>\n",
+       "      <td>Craft and Structure</td>\n",
+       "      <td>Recognize common types of texts (e.g., storybo...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       ID            Category           Sub Category  \\\n",
+       "0  K.RL.1  Reading Literature  Key Ideas and Details   \n",
+       "1  K.RL.2  Reading Literature  Key Ideas and Details   \n",
+       "2  K.RL.3  Reading Literature  Key Ideas and Details   \n",
+       "3  K.RL.4  Reading Literature    Craft and Structure   \n",
+       "4  K.RL.5  Reading Literature    Craft and Structure   \n",
+       "\n",
+       "                                      State Standard  \n",
+       "0  With prompting and support, ask and answer que...  \n",
+       "1  With prompting and support, retell familiar st...  \n",
+       "2  With prompting and support, identify character...  \n",
+       "3  Ask and answer questions about unknown words i...  \n",
+       "4  Recognize common types of texts (e.g., storybo...  "
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()  # Display the first few rows of the DataFrame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "3958653b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1486, 4)"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "0e747290",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ID                501\n",
+       "Category          501\n",
+       "Sub Category      501\n",
+       "State Standard    501\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "34001c04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dropna(inplace=True)  # Drop rows with any NaN values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e02750b",
+   "metadata": {},
+   "source": [
+    "# ```Preprocessing data```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "506a332c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_text(text: str) -> str:\n",
+    "    text = text.strip()\n",
+    "    text = text.replace(\"\\n\", \" \").replace(\"\\xa0\", \" \")\n",
+    "    text = text.replace(\"“\", \"\\\"\").replace(\"”\", \"\\\"\").replace(\"–\", \"-\")\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a97a9cfb",
+   "metadata": {},
+   "source": [
+    "## ```Lower Casing```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "2f843d49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lowercase(text: str) -> str:\n",
+    "    return text.lower()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4bc931f1",
+   "metadata": {},
+   "source": [
+    "## ```Removing Punctuation```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "734d4b30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_punctuation(text: str) -> str:\n",
+    "    return re.sub(r\"[^\\w\\s]\", \"\", text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d12ab012",
+   "metadata": {},
+   "source": [
+    "##  ``` Removing Stop Words ```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "b925980c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_stopwords(text: str) -> str:\n",
+    "    tokens = word_tokenize(text)\n",
+    "    return ' '.join([word for word in tokens if word not in stop_words])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "814c3818",
+   "metadata": {},
+   "source": [
+    "## ``` Lemmatization ```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "70500704",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lemmatizer = WordNetLemmatizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "7e287fc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lemmatize_tokens(text: str) -> str:\n",
+    "    tokens = word_tokenize(text)\n",
+    "    return ' '.join([lemmatizer.lemmatize(token) for token in tokens])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39bd33a6",
+   "metadata": {},
+   "source": [
+    "## ``` PipeLine ```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "b443ffec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocessing_pipeline(text: str) -> str:\n",
+    "    text = clean_text(text)\n",
+    "    text = lowercase(text)\n",
+    "    text = remove_punctuation(text)\n",
+    "    text = remove_stopwords(text)\n",
+    "    text = lemmatize_tokens(text)\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "13a7d65a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['State Standard'] = df['State Standard'].apply(preprocessing_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "f5a8cb5d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0      prompting support ask answer question key deta...\n",
+       "1      prompting support retell familiar story includ...\n",
+       "2      prompting support identify character setting m...\n",
+       "3                  ask answer question unknown word text\n",
+       "4           recognize common type text eg storybook poem\n",
+       "                             ...                        \n",
+       "980    use technology including internet produce publ...\n",
+       "981    conduct short well sustained research project ...\n",
+       "982    gather relevant information multiple authorita...\n",
+       "983    draw evidence informational text support analy...\n",
+       "984    write routinely extended time frame time refle...\n",
+       "Name: State Standard, Length: 985, dtype: object"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['State Standard']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecb1369e",
+   "metadata": {},
+   "source": [
+    "## ``` BM25 Retreiver Function ```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "41e30b91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from rank_bm25 import BM25Okapi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "id": "d34de1f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenized_docs = [doc.lower().split() for doc in df['State Standard']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "32594e27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bm25 = BM25Okapi(tokenized_docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "id": "3d552a24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def retrieve_top_n_bm25(query: str, top_n=5):\n",
+    "    query_tokens = preprocessing_pipeline(query)\n",
+    "    tokenized_query = query_tokens.split()\n",
+    "    \n",
+    "    scores = bm25.get_scores(tokenized_query)\n",
+    "\n",
+    "    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]\n",
+    "\n",
+    "\n",
+    "    # ID\tCategory\tSub Category\tState Standard\n",
+    "\n",
+    "    results = []\n",
+    "    for idx in top_indices:\n",
+    "        row = df.iloc[idx]\n",
+    "        results.append({\n",
+    "            \"ID\": row[\"ID\"],\n",
+    "            \"Category\": row[\"Category\"],\n",
+    "            \"Sub Category\": row[\"Sub Category\"],\n",
+    "            \"standard\": row[\"State Standard\"],\n",
+    "            \"score\": round(scores[idx], 4)\n",
+    "\n",
+    "        })\n",
+    "    return results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "id": "5a18deb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"Identify the main idea of a text\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "id": "11954a8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = retrieve_top_n_bm25(query, top_n=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49538e11",
+   "metadata": {},
+   "source": [
+    "## ``` Top 5 Results from BM25 Retrieval ```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "id": "f7d12c4d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'ID': '1.RI.2',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Key Ideas and Details',\n",
+       "  'State Standard': 'identify main topic retell key detail text',\n",
+       "  'score': 10.666},\n",
+       " {'ID': '3.RI.2',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Key Ideas and Details',\n",
+       "  'State Standard': 'determine main idea text recount key detail explain support main idea',\n",
+       "  'score': 10.0953},\n",
+       " {'ID': 'K.RI.2',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Key Ideas and Details',\n",
+       "  'State Standard': 'prompting support identify main topic retell key detail text',\n",
+       "  'score': 9.8043},\n",
+       " {'ID': '2.RI.6',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Craft and Structure',\n",
+       "  'State Standard': 'identify main purpose text including author want answer explain describe',\n",
+       "  'score': 9.4236},\n",
+       " {'ID': '2.RI.2',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Key Ideas and Details',\n",
+       "  'State Standard': 'identify main topic multiparagraph text well focus specific paragraph within text',\n",
+       "  'score': 9.3944}]"
+      ]
+     },
+     "execution_count": 125,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1dd7ac6e",
+   "metadata": {},
+   "source": [
+    "## ``` Using Splade sparse retreiver```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "id": "f8c3fee3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "be42ffa9ef0949679ea06670a3436378",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e638218262224627be57d394b0bb8d07",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.txt: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "448331ce71bf4b98b07f0291f734fd97",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0b4b7108d21c44ca96368b6b6f137002",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1f86b5b8152645bcb5318b15d57243d8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/670 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d9e4d08d968e4a6a8bd93476a68e1f83",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "BertForMaskedLM(\n",
+       "  (bert): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (token_type_embeddings): Embedding(2, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0-11): 12 x BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSdpaSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "              (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (intermediate_act_fn): GELUActivation()\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (cls): BertOnlyMLMHead(\n",
+       "    (predictions): BertLMPredictionHead(\n",
+       "      (transform): BertPredictionHeadTransform(\n",
+       "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (transform_act_fn): GELUActivation()\n",
+       "        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      )\n",
+       "      (decoder): Linear(in_features=768, out_features=30522, bias=True)\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 126,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "afa41fd273a147f08a2017dad5455866",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "model_name = \"naver/splade-cocondenser-ensembledistil\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForMaskedLM.from_pretrained(model_name)\n",
+    "model.eval()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "id": "0dcffefe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_splade_sparse_vector(text):\n",
+    "    with torch.no_grad():\n",
+    "        inputs = tokenizer(text, return_tensors=\"pt\", truncation=True, max_length=512)\n",
+    "        logits = model(**inputs).logits.squeeze(0)  # [seq_len, vocab_size]\n",
+    "        relu_out = F.relu(logits)\n",
+    "        splade_weights = torch.log1p(relu_out).max(dim=0).values\n",
+    "        indices = torch.nonzero(splade_weights).squeeze()\n",
+    "        return {\n",
+    "            tokenizer.convert_ids_to_tokens([i.item()])[0]: splade_weights[i].item()\n",
+    "            for i in indices\n",
+    "        }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 128,
+   "id": "0493ae98",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 985/985 [00:39<00:00, 24.77it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "# Reset index to align doc IDs\n",
+    "df = df.reset_index(drop=True)\n",
+    "\n",
+    "# Get list of standard texts\n",
+    "standard_texts = df[\"State Standard\"].astype(str).tolist()\n",
+    "\n",
+    "# Compute sparse vectors\n",
+    "splade_doc_vectors = [get_splade_sparse_vector(text) for text in tqdm(standard_texts)]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 129,
+   "id": "42c8b47b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def dot_product_sparse(query_vec, doc_vec):\n",
+    "    return sum(query_vec.get(term, 0.0) * doc_vec.get(term, 0.0) for term in query_vec)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "id": "6254a5e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def retrieve_top_n_splade(query, top_n=5):\n",
+    "    query_vec = get_splade_sparse_vector(query)\n",
+    "    scores = [\n",
+    "        (dot_product_sparse(query_vec, doc_vec), idx)\n",
+    "        for idx, doc_vec in enumerate(splade_doc_vectors)\n",
+    "    ]\n",
+    "    \n",
+    "    top_matches = sorted(scores, reverse=True)[:top_n]\n",
+    "    \n",
+    "    results = []\n",
+    "    for score, idx in top_matches:\n",
+    "        results.append({\n",
+    "            \"rank\": len(results) + 1,\n",
+    "            \"score\": round(score, 4),\n",
+    "            \"standard\": df.iloc[idx][\"State Standard\"],\n",
+    "            \"ID\": df.iloc[idx][\"ID\"],\n",
+    "            \"Category\": df.iloc[idx][\"Category\"],\n",
+    "            \"Sub Category\": df.iloc[idx][\"Sub Category\"]\n",
+    "        })\n",
+    "    return results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 146,
+   "id": "26f27920",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"Identify the main idea of a text\"\n",
+    "results = retrieve_top_n_splade(query)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "id": "6f736b83",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'rank': 1,\n",
+       "  'score': 21.3089,\n",
+       "  'standard': 'determine main idea text explain supported key detail summarize text',\n",
+       "  'ID': '4.RI.2',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Key Ideas and Details'},\n",
+       " {'rank': 2,\n",
+       "  'score': 20.8493,\n",
+       "  'standard': 'determine main idea text recount key detail explain support main idea',\n",
+       "  'ID': '3.RI.2',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Key Ideas and Details'},\n",
+       " {'rank': 3,\n",
+       "  'score': 20.2714,\n",
+       "  'standard': 'determine two main idea text explain supported key detail summarize text',\n",
+       "  'ID': '5.RI.2',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Key Ideas and Details'},\n",
+       " {'rank': 4,\n",
+       "  'score': 17.5151,\n",
+       "  'standard': 'determine main idea supporting detail text read aloud information presented diverse medium format including visually quantitatively orally',\n",
+       "  'ID': '3.SL.2',\n",
+       "  'Category': 'Speaking & Listening',\n",
+       "  'Sub Category': 'Comprehension and Collaboration'},\n",
+       " {'rank': 5,\n",
+       "  'score': 17.512,\n",
+       "  'standard': 'identify main purpose text including author want answer explain describe',\n",
+       "  'ID': '2.RI.6',\n",
+       "  'Category': 'Reading Informational',\n",
+       "  'Sub Category': 'Craft and Structure'}]"
+      ]
+     },
+     "execution_count": 147,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 148,
+   "id": "f6232e19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_top1_accuracy(df, retrieve_fn):\n",
+    "    correct = 0\n",
+    "    total = len(df)\n",
+    "\n",
+    "    for i in range(total):\n",
+    "        query = df.loc[i, \"State Standard\"]\n",
+    "        expected = query.strip().lower()\n",
+    "\n",
+    "        results = retrieve_fn(query, top_n=1)\n",
+    "        predicted = results[0][\"standard\"].strip().lower()\n",
+    "\n",
+    "        if predicted == expected:\n",
+    "            correct += 1\n",
+    "\n",
+    "    accuracy = round(correct / total, 4)\n",
+    "    print(f\"Top-1 Accuracy: {accuracy}\")\n",
+    "    return accuracy\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
+   "id": "5d653426",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Top-1 Accuracy: 0.9959\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.9959"
+      ]
+     },
+     "execution_count": 156,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# For BM25\n",
+    "evaluate_top1_accuracy(df, retrieve_top_n_bm25)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 153,
+   "id": "6f9e5c59",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Top-1 Accuracy: 0.9797\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.9797"
+      ]
+     },
+     "execution_count": 153,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# For SPLADE\n",
+    "evaluate_top1_accuracy(df, retrieve_top_n_splade)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2b5800b",
+   "metadata": {},
+   "source": [
+    "## Comparison: BM25 vs SPLADE for CCSS Alignment\n",
+    "\n",
+    "**Query:**  \n",
+    "> *\"Identify the main idea of a text\"*\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Top-5 Results: **BM25**\n",
+    "\n",
+    "| Rank | ID      | Category             | Sub Category         | State Standard                                                                 | Score   |\n",
+    "|------|---------|----------------------|----------------------|----------------------------------------------------------------------------------|---------|\n",
+    "| 1    | 1.RI.2  | Reading Informational| Key Ideas and Details| identify main topic retell key detail text                                       | 10.666  |\n",
+    "| 2    | 3.RI.2  | Reading Informational| Key Ideas and Details| determine main idea text recount key detail explain support main idea           | 10.0953 |\n",
+    "| 3    | K.RI.2  | Reading Informational| Key Ideas and Details| prompting support identify main topic retell key detail text                    | 9.8043  |\n",
+    "| 4    | 2.RI.6  | Reading Informational| Craft and Structure  | identify main purpose text including author want answer explain describe       | 9.4236  |\n",
+    "| 5    | 2.RI.2  | Reading Informational| Key Ideas and Details| identify main topic multiparagraph text well focus specific paragraph within text | 9.3944  |\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Top-5 Results: **SPLADE (Sparse Embedding Model)**\n",
+    "\n",
+    "| Rank | ID      | Category             | Sub Category         | State Standard                                                                 | Score   |\n",
+    "|------|---------|----------------------|----------------------|----------------------------------------------------------------------------------|---------|\n",
+    "| 1    | 4.RI.2  | Reading Informational| Key Ideas and Details| determine main idea text explain supported key detail summarize text           | 21.3089 |\n",
+    "| 2    | 3.RI.2  | Reading Informational| Key Ideas and Details| determine main idea text recount key detail explain support main idea           | 20.8493 |\n",
+    "| 3    | 5.RI.2  | Reading Informational| Key Ideas and Details| determine two main idea text explain supported key detail summarize text       | 20.2714 |\n",
+    "| 4    | 3.SL.2  | Speaking & Listening | Comprehension and Collaboration | determine main idea supporting detail text read aloud information presented diverse medium format including visually quantitatively orally | 17.5151 |\n",
+    "| 5    | 2.RI.6  | Reading Informational| Craft and Structure  | identify main purpose text including author want answer explain describe       | 17.512  |\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Insights:\n",
+    "\n",
+    "- Both **BM25 and SPLADE** correctly rank **\"3.RI.2\"** and **\"2.RI.6\"** in the top-5.\n",
+    "- **SPLADE ranks more abstract or paraphrased variants** (e.g., \"summarize\", \"supported key detail\") higher due to its semantic understanding.\n",
+    "- SPLADE retrieves **higher-level matches** like **\"5.RI.2\"** and **\"4.RI.2\"**, which are **semantically related** but not lexically identical.\n",
+    "- BM25 relies on **exact term overlap**, favoring simpler phrasings like \"identify main topic\".\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Conclusion:\n",
+    "\n",
+    "| Feature                  | BM25                       | SPLADE                           |\n",
+    "|--------------------------|----------------------------|----------------------------------|\n",
+    "| Matching Type            | Exact lexical match        | Semantic sparse match            |\n",
+    "| Interpretability         | High (term overlap)     | High (per-term weights)       |\n",
+    "| Handles Paraphrasing     | No                      | Yes                           |\n",
+    "| Use Case Fit             | Good for short, exact queries | Great for natural language input |\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Top-1 Accuracy\n",
+    "\n",
+    "| Model   | Top-1 Accuracy |\n",
+    "|---------|----------------|\n",
+    "| BM25    | **0.9959**     |\n",
+    "| SPLADE  | **0.9797**     |\n",
+    "\n",
+    "---\n",
+    "\n",
+    "### Insights\n",
+    "\n",
+    "- **BM25** achieves near-perfect accuracy due to exact term matching, especially since queries are identical to indexed documents.\n",
+    "- **SPLADE** performs slightly lower because it may **re-rank paraphrases or semantic neighbors**, even when the original text is present.\n",
+    "\n",
+    "---\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "venv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# Core libraries
+pandas
+numpy
+scikit-learn
+# NLP
+nltk
+transformers
+torch
+# BM25
+rank_bm25
+# Streamlit UI
+streamlit
+# Plotting (optional, if using matplotlib in app.py)
+matplotlib
+# For reading Excel
+openpyxl
+# Ensure compatibility with tokenizers
+tokenizers