Spaces:

asoni9
/

canHeal

Sleeping

Harshit Jain commited on Oct 2, 2025

Commit

b42a15f

1 Parent(s): 597fb81

Added spellcheck module + 'Did you mean' suggestions with custom vocab

Files changed (5) hide show

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from dash import dcc, html
 from dash.dependencies import Input, Output, State, MATCH
 import dash_bootstrap_components as dbc
 from retriever import retrieve
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
@@ -38,11 +39,16 @@ def search_callback(_, query):
     if not query:
         return dbc.Alert("Please enter a query.", color="warning")
-    results = retrieve(query, top_k=5)
     if not results:
         return dbc.Alert("No results found.", color="danger")
-    cards = []
     for idx, r in enumerate(results):
         meta = r["metadata"]
         title = meta.get("name") or f"Document {meta.get('document_id')}"

 from dash.dependencies import Input, Output, State, MATCH
 import dash_bootstrap_components as dbc
 from retriever import retrieve
+from spellcheck import autocorrect_query
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
     if not query:
         return dbc.Alert("Please enter a query.", color="warning")
+    corrected_query, suggestion = autocorrect_query(query)
+    results = retrieve(corrected_query, top_k=5)
+    cards = []
+    if suggestion:
+        cards.append(dbc.Alert(suggestion, color="info"))
     if not results:
         return dbc.Alert("No results found.", color="danger")
     for idx, r in enumerate(results):
         meta = r["metadata"]
         title = meta.get("name") or f"Document {meta.get('document_id')}"

data/embeddings.npy CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bad33642819fb65ff5d780c1fe8ea60974a03a95218ace76d2e4f7a6a95c8577
 size 1313408

 version https://git-lfs.github.com/spec/v1
+oid sha256:cc7063629c99f9ba0cb1063dc07a70706c82c217864ceb9b44c5d8e64f96967d
 size 1313408

requirements.txt CHANGED Viewed

@@ -3,3 +3,4 @@ dash-bootstrap-components==1.6.0
 sentence-transformers>=2.6.0
 numpy>=1.23
 huggingface-hub>=0.23.0

 sentence-transformers>=2.6.0
 numpy>=1.23
 huggingface-hub>=0.23.0
+pyspellchecker

retriever.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sentence_transformers import SentenceTransformer
 from collections import defaultdict
 from preprocess import load_json, extract_text, chunk_text
 # --------- Paths ---------
 DATA_DIR = Path("data")
@@ -64,13 +65,14 @@ def _load_or_build():
     if not (EMB_FILE.exists() and CHUNK_FILE.exists() and DOC_FILE.exists() and META_FILE.exists()):
         _build_index()
     print("🔄 Loading precomputed data...")
-    embeddings = np.load(EMB_FILE)
     with open(CHUNK_FILE, "rb") as f:
         chunks, chunk_to_doc_idx = pickle.load(f)
     with open(DOC_FILE, "rb") as f:
         documents = pickle.load(f)
     with open(META_FILE, "rb") as f:
         metadata = pickle.load(f)
     return chunks, chunk_to_doc_idx, documents, metadata, embeddings
 # Load on import
@@ -78,6 +80,11 @@ chunks, chunk_to_doc_idx, documents, metadata, embeddings = _load_or_build()
 # --------- Retrieval ---------
 def retrieve(query, top_k=5):
     q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
     scores = (embeddings @ q_emb.T).squeeze()

 from collections import defaultdict
 from preprocess import load_json, extract_text, chunk_text
+from spellcheck import autocorrect_query, load_custom_vocab
 # --------- Paths ---------
 DATA_DIR = Path("data")
     if not (EMB_FILE.exists() and CHUNK_FILE.exists() and DOC_FILE.exists() and META_FILE.exists()):
         _build_index()
     print("🔄 Loading precomputed data...")
+    embeddings = np.load(EMB_FILE, allow_pickle=True)
     with open(CHUNK_FILE, "rb") as f:
         chunks, chunk_to_doc_idx = pickle.load(f)
     with open(DOC_FILE, "rb") as f:
         documents = pickle.load(f)
     with open(META_FILE, "rb") as f:
         metadata = pickle.load(f)
+    load_custom_vocab(documents)
     return chunks, chunk_to_doc_idx, documents, metadata, embeddings
 # Load on import
 # --------- Retrieval ---------
 def retrieve(query, top_k=5):
+    # Autocorrect step
+    query, suggestion = autocorrect_query(query)
+    if suggestion:
+        print(suggestion)  # Logs correction suggestion in console
     q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
     scores = (embeddings @ q_emb.T).squeeze()

spellcheck.py ADDED Viewed

+# spellcheck.py
+from spellchecker import SpellChecker
+spell = SpellChecker()
+custom_vocab = set()
+def load_custom_vocab(docs: list[str]):
+    """Load custom vocab from project documents."""
+    global custom_vocab
+    for d in docs:
+        for w in d.split():
+            custom_vocab.add(w.lower())
+def autocorrect_query(query: str) -> tuple[str, str]:
+    """
+    Autocorrects a query and returns (corrected_query, suggestion).
+    Suggestion will be a 'Did you mean...' string or "" if no change.
+    """
+    words = query.split()
+    corrected = []
+    for w in words:
+        candidates = spell.candidates(w)
+        # Prefer candidates from custom vocab if available
+        custom_candidates = [c for c in candidates if c in custom_vocab]
+        if custom_candidates:
+            corrected.append(custom_candidates[0])
+        else:
+            corrected.append(spell.correction(w) or w)
+    corrected_query = " ".join(corrected)
+    if corrected_query != query:
+        return corrected_query, f"Did you mean '{corrected_query}'?"
+    return query, ""