Spaces:

asaduzzaman607
/

memphis-search

Sleeping

App Files Files Community

asaduzzaman607 commited on Dec 8, 2025

Commit

55816d9

verified ·

1 Parent(s): 84950ba

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -33

app.py CHANGED Viewed

@@ -288,11 +288,7 @@ def load_stopwords(path: str):
 STOPWORDS = load_stopwords(STOPWORD_FILE)
-# --- DEBUG: confirm key words are in the stopword set ---
-debug_words = ["a", "almost", "available", "because", "described", "zero", "able"]
 print("Loaded", len(STOPWORDS), "raw stopwords from file.")
-for w in debug_words:
-    print(f"'{w}' in STOPWORDS:", w in STOPWORDS)
 # ---------- TOKENIZE WITH STRICT STOPWORD REMOVAL ----------
@@ -317,14 +313,6 @@ def tokenize(text: str):
     return stems
-# --- OPTIONAL: quick debug of a sample query ---
-test_q = "Almost available because described?"
-print("DEBUG tokenize test:")
-print("  RAW:", WORD_RE.findall(test_q.lower()))
-print("  AFTER STOPWORDS:", [t for t in WORD_RE.findall(test_q.lower()) if t not in STOPWORDS])
-print("  FINAL STEMS:", tokenize(test_q))
 # ---------- SAFE LOG HELPER ----------
 def log_weight(tf: int) -> float:
     try:
@@ -421,11 +409,11 @@ def compute_bigram_coverage(candidate_docs, query_terms):
 # ---------- CORE SEARCH ----------
-def search_core(query: str, top_k: int = 10):
     terms = tokenize(query)
     if not terms:
         # caller handles the "only stopwords" messaging
-        return []
     # OR semantics over terms
     candidate_docs = set()
@@ -434,15 +422,16 @@ def search_core(query: str, top_k: int = 10):
         if postings:
             candidate_docs.update(postings.keys())
-    if not candidate_docs:
-        return []
     # SPECIAL: if query involves mascot, require mascot term to appear
     if "mascot" in terms:
         mascot_docs = set(POSTINGS.get("mascot", {}).keys())
         candidate_docs = candidate_docs & mascot_docs
-        if not candidate_docs:
-            return []
     # Query weights
     q_tf = Counter(terms)
@@ -451,7 +440,7 @@ def search_core(query: str, top_k: int = 10):
         if t in IDF:
             q_w[t] = log_weight(tf) * IDF[t]
     if not q_w:
-        return []
     q_norm = math.sqrt(sum(w * w for w in q_w.values())) or 1.0
@@ -467,7 +456,7 @@ def search_core(query: str, top_k: int = 10):
             scores[doc_id] += wq * wd
     if not scores:
-        return []
     coverage = compute_term_coverage(candidate_docs, terms)
     bigram_cov = compute_bigram_coverage(candidate_docs, terms)
@@ -487,27 +476,32 @@ def search_core(query: str, top_k: int = 10):
         ranked.append((final_score, doc_id))
     ranked.sort(reverse=True)
     ranked = ranked[:top_k]
     rows = []
     for rank, (score, doc_id) in enumerate(ranked, start=1):
         url = URL_MAP.get(doc_id, "")
         rows.append((rank, doc_id, score, url))
-    return rows
 # ---------- HTML RENDER ----------
-def format_results_html(query: str, rows):
     if not query.strip():
         return "<p style='color:#888'>Type a query and press <b>Submit</b> to see results.</p>"
-    if not rows:
-        return f"<p>No results found for <b>{query}</b>.</p>"
     html = [
-        f"<p>Showing top <b>{len(rows)}</b> results for "
         f"<span style='background:#fff3cd;border-radius:999px;padding:2px 10px;'>{query}</span></p>"
     ]
     html.append(
         """
         <table class="results-table">
@@ -552,10 +546,10 @@ def gradio_search(query, top_k):
     try:
         k = int(top_k)
     except Exception:
-        k = 10
     if not query:
-        return format_results_html(query, [])
     # STRICT stopword-only check: if nothing survives tokenize(), show message
     stemmed_terms = tokenize(query)
@@ -567,7 +561,7 @@ def gradio_search(query, top_k):
         )
     try:
-        rows = search_core(query, k)
     except Exception as e:
         return (
             "<h3 style='color:red;'>Search error</h3>"
@@ -577,7 +571,7 @@ def gradio_search(query, top_k):
             + "</pre>"
         )
-    return format_results_html(query, rows)
 # ---------- CSS ----------
@@ -636,11 +630,11 @@ with gr.Blocks(title="Memphis.edu Search Engine") as demo:
                 lines=1,
             )
             top_k = gr.Slider(
-                label="Top K results",
                 minimum=1,
-                maximum=40,
                 step=1,
-                value=10,
             )
             with gr.Row():

 STOPWORDS = load_stopwords(STOPWORD_FILE)
 print("Loaded", len(STOPWORDS), "raw stopwords from file.")
 # ---------- TOKENIZE WITH STRICT STOPWORD REMOVAL ----------
     return stems
 # ---------- SAFE LOG HELPER ----------
 def log_weight(tf: int) -> float:
     try:
 # ---------- CORE SEARCH ----------
+def search_core(query: str, top_k: int = 50):
     terms = tokenize(query)
     if not terms:
         # caller handles the "only stopwords" messaging
+        return [], 0
     # OR semantics over terms
     candidate_docs = set()
         if postings:
             candidate_docs.update(postings.keys())
     # SPECIAL: if query involves mascot, require mascot term to appear
     if "mascot" in terms:
         mascot_docs = set(POSTINGS.get("mascot", {}).keys())
         candidate_docs = candidate_docs & mascot_docs
+    if not candidate_docs:
+        return [], 0
+    # total pages retrieved for this query (after any special filters)
+    total_matching = len(candidate_docs)
     # Query weights
     q_tf = Counter(terms)
         if t in IDF:
             q_w[t] = log_weight(tf) * IDF[t]
     if not q_w:
+        return [], total_matching
     q_norm = math.sqrt(sum(w * w for w in q_w.values())) or 1.0
             scores[doc_id] += wq * wd
     if not scores:
+        return [], total_matching
     coverage = compute_term_coverage(candidate_docs, terms)
     bigram_cov = compute_bigram_coverage(candidate_docs, terms)
         ranked.append((final_score, doc_id))
     ranked.sort(reverse=True)
+    # cap at 50 regardless of slider, to satisfy "top 50" requirement
+    top_k = min(int(top_k), 50)
     ranked = ranked[:top_k]
     rows = []
     for rank, (score, doc_id) in enumerate(ranked, start=1):
         url = URL_MAP.get(doc_id, "")
         rows.append((rank, doc_id, score, url))
+    return rows, total_matching
 # ---------- HTML RENDER ----------
+def format_results_html(query: str, rows, total_matching: int):
     if not query.strip():
         return "<p style='color:#888'>Type a query and press <b>Submit</b> to see results.</p>"
+    if total_matching == 0:
+        return f"<p>No results found for <b>{query}</b>. (0 matching documents)</p>"
     html = [
+        f"<p>Found <b>{total_matching}</b> matching documents. "
+        f"Showing top <b>{len(rows)}</b> for "
         f"<span style='background:#fff3cd;border-radius:999px;padding:2px 10px;'>{query}</span></p>"
     ]
     html.append(
         """
         <table class="results-table">
     try:
         k = int(top_k)
     except Exception:
+        k = 50
     if not query:
+        return format_results_html(query, [], 0)
     # STRICT stopword-only check: if nothing survives tokenize(), show message
     stemmed_terms = tokenize(query)
         )
     try:
+        rows, total = search_core(query, k)
     except Exception as e:
         return (
             "<h3 style='color:red;'>Search error</h3>"
             + "</pre>"
         )
+    return format_results_html(query, rows, total)
 # ---------- CSS ----------
                 lines=1,
             )
             top_k = gr.Slider(
+                label="Top K results (max 50)",
                 minimum=1,
+                maximum=50,
                 step=1,
+                value=50,
             )
             with gr.Row():