asaduzzaman607 commited on
Commit
55816d9
·
verified ·
1 Parent(s): 84950ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -33
app.py CHANGED
@@ -288,11 +288,7 @@ def load_stopwords(path: str):
288
 
289
  STOPWORDS = load_stopwords(STOPWORD_FILE)
290
 
291
- # --- DEBUG: confirm key words are in the stopword set ---
292
- debug_words = ["a", "almost", "available", "because", "described", "zero", "able"]
293
  print("Loaded", len(STOPWORDS), "raw stopwords from file.")
294
- for w in debug_words:
295
- print(f"'{w}' in STOPWORDS:", w in STOPWORDS)
296
 
297
 
298
  # ---------- TOKENIZE WITH STRICT STOPWORD REMOVAL ----------
@@ -317,14 +313,6 @@ def tokenize(text: str):
317
  return stems
318
 
319
 
320
- # --- OPTIONAL: quick debug of a sample query ---
321
- test_q = "Almost available because described?"
322
- print("DEBUG tokenize test:")
323
- print(" RAW:", WORD_RE.findall(test_q.lower()))
324
- print(" AFTER STOPWORDS:", [t for t in WORD_RE.findall(test_q.lower()) if t not in STOPWORDS])
325
- print(" FINAL STEMS:", tokenize(test_q))
326
-
327
-
328
  # ---------- SAFE LOG HELPER ----------
329
  def log_weight(tf: int) -> float:
330
  try:
@@ -421,11 +409,11 @@ def compute_bigram_coverage(candidate_docs, query_terms):
421
 
422
 
423
  # ---------- CORE SEARCH ----------
424
- def search_core(query: str, top_k: int = 10):
425
  terms = tokenize(query)
426
  if not terms:
427
  # caller handles the "only stopwords" messaging
428
- return []
429
 
430
  # OR semantics over terms
431
  candidate_docs = set()
@@ -434,15 +422,16 @@ def search_core(query: str, top_k: int = 10):
434
  if postings:
435
  candidate_docs.update(postings.keys())
436
 
437
- if not candidate_docs:
438
- return []
439
-
440
  # SPECIAL: if query involves mascot, require mascot term to appear
441
  if "mascot" in terms:
442
  mascot_docs = set(POSTINGS.get("mascot", {}).keys())
443
  candidate_docs = candidate_docs & mascot_docs
444
- if not candidate_docs:
445
- return []
 
 
 
 
446
 
447
  # Query weights
448
  q_tf = Counter(terms)
@@ -451,7 +440,7 @@ def search_core(query: str, top_k: int = 10):
451
  if t in IDF:
452
  q_w[t] = log_weight(tf) * IDF[t]
453
  if not q_w:
454
- return []
455
 
456
  q_norm = math.sqrt(sum(w * w for w in q_w.values())) or 1.0
457
 
@@ -467,7 +456,7 @@ def search_core(query: str, top_k: int = 10):
467
  scores[doc_id] += wq * wd
468
 
469
  if not scores:
470
- return []
471
 
472
  coverage = compute_term_coverage(candidate_docs, terms)
473
  bigram_cov = compute_bigram_coverage(candidate_docs, terms)
@@ -487,27 +476,32 @@ def search_core(query: str, top_k: int = 10):
487
  ranked.append((final_score, doc_id))
488
 
489
  ranked.sort(reverse=True)
 
 
490
  ranked = ranked[:top_k]
491
 
492
  rows = []
493
  for rank, (score, doc_id) in enumerate(ranked, start=1):
494
  url = URL_MAP.get(doc_id, "")
495
  rows.append((rank, doc_id, score, url))
496
- return rows
 
497
 
498
 
499
  # ---------- HTML RENDER ----------
500
- def format_results_html(query: str, rows):
501
  if not query.strip():
502
  return "<p style='color:#888'>Type a query and press <b>Submit</b> to see results.</p>"
503
 
504
- if not rows:
505
- return f"<p>No results found for <b>{query}</b>.</p>"
506
 
507
  html = [
508
- f"<p>Showing top <b>{len(rows)}</b> results for "
 
509
  f"<span style='background:#fff3cd;border-radius:999px;padding:2px 10px;'>{query}</span></p>"
510
  ]
 
511
  html.append(
512
  """
513
  <table class="results-table">
@@ -552,10 +546,10 @@ def gradio_search(query, top_k):
552
  try:
553
  k = int(top_k)
554
  except Exception:
555
- k = 10
556
 
557
  if not query:
558
- return format_results_html(query, [])
559
 
560
  # STRICT stopword-only check: if nothing survives tokenize(), show message
561
  stemmed_terms = tokenize(query)
@@ -567,7 +561,7 @@ def gradio_search(query, top_k):
567
  )
568
 
569
  try:
570
- rows = search_core(query, k)
571
  except Exception as e:
572
  return (
573
  "<h3 style='color:red;'>Search error</h3>"
@@ -577,7 +571,7 @@ def gradio_search(query, top_k):
577
  + "</pre>"
578
  )
579
 
580
- return format_results_html(query, rows)
581
 
582
 
583
  # ---------- CSS ----------
@@ -636,11 +630,11 @@ with gr.Blocks(title="Memphis.edu Search Engine") as demo:
636
  lines=1,
637
  )
638
  top_k = gr.Slider(
639
- label="Top K results",
640
  minimum=1,
641
- maximum=40,
642
  step=1,
643
- value=10,
644
  )
645
 
646
  with gr.Row():
 
288
 
289
  STOPWORDS = load_stopwords(STOPWORD_FILE)
290
 
 
 
291
  print("Loaded", len(STOPWORDS), "raw stopwords from file.")
 
 
292
 
293
 
294
  # ---------- TOKENIZE WITH STRICT STOPWORD REMOVAL ----------
 
313
  return stems
314
 
315
 
 
 
 
 
 
 
 
 
316
  # ---------- SAFE LOG HELPER ----------
317
  def log_weight(tf: int) -> float:
318
  try:
 
409
 
410
 
411
  # ---------- CORE SEARCH ----------
412
+ def search_core(query: str, top_k: int = 50):
413
  terms = tokenize(query)
414
  if not terms:
415
  # caller handles the "only stopwords" messaging
416
+ return [], 0
417
 
418
  # OR semantics over terms
419
  candidate_docs = set()
 
422
  if postings:
423
  candidate_docs.update(postings.keys())
424
 
 
 
 
425
  # SPECIAL: if query involves mascot, require mascot term to appear
426
  if "mascot" in terms:
427
  mascot_docs = set(POSTINGS.get("mascot", {}).keys())
428
  candidate_docs = candidate_docs & mascot_docs
429
+
430
+ if not candidate_docs:
431
+ return [], 0
432
+
433
+ # total pages retrieved for this query (after any special filters)
434
+ total_matching = len(candidate_docs)
435
 
436
  # Query weights
437
  q_tf = Counter(terms)
 
440
  if t in IDF:
441
  q_w[t] = log_weight(tf) * IDF[t]
442
  if not q_w:
443
+ return [], total_matching
444
 
445
  q_norm = math.sqrt(sum(w * w for w in q_w.values())) or 1.0
446
 
 
456
  scores[doc_id] += wq * wd
457
 
458
  if not scores:
459
+ return [], total_matching
460
 
461
  coverage = compute_term_coverage(candidate_docs, terms)
462
  bigram_cov = compute_bigram_coverage(candidate_docs, terms)
 
476
  ranked.append((final_score, doc_id))
477
 
478
  ranked.sort(reverse=True)
479
+ # cap at 50 regardless of slider, to satisfy "top 50" requirement
480
+ top_k = min(int(top_k), 50)
481
  ranked = ranked[:top_k]
482
 
483
  rows = []
484
  for rank, (score, doc_id) in enumerate(ranked, start=1):
485
  url = URL_MAP.get(doc_id, "")
486
  rows.append((rank, doc_id, score, url))
487
+
488
+ return rows, total_matching
489
 
490
 
491
  # ---------- HTML RENDER ----------
492
+ def format_results_html(query: str, rows, total_matching: int):
493
  if not query.strip():
494
  return "<p style='color:#888'>Type a query and press <b>Submit</b> to see results.</p>"
495
 
496
+ if total_matching == 0:
497
+ return f"<p>No results found for <b>{query}</b>. (0 matching documents)</p>"
498
 
499
  html = [
500
+ f"<p>Found <b>{total_matching}</b> matching documents. "
501
+ f"Showing top <b>{len(rows)}</b> for "
502
  f"<span style='background:#fff3cd;border-radius:999px;padding:2px 10px;'>{query}</span></p>"
503
  ]
504
+
505
  html.append(
506
  """
507
  <table class="results-table">
 
546
  try:
547
  k = int(top_k)
548
  except Exception:
549
+ k = 50
550
 
551
  if not query:
552
+ return format_results_html(query, [], 0)
553
 
554
  # STRICT stopword-only check: if nothing survives tokenize(), show message
555
  stemmed_terms = tokenize(query)
 
561
  )
562
 
563
  try:
564
+ rows, total = search_core(query, k)
565
  except Exception as e:
566
  return (
567
  "<h3 style='color:red;'>Search error</h3>"
 
571
  + "</pre>"
572
  )
573
 
574
+ return format_results_html(query, rows, total)
575
 
576
 
577
  # ---------- CSS ----------
 
630
  lines=1,
631
  )
632
  top_k = gr.Slider(
633
+ label="Top K results (max 50)",
634
  minimum=1,
635
+ maximum=50,
636
  step=1,
637
+ value=50,
638
  )
639
 
640
  with gr.Row():