relaxed match
Browse files
backend/populate_vec_db_and_seach.py
CHANGED
|
@@ -70,7 +70,78 @@ def get_paragraphs(book_name_sup: str):
|
|
| 70 |
print(f"[WARNING] No paragraphs found for book '{book_name_sup}'.", flush=True)
|
| 71 |
|
| 72 |
return selected_paragraphs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
#------------------------------ Function to check of collection already exists in qdrant -------------------------
|
| 76 |
def does_collection_exist(collection_name , client):
|
|
@@ -135,7 +206,8 @@ def create_populate_collection_if_not_exist(book_name_sup):
|
|
| 135 |
create_collection(collection_name, client, model, EMBEDDING_MODEL_NAME)
|
| 136 |
print("[DEBUG] collection created")
|
| 137 |
|
| 138 |
-
selected_paragraphs = get_paragraphs(book_name_sup)
|
|
|
|
| 139 |
print(f"[DEBUG] fetched {len(selected_paragraphs)} paragraphs")
|
| 140 |
|
| 141 |
upload_embeddings(selected_paragraphs, client, model, collection_name)
|
|
|
|
| 70 |
print(f"[WARNING] No paragraphs found for book '{book_name_sup}'.", flush=True)
|
| 71 |
|
| 72 |
return selected_paragraphs
|
| 73 |
+
#---------------------------------------- Function to get paragraph relaxed ---------------------------------------
|
| 74 |
+
def get_paragraphs_relaxed(book_name_sup: str):
|
| 75 |
+
print(f"[DEBUG] get_paragraphs called with book_name_sup={book_name_sup}", flush=True)
|
| 76 |
+
|
| 77 |
+
# Pre-process user input for relaxed matching (lowercase, stripped)
|
| 78 |
+
target_clean = book_name_sup.lower().strip()
|
| 79 |
|
| 80 |
+
# 1. Try loading the dataset
|
| 81 |
+
try:
|
| 82 |
+
print("[DEBUG] Loading dataset…", flush=True)
|
| 83 |
+
dataset = load_dataset(
|
| 84 |
+
"Navanjana/Gutenberg_books",
|
| 85 |
+
split="train",
|
| 86 |
+
streaming=True
|
| 87 |
+
)
|
| 88 |
+
print("[DEBUG] Dataset loaded successfully (streaming mode).", flush=True)
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f"[ERROR] Failed to load dataset: {e}", flush=True)
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
# Helper function for safe, relaxed matching
|
| 94 |
+
# Returns True if target is inside the book name (case-insensitive)
|
| 95 |
+
def is_match(row):
|
| 96 |
+
row_book = row.get("book_name")
|
| 97 |
+
if not row_book:
|
| 98 |
+
return False
|
| 99 |
+
return target_clean in row_book.lower()
|
| 100 |
+
|
| 101 |
+
# 2. Start scanning dataset until we find the target book
|
| 102 |
+
# Logic: Drop rows while they do NOT match the target
|
| 103 |
+
print("[DEBUG] Starting dropwhile (scanning for first relaxed match)…", flush=True)
|
| 104 |
+
start_stream = dropwhile(lambda x: not is_match(x), dataset)
|
| 105 |
+
|
| 106 |
+
# 3. Prepare takewhile
|
| 107 |
+
# Logic: Take rows while they DO match the target
|
| 108 |
+
print("[DEBUG] Starting takewhile (reading matching rows)…", flush=True)
|
| 109 |
+
book_stream = takewhile(lambda x: is_match(x), start_stream)
|
| 110 |
+
|
| 111 |
+
selected_paragraphs = []
|
| 112 |
+
row_count = 0
|
| 113 |
+
match_count = 0
|
| 114 |
+
|
| 115 |
+
# 4. Iterate through streaming rows
|
| 116 |
+
print("[DEBUG] Iterating through book_stream…", flush=True)
|
| 117 |
+
for row in book_stream:
|
| 118 |
+
row_count += 1
|
| 119 |
+
|
| 120 |
+
# Log the actual book name found to verify the match
|
| 121 |
+
if row_count == 1:
|
| 122 |
+
print(f"[DEBUG] First match found on book: '{row.get('book_name')}'", flush=True)
|
| 123 |
+
|
| 124 |
+
if row_count <= 3:
|
| 125 |
+
print(f"[DEBUG] Sample row #{row_count}: {row}", flush=True)
|
| 126 |
+
|
| 127 |
+
text = row.get("paragraph")
|
| 128 |
+
if text:
|
| 129 |
+
match_count += 1
|
| 130 |
+
|
| 131 |
+
if text and len(text) > 20:
|
| 132 |
+
selected_paragraphs.append(text)
|
| 133 |
+
|
| 134 |
+
if row_count % 500 == 0:
|
| 135 |
+
print(f"[DEBUG] Processed {row_count} rows, paragraphs collected: {len(selected_paragraphs)}", flush=True)
|
| 136 |
+
|
| 137 |
+
# 5. Summary
|
| 138 |
+
print(f"[DEBUG] Finished streaming. Total matching rows: {match_count}", flush=True)
|
| 139 |
+
print(f"[DEBUG] Total selected paragraphs (len > 20): {len(selected_paragraphs)}", flush=True)
|
| 140 |
+
|
| 141 |
+
if len(selected_paragraphs) == 0:
|
| 142 |
+
print(f"[WARNING] No paragraphs found for criteria '{book_name_sup}'.", flush=True)
|
| 143 |
+
|
| 144 |
+
return selected_paragraphs
|
| 145 |
|
| 146 |
#------------------------------ Function to check of collection already exists in qdrant -------------------------
|
| 147 |
def does_collection_exist(collection_name , client):
|
|
|
|
| 206 |
create_collection(collection_name, client, model, EMBEDDING_MODEL_NAME)
|
| 207 |
print("[DEBUG] collection created")
|
| 208 |
|
| 209 |
+
#selected_paragraphs = get_paragraphs(book_name_sup)
|
| 210 |
+
selected_paragraphs = get_paragraphs_relaxed(book_name_sup)
|
| 211 |
print(f"[DEBUG] fetched {len(selected_paragraphs)} paragraphs")
|
| 212 |
|
| 213 |
upload_embeddings(selected_paragraphs, client, model, collection_name)
|