rag-rag commited on
Commit
3cdfed5
·
1 Parent(s): 69d9717

relaxed match

Browse files
backend/populate_vec_db_and_seach.py CHANGED
@@ -70,7 +70,78 @@ def get_paragraphs(book_name_sup: str):
70
  print(f"[WARNING] No paragraphs found for book '{book_name_sup}'.", flush=True)
71
 
72
  return selected_paragraphs
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  #------------------------------ Function to check of collection already exists in qdrant -------------------------
76
  def does_collection_exist(collection_name , client):
@@ -135,7 +206,8 @@ def create_populate_collection_if_not_exist(book_name_sup):
135
  create_collection(collection_name, client, model, EMBEDDING_MODEL_NAME)
136
  print("[DEBUG] collection created")
137
 
138
- selected_paragraphs = get_paragraphs(book_name_sup)
 
139
  print(f"[DEBUG] fetched {len(selected_paragraphs)} paragraphs")
140
 
141
  upload_embeddings(selected_paragraphs, client, model, collection_name)
 
70
  print(f"[WARNING] No paragraphs found for book '{book_name_sup}'.", flush=True)
71
 
72
  return selected_paragraphs
73
+ #---------------------------------------- Function to get paragraph relaxed ---------------------------------------
74
+ def get_paragraphs_relaxed(book_name_sup: str):
75
+ print(f"[DEBUG] get_paragraphs called with book_name_sup={book_name_sup}", flush=True)
76
+
77
+ # Pre-process user input for relaxed matching (lowercase, stripped)
78
+ target_clean = book_name_sup.lower().strip()
79
 
80
+ # 1. Try loading the dataset
81
+ try:
82
+ print("[DEBUG] Loading dataset…", flush=True)
83
+ dataset = load_dataset(
84
+ "Navanjana/Gutenberg_books",
85
+ split="train",
86
+ streaming=True
87
+ )
88
+ print("[DEBUG] Dataset loaded successfully (streaming mode).", flush=True)
89
+ except Exception as e:
90
+ print(f"[ERROR] Failed to load dataset: {e}", flush=True)
91
+ return []
92
+
93
+ # Helper function for safe, relaxed matching
94
+ # Returns True if target is inside the book name (case-insensitive)
95
+ def is_match(row):
96
+ row_book = row.get("book_name")
97
+ if not row_book:
98
+ return False
99
+ return target_clean in row_book.lower()
100
+
101
+ # 2. Start scanning dataset until we find the target book
102
+ # Logic: Drop rows while they do NOT match the target
103
+ print("[DEBUG] Starting dropwhile (scanning for first relaxed match)…", flush=True)
104
+ start_stream = dropwhile(lambda x: not is_match(x), dataset)
105
+
106
+ # 3. Prepare takewhile
107
+ # Logic: Take rows while they DO match the target
108
+ print("[DEBUG] Starting takewhile (reading matching rows)…", flush=True)
109
+ book_stream = takewhile(lambda x: is_match(x), start_stream)
110
+
111
+ selected_paragraphs = []
112
+ row_count = 0
113
+ match_count = 0
114
+
115
+ # 4. Iterate through streaming rows
116
+ print("[DEBUG] Iterating through book_stream…", flush=True)
117
+ for row in book_stream:
118
+ row_count += 1
119
+
120
+ # Log the actual book name found to verify the match
121
+ if row_count == 1:
122
+ print(f"[DEBUG] First match found on book: '{row.get('book_name')}'", flush=True)
123
+
124
+ if row_count <= 3:
125
+ print(f"[DEBUG] Sample row #{row_count}: {row}", flush=True)
126
+
127
+ text = row.get("paragraph")
128
+ if text:
129
+ match_count += 1
130
+
131
+ if text and len(text) > 20:
132
+ selected_paragraphs.append(text)
133
+
134
+ if row_count % 500 == 0:
135
+ print(f"[DEBUG] Processed {row_count} rows, paragraphs collected: {len(selected_paragraphs)}", flush=True)
136
+
137
+ # 5. Summary
138
+ print(f"[DEBUG] Finished streaming. Total matching rows: {match_count}", flush=True)
139
+ print(f"[DEBUG] Total selected paragraphs (len > 20): {len(selected_paragraphs)}", flush=True)
140
+
141
+ if len(selected_paragraphs) == 0:
142
+ print(f"[WARNING] No paragraphs found for criteria '{book_name_sup}'.", flush=True)
143
+
144
+ return selected_paragraphs
145
 
146
  #------------------------------ Function to check of collection already exists in qdrant -------------------------
147
  def does_collection_exist(collection_name , client):
 
206
  create_collection(collection_name, client, model, EMBEDDING_MODEL_NAME)
207
  print("[DEBUG] collection created")
208
 
209
+ #selected_paragraphs = get_paragraphs(book_name_sup)
210
+ selected_paragraphs = get_paragraphs_relaxed(book_name_sup)
211
  print(f"[DEBUG] fetched {len(selected_paragraphs)} paragraphs")
212
 
213
  upload_embeddings(selected_paragraphs, client, model, collection_name)