Bohaska commited on
Commit
0fcec84
·
1 Parent(s): b5ace46

chunk semantic issue search, fix issue titles

Browse files
app.py CHANGED
@@ -3,31 +3,39 @@ from FlagEmbedding import BGEM3FlagModel
3
  import numpy as np
4
  import json
5
  import os
6
- import re # Added for strict search context extraction
7
 
8
  # --- Configuration and Global Data Loading ---
9
 
10
  # Determine the directory of the script to load files relative to it
11
  script_dir = os.path.dirname(os.path.abspath(__file__))
12
 
13
- # Define paths for issue embedding types
14
  issue_embeddings_paths = {
15
- 'semantic': os.path.join(script_dir, 'ns_issues_semantic_bge-m3.npy'), # Renamed from fuzzy
16
- 'loose': os.path.join(script_dir, 'ns_issues_loose_bge-m3.npy'), # Renamed from direct
 
 
17
  }
18
  issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
19
 
20
- # Define paths for GA resolution embedding types
 
 
 
 
 
 
 
 
21
  ga_embeddings_paths = {
22
- 'semantic': os.path.join(script_dir, 'ns_ga_resolutions_semantic_bge-m3.npy'), # Renamed from fuzzy
23
- 'loose': os.path.join(script_dir, 'ns_ga_resolutions_loose_bge-m3.npy'), # Renamed from direct
24
  }
25
  ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
26
 
27
  print("Loading BGE-M3 model...")
28
  try:
29
- # Use 'BAAI/bge-m3' to let FlagEmbedding handle downloading/caching.
30
- # If you prefer to force a local path, change it here.
31
  model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
32
  print("Model loaded successfully.")
33
  except Exception as e:
@@ -35,72 +43,100 @@ except Exception as e:
35
  print("Please ensure you have an internet connection or the model is cached locally.")
36
  model = None # Indicate model loading failed
37
 
38
- # Issue data storage for all types
39
  issue_all_embeddings = {
40
- 'semantic': None,
41
- 'loose': None,
42
  }
43
  issue_titles = {}
44
- all_issue_raw_texts = [] # New: To store raw issue texts for strict search
 
 
 
 
 
 
45
 
46
  print("Loading issue data...")
47
  try:
48
- if model: # Only attempt to load embeddings if model is available
49
- # Load available embedding types for issues
50
- for embed_type, path in issue_embeddings_paths.items():
51
- if os.path.exists(path):
52
- if embed_type == 'loose': # Only sparse is loaded as list of objects now
53
- # Load sparse dictionaries: it's a NumPy object array, convert to list of objects
54
- issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
55
- else: # Dense
56
- issue_all_embeddings[embed_type] = np.load(path)
57
- print(
58
- f" Loaded {embed_type} issue embeddings from {path} (Shape: {issue_all_embeddings[embed_type].shape if hasattr(issue_all_embeddings[embed_type], 'shape') else len(issue_all_embeddings[embed_type])})")
59
  else:
60
- print(f" Warning: {embed_type} issue embeddings not found at {path}. Skipping.")
61
- issue_all_embeddings[embed_type] = None # Ensure it's explicitly None if not found
 
 
 
 
62
 
 
 
63
  with open(issue_titles_path, encoding='utf-8') as file:
64
  issue_titles = json.load(file)
65
- print(f"Issue data loaded: {len(issue_titles)} issues.")
66
-
67
- # --- Load raw issue texts for strict search ---
68
- # The issue text files are in 'small_scripts/make_embedding/NationStates-Issue-Megathread/002 - Issue Megalist (MAIN)'
69
- issues_input_dir = os.path.join(script_dir, 'small_scripts', 'make_embedding',
70
- 'NationStates-Issue-Megathread', '002 - Issue Megalist (MAIN)')
71
- issue_files_for_raw_load = []
72
- file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
73
-
74
- if os.path.isdir(issues_input_dir):
75
- for filename in os.listdir(issues_input_dir):
76
- if filename.endswith('.txt'):
77
- match = file_pattern.match(filename)
78
- if match:
79
- start_num = int(match.group(1))
80
- issue_files_for_raw_load.append((start_num, filename))
81
- issue_files_for_raw_load.sort(key=lambda x: x[0])
82
- issue_files_for_raw_load = [os.path.join(issues_input_dir, filename) for _, filename in issue_files_for_raw_load]
83
-
84
- for filepath in issue_files_for_raw_load:
85
- with open(filepath, 'r', encoding='utf-8') as file:
86
- issues_text_in_file = file.read()
87
- # Split issues by the separator and remove any empty strings resulting from multiple separators
88
- issues_list_in_file = [
89
- issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
90
- ]
91
- all_issue_raw_texts.extend(issues_list_in_file)
92
- print(f" Loaded {len(all_issue_raw_texts)} raw issue texts for strict search.")
 
 
 
 
 
 
 
 
93
  else:
94
- print(f" Warning: Issue text directory '{issues_input_dir}' not found. Strict issue search will not work.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  except FileNotFoundError as e:
97
  print(f"Error loading issue data: {e}")
98
- print(
99
- f"Please ensure embedding files and '{os.path.basename(issue_titles_path)}' are in the same directory as app.py")
100
  except Exception as e:
101
  print(f"Error loading issue data: {e}")
102
 
103
- # GA resolution data storage for all types
104
  ga_all_embeddings = {
105
  'semantic': None,
106
  'loose': None,
@@ -110,149 +146,198 @@ ga_resolutions_data = []
110
  print("Loading GA resolution data...")
111
  try:
112
  if model: # Only attempt to load embeddings if model is available
113
- # Load available embedding types for GA resolutions
114
  for embed_type, path in ga_embeddings_paths.items():
115
  if os.path.exists(path):
116
- if embed_type == 'loose': # Only sparse is loaded as list of objects now
117
  ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
118
- else: # Dense
119
  ga_all_embeddings[embed_type] = np.load(path)
120
- print(
121
- f" Loaded {embed_type} GA embeddings from {path} (Shape: {ga_all_embeddings[embed_type].shape if hasattr(ga_all_embeddings[embed_type], 'shape') else len(ga_all_embeddings[embed_type])})")
122
  else:
123
- print(f" Warning: {embed_type} GA embeddings not found at {path}. Skipping.")
124
- ga_all_embeddings[embed_type] = None # Ensure it's explicitly None if not found
125
 
126
- with open(ga_resolutions_path, encoding='utf-8') as file:
127
- ga_resolutions_data = json.load(file) # List of dictionaries
128
- print(f"GA resolution data loaded: {len(ga_resolutions_data)} resolutions.")
 
 
 
129
  except FileNotFoundError as e:
130
  print(f"Error loading GA resolution data: {e}")
131
- print(
132
- f"Please ensure GA embedding files and '{os.path.basename(ga_resolutions_path)}' are in the same directory as app.py")
133
  except Exception as e:
134
  print(f"Error loading GA resolution data: {e}")
135
 
136
 
137
- # --- Search Functions ---
138
-
139
- def _perform_search(search_term: str, corpus_embeddings_dict: dict, search_type: str):
140
- """
141
- Helper function to perform an embedding-based search given the search term, corpus embeddings, and search type.
142
- Returns sorted list of (index, similarity_score).
143
- """
144
- if not model:
145
- raise ValueError("Model failed to load. Cannot perform search.")
146
- if not search_term:
147
- raise ValueError("Please enter a search term.")
148
-
149
- corpus_embeddings = corpus_embeddings_dict.get(search_type)
150
- if corpus_embeddings is None:
151
- raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
152
-
153
- # Encode the search term for relevant types
154
- query_embeddings = model.encode([search_term],
155
- return_dense=True,
156
- return_sparse=True,
157
- return_colbert_vecs=False)
158
-
159
- similarity_scores = []
160
-
161
- if search_type == 'semantic': # Renamed from 'fuzzy'
162
- query_vec = query_embeddings['dense_vecs'] # Shape: (1, embedding_dim)
163
- # Perform dot product for dense similarity
164
- similarity_scores = (query_vec @ corpus_embeddings.T)[0] # Result shape: (num_docs,)
165
- elif search_type == 'loose': # Renamed from 'direct'
166
- # 'lexical_weights' is a list of dictionaries, even for a single query.
167
- # We need the first (and only) dictionary from this list.
168
- if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
169
- raise ValueError("Lexical weights (sparse) not returned for query. Model or configuration issue.")
170
- query_sparse_dict = query_embeddings['lexical_weights'][0]
171
-
172
- # Iterate through each document's sparse dictionary and compute score
173
- for doc_sparse_dict in corpus_embeddings: # corpus_embeddings is a list of sparse dictionaries
174
- score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
175
- similarity_scores.append(score)
176
- similarity_scores = np.array(similarity_scores) # Convert to numpy array
177
- else:
178
- # This function should only be called for embedding-based searches
179
- raise ValueError(f"Unsupported embedding search type: {search_type}")
180
-
181
- # Pair index with similarity score
182
- indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
183
-
184
- # Sort by similarity score in descending order
185
- sorted_similarities = sorted(indexed_similarities, key=lambda item: item[1], reverse=True)
186
-
187
- return sorted_similarities
188
 
189
  def _extract_context(text: str, query: str):
190
- """Extracts the first line containing the query and highlights all mentions of it."""
191
  text_lines = text.split('\n')
192
  query_lower = query.lower()
193
-
194
  for line in text_lines:
195
  if query_lower in line.lower():
196
- # Found the first line containing the query
197
- # Highlight all occurrences of the query in this line
198
  highlighted_line = re.sub(re.escape(query), lambda m: f"**{m.group(0)}**", line, flags=re.IGNORECASE)
199
  return f'> {highlighted_line}'
200
- return "" # Should not be reached if strict search already found a match
 
201
 
 
202
 
203
- def get_issue_similarity_rankings(search_term: str, search_type: str = 'semantic'): # Renamed default
204
- """Searches issues and returns formatted results."""
 
 
 
 
 
205
  try:
206
- if not search_term:
 
 
207
  return "Please enter a search term."
208
 
209
- if search_type == 'strict':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  if not all_issue_raw_texts:
211
  return "Raw issue texts not loaded. Strict search is unavailable."
212
 
213
  strict_matches = []
214
- search_term_lower = search_term.lower()
215
  for i, issue_text in enumerate(all_issue_raw_texts):
216
- if search_term_lower in issue_text.lower():
217
- strict_matches.append((i, 1.0)) # Use 1.0 as a dummy score for strict matches
218
 
219
- similarity_text = f"# Top 20 Issue Search Results (Strict)\n"
220
  if not strict_matches:
221
- return similarity_text + "No exact matches found."
 
222
 
223
- search_ranking = 1
224
- for index, sim_score in strict_matches[:20]: # Still limit to top 20
225
  issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
226
- context = _extract_context(all_issue_raw_texts[index], search_term)
227
- similarity_text += f"{search_ranking}. {issue_title}\n{context}\n\n"
228
- search_ranking += 1
229
- return similarity_text
230
 
231
- else: # Embedding-based search
232
- sorted_similarities = _perform_search(search_term, issue_all_embeddings, search_type)
233
-
234
- similarity_text = f"# Top 20 Issue Search Results ({search_type.capitalize()})\n"
235
- if not sorted_similarities:
236
- return similarity_text + "No issues found."
237
-
238
- search_ranking = 1
239
- for index, sim_score in sorted_similarities[:20]:
240
- # issue_titles is a dict, needs string key
241
- issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
242
- similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
243
- search_ranking += 1
244
- return similarity_text
245
 
246
  except Exception as e:
247
  return f"An error occurred during issue search: {e}"
248
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
251
- search_type: str = 'semantic'): # Renamed default
252
- """
253
- Searches GA resolutions, filters repealed and/or repeal category if requested,
254
- and returns formatted results with links and status.
255
- """
256
  try:
257
  if not search_term:
258
  return "Please enter a search term."
@@ -260,31 +345,28 @@ def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_cat
260
  if search_type == 'strict':
261
  if not ga_resolutions_data:
262
  return "GA resolution data not loaded. Strict search is unavailable."
263
-
264
  strict_matches = []
265
- search_term_lower = search_term.lower()
266
  for i, resolution in enumerate(ga_resolutions_data):
267
- resolution_body = resolution.get('body', '')
268
- if search_term_lower in resolution_body.lower():
269
- # Apply filters immediately for strict search
270
  status = resolution.get('status')
271
  category = resolution.get('category')
272
  if hide_repealed and status == "Repealed":
273
  continue
274
  if hide_repeal_category and category == "Repeal":
275
  continue
276
- strict_matches.append((i, 1.0)) # Dummy score
277
 
278
- similarity_text = f"# Top 20 GA Resolution Search Results (Strict)\n"
279
  if not strict_matches:
280
  status_msgs = []
281
  if hide_repealed: status_msgs.append("Repealed")
282
  if hide_repeal_category: status_msgs.append("Repeal Category")
283
  filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
284
- return similarity_text + f"No exact matches found{filter_msg}."
285
 
286
- search_ranking = 1
287
- for index, sim_score in strict_matches[:20]:
288
  resolution = ga_resolutions_data[index]
289
  title = resolution.get('title', 'Untitled Resolution')
290
  res_id = resolution.get('id', 'N/A')
@@ -292,135 +374,104 @@ def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_cat
292
  status = resolution.get('status')
293
  status_marker = "[REPEALED] " if status == "Repealed" else ""
294
  url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
295
-
296
  context = _extract_context(resolution.get('body', ''), search_term)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
- similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Match: {sim_score:.4f}\n{context}\n"
299
- search_ranking += 1
300
- return similarity_text
301
-
302
- else: # Embedding-based search
303
- raw_sorted_similarities = _perform_search(search_term, ga_all_embeddings, search_type)
304
-
305
- # --- Filtering ---
306
- filtered_indexed_similarities = []
307
- for index, score in raw_sorted_similarities:
308
- # Ensure index is valid
309
- if index < len(ga_resolutions_data):
310
- resolution = ga_resolutions_data[index]
311
- status = resolution.get('status')
312
- category = resolution.get('category')
313
-
314
- # Apply filters
315
- if hide_repealed and status == "Repealed":
316
- continue
317
- if hide_repeal_category and category == "Repeal":
318
- continue
319
- filtered_indexed_similarities.append((index, score))
320
-
321
- # The list is already sorted, no re-sort needed after filtering.
322
-
323
- # --- Formatting Results ---
324
- similarity_text = f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})\n"
325
- if not filtered_indexed_similarities:
326
- status_msgs = []
327
- if hide_repealed: status_msgs.append("Repealed")
328
- if hide_repeal_category: status_msgs.append("Repeal Category")
329
-
330
- filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
331
- return similarity_text + f"No matching resolutions found{filter_msg}."
332
-
333
- search_ranking = 1
334
- # Get top 20 results from the sorted and filtered list
335
- for index, sim_score in filtered_indexed_similarities[:20]:
336
- resolution = ga_resolutions_data[index]
337
-
338
- title = resolution.get('title', 'Untitled Resolution')
339
- res_id = resolution.get('id', 'N/A')
340
- council = resolution.get('council', 1)
341
- status = resolution.get('status')
342
-
343
- # Add [REPEALED] marker if the status is "Repealed"
344
- status_marker = "[REPEALED] " if status == "Repealed" else ""
345
-
346
- # Construct the NationStates URL
347
- url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
348
-
349
- # Format as Markdown link with the status marker
350
- similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Similarity: {sim_score:.4f}\n"
351
-
352
- search_ranking += 1
353
-
354
- return similarity_text
355
  except Exception as e:
356
  return f"An error occurred during GA resolution search: {e}"
357
 
358
 
359
  # --- Gradio Interface ---
360
 
361
- """
362
- For information on how to customize the Gradio Blocks and Tabs, peruse the gradio docs:
363
- https://www.gradio.app/docs/blocks
364
- https://www.gradio.app/docs/tabs
365
- https://www.gradio.app/docs/interface (used within tabs)
366
- """
367
-
368
  with gr.Blocks() as demo:
369
  gr.Markdown("""
370
  # NationStates Semantic Search
371
- Search through NationStates issues/GA resolutions using semantic search.
372
- Search time depends on how long your query is. For single words or sentences, expect an answer in less than 5 seconds. For long paragraphs/blocks of text, it might take up to a minute for the AI search engine to finish.
373
  """)
374
 
375
  with gr.Tabs() as tabs:
 
376
  with gr.TabItem("Issue Search"):
377
- gr.Markdown(f"""
378
  ### Search NationStates Issues
379
- Search through first {len(issue_titles)} issues. This uses semantic search, which finds related concepts/ideas, not as good with exact keywords. Feel free to try words, sentences, or paragraphs!
 
 
380
  """)
381
  issue_search_interface = gr.Interface(
382
- fn=get_issue_similarity_rankings,
383
  inputs=[
384
- gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
385
- gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
386
- info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
 
 
387
  ],
388
  outputs=gr.Markdown(),
389
  examples=[
390
- # Examples for Issue Search (search_term, search_type)
391
- ["coffee", "semantic"],
392
- ["land value tax", "loose"],
393
- ["Elon Musk", "loose"],
394
- ["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
395
- "semantic"],
396
- [
397
- "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
398
- "semantic"],
399
- ["tax", "strict"], # New example for strict
400
- ["environmental protection", "strict"] # New example for strict
401
  ],
402
  title=None,
403
  description=None,
404
  submit_btn="Search Issues",
405
- article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). Issue data from [Valentine Z](https://www.nationstates.net/nation=valentine_z). Powered by [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)."
406
  )
407
 
 
408
  with gr.TabItem("GA Resolution Search"):
409
- gr.Markdown(f"""
410
- ### Search NationStates General Assembly Resolutions
411
- Search through first {len(ga_resolutions_data)} General Assembly resolutions. This uses semantic search, which finds related concepts/ideas, not as good with exact keywords. Feel free to try words, sentences, or paragraphs!
412
- """)
413
-
414
- # Define inputs for the GA search interface
415
  ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
416
  ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
417
  ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
418
- ga_search_type_radio = gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
419
- info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
420
 
421
  ga_search_interface = gr.Interface(
422
  fn=search_ga_resolutions,
423
- # Pass inputs in the order expected by the function
424
  inputs=[
425
  ga_search_term_input,
426
  ga_hide_repealed_checkbox,
@@ -429,23 +480,20 @@ with gr.Blocks() as demo:
429
  ],
430
  outputs=gr.Markdown(),
431
  examples=[
432
- # Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
433
  ["condemn genocide", True, True, "semantic"],
434
  ["rights of animals", True, True, "loose"],
435
  ["regulating space mining", True, True, "semantic"],
436
  ["founding of the World Assembly", True, True, "semantic"],
437
  ["environmental protection", True, True, "semantic"],
438
- ["human rights", True, True, "strict"], # New example for strict
439
- ["World Assembly", True, True, "strict"] # New example for strict
440
  ],
441
  title=None,
442
  description=None,
443
  submit_btn="Search Resolutions",
444
- article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). GA Resolution data parsed from NationStates. Powered by [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)."
445
  )
446
 
447
  # --- Launch App ---
448
  if __name__ == "__main__":
449
- # Set share=True to make the app accessible externally (requires ngrok)
450
- # share=False is default and runs locally
451
  demo.launch()
 
3
  import numpy as np
4
  import json
5
  import os
6
+ import re
7
 
8
  # --- Configuration and Global Data Loading ---
9
 
10
  # Determine the directory of the script to load files relative to it
11
  script_dir = os.path.dirname(os.path.abspath(__file__))
12
 
13
+ # Original issue-level artifacts (kept for sparse/loose and strict)
14
  issue_embeddings_paths = {
15
+ # We will still attempt to load original dense (semantic) if present,
16
+ # but semantic search will use component-level embeddings. This is optional.
17
+ 'semantic': os.path.join(script_dir, 'ns_issues_semantic_bge-m3.npy'),
18
+ 'loose': os.path.join(script_dir, 'ns_issues_loose_bge-m3.npy'),
19
  }
20
  issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
21
 
22
+ # Component-level artifacts (used for semantic only)
23
+ issue_components_paths = {
24
+ 'semantic': os.path.join(script_dir, 'ns_issue_components_semantic_bge-m3.npy'),
25
+ # There is intentionally no component-level 'loose' per your instruction.
26
+ }
27
+ issue_components_meta_path = os.path.join(script_dir, 'ns_issue_components_meta.json')
28
+ issue_titles_components_path = os.path.join(script_dir, 'issue_titles_components.json')
29
+
30
+ # GA resolution artifacts (unchanged)
31
  ga_embeddings_paths = {
32
+ 'semantic': os.path.join(script_dir, 'ns_ga_resolutions_semantic_bge-m3.npy'),
33
+ 'loose': os.path.join(script_dir, 'ns_ga_resolutions_loose_bge-m3.npy'),
34
  }
35
  ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
36
 
37
  print("Loading BGE-M3 model...")
38
  try:
 
 
39
  model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
40
  print("Model loaded successfully.")
41
  except Exception as e:
 
43
  print("Please ensure you have an internet connection or the model is cached locally.")
44
  model = None # Indicate model loading failed
45
 
46
+ # Issue data storage (issue-level and component-level)
47
  issue_all_embeddings = {
48
+ 'semantic': None, # optional legacy dense; not used for semantic queries in this app
49
+ 'loose': None, # issue-level sparse, used for loose search
50
  }
51
  issue_titles = {}
52
+ all_issue_raw_texts = [] # For strict search (issue-level)
53
+
54
+ issue_components_embeddings = {
55
+ 'semantic': None, # dense component-level embedding matrix
56
+ }
57
+ issue_components_meta = [] # list of dicts aligned to component rows
58
+ issue_titles_components = {}
59
 
60
  print("Loading issue data...")
61
  try:
62
+ # Load issue-level embeddings (kept for sparse/loose and optional legacy dense)
63
+ for embed_type, path in issue_embeddings_paths.items():
64
+ if os.path.exists(path):
65
+ if embed_type == 'loose':
66
+ issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
 
 
 
 
 
 
67
  else:
68
+ issue_all_embeddings[embed_type] = np.load(path)
69
+ shape_or_len = issue_all_embeddings[embed_type].shape if hasattr(issue_all_embeddings[embed_type], 'shape') else len(issue_all_embeddings[embed_type])
70
+ print(f" Loaded {embed_type} issue embeddings from {path} (Shape/Len: {shape_or_len})")
71
+ else:
72
+ print(f" Warning: {embed_type} issue embeddings not found at {path}.")
73
+ issue_all_embeddings[embed_type] = None
74
 
75
+ # Load titles (issue-level)
76
+ if os.path.exists(issue_titles_path):
77
  with open(issue_titles_path, encoding='utf-8') as file:
78
  issue_titles = json.load(file)
79
+ print(f"Issue titles loaded: {len(issue_titles)} issues.")
80
+ else:
81
+ print(f" Warning: issue_titles.json not found at {issue_titles_path}")
82
+
83
+ # Load raw issue texts for strict search
84
+ issues_input_dir = os.path.join(script_dir, 'small_scripts', 'make_embedding',
85
+ 'NationStates-Issue-Megathread', '002 - Issue Megalist (MAIN)')
86
+ issue_files_for_raw_load = []
87
+ file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
88
+
89
+ if os.path.isdir(issues_input_dir):
90
+ for filename in os.listdir(issues_input_dir):
91
+ if filename.endswith('.txt'):
92
+ match = file_pattern.match(filename)
93
+ if match:
94
+ start_num = int(match.group(1))
95
+ issue_files_for_raw_load.append((start_num, filename))
96
+ issue_files_for_raw_load.sort(key=lambda x: x[0])
97
+ issue_files_for_raw_load = [os.path.join(issues_input_dir, filename) for _, filename in issue_files_for_raw_load]
98
+
99
+ for filepath in issue_files_for_raw_load:
100
+ with open(filepath, 'r', encoding='utf-8') as file:
101
+ issues_text_in_file = file.read()
102
+ issues_list_in_file = [
103
+ issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
104
+ ]
105
+ all_issue_raw_texts.extend(issues_list_in_file)
106
+ print(f" Loaded {len(all_issue_raw_texts)} raw issue texts for strict search.")
107
+ else:
108
+ print(f" Warning: Issue text directory '{issues_input_dir}' not found. Strict issue search will not work.")
109
+
110
+ # Load component-level artifacts (semantic only)
111
+ for embed_type, path in issue_components_paths.items():
112
+ if os.path.exists(path):
113
+ issue_components_embeddings[embed_type] = np.load(path)
114
+ print(f" Loaded component {embed_type} embeddings from {path} (Shape: {issue_components_embeddings[embed_type].shape})")
115
  else:
116
+ print(f" Warning: component {embed_type} embeddings not found at {path}.")
117
+
118
+ if os.path.exists(issue_components_meta_path):
119
+ with open(issue_components_meta_path, encoding='utf-8') as f:
120
+ issue_components_meta = json.load(f)
121
+ print(f" Loaded component meta: {len(issue_components_meta)} items.")
122
+ else:
123
+ print(f" Warning: component meta not found at {issue_components_meta_path}.")
124
+
125
+ if os.path.exists(issue_titles_components_path):
126
+ with open(issue_titles_components_path, encoding='utf-8') as f:
127
+ issue_titles_components = json.load(f)
128
+ print(f" Loaded component issue titles: {len(issue_titles_components)}")
129
+ else:
130
+ # Fallback to issue-level titles if component titles not present
131
+ issue_titles_components = issue_titles
132
 
133
  except FileNotFoundError as e:
134
  print(f"Error loading issue data: {e}")
135
+ print(f"Please ensure embedding files and '{os.path.basename(issue_titles_path)}' are in the same directory as app.py")
 
136
  except Exception as e:
137
  print(f"Error loading issue data: {e}")
138
 
139
+ # GA resolution data storage (unchanged)
140
  ga_all_embeddings = {
141
  'semantic': None,
142
  'loose': None,
 
146
  print("Loading GA resolution data...")
147
  try:
148
  if model: # Only attempt to load embeddings if model is available
 
149
  for embed_type, path in ga_embeddings_paths.items():
150
  if os.path.exists(path):
151
+ if embed_type == 'loose':
152
  ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
153
+ else:
154
  ga_all_embeddings[embed_type] = np.load(path)
155
+ shape_or_len = ga_all_embeddings[embed_type].shape if hasattr(ga_all_embeddings[embed_type], 'shape') else len(ga_all_embeddings[embed_type])
156
+ print(f" Loaded {embed_type} GA embeddings from {path} (Shape/Len: {shape_or_len})")
157
  else:
158
+ print(f" Warning: {embed_type} GA embeddings not found at {path}.")
159
+ ga_all_embeddings[embed_type] = None
160
 
161
+ if os.path.exists(ga_resolutions_path):
162
+ with open(ga_resolutions_path, encoding='utf-8') as file:
163
+ ga_resolutions_data = json.load(file)
164
+ print(f"GA resolution data loaded: {len(ga_resolutions_data)} resolutions.")
165
+ else:
166
+ print(f" Warning: GA data file not found at {ga_resolutions_path}")
167
  except FileNotFoundError as e:
168
  print(f"Error loading GA resolution data: {e}")
169
+ print(f"Please ensure GA embedding files and '{os.path.basename(ga_resolutions_path)}' are in the same directory as app.py")
 
170
  except Exception as e:
171
  print(f"Error loading GA resolution data: {e}")
172
 
173
 
174
+ # --- Search Utilities ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def _extract_context(text: str, query: str):
177
+ """Extracts the first line containing the query and highlights all mentions of it (case-insensitive)."""
178
  text_lines = text.split('\n')
179
  query_lower = query.lower()
 
180
  for line in text_lines:
181
  if query_lower in line.lower():
 
 
182
  highlighted_line = re.sub(re.escape(query), lambda m: f"**{m.group(0)}**", line, flags=re.IGNORECASE)
183
  return f'> {highlighted_line}'
184
+ return ""
185
+
186
 
187
+ # --- Issue Search (Component-level semantic, Issue-level loose/strict) ---
188
 
189
+ def search_issues(query: str, search_type: str = 'semantic', scope: str = 'both'):
190
+ """
191
+ Issue search dispatcher:
192
+ - semantic: component-level dense with scope (descriptions | options | both).
193
+ - loose: issue-level sparse (scope is ignored).
194
+ - strict: issue-level exact/substring match over raw texts (scope is ignored).
195
+ """
196
  try:
197
+ if not model:
198
+ return "Model failed to load. Cannot perform search."
199
+ if not query:
200
  return "Please enter a search term."
201
 
202
+ # --- Semantic (component-level) ---
203
+ if search_type == 'semantic':
204
+ corpus = issue_components_embeddings.get('semantic')
205
+ if corpus is None or not len(issue_components_meta):
206
+ return "Component-level semantic embeddings or metadata not loaded. Cannot run semantic search."
207
+
208
+ query_embeddings = model.encode([query],
209
+ return_dense=True,
210
+ return_sparse=True,
211
+ return_colbert_vecs=False)
212
+ q = query_embeddings['dense_vecs'] # shape (1, d)
213
+ scores = (q @ corpus.T)[0] # shape (N_components,)
214
+ indexed = list(enumerate(scores))
215
+
216
+ # Scope filter
217
+ def allow(meta):
218
+ t = meta.get('component_type')
219
+ if scope == 'descriptions':
220
+ return t == 'desc'
221
+ elif scope == 'options':
222
+ return t == 'option'
223
+ return True
224
+
225
+ filtered = [(i, s) for i, s in indexed if allow(issue_components_meta[i])]
226
+ filtered.sort(key=lambda x: x[1], reverse=True)
227
+
228
+ out = [f"# Top 20 Issue Results (Semantic, scope={scope})"]
229
+ if not filtered:
230
+ out.append("No matches found.")
231
+ return "\n".join(out)
232
+
233
+ topk = filtered[:20]
234
+ for rank, (idx, score) in enumerate(topk, start=1):
235
+ meta = issue_components_meta[idx]
236
+ issue_idx = meta['issue_index']
237
+ ctype = meta['component_type']
238
+ opt_idx = meta['option_index']
239
+ title = issue_titles_components.get(str(issue_idx), f"Issue {issue_idx}")
240
+ if ctype == 'desc':
241
+ label = f"{title} — Description"
242
+ else:
243
+ label = f"{title} — Option {opt_idx}"
244
+ out.append(f"{rank}. {label}, Similarity: {score:.4f}")
245
+ return "\n".join(out)
246
+
247
+ # --- Loose (issue-level sparse) ---
248
+ elif search_type == 'loose':
249
+ corpus_sparse = issue_all_embeddings.get('loose')
250
+ if corpus_sparse is None:
251
+ return "Issue-level sparse embeddings not loaded. Cannot run loose search."
252
+
253
+ query_embeddings = model.encode([query],
254
+ return_dense=True,
255
+ return_sparse=True,
256
+ return_colbert_vecs=False)
257
+ if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
258
+ return "Sparse query failed (no lexical weights)."
259
+ q_sparse = query_embeddings['lexical_weights'][0]
260
+
261
+ scores = [model.compute_lexical_matching_score(q_sparse, d) for d in corpus_sparse]
262
+ indexed = list(enumerate(scores))
263
+ indexed.sort(key=lambda x: x[1], reverse=True)
264
+
265
+ out = [f"# Top 20 Issue Results (Loose keyword, scope ignored)"]
266
+ if not indexed:
267
+ out.append("No matches found.")
268
+ return "\n".join(out)
269
+
270
+ for rank, (idx, score) in enumerate(indexed[:20], start=1):
271
+ title = issue_titles.get(str(idx), f"Unknown Issue (Index {idx})")
272
+ out.append(f"{rank}. {title}, Similarity: {score:.4f}")
273
+ return "\n".join(out)
274
+
275
+ # --- Strict (issue-level exact/substring) ---
276
+ elif search_type == 'strict':
277
  if not all_issue_raw_texts:
278
  return "Raw issue texts not loaded. Strict search is unavailable."
279
 
280
  strict_matches = []
281
+ ql = query.lower()
282
  for i, issue_text in enumerate(all_issue_raw_texts):
283
+ if ql in issue_text.lower():
284
+ strict_matches.append(i)
285
 
286
+ out = [f"# Top 20 Issue Search Results (Strict exact/substring)"]
287
  if not strict_matches:
288
+ out.append("No exact matches found.")
289
+ return "\n".join(out)
290
 
291
+ for rank, index in enumerate(strict_matches[:20], start=1):
 
292
  issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
293
+ context = _extract_context(all_issue_raw_texts[index], query)
294
+ out.append(f"{rank}. {issue_title}\n{context}\n")
295
+ return "\n".join(out)
 
296
 
297
+ else:
298
+ return f"Unsupported search type: {search_type}"
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
  except Exception as e:
301
  return f"An error occurred during issue search: {e}"
302
 
303
 
304
+ # --- GA Resolution Search (unchanged logic) ---
305
+
306
+ def _perform_search_ga(search_term: str, corpus_embeddings_dict: dict, search_type: str):
307
+ if not model:
308
+ raise ValueError("Model failed to load. Cannot perform search.")
309
+ if not search_term:
310
+ raise ValueError("Please enter a search term.")
311
+
312
+ corpus_embeddings = corpus_embeddings_dict.get(search_type)
313
+ if corpus_embeddings is None:
314
+ raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
315
+
316
+ query_embeddings = model.encode([search_term],
317
+ return_dense=True,
318
+ return_sparse=True,
319
+ return_colbert_vecs=False)
320
+
321
+ if search_type == 'semantic':
322
+ query_vec = query_embeddings['dense_vecs'] # Shape: (1, embedding_dim)
323
+ similarity_scores = (query_vec @ corpus_embeddings.T)[0]
324
+ elif search_type == 'loose':
325
+ if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
326
+ raise ValueError("Lexical weights (sparse) not returned for query. Model or configuration issue.")
327
+ query_sparse_dict = query_embeddings['lexical_weights'][0]
328
+ similarity_scores = np.array([
329
+ model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
330
+ for doc_sparse_dict in corpus_embeddings
331
+ ])
332
+ else:
333
+ raise ValueError(f"Unsupported embedding search type: {search_type}")
334
+
335
+ indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
336
+ sorted_similarities = sorted(indexed_similarities, key=lambda item: item[1], reverse=True)
337
+ return sorted_similarities
338
+
339
  def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
340
+ search_type: str = 'semantic'):
 
 
 
 
341
  try:
342
  if not search_term:
343
  return "Please enter a search term."
 
345
  if search_type == 'strict':
346
  if not ga_resolutions_data:
347
  return "GA resolution data not loaded. Strict search is unavailable."
 
348
  strict_matches = []
349
+ ql = search_term.lower()
350
  for i, resolution in enumerate(ga_resolutions_data):
351
+ body = resolution.get('body', '')
352
+ if ql in body.lower():
 
353
  status = resolution.get('status')
354
  category = resolution.get('category')
355
  if hide_repealed and status == "Repealed":
356
  continue
357
  if hide_repeal_category and category == "Repeal":
358
  continue
359
+ strict_matches.append(i)
360
 
361
+ out = [f"# Top 20 GA Resolution Search Results (Strict)"]
362
  if not strict_matches:
363
  status_msgs = []
364
  if hide_repealed: status_msgs.append("Repealed")
365
  if hide_repeal_category: status_msgs.append("Repeal Category")
366
  filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
367
+ return "\n".join(out + [f"No exact matches found{filter_msg}."])
368
 
369
+ for rank, index in enumerate(strict_matches[:20], start=1):
 
370
  resolution = ga_resolutions_data[index]
371
  title = resolution.get('title', 'Untitled Resolution')
372
  res_id = resolution.get('id', 'N/A')
 
374
  status = resolution.get('status')
375
  status_marker = "[REPEALED] " if status == "Repealed" else ""
376
  url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
 
377
  context = _extract_context(resolution.get('body', ''), search_term)
378
+ out.append(f"{rank}. {status_marker}[#{res_id} {title}]({url}), Match: 1.0000\n{context}\n")
379
+ return "\n".join(out)
380
+
381
+ # Embedding-based GA search
382
+ raw_sorted = _perform_search_ga(search_term, ga_all_embeddings, search_type)
383
+
384
+ # Filter by status/category
385
+ filtered = []
386
+ for index, score in raw_sorted:
387
+ if index >= len(ga_resolutions_data):
388
+ continue
389
+ resolution = ga_resolutions_data[index]
390
+ status = resolution.get('status')
391
+ category = resolution.get('category')
392
+ if hide_repealed and status == "Repealed":
393
+ continue
394
+ if hide_repeal_category and category == "Repeal":
395
+ continue
396
+ filtered.append((index, score))
397
+
398
+ out = [f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})"]
399
+ if not filtered:
400
+ status_msgs = []
401
+ if hide_repealed: status_msgs.append("Repealed")
402
+ if hide_repeal_category: status_msgs.append("Repeal Category")
403
+ filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
404
+ return "\n".join(out + [f"No matching resolutions found{filter_msg}."])
405
+
406
+ for rank, (index, score) in enumerate(filtered[:20], start=1):
407
+ resolution = ga_resolutions_data[index]
408
+ title = resolution.get('title', 'Untitled Resolution')
409
+ res_id = resolution.get('id', 'N/A')
410
+ council = resolution.get('council', 1)
411
+ status = resolution.get('status')
412
+ status_marker = "[REPEALED] " if status == "Repealed" else ""
413
+ url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
414
+ out.append(f"{rank}. {status_marker}[#{res_id} {title}]({url}), Similarity: {score:.4f}")
415
+ return "\n".join(out)
416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  except Exception as e:
418
  return f"An error occurred during GA resolution search: {e}"
419
 
420
 
421
  # --- Gradio Interface ---
422
 
 
 
 
 
 
 
 
423
  with gr.Blocks() as demo:
424
  gr.Markdown("""
425
  # NationStates Semantic Search
426
+ Search NationStates issues and GA resolutions. Choose semantic for conceptual similarity, loose for keyword matching, and strict for exact substring queries.
 
427
  """)
428
 
429
  with gr.Tabs() as tabs:
430
+ # Issue Search Tab
431
  with gr.TabItem("Issue Search"):
432
+ gr.Markdown("""
433
  ### Search NationStates Issues
434
+ - Semantic: component-level (descriptions and/or options), honors Scope.
435
+ - Loose: issue-level keywords (Scope is ignored).
436
+ - Strict: issue-level exact/substring (Scope is ignored).
437
  """)
438
  issue_search_interface = gr.Interface(
439
+ fn=search_issues,
440
  inputs=[
441
+ gr.Textbox(label="Search term", placeholder="What issue or option are you looking for?"),
442
+ gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic",
443
+ info="semantic: conceptual (component-level); loose: keyword (issue-level); strict: exact substring (issue-level)"),
444
+ gr.Radio(["both", "descriptions", "options"], label="Scope (semantic only)", value="both",
445
+ info="Only applies to semantic search; ignored for loose and strict.")
446
  ],
447
  outputs=gr.Markdown(),
448
  examples=[
449
+ ["coffee", "semantic", "both"],
450
+ ["land value tax", "semantic", "descriptions"],
451
+ ["chainsaw maniacs", "semantic", "options"],
452
+ ["Elon Musk", "loose", "both"],
453
+ ["environmental protection", "strict", "both"]
 
 
 
 
 
 
454
  ],
455
  title=None,
456
  description=None,
457
  submit_btn="Search Issues",
458
+ article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). Issues powered by component-level semantic (BAAI/bge-m3) and issue-level sparse keywords."
459
  )
460
 
461
+ # GA Resolution Search Tab
462
  with gr.TabItem("GA Resolution Search"):
463
+ gr.Markdown("""
464
+ ### Search NationStates General Assembly Resolutions
465
+ Use semantic for concepts, loose for keyword matching, or strict for exact substring.
466
+ """)
 
 
467
  ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
468
  ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
469
  ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
470
+ ga_search_type_radio = gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic",
471
+ info="semantic: conceptual similarity; loose: keyword matching; strict: exact substring")
472
 
473
  ga_search_interface = gr.Interface(
474
  fn=search_ga_resolutions,
 
475
  inputs=[
476
  ga_search_term_input,
477
  ga_hide_repealed_checkbox,
 
480
  ],
481
  outputs=gr.Markdown(),
482
  examples=[
 
483
  ["condemn genocide", True, True, "semantic"],
484
  ["rights of animals", True, True, "loose"],
485
  ["regulating space mining", True, True, "semantic"],
486
  ["founding of the World Assembly", True, True, "semantic"],
487
  ["environmental protection", True, True, "semantic"],
488
+ ["human rights", True, True, "strict"],
489
+ ["World Assembly", True, True, "strict"]
490
  ],
491
  title=None,
492
  description=None,
493
  submit_btn="Search Resolutions",
494
+ article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). GA data parsed from NationStates. Powered by BAAI/bge-m3."
495
  )
496
 
497
  # --- Launch App ---
498
  if __name__ == "__main__":
 
 
499
  demo.launch()
issue_titles.json CHANGED
The diff for this file is too large to render. See raw diff
 
issue_titles_components.json ADDED
The diff for this file is too large to render. See raw diff
 
ns_issue_components_meta.json ADDED
The diff for this file is too large to render. See raw diff
 
ns_issue_components_semantic_bge-m3.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f5128a11cd81849b9eafd4f312e323a84edacf88177f9cfd28ae0c2a589232b
3
+ size 16728192
ns_issues_loose_bge-m3.npy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99d43b309e690e846e24ac5ec3b4406f53842f132df6fa3bb11659494cfd3772
3
- size 8418649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1921105894133a81c5a79d60fb9670b48f8dc14d43f14cc02cf9a5405e7ed312
3
+ size 8416495
small_scripts/make_embedding/embedding.py CHANGED
@@ -1,230 +1,340 @@
 
 
1
  import os
2
  import re
 
3
  import numpy as np
4
  from FlagEmbedding import BGEM3FlagModel
5
 
6
- # --- Configuration ---
7
- # IMPORTANT: Adjust MODEL_PATH to your model's actual local path.
8
  MODEL_PATH = '../../../../Downloads/bge-m3'
9
-
10
- # Output directory for the final consolidated .npy files.
11
- # If this script is in 'project_root/scripts/', and app.py is in 'project_root/',
12
- # then '../' would be appropriate here. If both are in the same directory, use '.'
13
  OUTPUT_DIR = '../../'
14
-
15
- # Temporary cache directory for per-file embeddings (relative to script location)
16
  CACHE_DIR = './.issue_embeddings_cache'
17
 
18
- # --- Embedding Generation Control ---
19
- # Set to True to re-embed all files regardless of cached files.
20
- # If False, existing cached files will be skipped unless they are in CHANGED_FILES.
21
  RE_EMBED_ALL = False
22
- # List of specific filenames (e.g., '0000 TO 0025.txt') to re-embed.
23
- # Only effective if RE_EMBED_ALL is False.
24
- CHANGED_FILES = [] # e.g., ['0000 TO 0025.txt', '0026 TO 0050.txt']
25
 
 
 
 
 
 
26
 
27
- # --- Helper Functions ---
28
  def get_issue_files(directory="."):
29
- """Gets and sorts issue files by their starting number from the filename pattern."""
30
  issue_files = []
31
- # Regex to extract the first number from filenames like "0000 TO 0025.txt"
32
  file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
33
-
34
  if not os.path.isdir(directory):
35
  print(f"Error: Directory '{directory}' not found.")
36
  return []
37
-
38
  for filename in os.listdir(directory):
39
  if filename.endswith('.txt'):
40
  match = file_pattern.match(filename)
41
  if match:
42
  start_num = int(match.group(1))
43
  issue_files.append((start_num, filename))
44
-
45
- # Sort by the extracted starting number to ensure correct global order
46
  issue_files.sort(key=lambda x: x[0])
47
- return [os.path.join(directory, filename) for _, filename in issue_files] # Return full paths
48
-
49
 
50
  def ensure_dirs(dirs):
51
- """Ensures that a list of directories exists."""
52
  for d in dirs:
53
  os.makedirs(d, exist_ok=True)
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # --- Main Embedding Function ---
57
- def encode_issues():
58
  print("Initializing BGEM3FlagModel...")
59
- # Setting use_fp16 to True speeds up computation with a slight performance degradation
60
  try:
61
  model = BGEM3FlagModel(MODEL_PATH, use_fp16=True)
62
  print("Model loaded.")
63
  except Exception as e:
64
  print(f"Error loading model from {MODEL_PATH}: {e}")
65
- print("Please ensure the model is downloaded to the specified path.")
66
  return
67
 
68
- issues_input_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'NationStates-Issue-Megathread/002 - Issue Megalist (MAIN)')
 
69
  issue_files = get_issue_files(issues_input_dir)
70
  if not issue_files:
71
- print(
72
- f"No issue files found matching the pattern 'NNNN TO NNNN.txt' in '{issues_input_dir}'. Please ensure files are present.")
73
  return
74
 
75
- # Prepare cache directories for individual file embeddings
76
- cache_dense_dir = os.path.join(CACHE_DIR, 'dense')
77
- cache_sparse_dir = os.path.join(CACHE_DIR, 'sparse')
78
- # Removed cache_colbert_dir
79
  ensure_dirs([cache_dense_dir, cache_sparse_dir])
80
-
81
- # Ensure output directory for final consolidated files exists
82
  os.makedirs(OUTPUT_DIR, exist_ok=True)
83
 
84
- print(f"Found {len(issue_files)} issue files to process. Starting embedding process...")
 
 
 
 
85
 
86
- # Process each issue file individually
87
- for i, filepath in enumerate(issue_files): # filepath is now full path
88
- filename = os.path.basename(filepath)
89
- print(f"\nProcessing file {i + 1}/{len(issue_files)}: {filename}")
90
 
91
- # Define cache paths for the embeddings of this specific file
92
- base_name = os.path.splitext(filename)[0] # e.g., "0000 TO 0025"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  file_cache_dense_path = os.path.join(cache_dense_dir, f"{base_name}.npy")
94
- file_cache_sparse_path = os.path.join(cache_sparse_dir, f"{base_name}.npy")
95
- # Removed file_cache_colbert_path
96
 
97
- # Check if re-embedding is needed for this file based on configuration
98
- is_cached = (os.path.exists(file_cache_dense_path) and
99
- os.path.exists(file_cache_sparse_path)) # Removed colbert cache check
 
 
100
 
 
101
  if not RE_EMBED_ALL and filename not in CHANGED_FILES and is_cached:
102
- print(f" Skipping {filename} (cached embeddings exist and no re-embed flags are set).")
103
- continue # Skip to next file
104
-
105
- try:
106
- with open(filepath, 'r', encoding='utf-8') as file:
107
- issues_text_in_file = file.read()
108
- # Split issues by the separator and remove any empty strings resulting from multiple separators
109
- issues_list_in_file = [
110
- issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
111
- ]
112
-
113
- if not issues_list_in_file:
114
- print(f" Warning: No issues found in {filename} after splitting. Skipping encoding for this file.")
115
- continue # Skip to next file if no content
116
-
117
- print(f" Found {len(issues_list_in_file)} issues in {filename}. Encoding...")
118
-
119
- # Encode only Dense and Sparse vector types
120
- embeddings = model.encode(issues_list_in_file,
121
- batch_size=12, # Adjust batch_size based on your GPU/CPU memory
122
- max_length=8192, # Max length of input sequence
123
- return_dense=True,
124
- return_sparse=True, # This will return 'lexical_weights' for BGE-M3
125
- return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
126
-
127
- # Save Semantic (Dense) Embeddings
128
- np.save(file_cache_dense_path, embeddings['dense_vecs'])
129
-
130
- # --- Save Loose (Sparse) Embeddings ---
131
- # 'lexical_weights' is a list of dictionaries, one for each item in the batch
132
- sparse_list_of_dicts = embeddings.get('lexical_weights')
133
-
134
- # Save this list of sparse dictionaries as a NumPy object array
135
- # This allows storing Python objects (dictionaries) in a NumPy array.
136
- np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
137
-
138
-
139
- print(f" Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
140
-
141
- except Exception as e:
142
- print(f" Error processing {filename}: {e}")
143
- import traceback
144
- traceback.print_exc() # Print full traceback for debugging
145
- continue # Continue to the next file even if one fails
146
-
147
- print("\n--- Consolidation Phase: Combining cached embeddings ---")
148
-
149
- # Initialize lists to collect all embeddings in the correct global order
150
- final_semantic_embeddings_list = [] # Renamed from final_dense_embeddings_list
151
- final_loose_embeddings_list = [] # Renamed from final_sparse_embeddings_list
152
- # Removed final_colbert_embeddings_list
153
-
154
- # Re-get sorted file paths to ensure correct order for consolidation
155
- issue_files_for_consolidation = get_issue_files(issues_input_dir)
156
 
157
- global_issue_index = 0
158
- # Iterate through files again to load from cache and consolidate in sorted order
159
- for i, filepath in enumerate(issue_files_for_consolidation):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  filename = os.path.basename(filepath)
161
  base_name = os.path.splitext(filename)[0]
162
- file_cache_dense_path = os.path.join(cache_dense_dir, f"{base_name}.npy")
163
  file_cache_sparse_path = os.path.join(cache_sparse_dir, f"{base_name}.npy")
164
- # Removed file_cache_colbert_path
165
-
166
- # Only load if all cached embedding files for this issue file are present
167
- if (os.path.exists(file_cache_dense_path) and
168
- os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
169
-
170
- # Load and append to the lists
171
- final_semantic_embeddings_list.append(np.load(file_cache_dense_path)) # Renamed
172
-
173
- # Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
174
- loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
175
- final_loose_embeddings_list.extend(loaded_sparse_dicts_for_file) # Renamed
176
 
177
- # Removed loading ColBERT arrays
178
- # loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
179
- # final_colbert_embeddings_list.extend(loaded_colbert_arrays_for_file)
 
180
 
181
- # Count issues in this file to correctly update global_issue_index
182
- # We need to re-read the raw file to get the count
183
- with open(filepath, 'r', encoding='utf-8') as file:
184
- issues_text_in_file = file.read()
185
- issue_count_in_file = len(
186
- [issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()])
187
-
188
- global_issue_index += issue_count_in_file
189
  else:
190
- print(
191
- f" Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
192
-
193
- if not final_semantic_embeddings_list: # Renamed
194
- print("No embeddings were successfully loaded for consolidation. No output files generated.")
195
- return
196
-
197
- # --- Final Save Phase ---
198
- # Concatenate all collected embeddings into single large NumPy arrays
199
- print("Concatenating and saving final consolidated embeddings...")
200
-
201
- # Semantic (Dense) embeddings
202
- final_semantic_array = np.vstack(final_semantic_embeddings_list) # Renamed
203
- np.save(os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy'), final_semantic_array) # Renamed file
204
- print(
205
- f" Saved semantic embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy')} (Shape: {final_semantic_array.shape})") # Renamed file and type
206
-
207
- # Loose (Sparse) embeddings (now a list of dictionaries, saved as object array)
208
- if final_loose_embeddings_list: # Renamed
209
- # Save the list of dictionaries as a NumPy object array
210
- final_loose_array = np.array(final_loose_embeddings_list, dtype=object) # Renamed
211
- np.save(os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy'), final_loose_array, allow_pickle=True) # Renamed file
212
- print(
213
- f" Saved loose embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy')} (Total objects: {len(final_loose_array)}, type: {type(final_loose_array)})") # Renamed file and type
214
- else:
215
- print(" No loose embeddings to save.") # Renamed
216
-
217
- # Removed ColBERT embeddings saving
218
- # if final_colbert_embeddings_list:
219
- # final_colbert_array = np.array(final_colbert_embeddings_list, dtype=object)
220
- # np.save(os.path.join(OUTPUT_DIR, 'ns_issues_colbert_bge-m3.npy'), final_colbert_array, allow_pickle=True)
221
- # print(f" Saved ColBERT embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_colbert_bge-m3.npy')} (Total objects: {len(final_colbert_array)}, type: {type(final_colbert_array)})")
222
- # else:
223
- # print(" No ColBERT embeddings to save.")
224
-
225
- print("\nEmbedding generation complete!")
226
-
227
 
228
- # Call this function to start the embedding process.
229
  if __name__ == "__main__":
230
- encode_issues()
 
1
+ # filename: encode_issues_components_and_sparse.py
2
+
3
  import os
4
  import re
5
+ import json
6
  import numpy as np
7
  from FlagEmbedding import BGEM3FlagModel
8
 
 
 
9
  MODEL_PATH = '../../../../Downloads/bge-m3'
 
 
 
 
10
  OUTPUT_DIR = '../../'
 
 
11
  CACHE_DIR = './.issue_embeddings_cache'
12
 
 
 
 
13
  RE_EMBED_ALL = False
14
+ CHANGED_FILES = []
15
+
16
+ ISSUE_SPLIT_MARKER = "[hr][/hr]"
17
 
18
+ BB_TAG_RE = re.compile(r'\[(?:\/)?[^\]]+\]') # strips BBCode tags
19
+
20
+ def strip_bbcode(s: str) -> str:
21
+ # Stripping BBCode ensures robust header and description detection
22
+ return BB_TAG_RE.sub('', s)
23
 
 
24
  def get_issue_files(directory="."):
 
25
  issue_files = []
 
26
  file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
 
27
  if not os.path.isdir(directory):
28
  print(f"Error: Directory '{directory}' not found.")
29
  return []
 
30
  for filename in os.listdir(directory):
31
  if filename.endswith('.txt'):
32
  match = file_pattern.match(filename)
33
  if match:
34
  start_num = int(match.group(1))
35
  issue_files.append((start_num, filename))
 
 
36
  issue_files.sort(key=lambda x: x[0])
37
+ return [os.path.join(directory, filename) for _, filename in issue_files]
 
38
 
39
  def ensure_dirs(dirs):
 
40
  for d in dirs:
41
  os.makedirs(d, exist_ok=True)
42
 
43
+ def _split_raw_issues(raw_text):
44
+ return [issue.strip() for issue in raw_text.split(ISSUE_SPLIT_MARKER) if issue.strip()]
45
+
46
+ def _extract_title(issue_block):
47
+ for line in issue_block.splitlines():
48
+ line = line.strip()
49
+ if line:
50
+ return line
51
+ return "Untitled Issue"
52
+
53
+ def find_header_index(header: str, lines):
54
+ # Strips BBCode and whitespace, compares case-insensitively
55
+ header_lower = header.lower()
56
+ for idx, line in enumerate(lines):
57
+ line_clean = strip_bbcode(line).strip().lower()
58
+ if line_clean == header_lower:
59
+ return idx
60
+ return -1
61
+
62
+ def is_placeholder_issue(issue_block):
63
+ # Skips issues that are just a title line with 'TBD' and no content
64
+ lines = [line.strip() for line in issue_block.splitlines() if line.strip()]
65
+ if len(lines) == 1 and 'TBD' in lines[0]:
66
+ return True
67
+ # Also skip if all non-empty lines are BBCode or anchor/title lines and contain 'TBD'
68
+ non_title_lines = [
69
+ l for l in lines
70
+ if not (l.startswith('[b][anchor=') and 'TBD' in l)
71
+ ]
72
+ if not non_title_lines and any('TBD' in l for l in lines):
73
+ return True
74
+ return False
75
+
76
+ def _parse_issue_strict(issue_block: str, global_issue_index: int):
77
+ lines = issue_block.splitlines()
78
+
79
+ i_issue = find_header_index("The Issue", lines)
80
+ i_debate = find_header_index("The Debate", lines)
81
+
82
+ if i_issue == -1 or i_debate == -1 or i_debate <= i_issue:
83
+ print(f"Parse error: missing 'The Issue' or 'The Debate' in issue #{global_issue_index}")
84
+ raise ValueError(f"Parse error in issue #{global_issue_index}")
85
+
86
+ between = lines[i_issue + 1:i_debate]
87
+ cleaned = [strip_bbcode(l).strip() for l in between]
88
+ non_empty_idx = [k for k, c in enumerate(cleaned) if c]
89
+
90
+ if len(non_empty_idx) == 1:
91
+ desc_text = cleaned[non_empty_idx[0]]
92
+ elif len(non_empty_idx) == 0:
93
+ first_raw = None
94
+ for l in between:
95
+ if l.strip():
96
+ first_raw = l
97
+ break
98
+ if not first_raw:
99
+ print(f"Parse error: issue #{global_issue_index} has no usable description lines")
100
+ raise ValueError(f"Parse error in issue #{global_issue_index}")
101
+ desc_text = strip_bbcode(first_raw).strip()
102
+ else:
103
+ offending = [between[k] for k in non_empty_idx]
104
+ print(f"Parse error: issue #{global_issue_index} has {len(non_empty_idx)} non-empty description lines (expected 1)")
105
+ print(f"Description lines (raw): {offending}")
106
+ raise ValueError(f"Parse error in issue #{global_issue_index}")
107
+
108
+ after_debate = [l.strip() for l in lines[i_debate + 1:] if l.strip()]
109
+ option_lines = after_debate
110
+
111
+ return desc_text, option_lines
112
+
113
+
114
+ import re
115
+
116
+ def format_issue_title_markdown(issue_block):
117
+ """
118
+ Extracts anchor and visible title from the first line of the issue block,
119
+ and formats as markdown with a forum link.
120
+ """
121
+ # Find the first non-empty line (should be the title line)
122
+ for line in issue_block.splitlines():
123
+ line = line.strip()
124
+ if not line:
125
+ continue
126
+
127
+ # Extract anchor (e.g., [anchor=1379])
128
+ anchor_match = re.search(r'\[anchor=(\d+)\]', line)
129
+ anchor = anchor_match.group(1) if anchor_match else None
130
+
131
+ # Extract visible title (after the closing [/anchor]:)
132
+ # This matches: [anchor=1379]#1379[/anchor]: <title>
133
+ title_match = re.search(r'\[anchor=(\d+)\]\#\d+\[\/anchor\]:\s*(.*)', line)
134
+ if title_match:
135
+ title_text = title_match.group(2).strip()
136
+ else:
137
+ # Fallback: try to find after the first colon
138
+ parts = line.split(':', 1)
139
+ title_text = parts[1].strip() if len(parts) > 1 else line
140
+
141
+ # Remove trailing BBCode tags from title (but keep chain/fancy formatting)
142
+ title_text = re.sub(r'\[\/?[^\]]+\]', '', title_text).strip()
143
+
144
+ # Compose markdown
145
+ if anchor:
146
+ return f"#{anchor}: [{title_text}](https://forum.nationstates.net/viewtopic.php?f=13&t=88#{anchor})"
147
+ else:
148
+ # Fallback: just return cleaned title
149
+ return title_text
150
+
151
+ print(f"Could not find issue title in {issue_block}")
152
+ raise ValueError(f"Parse error in issue title")
153
 
154
+ def encode_issues_components_and_sparse():
 
155
  print("Initializing BGEM3FlagModel...")
 
156
  try:
157
  model = BGEM3FlagModel(MODEL_PATH, use_fp16=True)
158
  print("Model loaded.")
159
  except Exception as e:
160
  print(f"Error loading model from {MODEL_PATH}: {e}")
 
161
  return
162
 
163
+ issues_input_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
164
+ 'NationStates-Issue-Megathread/002 - Issue Megalist (MAIN)')
165
  issue_files = get_issue_files(issues_input_dir)
166
  if not issue_files:
167
+ print(f"No issue files found in '{issues_input_dir}'.")
 
168
  return
169
 
170
+ cache_dense_dir = os.path.join(CACHE_DIR, 'dense_components')
171
+ cache_sparse_dir = os.path.join(CACHE_DIR, 'sparse_issues')
 
 
172
  ensure_dirs([cache_dense_dir, cache_sparse_dir])
 
 
173
  os.makedirs(OUTPUT_DIR, exist_ok=True)
174
 
175
+ # --- Component-level dense (semantic) ---
176
+ perfile_component_texts = []
177
+ perfile_component_meta = []
178
+ all_issue_titles = []
179
+ global_issue_index_offset = 0
180
 
181
+ # --- Issue-level sparse (loose) ---
182
+ perfile_issue_texts = []
183
+ titles_dict = {}
 
184
 
185
+ print(f"Parsing and preparing issue blocks from {len(issue_files)} files...")
186
+ for i, filepath in enumerate(issue_files):
187
+ filename = os.path.basename(filepath)
188
+ print(f" [{i+1}/{len(issue_files)}] Parsing file: {filename}")
189
+ with open(filepath, 'r', encoding='utf-8') as f:
190
+ raw = f.read()
191
+ issue_blocks = _split_raw_issues(raw)
192
+ file_components_texts = []
193
+ file_components_meta = []
194
+ file_issue_texts = []
195
+ file_issue_titles = []
196
+
197
+ for local_issue_idx, issue_block in enumerate(issue_blocks):
198
+ if is_placeholder_issue(issue_block):
199
+ continue # Skip placeholder/empty issues
200
+
201
+ title_line = _extract_title(issue_block)
202
+ this_issue_global_idx = global_issue_index_offset + local_issue_idx
203
+
204
+ titles_dict[str(this_issue_global_idx)] = format_issue_title_markdown(issue_block)
205
+
206
+ try:
207
+ desc_text, option_texts = _parse_issue_strict(issue_block, this_issue_global_idx)
208
+ except Exception as e:
209
+ print(f"Aborting due to parse error in issue #{this_issue_global_idx}")
210
+ raise
211
+
212
+ # Dense: description and options as separate components
213
+ file_components_texts.append(desc_text)
214
+ file_components_meta.append({
215
+ "issue_index": this_issue_global_idx,
216
+ "component_type": "desc",
217
+ "option_index": None
218
+ })
219
+ for opt_idx, opt_text in enumerate(option_texts, start=1):
220
+ file_components_texts.append(opt_text)
221
+ file_components_meta.append({
222
+ "issue_index": this_issue_global_idx,
223
+ "component_type": "option",
224
+ "option_index": opt_idx
225
+ })
226
+
227
+ # Sparse: whole issue block (not chunked)
228
+ file_issue_texts.append(issue_block)
229
+ file_issue_titles.append(title_line)
230
+
231
+ perfile_component_texts.append(file_components_texts)
232
+ perfile_component_meta.append(file_components_meta)
233
+ perfile_issue_texts.append(file_issue_texts)
234
+ global_issue_index_offset += len(issue_blocks)
235
+
236
+ # --- Dense embedding for components ---
237
+ print("\nStarting dense (semantic) embedding for components...")
238
+ all_dense_chunks = []
239
+ all_meta = []
240
+ for i, filepath in enumerate(issue_files):
241
+ filename = os.path.basename(filepath)
242
+ base_name = os.path.splitext(filename)[0]
243
  file_cache_dense_path = os.path.join(cache_dense_dir, f"{base_name}.npy")
 
 
244
 
245
+ texts = perfile_component_texts[i]
246
+ metas = perfile_component_meta[i]
247
+ if not texts:
248
+ print(f" [Dense] Skipping file {filename} (no components to embed).")
249
+ continue
250
 
251
+ is_cached = os.path.exists(file_cache_dense_path)
252
  if not RE_EMBED_ALL and filename not in CHANGED_FILES and is_cached:
253
+ print(f" [Dense] Loading cached embeddings for {filename} ({len(texts)} components).")
254
+ dense_vecs = np.load(file_cache_dense_path)
255
+ else:
256
+ print(f" [Dense] Embedding {len(texts)} components from {filename}...")
257
+ embeddings = model.encode(
258
+ texts,
259
+ batch_size=12,
260
+ max_length=8192,
261
+ return_dense=True,
262
+ return_sparse=False, # Only dense for components
263
+ return_colbert_vecs=False
264
+ )
265
+ dense_vecs = embeddings['dense_vecs']
266
+ np.save(file_cache_dense_path, dense_vecs)
267
+ print(f" [Dense] Saved cache for {filename} ({dense_vecs.shape[0]} components).")
268
+
269
+ all_dense_chunks.append(dense_vecs)
270
+ all_meta.extend(metas)
271
+
272
+ if not all_dense_chunks:
273
+ print("No component embeddings produced.")
274
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ final_dense = np.vstack(all_dense_chunks)
277
+ dense_out = os.path.join(OUTPUT_DIR, 'ns_issue_components_semantic_bge-m3.npy')
278
+ meta_out = os.path.join(OUTPUT_DIR, 'ns_issue_components_meta.json')
279
+ titles_out = os.path.join(OUTPUT_DIR, 'issue_titles_components.json')
280
+
281
+ np.save(dense_out, final_dense)
282
+ with open(meta_out, 'w', encoding='utf-8') as f:
283
+ json.dump(all_meta, f, ensure_ascii=False)
284
+ with open(titles_out, 'w', encoding='utf-8') as f:
285
+ # Only titles for non-placeholder issues
286
+ json.dump(titles_dict, f, ensure_ascii=False)
287
+
288
+ print(f"\nDense embedding complete. Saved:")
289
+ print(f" Dense: {dense_out} shape={final_dense.shape}")
290
+ print(f" Meta: {meta_out} items={len(all_meta)}")
291
+ print(f" Titles: {titles_out} issues={len(titles_dict)}")
292
+
293
+ # --- Sparse embedding for whole issues, cached per file ---
294
+ print("\nStarting sparse (loose) embedding for whole issues (per file)...")
295
+ sparse_out = os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy')
296
+ titles_sparse_out = os.path.join(OUTPUT_DIR, 'issue_titles.json')
297
+
298
+ all_sparse_chunks = []
299
+ for i, filepath in enumerate(issue_files):
300
  filename = os.path.basename(filepath)
301
  base_name = os.path.splitext(filename)[0]
 
302
  file_cache_sparse_path = os.path.join(cache_sparse_dir, f"{base_name}.npy")
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
+ issue_texts = perfile_issue_texts[i]
305
+ if not issue_texts:
306
+ print(f" [Sparse] Skipping file {filename} (no issues to embed).")
307
+ continue
308
 
309
+ is_cached = os.path.exists(file_cache_sparse_path)
310
+ if not RE_EMBED_ALL and filename not in CHANGED_FILES and is_cached:
311
+ print(f" [Sparse] Loading cached sparse embeddings for {filename} ({len(issue_texts)} issues).")
312
+ sparse_dicts = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
 
 
 
 
313
  else:
314
+ print(f" [Sparse] Embedding {len(issue_texts)} issues from {filename}...")
315
+ embeddings = model.encode(
316
+ issue_texts,
317
+ batch_size=12,
318
+ max_length=8192,
319
+ return_dense=False,
320
+ return_sparse=True,
321
+ return_colbert_vecs=False
322
+ )
323
+ sparse_dicts = embeddings['lexical_weights']
324
+ np.save(file_cache_sparse_path, np.array(sparse_dicts, dtype=object), allow_pickle=True)
325
+ print(f" [Sparse] Saved cache for {filename} ({len(sparse_dicts)} issues).")
326
+
327
+ all_sparse_chunks.extend(sparse_dicts)
328
+
329
+ np.save(sparse_out, np.array(all_sparse_chunks, dtype=object), allow_pickle=True)
330
+ # Flatten all titles for sparse
331
+ with open(titles_sparse_out, 'w', encoding='utf-8') as f:
332
+ json.dump(titles_dict, f, ensure_ascii=False)
333
+
334
+ print(f"\nSparse embedding complete. Saved:")
335
+ print(f" Sparse: {sparse_out} count={len(all_sparse_chunks)}")
336
+ print(f" Titles (sparse): {titles_sparse_out} issues={len(titles_dict)}")
337
+ print("Embedding generation (components dense, issues sparse, strict) complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
 
339
  if __name__ == "__main__":
340
+ encode_issues_components_and_sparse()