Spaces:

Bohaska
/

ns_issue_search

Running

App Files Files Community

Bohaska commited on Sep 18, 2025

Commit

0fcec84

1 Parent(s): b5ace46

chunk semantic issue search, fix issue titles

Browse files

Files changed (7) hide show

app.py +318 -270
issue_titles.json +0 -0
issue_titles_components.json +0 -0
ns_issue_components_meta.json +0 -0
ns_issue_components_semantic_bge-m3.npy +3 -0
ns_issues_loose_bge-m3.npy +2 -2
small_scripts/make_embedding/embedding.py +279 -169

app.py CHANGED Viewed

@@ -3,31 +3,39 @@ from FlagEmbedding import BGEM3FlagModel
 import numpy as np
 import json
 import os
-import re # Added for strict search context extraction
 # --- Configuration and Global Data Loading ---
 # Determine the directory of the script to load files relative to it
 script_dir = os.path.dirname(os.path.abspath(__file__))
-# Define paths for issue embedding types
 issue_embeddings_paths = {
-    'semantic': os.path.join(script_dir, 'ns_issues_semantic_bge-m3.npy'), # Renamed from fuzzy
-    'loose': os.path.join(script_dir, 'ns_issues_loose_bge-m3.npy'),       # Renamed from direct
 }
 issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
-# Define paths for GA resolution embedding types
 ga_embeddings_paths = {
-    'semantic': os.path.join(script_dir, 'ns_ga_resolutions_semantic_bge-m3.npy'), # Renamed from fuzzy
-    'loose': os.path.join(script_dir, 'ns_ga_resolutions_loose_bge-m3.npy'),       # Renamed from direct
 }
 ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
 print("Loading BGE-M3 model...")
 try:
-    # Use 'BAAI/bge-m3' to let FlagEmbedding handle downloading/caching.
-    # If you prefer to force a local path, change it here.
     model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
     print("Model loaded successfully.")
 except Exception as e:
@@ -35,72 +43,100 @@ except Exception as e:
     print("Please ensure you have an internet connection or the model is cached locally.")
     model = None  # Indicate model loading failed
-# Issue data storage for all types
 issue_all_embeddings = {
-    'semantic': None,
-    'loose': None,
 }
 issue_titles = {}
-all_issue_raw_texts = [] # New: To store raw issue texts for strict search
 print("Loading issue data...")
 try:
-    if model:  # Only attempt to load embeddings if model is available
-        # Load available embedding types for issues
-        for embed_type, path in issue_embeddings_paths.items():
-            if os.path.exists(path):
-                if embed_type == 'loose': # Only sparse is loaded as list of objects now
-                    # Load sparse dictionaries: it's a NumPy object array, convert to list of objects
-                    issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
-                else: # Dense
-                    issue_all_embeddings[embed_type] = np.load(path)
-                print(
-                    f"  Loaded {embed_type} issue embeddings from {path} (Shape: {issue_all_embeddings[embed_type].shape if hasattr(issue_all_embeddings[embed_type], 'shape') else len(issue_all_embeddings[embed_type])})")
             else:
-                print(f"  Warning: {embed_type} issue embeddings not found at {path}. Skipping.")
-                issue_all_embeddings[embed_type] = None  # Ensure it's explicitly None if not found
         with open(issue_titles_path, encoding='utf-8') as file:
             issue_titles = json.load(file)
-        print(f"Issue data loaded: {len(issue_titles)} issues.")
-        # --- Load raw issue texts for strict search ---
-        # The issue text files are in 'small_scripts/make_embedding/NationStates-Issue-Megathread/002 - Issue Megalist (MAIN)'
-        issues_input_dir = os.path.join(script_dir, 'small_scripts', 'make_embedding',
-                                        'NationStates-Issue-Megathread', '002 - Issue Megalist (MAIN)')
-        issue_files_for_raw_load = []
-        file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
-        if os.path.isdir(issues_input_dir):
-            for filename in os.listdir(issues_input_dir):
-                if filename.endswith('.txt'):
-                    match = file_pattern.match(filename)
-                    if match:
-                        start_num = int(match.group(1))
-                        issue_files_for_raw_load.append((start_num, filename))
-            issue_files_for_raw_load.sort(key=lambda x: x[0])
-            issue_files_for_raw_load = [os.path.join(issues_input_dir, filename) for _, filename in issue_files_for_raw_load]
-            for filepath in issue_files_for_raw_load:
-                with open(filepath, 'r', encoding='utf-8') as file:
-                    issues_text_in_file = file.read()
-                    # Split issues by the separator and remove any empty strings resulting from multiple separators
-                    issues_list_in_file = [
-                        issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
-                    ]
-                    all_issue_raw_texts.extend(issues_list_in_file)
-            print(f"  Loaded {len(all_issue_raw_texts)} raw issue texts for strict search.")
         else:
-            print(f"  Warning: Issue text directory '{issues_input_dir}' not found. Strict issue search will not work.")
 except FileNotFoundError as e:
     print(f"Error loading issue data: {e}")
-    print(
-        f"Please ensure embedding files and '{os.path.basename(issue_titles_path)}' are in the same directory as app.py")
 except Exception as e:
     print(f"Error loading issue data: {e}")
-# GA resolution data storage for all types
 ga_all_embeddings = {
     'semantic': None,
     'loose': None,
@@ -110,149 +146,198 @@ ga_resolutions_data = []
 print("Loading GA resolution data...")
 try:
     if model:  # Only attempt to load embeddings if model is available
-        # Load available embedding types for GA resolutions
         for embed_type, path in ga_embeddings_paths.items():
             if os.path.exists(path):
-                if embed_type == 'loose': # Only sparse is loaded as list of objects now
                     ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
-                else: # Dense
                     ga_all_embeddings[embed_type] = np.load(path)
-                print(
-                    f"  Loaded {embed_type} GA embeddings from {path} (Shape: {ga_all_embeddings[embed_type].shape if hasattr(ga_all_embeddings[embed_type], 'shape') else len(ga_all_embeddings[embed_type])})")
             else:
-                print(f"  Warning: {embed_type} GA embeddings not found at {path}. Skipping.")
-                ga_all_embeddings[embed_type] = None  # Ensure it's explicitly None if not found
-        with open(ga_resolutions_path, encoding='utf-8') as file:
-            ga_resolutions_data = json.load(file)  # List of dictionaries
-        print(f"GA resolution data loaded: {len(ga_resolutions_data)} resolutions.")
 except FileNotFoundError as e:
     print(f"Error loading GA resolution data: {e}")
-    print(
-        f"Please ensure GA embedding files and '{os.path.basename(ga_resolutions_path)}' are in the same directory as app.py")
 except Exception as e:
     print(f"Error loading GA resolution data: {e}")
-# --- Search Functions ---
-def _perform_search(search_term: str, corpus_embeddings_dict: dict, search_type: str):
-    """
-    Helper function to perform an embedding-based search given the search term, corpus embeddings, and search type.
-    Returns sorted list of (index, similarity_score).
-    """
-    if not model:
-        raise ValueError("Model failed to load. Cannot perform search.")
-    if not search_term:
-        raise ValueError("Please enter a search term.")
-    corpus_embeddings = corpus_embeddings_dict.get(search_type)
-    if corpus_embeddings is None:
-        raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
-    # Encode the search term for relevant types
-    query_embeddings = model.encode([search_term],
-                                    return_dense=True,
-                                    return_sparse=True,
-                                    return_colbert_vecs=False)
-    similarity_scores = []
-    if search_type == 'semantic': # Renamed from 'fuzzy'
-        query_vec = query_embeddings['dense_vecs']  # Shape: (1, embedding_dim)
-        # Perform dot product for dense similarity
-        similarity_scores = (query_vec @ corpus_embeddings.T)[0]  # Result shape: (num_docs,)
-    elif search_type == 'loose': # Renamed from 'direct'
-        # 'lexical_weights' is a list of dictionaries, even for a single query.
-        # We need the first (and only) dictionary from this list.
-        if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
-            raise ValueError("Lexical weights (sparse) not returned for query. Model or configuration issue.")
-        query_sparse_dict = query_embeddings['lexical_weights'][0]
-        # Iterate through each document's sparse dictionary and compute score
-        for doc_sparse_dict in corpus_embeddings:  # corpus_embeddings is a list of sparse dictionaries
-            score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
-            similarity_scores.append(score)
-        similarity_scores = np.array(similarity_scores)  # Convert to numpy array
-    else:
-        # This function should only be called for embedding-based searches
-        raise ValueError(f"Unsupported embedding search type: {search_type}")
-    # Pair index with similarity score
-    indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
-    # Sort by similarity score in descending order
-    sorted_similarities = sorted(indexed_similarities, key=lambda item: item[1], reverse=True)
-    return sorted_similarities
 def _extract_context(text: str, query: str):
-    """Extracts the first line containing the query and highlights all mentions of it."""
     text_lines = text.split('\n')
     query_lower = query.lower()
     for line in text_lines:
         if query_lower in line.lower():
-            # Found the first line containing the query
-            # Highlight all occurrences of the query in this line
             highlighted_line = re.sub(re.escape(query), lambda m: f"**{m.group(0)}**", line, flags=re.IGNORECASE)
             return f'> {highlighted_line}'
-    return "" # Should not be reached if strict search already found a match
-def get_issue_similarity_rankings(search_term: str, search_type: str = 'semantic'): # Renamed default
-    """Searches issues and returns formatted results."""
     try:
-        if not search_term:
             return "Please enter a search term."
-        if search_type == 'strict':
             if not all_issue_raw_texts:
                 return "Raw issue texts not loaded. Strict search is unavailable."
             strict_matches = []
-            search_term_lower = search_term.lower()
             for i, issue_text in enumerate(all_issue_raw_texts):
-                if search_term_lower in issue_text.lower():
-                    strict_matches.append((i, 1.0)) # Use 1.0 as a dummy score for strict matches
-            similarity_text = f"# Top 20 Issue Search Results (Strict)\n"
             if not strict_matches:
-                return similarity_text + "No exact matches found."
-            search_ranking = 1
-            for index, sim_score in strict_matches[:20]: # Still limit to top 20
                 issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
-                context = _extract_context(all_issue_raw_texts[index], search_term)
-                similarity_text += f"{search_ranking}. {issue_title}\n{context}\n\n"
-                search_ranking += 1
-            return similarity_text
-        else: # Embedding-based search
-            sorted_similarities = _perform_search(search_term, issue_all_embeddings, search_type)
-            similarity_text = f"# Top 20 Issue Search Results ({search_type.capitalize()})\n"
-            if not sorted_similarities:
-                return similarity_text + "No issues found."
-            search_ranking = 1
-            for index, sim_score in sorted_similarities[:20]:
-                # issue_titles is a dict, needs string key
-                issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
-                similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
-                search_ranking += 1
-            return similarity_text
     except Exception as e:
         return f"An error occurred during issue search: {e}"
 def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
-                          search_type: str = 'semantic'): # Renamed default
-    """
-    Searches GA resolutions, filters repealed and/or repeal category if requested,
-    and returns formatted results with links and status.
-    """
     try:
         if not search_term:
             return "Please enter a search term."
@@ -260,31 +345,28 @@ def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_cat
         if search_type == 'strict':
             if not ga_resolutions_data:
                 return "GA resolution data not loaded. Strict search is unavailable."
             strict_matches = []
-            search_term_lower = search_term.lower()
             for i, resolution in enumerate(ga_resolutions_data):
-                resolution_body = resolution.get('body', '')
-                if search_term_lower in resolution_body.lower():
-                    # Apply filters immediately for strict search
                     status = resolution.get('status')
                     category = resolution.get('category')
                     if hide_repealed and status == "Repealed":
                         continue
                     if hide_repeal_category and category == "Repeal":
                         continue
-                    strict_matches.append((i, 1.0)) # Dummy score
-            similarity_text = f"# Top 20 GA Resolution Search Results (Strict)\n"
             if not strict_matches:
                 status_msgs = []
                 if hide_repealed: status_msgs.append("Repealed")
                 if hide_repeal_category: status_msgs.append("Repeal Category")
                 filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
-                return similarity_text + f"No exact matches found{filter_msg}."
-            search_ranking = 1
-            for index, sim_score in strict_matches[:20]:
                 resolution = ga_resolutions_data[index]
                 title = resolution.get('title', 'Untitled Resolution')
                 res_id = resolution.get('id', 'N/A')
@@ -292,135 +374,104 @@ def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_cat
                 status = resolution.get('status')
                 status_marker = "[REPEALED] " if status == "Repealed" else ""
                 url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
                 context = _extract_context(resolution.get('body', ''), search_term)
-                similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Match: {sim_score:.4f}\n{context}\n"
-                search_ranking += 1
-            return similarity_text
-        else: # Embedding-based search
-            raw_sorted_similarities = _perform_search(search_term, ga_all_embeddings, search_type)
-            # --- Filtering ---
-            filtered_indexed_similarities = []
-            for index, score in raw_sorted_similarities:
-                # Ensure index is valid
-                if index < len(ga_resolutions_data):
-                    resolution = ga_resolutions_data[index]
-                    status = resolution.get('status')
-                    category = resolution.get('category')
-                    # Apply filters
-                    if hide_repealed and status == "Repealed":
-                        continue
-                    if hide_repeal_category and category == "Repeal":
-                        continue
-                    filtered_indexed_similarities.append((index, score))
-            # The list is already sorted, no re-sort needed after filtering.
-            # --- Formatting Results ---
-            similarity_text = f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})\n"
-            if not filtered_indexed_similarities:
-                status_msgs = []
-                if hide_repealed: status_msgs.append("Repealed")
-                if hide_repeal_category: status_msgs.append("Repeal Category")
-                filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
-                return similarity_text + f"No matching resolutions found{filter_msg}."
-            search_ranking = 1
-            # Get top 20 results from the sorted and filtered list
-            for index, sim_score in filtered_indexed_similarities[:20]:
-                resolution = ga_resolutions_data[index]
-                title = resolution.get('title', 'Untitled Resolution')
-                res_id = resolution.get('id', 'N/A')
-                council = resolution.get('council', 1)
-                status = resolution.get('status')
-                # Add [REPEALED] marker if the status is "Repealed"
-                status_marker = "[REPEALED] " if status == "Repealed" else ""
-                # Construct the NationStates URL
-                url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
-                # Format as Markdown link with the status marker
-                similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Similarity: {sim_score:.4f}\n"
-                search_ranking += 1
-            return similarity_text
     except Exception as e:
         return f"An error occurred during GA resolution search: {e}"
 # --- Gradio Interface ---
-"""
-For information on how to customize the Gradio Blocks and Tabs, peruse the gradio docs:
-https://www.gradio.app/docs/blocks
-https://www.gradio.app/docs/tabs
-https://www.gradio.app/docs/interface (used within tabs)
-"""
 with gr.Blocks() as demo:
     gr.Markdown("""
     # NationStates Semantic Search
-    Search through NationStates issues/GA resolutions using semantic search.
-    Search time depends on how long your query is. For single words or sentences, expect an answer in less than 5 seconds. For long paragraphs/blocks of text, it might take up to a minute for the AI search engine to finish.
     """)
     with gr.Tabs() as tabs:
         with gr.TabItem("Issue Search"):
-            gr.Markdown(f"""
             ### Search NationStates Issues
-            Search through first {len(issue_titles)} issues. This uses semantic search, which finds related concepts/ideas, not as good with exact keywords. Feel free to try words, sentences, or paragraphs!
             """)
             issue_search_interface = gr.Interface(
-                fn=get_issue_similarity_rankings,
                 inputs=[
-                    gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
-                    gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
-                             info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
                 ],
                 outputs=gr.Markdown(),
                 examples=[
-                    # Examples for Issue Search (search_term, search_type)
-                    ["coffee", "semantic"],
-                    ["land value tax", "loose"],
-                    ["Elon Musk", "loose"],
-                    ["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
-                     "semantic"],
-                    [
-                        "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
-                        "semantic"],
-                    ["tax", "strict"], # New example for strict
-                    ["environmental protection", "strict"] # New example for strict
                 ],
                 title=None,
                 description=None,
                 submit_btn="Search Issues",
-                article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). Issue data from [Valentine Z](https://www.nationstates.net/nation=valentine_z). Powered by [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)."
             )
         with gr.TabItem("GA Resolution Search"):
-            gr.Markdown(f"""
-             ### Search NationStates General Assembly Resolutions
-             Search through first {len(ga_resolutions_data)} General Assembly resolutions. This uses semantic search, which finds related concepts/ideas, not as good with exact keywords. Feel free to try words, sentences, or paragraphs!
-             """)
-            # Define inputs for the GA search interface
             ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
             ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
             ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
-            ga_search_type_radio = gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
-                                            info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
             ga_search_interface = gr.Interface(
                 fn=search_ga_resolutions,
-                # Pass inputs in the order expected by the function
                 inputs=[
                     ga_search_term_input,
                     ga_hide_repealed_checkbox,
@@ -429,23 +480,20 @@ with gr.Blocks() as demo:
                 ],
                 outputs=gr.Markdown(),
                 examples=[
-                    # Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
                     ["condemn genocide", True, True, "semantic"],
                     ["rights of animals", True, True, "loose"],
                     ["regulating space mining", True, True, "semantic"],
                     ["founding of the World Assembly", True, True, "semantic"],
                     ["environmental protection", True, True, "semantic"],
-                    ["human rights", True, True, "strict"], # New example for strict
-                    ["World Assembly", True, True, "strict"] # New example for strict
                 ],
                 title=None,
                 description=None,
                 submit_btn="Search Resolutions",
-                article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). GA Resolution data parsed from NationStates. Powered by [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)."
             )
 # --- Launch App ---
 if __name__ == "__main__":
-    # Set share=True to make the app accessible externally (requires ngrok)
-    # share=False is default and runs locally
     demo.launch()

 import numpy as np
 import json
 import os
+import re
 # --- Configuration and Global Data Loading ---
 # Determine the directory of the script to load files relative to it
 script_dir = os.path.dirname(os.path.abspath(__file__))
+# Original issue-level artifacts (kept for sparse/loose and strict)
 issue_embeddings_paths = {
+    # We will still attempt to load original dense (semantic) if present,
+    # but semantic search will use component-level embeddings. This is optional.
+    'semantic': os.path.join(script_dir, 'ns_issues_semantic_bge-m3.npy'),
+    'loose': os.path.join(script_dir, 'ns_issues_loose_bge-m3.npy'),
 }
 issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
+# Component-level artifacts (used for semantic only)
+issue_components_paths = {
+    'semantic': os.path.join(script_dir, 'ns_issue_components_semantic_bge-m3.npy'),
+    # There is intentionally no component-level 'loose' per your instruction.
+}
+issue_components_meta_path = os.path.join(script_dir, 'ns_issue_components_meta.json')
+issue_titles_components_path = os.path.join(script_dir, 'issue_titles_components.json')
+# GA resolution artifacts (unchanged)
 ga_embeddings_paths = {
+    'semantic': os.path.join(script_dir, 'ns_ga_resolutions_semantic_bge-m3.npy'),
+    'loose': os.path.join(script_dir, 'ns_ga_resolutions_loose_bge-m3.npy'),
 }
 ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
 print("Loading BGE-M3 model...")
 try:
     model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
     print("Model loaded successfully.")
 except Exception as e:
     print("Please ensure you have an internet connection or the model is cached locally.")
     model = None  # Indicate model loading failed
+# Issue data storage (issue-level and component-level)
 issue_all_embeddings = {
+    'semantic': None,  # optional legacy dense; not used for semantic queries in this app
+    'loose': None,     # issue-level sparse, used for loose search
 }
 issue_titles = {}
+all_issue_raw_texts = []  # For strict search (issue-level)
+issue_components_embeddings = {
+    'semantic': None,  # dense component-level embedding matrix
+}
+issue_components_meta = []  # list of dicts aligned to component rows
+issue_titles_components = {}
 print("Loading issue data...")
 try:
+    # Load issue-level embeddings (kept for sparse/loose and optional legacy dense)
+    for embed_type, path in issue_embeddings_paths.items():
+        if os.path.exists(path):
+            if embed_type == 'loose':
+                issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
             else:
+                issue_all_embeddings[embed_type] = np.load(path)
+            shape_or_len = issue_all_embeddings[embed_type].shape if hasattr(issue_all_embeddings[embed_type], 'shape') else len(issue_all_embeddings[embed_type])
+            print(f"  Loaded {embed_type} issue embeddings from {path} (Shape/Len: {shape_or_len})")
+        else:
+            print(f"  Warning: {embed_type} issue embeddings not found at {path}.")
+            issue_all_embeddings[embed_type] = None
+    # Load titles (issue-level)
+    if os.path.exists(issue_titles_path):
         with open(issue_titles_path, encoding='utf-8') as file:
             issue_titles = json.load(file)
+        print(f"Issue titles loaded: {len(issue_titles)} issues.")
+    else:
+        print(f"  Warning: issue_titles.json not found at {issue_titles_path}")
+    # Load raw issue texts for strict search
+    issues_input_dir = os.path.join(script_dir, 'small_scripts', 'make_embedding',
+                                    'NationStates-Issue-Megathread', '002 - Issue Megalist (MAIN)')
+    issue_files_for_raw_load = []
+    file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
+    if os.path.isdir(issues_input_dir):
+        for filename in os.listdir(issues_input_dir):
+            if filename.endswith('.txt'):
+                match = file_pattern.match(filename)
+                if match:
+                    start_num = int(match.group(1))
+                    issue_files_for_raw_load.append((start_num, filename))
+        issue_files_for_raw_load.sort(key=lambda x: x[0])
+        issue_files_for_raw_load = [os.path.join(issues_input_dir, filename) for _, filename in issue_files_for_raw_load]
+        for filepath in issue_files_for_raw_load:
+            with open(filepath, 'r', encoding='utf-8') as file:
+                issues_text_in_file = file.read()
+                issues_list_in_file = [
+                    issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
+                ]
+                all_issue_raw_texts.extend(issues_list_in_file)
+        print(f"  Loaded {len(all_issue_raw_texts)} raw issue texts for strict search.")
+    else:
+        print(f"  Warning: Issue text directory '{issues_input_dir}' not found. Strict issue search will not work.")
+    # Load component-level artifacts (semantic only)
+    for embed_type, path in issue_components_paths.items():
+        if os.path.exists(path):
+            issue_components_embeddings[embed_type] = np.load(path)
+            print(f"  Loaded component {embed_type} embeddings from {path} (Shape: {issue_components_embeddings[embed_type].shape})")
         else:
+            print(f"  Warning: component {embed_type} embeddings not found at {path}.")
+    if os.path.exists(issue_components_meta_path):
+        with open(issue_components_meta_path, encoding='utf-8') as f:
+            issue_components_meta = json.load(f)
+        print(f"  Loaded component meta: {len(issue_components_meta)} items.")
+    else:
+        print(f"  Warning: component meta not found at {issue_components_meta_path}.")
+    if os.path.exists(issue_titles_components_path):
+        with open(issue_titles_components_path, encoding='utf-8') as f:
+            issue_titles_components = json.load(f)
+        print(f"  Loaded component issue titles: {len(issue_titles_components)}")
+    else:
+        # Fallback to issue-level titles if component titles not present
+        issue_titles_components = issue_titles
 except FileNotFoundError as e:
     print(f"Error loading issue data: {e}")
+    print(f"Please ensure embedding files and '{os.path.basename(issue_titles_path)}' are in the same directory as app.py")
 except Exception as e:
     print(f"Error loading issue data: {e}")
+# GA resolution data storage (unchanged)
 ga_all_embeddings = {
     'semantic': None,
     'loose': None,
 print("Loading GA resolution data...")
 try:
     if model:  # Only attempt to load embeddings if model is available
         for embed_type, path in ga_embeddings_paths.items():
             if os.path.exists(path):
+                if embed_type == 'loose':
                     ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
+                else:
                     ga_all_embeddings[embed_type] = np.load(path)
+                shape_or_len = ga_all_embeddings[embed_type].shape if hasattr(ga_all_embeddings[embed_type], 'shape') else len(ga_all_embeddings[embed_type])
+                print(f"  Loaded {embed_type} GA embeddings from {path} (Shape/Len: {shape_or_len})")
             else:
+                print(f"  Warning: {embed_type} GA embeddings not found at {path}.")
+                ga_all_embeddings[embed_type] = None
+        if os.path.exists(ga_resolutions_path):
+            with open(ga_resolutions_path, encoding='utf-8') as file:
+                ga_resolutions_data = json.load(file)
+            print(f"GA resolution data loaded: {len(ga_resolutions_data)} resolutions.")
+        else:
+            print(f"  Warning: GA data file not found at {ga_resolutions_path}")
 except FileNotFoundError as e:
     print(f"Error loading GA resolution data: {e}")
+    print(f"Please ensure GA embedding files and '{os.path.basename(ga_resolutions_path)}' are in the same directory as app.py")
 except Exception as e:
     print(f"Error loading GA resolution data: {e}")
+# --- Search Utilities ---
 def _extract_context(text: str, query: str):
+    """Extracts the first line containing the query and highlights all mentions of it (case-insensitive)."""
     text_lines = text.split('\n')
     query_lower = query.lower()
     for line in text_lines:
         if query_lower in line.lower():
             highlighted_line = re.sub(re.escape(query), lambda m: f"**{m.group(0)}**", line, flags=re.IGNORECASE)
             return f'> {highlighted_line}'
+    return ""
+# --- Issue Search (Component-level semantic, Issue-level loose/strict) ---
+def search_issues(query: str, search_type: str = 'semantic', scope: str = 'both'):
+    """
+    Issue search dispatcher:
+    - semantic: component-level dense with scope (descriptions | options | both).
+    - loose: issue-level sparse (scope is ignored).
+    - strict: issue-level exact/substring match over raw texts (scope is ignored).
+    """
     try:
+        if not model:
+            return "Model failed to load. Cannot perform search."
+        if not query:
             return "Please enter a search term."
+        # --- Semantic (component-level) ---
+        if search_type == 'semantic':
+            corpus = issue_components_embeddings.get('semantic')
+            if corpus is None or not len(issue_components_meta):
+                return "Component-level semantic embeddings or metadata not loaded. Cannot run semantic search."
+            query_embeddings = model.encode([query],
+                                            return_dense=True,
+                                            return_sparse=True,
+                                            return_colbert_vecs=False)
+            q = query_embeddings['dense_vecs']  # shape (1, d)
+            scores = (q @ corpus.T)[0]  # shape (N_components,)
+            indexed = list(enumerate(scores))
+            # Scope filter
+            def allow(meta):
+                t = meta.get('component_type')
+                if scope == 'descriptions':
+                    return t == 'desc'
+                elif scope == 'options':
+                    return t == 'option'
+                return True
+            filtered = [(i, s) for i, s in indexed if allow(issue_components_meta[i])]
+            filtered.sort(key=lambda x: x[1], reverse=True)
+            out = [f"# Top 20 Issue Results (Semantic, scope={scope})"]
+            if not filtered:
+                out.append("No matches found.")
+                return "\n".join(out)
+            topk = filtered[:20]
+            for rank, (idx, score) in enumerate(topk, start=1):
+                meta = issue_components_meta[idx]
+                issue_idx = meta['issue_index']
+                ctype = meta['component_type']
+                opt_idx = meta['option_index']
+                title = issue_titles_components.get(str(issue_idx), f"Issue {issue_idx}")
+                if ctype == 'desc':
+                    label = f"{title} — Description"
+                else:
+                    label = f"{title} — Option {opt_idx}"
+                out.append(f"{rank}. {label}, Similarity: {score:.4f}")
+            return "\n".join(out)
+        # --- Loose (issue-level sparse) ---
+        elif search_type == 'loose':
+            corpus_sparse = issue_all_embeddings.get('loose')
+            if corpus_sparse is None:
+                return "Issue-level sparse embeddings not loaded. Cannot run loose search."
+            query_embeddings = model.encode([query],
+                                            return_dense=True,
+                                            return_sparse=True,
+                                            return_colbert_vecs=False)
+            if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
+                return "Sparse query failed (no lexical weights)."
+            q_sparse = query_embeddings['lexical_weights'][0]
+            scores = [model.compute_lexical_matching_score(q_sparse, d) for d in corpus_sparse]
+            indexed = list(enumerate(scores))
+            indexed.sort(key=lambda x: x[1], reverse=True)
+            out = [f"# Top 20 Issue Results (Loose keyword, scope ignored)"]
+            if not indexed:
+                out.append("No matches found.")
+                return "\n".join(out)
+            for rank, (idx, score) in enumerate(indexed[:20], start=1):
+                title = issue_titles.get(str(idx), f"Unknown Issue (Index {idx})")
+                out.append(f"{rank}. {title}, Similarity: {score:.4f}")
+            return "\n".join(out)
+        # --- Strict (issue-level exact/substring) ---
+        elif search_type == 'strict':
             if not all_issue_raw_texts:
                 return "Raw issue texts not loaded. Strict search is unavailable."
             strict_matches = []
+            ql = query.lower()
             for i, issue_text in enumerate(all_issue_raw_texts):
+                if ql in issue_text.lower():
+                    strict_matches.append(i)
+            out = [f"# Top 20 Issue Search Results (Strict exact/substring)"]
             if not strict_matches:
+                out.append("No exact matches found.")
+                return "\n".join(out)
+            for rank, index in enumerate(strict_matches[:20], start=1):
                 issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
+                context = _extract_context(all_issue_raw_texts[index], query)
+                out.append(f"{rank}. {issue_title}\n{context}\n")
+            return "\n".join(out)
+        else:
+            return f"Unsupported search type: {search_type}"
     except Exception as e:
         return f"An error occurred during issue search: {e}"
+# --- GA Resolution Search (unchanged logic) ---
+def _perform_search_ga(search_term: str, corpus_embeddings_dict: dict, search_type: str):
+    if not model:
+        raise ValueError("Model failed to load. Cannot perform search.")
+    if not search_term:
+        raise ValueError("Please enter a search term.")
+    corpus_embeddings = corpus_embeddings_dict.get(search_type)
+    if corpus_embeddings is None:
+        raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
+    query_embeddings = model.encode([search_term],
+                                    return_dense=True,
+                                    return_sparse=True,
+                                    return_colbert_vecs=False)
+    if search_type == 'semantic':
+        query_vec = query_embeddings['dense_vecs']  # Shape: (1, embedding_dim)
+        similarity_scores = (query_vec @ corpus_embeddings.T)[0]
+    elif search_type == 'loose':
+        if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
+            raise ValueError("Lexical weights (sparse) not returned for query. Model or configuration issue.")
+        query_sparse_dict = query_embeddings['lexical_weights'][0]
+        similarity_scores = np.array([
+            model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
+            for doc_sparse_dict in corpus_embeddings
+        ])
+    else:
+        raise ValueError(f"Unsupported embedding search type: {search_type}")
+    indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
+    sorted_similarities = sorted(indexed_similarities, key=lambda item: item[1], reverse=True)
+    return sorted_similarities
 def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
+                          search_type: str = 'semantic'):
     try:
         if not search_term:
             return "Please enter a search term."
         if search_type == 'strict':
             if not ga_resolutions_data:
                 return "GA resolution data not loaded. Strict search is unavailable."
             strict_matches = []
+            ql = search_term.lower()
             for i, resolution in enumerate(ga_resolutions_data):
+                body = resolution.get('body', '')
+                if ql in body.lower():
                     status = resolution.get('status')
                     category = resolution.get('category')
                     if hide_repealed and status == "Repealed":
                         continue
                     if hide_repeal_category and category == "Repeal":
                         continue
+                    strict_matches.append(i)
+            out = [f"# Top 20 GA Resolution Search Results (Strict)"]
             if not strict_matches:
                 status_msgs = []
                 if hide_repealed: status_msgs.append("Repealed")
                 if hide_repeal_category: status_msgs.append("Repeal Category")
                 filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
+                return "\n".join(out + [f"No exact matches found{filter_msg}."])
+            for rank, index in enumerate(strict_matches[:20], start=1):
                 resolution = ga_resolutions_data[index]
                 title = resolution.get('title', 'Untitled Resolution')
                 res_id = resolution.get('id', 'N/A')
                 status = resolution.get('status')
                 status_marker = "[REPEALED] " if status == "Repealed" else ""
                 url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
                 context = _extract_context(resolution.get('body', ''), search_term)
+                out.append(f"{rank}. {status_marker}[#{res_id} {title}]({url}), Match: 1.0000\n{context}\n")
+            return "\n".join(out)
+        # Embedding-based GA search
+        raw_sorted = _perform_search_ga(search_term, ga_all_embeddings, search_type)
+        # Filter by status/category
+        filtered = []
+        for index, score in raw_sorted:
+            if index >= len(ga_resolutions_data):
+                continue
+            resolution = ga_resolutions_data[index]
+            status = resolution.get('status')
+            category = resolution.get('category')
+            if hide_repealed and status == "Repealed":
+                continue
+            if hide_repeal_category and category == "Repeal":
+                continue
+            filtered.append((index, score))
+        out = [f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})"]
+        if not filtered:
+            status_msgs = []
+            if hide_repealed: status_msgs.append("Repealed")
+            if hide_repeal_category: status_msgs.append("Repeal Category")
+            filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
+            return "\n".join(out + [f"No matching resolutions found{filter_msg}."])
+        for rank, (index, score) in enumerate(filtered[:20], start=1):
+            resolution = ga_resolutions_data[index]
+            title = resolution.get('title', 'Untitled Resolution')
+            res_id = resolution.get('id', 'N/A')
+            council = resolution.get('council', 1)
+            status = resolution.get('status')
+            status_marker = "[REPEALED] " if status == "Repealed" else ""
+            url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
+            out.append(f"{rank}. {status_marker}[#{res_id} {title}]({url}), Similarity: {score:.4f}")
+        return "\n".join(out)
     except Exception as e:
         return f"An error occurred during GA resolution search: {e}"
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown("""
     # NationStates Semantic Search
+    Search NationStates issues and GA resolutions. Choose semantic for conceptual similarity, loose for keyword matching, and strict for exact substring queries.
     """)
     with gr.Tabs() as tabs:
+        # Issue Search Tab
         with gr.TabItem("Issue Search"):
+            gr.Markdown("""
             ### Search NationStates Issues
+            - Semantic: component-level (descriptions and/or options), honors Scope.
+            - Loose: issue-level keywords (Scope is ignored).
+            - Strict: issue-level exact/substring (Scope is ignored).
             """)
             issue_search_interface = gr.Interface(
+                fn=search_issues,
                 inputs=[
+                    gr.Textbox(label="Search term", placeholder="What issue or option are you looking for?"),
+                    gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic",
+                             info="semantic: conceptual (component-level); loose: keyword (issue-level); strict: exact substring (issue-level)"),
+                    gr.Radio(["both", "descriptions", "options"], label="Scope (semantic only)", value="both",
+                             info="Only applies to semantic search; ignored for loose and strict.")
                 ],
                 outputs=gr.Markdown(),
                 examples=[
+                    ["coffee", "semantic", "both"],
+                    ["land value tax", "semantic", "descriptions"],
+                    ["chainsaw maniacs", "semantic", "options"],
+                    ["Elon Musk", "loose", "both"],
+                    ["environmental protection", "strict", "both"]
                 ],
                 title=None,
                 description=None,
                 submit_btn="Search Issues",
+                article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). Issues powered by component-level semantic (BAAI/bge-m3) and issue-level sparse keywords."
             )
+        # GA Resolution Search Tab
         with gr.TabItem("GA Resolution Search"):
+            gr.Markdown("""
+            ### Search NationStates General Assembly Resolutions
+            Use semantic for concepts, loose for keyword matching, or strict for exact substring.
+            """)
             ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
             ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
             ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
+            ga_search_type_radio = gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic",
+                                            info="semantic: conceptual similarity; loose: keyword matching; strict: exact substring")
             ga_search_interface = gr.Interface(
                 fn=search_ga_resolutions,
                 inputs=[
                     ga_search_term_input,
                     ga_hide_repealed_checkbox,
                 ],
                 outputs=gr.Markdown(),
                 examples=[
                     ["condemn genocide", True, True, "semantic"],
                     ["rights of animals", True, True, "loose"],
                     ["regulating space mining", True, True, "semantic"],
                     ["founding of the World Assembly", True, True, "semantic"],
                     ["environmental protection", True, True, "semantic"],
+                    ["human rights", True, True, "strict"],
+                    ["World Assembly", True, True, "strict"]
                 ],
                 title=None,
                 description=None,
                 submit_btn="Search Resolutions",
+                article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). GA data parsed from NationStates. Powered by BAAI/bge-m3."
             )
 # --- Launch App ---
 if __name__ == "__main__":
     demo.launch()

issue_titles.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

issue_titles_components.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ns_issue_components_meta.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ns_issue_components_semantic_bge-m3.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f5128a11cd81849b9eafd4f312e323a84edacf88177f9cfd28ae0c2a589232b
+size 16728192

ns_issues_loose_bge-m3.npy CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:99d43b309e690e846e24ac5ec3b4406f53842f132df6fa3bb11659494cfd3772
-size 8418649

 version https://git-lfs.github.com/spec/v1
+oid sha256:1921105894133a81c5a79d60fb9670b48f8dc14d43f14cc02cf9a5405e7ed312
+size 8416495

small_scripts/make_embedding/embedding.py CHANGED Viewed

@@ -1,230 +1,340 @@
 import os
 import re
 import numpy as np
 from FlagEmbedding import BGEM3FlagModel
-# --- Configuration ---
-# IMPORTANT: Adjust MODEL_PATH to your model's actual local path.
 MODEL_PATH = '../../../../Downloads/bge-m3'
-# Output directory for the final consolidated .npy files.
-# If this script is in 'project_root/scripts/', and app.py is in 'project_root/',
-# then '../' would be appropriate here. If both are in the same directory, use '.'
 OUTPUT_DIR = '../../'
-# Temporary cache directory for per-file embeddings (relative to script location)
 CACHE_DIR = './.issue_embeddings_cache'
-# --- Embedding Generation Control ---
-# Set to True to re-embed all files regardless of cached files.
-# If False, existing cached files will be skipped unless they are in CHANGED_FILES.
 RE_EMBED_ALL = False
-# List of specific filenames (e.g., '0000 TO 0025.txt') to re-embed.
-# Only effective if RE_EMBED_ALL is False.
-CHANGED_FILES = []  # e.g., ['0000 TO 0025.txt', '0026 TO 0050.txt']
-# --- Helper Functions ---
 def get_issue_files(directory="."):
-    """Gets and sorts issue files by their starting number from the filename pattern."""
     issue_files = []
-    # Regex to extract the first number from filenames like "0000 TO 0025.txt"
     file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
     if not os.path.isdir(directory):
         print(f"Error: Directory '{directory}' not found.")
         return []
     for filename in os.listdir(directory):
         if filename.endswith('.txt'):
             match = file_pattern.match(filename)
             if match:
                 start_num = int(match.group(1))
                 issue_files.append((start_num, filename))
-    # Sort by the extracted starting number to ensure correct global order
     issue_files.sort(key=lambda x: x[0])
-    return [os.path.join(directory, filename) for _, filename in issue_files]  # Return full paths
 def ensure_dirs(dirs):
-    """Ensures that a list of directories exists."""
     for d in dirs:
         os.makedirs(d, exist_ok=True)
-# --- Main Embedding Function ---
-def encode_issues():
     print("Initializing BGEM3FlagModel...")
-    # Setting use_fp16 to True speeds up computation with a slight performance degradation
     try:
         model = BGEM3FlagModel(MODEL_PATH, use_fp16=True)
         print("Model loaded.")
     except Exception as e:
         print(f"Error loading model from {MODEL_PATH}: {e}")
-        print("Please ensure the model is downloaded to the specified path.")
         return
-    issues_input_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'NationStates-Issue-Megathread/002 - Issue Megalist (MAIN)')
     issue_files = get_issue_files(issues_input_dir)
     if not issue_files:
-        print(
-            f"No issue files found matching the pattern 'NNNN TO NNNN.txt' in '{issues_input_dir}'. Please ensure files are present.")
         return
-    # Prepare cache directories for individual file embeddings
-    cache_dense_dir = os.path.join(CACHE_DIR, 'dense')
-    cache_sparse_dir = os.path.join(CACHE_DIR, 'sparse')
-    # Removed cache_colbert_dir
     ensure_dirs([cache_dense_dir, cache_sparse_dir])
-    # Ensure output directory for final consolidated files exists
     os.makedirs(OUTPUT_DIR, exist_ok=True)
-    print(f"Found {len(issue_files)} issue files to process. Starting embedding process...")
-    # Process each issue file individually
-    for i, filepath in enumerate(issue_files):  # filepath is now full path
-        filename = os.path.basename(filepath)
-        print(f"\nProcessing file {i + 1}/{len(issue_files)}: {filename}")
-        # Define cache paths for the embeddings of this specific file
-        base_name = os.path.splitext(filename)[0]  # e.g., "0000 TO 0025"
         file_cache_dense_path = os.path.join(cache_dense_dir, f"{base_name}.npy")
-        file_cache_sparse_path = os.path.join(cache_sparse_dir, f"{base_name}.npy")
-        # Removed file_cache_colbert_path
-        # Check if re-embedding is needed for this file based on configuration
-        is_cached = (os.path.exists(file_cache_dense_path) and
-                     os.path.exists(file_cache_sparse_path))  # Removed colbert cache check
         if not RE_EMBED_ALL and filename not in CHANGED_FILES and is_cached:
-            print(f"  Skipping {filename} (cached embeddings exist and no re-embed flags are set).")
-            continue  # Skip to next file
-        try:
-            with open(filepath, 'r', encoding='utf-8') as file:
-                issues_text_in_file = file.read()
-                # Split issues by the separator and remove any empty strings resulting from multiple separators
-                issues_list_in_file = [
-                    issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
-                ]
-            if not issues_list_in_file:
-                print(f"  Warning: No issues found in {filename} after splitting. Skipping encoding for this file.")
-                continue  # Skip to next file if no content
-            print(f"  Found {len(issues_list_in_file)} issues in {filename}. Encoding...")
-            # Encode only Dense and Sparse vector types
-            embeddings = model.encode(issues_list_in_file,
-                                      batch_size=12,  # Adjust batch_size based on your GPU/CPU memory
-                                      max_length=8192,  # Max length of input sequence
-                                      return_dense=True,
-                                      return_sparse=True,  # This will return 'lexical_weights' for BGE-M3
-                                      return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
-            # Save Semantic (Dense) Embeddings
-            np.save(file_cache_dense_path, embeddings['dense_vecs'])
-            # --- Save Loose (Sparse) Embeddings ---
-            # 'lexical_weights' is a list of dictionaries, one for each item in the batch
-            sparse_list_of_dicts = embeddings.get('lexical_weights')
-            # Save this list of sparse dictionaries as a NumPy object array
-            # This allows storing Python objects (dictionaries) in a NumPy array.
-            np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
-            print(f"  Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
-        except Exception as e:
-            print(f"  Error processing {filename}: {e}")
-            import traceback
-            traceback.print_exc()  # Print full traceback for debugging
-            continue  # Continue to the next file even if one fails
-    print("\n--- Consolidation Phase: Combining cached embeddings ---")
-    # Initialize lists to collect all embeddings in the correct global order
-    final_semantic_embeddings_list = [] # Renamed from final_dense_embeddings_list
-    final_loose_embeddings_list = []  # Renamed from final_sparse_embeddings_list
-    # Removed final_colbert_embeddings_list
-    # Re-get sorted file paths to ensure correct order for consolidation
-    issue_files_for_consolidation = get_issue_files(issues_input_dir)
-    global_issue_index = 0
-    # Iterate through files again to load from cache and consolidate in sorted order
-    for i, filepath in enumerate(issue_files_for_consolidation):
         filename = os.path.basename(filepath)
         base_name = os.path.splitext(filename)[0]
-        file_cache_dense_path = os.path.join(cache_dense_dir, f"{base_name}.npy")
         file_cache_sparse_path = os.path.join(cache_sparse_dir, f"{base_name}.npy")
-        # Removed file_cache_colbert_path
-        # Only load if all cached embedding files for this issue file are present
-        if (os.path.exists(file_cache_dense_path) and
-                os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
-            # Load and append to the lists
-            final_semantic_embeddings_list.append(np.load(file_cache_dense_path)) # Renamed
-            # Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
-            loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
-            final_loose_embeddings_list.extend(loaded_sparse_dicts_for_file) # Renamed
-            # Removed loading ColBERT arrays
-            # loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
-            # final_colbert_embeddings_list.extend(loaded_colbert_arrays_for_file)
-            # Count issues in this file to correctly update global_issue_index
-            # We need to re-read the raw file to get the count
-            with open(filepath, 'r', encoding='utf-8') as file:
-                issues_text_in_file = file.read()
-                issue_count_in_file = len(
-                    [issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()])
-            global_issue_index += issue_count_in_file
         else:
-            print(
-                f"  Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
-    if not final_semantic_embeddings_list: # Renamed
-        print("No embeddings were successfully loaded for consolidation. No output files generated.")
-        return
-    # --- Final Save Phase ---
-    # Concatenate all collected embeddings into single large NumPy arrays
-    print("Concatenating and saving final consolidated embeddings...")
-    # Semantic (Dense) embeddings
-    final_semantic_array = np.vstack(final_semantic_embeddings_list) # Renamed
-    np.save(os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy'), final_semantic_array) # Renamed file
-    print(
-        f"  Saved semantic embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy')} (Shape: {final_semantic_array.shape})") # Renamed file and type
-    # Loose (Sparse) embeddings (now a list of dictionaries, saved as object array)
-    if final_loose_embeddings_list: # Renamed
-        # Save the list of dictionaries as a NumPy object array
-        final_loose_array = np.array(final_loose_embeddings_list, dtype=object) # Renamed
-        np.save(os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy'), final_loose_array, allow_pickle=True) # Renamed file
-        print(
-            f"  Saved loose embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy')} (Total objects: {len(final_loose_array)}, type: {type(final_loose_array)})") # Renamed file and type
-    else:
-        print("  No loose embeddings to save.") # Renamed
-    # Removed ColBERT embeddings saving
-    # if final_colbert_embeddings_list:
-    #     final_colbert_array = np.array(final_colbert_embeddings_list, dtype=object)
-    #     np.save(os.path.join(OUTPUT_DIR, 'ns_issues_colbert_bge-m3.npy'), final_colbert_array, allow_pickle=True)
-    #     print(f"  Saved ColBERT embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_colbert_bge-m3.npy')} (Total objects: {len(final_colbert_array)}, type: {type(final_colbert_array)})")
-    # else:
-    #     print("  No ColBERT embeddings to save.")
-    print("\nEmbedding generation complete!")
-# Call this function to start the embedding process.
 if __name__ == "__main__":
-    encode_issues()

+# filename: encode_issues_components_and_sparse.py
 import os
 import re
+import json
 import numpy as np
 from FlagEmbedding import BGEM3FlagModel
 MODEL_PATH = '../../../../Downloads/bge-m3'
 OUTPUT_DIR = '../../'
 CACHE_DIR = './.issue_embeddings_cache'
 RE_EMBED_ALL = False
+CHANGED_FILES = []
+ISSUE_SPLIT_MARKER = "[hr][/hr]"
+BB_TAG_RE = re.compile(r'\[(?:\/)?[^\]]+\]')  # strips BBCode tags
+def strip_bbcode(s: str) -> str:
+    # Stripping BBCode ensures robust header and description detection
+    return BB_TAG_RE.sub('', s)
 def get_issue_files(directory="."):
     issue_files = []
     file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
     if not os.path.isdir(directory):
         print(f"Error: Directory '{directory}' not found.")
         return []
     for filename in os.listdir(directory):
         if filename.endswith('.txt'):
             match = file_pattern.match(filename)
             if match:
                 start_num = int(match.group(1))
                 issue_files.append((start_num, filename))
     issue_files.sort(key=lambda x: x[0])
+    return [os.path.join(directory, filename) for _, filename in issue_files]
 def ensure_dirs(dirs):
     for d in dirs:
         os.makedirs(d, exist_ok=True)
+def _split_raw_issues(raw_text):
+    return [issue.strip() for issue in raw_text.split(ISSUE_SPLIT_MARKER) if issue.strip()]
+def _extract_title(issue_block):
+    for line in issue_block.splitlines():
+        line = line.strip()
+        if line:
+            return line
+    return "Untitled Issue"
+def find_header_index(header: str, lines):
+    # Strips BBCode and whitespace, compares case-insensitively
+    header_lower = header.lower()
+    for idx, line in enumerate(lines):
+        line_clean = strip_bbcode(line).strip().lower()
+        if line_clean == header_lower:
+            return idx
+    return -1
+def is_placeholder_issue(issue_block):
+    # Skips issues that are just a title line with 'TBD' and no content
+    lines = [line.strip() for line in issue_block.splitlines() if line.strip()]
+    if len(lines) == 1 and 'TBD' in lines[0]:
+        return True
+    # Also skip if all non-empty lines are BBCode or anchor/title lines and contain 'TBD'
+    non_title_lines = [
+        l for l in lines
+        if not (l.startswith('[b][anchor=') and 'TBD' in l)
+    ]
+    if not non_title_lines and any('TBD' in l for l in lines):
+        return True
+    return False
+def _parse_issue_strict(issue_block: str, global_issue_index: int):
+    lines = issue_block.splitlines()
+    i_issue = find_header_index("The Issue", lines)
+    i_debate = find_header_index("The Debate", lines)
+    if i_issue == -1 or i_debate == -1 or i_debate <= i_issue:
+        print(f"Parse error: missing 'The Issue' or 'The Debate' in issue #{global_issue_index}")
+        raise ValueError(f"Parse error in issue #{global_issue_index}")
+    between = lines[i_issue + 1:i_debate]
+    cleaned = [strip_bbcode(l).strip() for l in between]
+    non_empty_idx = [k for k, c in enumerate(cleaned) if c]
+    if len(non_empty_idx) == 1:
+        desc_text = cleaned[non_empty_idx[0]]
+    elif len(non_empty_idx) == 0:
+        first_raw = None
+        for l in between:
+            if l.strip():
+                first_raw = l
+                break
+        if not first_raw:
+            print(f"Parse error: issue #{global_issue_index} has no usable description lines")
+            raise ValueError(f"Parse error in issue #{global_issue_index}")
+        desc_text = strip_bbcode(first_raw).strip()
+    else:
+        offending = [between[k] for k in non_empty_idx]
+        print(f"Parse error: issue #{global_issue_index} has {len(non_empty_idx)} non-empty description lines (expected 1)")
+        print(f"Description lines (raw): {offending}")
+        raise ValueError(f"Parse error in issue #{global_issue_index}")
+    after_debate = [l.strip() for l in lines[i_debate + 1:] if l.strip()]
+    option_lines = after_debate
+    return desc_text, option_lines
+import re
+def format_issue_title_markdown(issue_block):
+    """
+    Extracts anchor and visible title from the first line of the issue block,
+    and formats as markdown with a forum link.
+    """
+    # Find the first non-empty line (should be the title line)
+    for line in issue_block.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        # Extract anchor (e.g., [anchor=1379])
+        anchor_match = re.search(r'\[anchor=(\d+)\]', line)
+        anchor = anchor_match.group(1) if anchor_match else None
+        # Extract visible title (after the closing [/anchor]:)
+        # This matches: [anchor=1379]#1379[/anchor]: <title>
+        title_match = re.search(r'\[anchor=(\d+)\]\#\d+\[\/anchor\]:\s*(.*)', line)
+        if title_match:
+            title_text = title_match.group(2).strip()
+        else:
+            # Fallback: try to find after the first colon
+            parts = line.split(':', 1)
+            title_text = parts[1].strip() if len(parts) > 1 else line
+        # Remove trailing BBCode tags from title (but keep chain/fancy formatting)
+        title_text = re.sub(r'\[\/?[^\]]+\]', '', title_text).strip()
+        # Compose markdown
+        if anchor:
+            return f"#{anchor}: [{title_text}](https://forum.nationstates.net/viewtopic.php?f=13&t=88#{anchor})"
+        else:
+            # Fallback: just return cleaned title
+            return title_text
+    print(f"Could not find issue title in {issue_block}")
+    raise ValueError(f"Parse error in issue title")
+def encode_issues_components_and_sparse():
     print("Initializing BGEM3FlagModel...")
     try:
         model = BGEM3FlagModel(MODEL_PATH, use_fp16=True)
         print("Model loaded.")
     except Exception as e:
         print(f"Error loading model from {MODEL_PATH}: {e}")
         return
+    issues_input_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                    'NationStates-Issue-Megathread/002 - Issue Megalist (MAIN)')
     issue_files = get_issue_files(issues_input_dir)
     if not issue_files:
+        print(f"No issue files found in '{issues_input_dir}'.")
         return
+    cache_dense_dir = os.path.join(CACHE_DIR, 'dense_components')
+    cache_sparse_dir = os.path.join(CACHE_DIR, 'sparse_issues')
     ensure_dirs([cache_dense_dir, cache_sparse_dir])
     os.makedirs(OUTPUT_DIR, exist_ok=True)
+    # --- Component-level dense (semantic) ---
+    perfile_component_texts = []
+    perfile_component_meta = []
+    all_issue_titles = []
+    global_issue_index_offset = 0
+    # --- Issue-level sparse (loose) ---
+    perfile_issue_texts = []
+    titles_dict = {}
+    print(f"Parsing and preparing issue blocks from {len(issue_files)} files...")
+    for i, filepath in enumerate(issue_files):
+        filename = os.path.basename(filepath)
+        print(f"  [{i+1}/{len(issue_files)}] Parsing file: {filename}")
+        with open(filepath, 'r', encoding='utf-8') as f:
+            raw = f.read()
+        issue_blocks = _split_raw_issues(raw)
+        file_components_texts = []
+        file_components_meta = []
+        file_issue_texts = []
+        file_issue_titles = []
+        for local_issue_idx, issue_block in enumerate(issue_blocks):
+            if is_placeholder_issue(issue_block):
+                continue  # Skip placeholder/empty issues
+            title_line = _extract_title(issue_block)
+            this_issue_global_idx = global_issue_index_offset + local_issue_idx
+            titles_dict[str(this_issue_global_idx)] = format_issue_title_markdown(issue_block)
+            try:
+                desc_text, option_texts = _parse_issue_strict(issue_block, this_issue_global_idx)
+            except Exception as e:
+                print(f"Aborting due to parse error in issue #{this_issue_global_idx}")
+                raise
+            # Dense: description and options as separate components
+            file_components_texts.append(desc_text)
+            file_components_meta.append({
+                "issue_index": this_issue_global_idx,
+                "component_type": "desc",
+                "option_index": None
+            })
+            for opt_idx, opt_text in enumerate(option_texts, start=1):
+                file_components_texts.append(opt_text)
+                file_components_meta.append({
+                    "issue_index": this_issue_global_idx,
+                    "component_type": "option",
+                    "option_index": opt_idx
+                })
+            # Sparse: whole issue block (not chunked)
+            file_issue_texts.append(issue_block)
+            file_issue_titles.append(title_line)
+        perfile_component_texts.append(file_components_texts)
+        perfile_component_meta.append(file_components_meta)
+        perfile_issue_texts.append(file_issue_texts)
+        global_issue_index_offset += len(issue_blocks)
+    # --- Dense embedding for components ---
+    print("\nStarting dense (semantic) embedding for components...")
+    all_dense_chunks = []
+    all_meta = []
+    for i, filepath in enumerate(issue_files):
+        filename = os.path.basename(filepath)
+        base_name = os.path.splitext(filename)[0]
         file_cache_dense_path = os.path.join(cache_dense_dir, f"{base_name}.npy")
+        texts = perfile_component_texts[i]
+        metas = perfile_component_meta[i]
+        if not texts:
+            print(f"  [Dense] Skipping file {filename} (no components to embed).")
+            continue
+        is_cached = os.path.exists(file_cache_dense_path)
         if not RE_EMBED_ALL and filename not in CHANGED_FILES and is_cached:
+            print(f"  [Dense] Loading cached embeddings for {filename} ({len(texts)} components).")
+            dense_vecs = np.load(file_cache_dense_path)
+        else:
+            print(f"  [Dense] Embedding {len(texts)} components from {filename}...")
+            embeddings = model.encode(
+                texts,
+                batch_size=12,
+                max_length=8192,
+                return_dense=True,
+                return_sparse=False,  # Only dense for components
+                return_colbert_vecs=False
+            )
+            dense_vecs = embeddings['dense_vecs']
+            np.save(file_cache_dense_path, dense_vecs)
+            print(f"  [Dense] Saved cache for {filename} ({dense_vecs.shape[0]} components).")
+        all_dense_chunks.append(dense_vecs)
+        all_meta.extend(metas)
+    if not all_dense_chunks:
+        print("No component embeddings produced.")
+        return
+    final_dense = np.vstack(all_dense_chunks)
+    dense_out = os.path.join(OUTPUT_DIR, 'ns_issue_components_semantic_bge-m3.npy')
+    meta_out = os.path.join(OUTPUT_DIR, 'ns_issue_components_meta.json')
+    titles_out = os.path.join(OUTPUT_DIR, 'issue_titles_components.json')
+    np.save(dense_out, final_dense)
+    with open(meta_out, 'w', encoding='utf-8') as f:
+        json.dump(all_meta, f, ensure_ascii=False)
+    with open(titles_out, 'w', encoding='utf-8') as f:
+        # Only titles for non-placeholder issues
+        json.dump(titles_dict, f, ensure_ascii=False)
+    print(f"\nDense embedding complete. Saved:")
+    print(f"  Dense: {dense_out} shape={final_dense.shape}")
+    print(f"  Meta: {meta_out} items={len(all_meta)}")
+    print(f"  Titles: {titles_out} issues={len(titles_dict)}")
+    # --- Sparse embedding for whole issues, cached per file ---
+    print("\nStarting sparse (loose) embedding for whole issues (per file)...")
+    sparse_out = os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy')
+    titles_sparse_out = os.path.join(OUTPUT_DIR, 'issue_titles.json')
+    all_sparse_chunks = []
+    for i, filepath in enumerate(issue_files):
         filename = os.path.basename(filepath)
         base_name = os.path.splitext(filename)[0]
         file_cache_sparse_path = os.path.join(cache_sparse_dir, f"{base_name}.npy")
+        issue_texts = perfile_issue_texts[i]
+        if not issue_texts:
+            print(f"  [Sparse] Skipping file {filename} (no issues to embed).")
+            continue
+        is_cached = os.path.exists(file_cache_sparse_path)
+        if not RE_EMBED_ALL and filename not in CHANGED_FILES and is_cached:
+            print(f"  [Sparse] Loading cached sparse embeddings for {filename} ({len(issue_texts)} issues).")
+            sparse_dicts = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
         else:
+            print(f"  [Sparse] Embedding {len(issue_texts)} issues from {filename}...")
+            embeddings = model.encode(
+                issue_texts,
+                batch_size=12,
+                max_length=8192,
+                return_dense=False,
+                return_sparse=True,
+                return_colbert_vecs=False
+            )
+            sparse_dicts = embeddings['lexical_weights']
+            np.save(file_cache_sparse_path, np.array(sparse_dicts, dtype=object), allow_pickle=True)
+            print(f"  [Sparse] Saved cache for {filename} ({len(sparse_dicts)} issues).")
+        all_sparse_chunks.extend(sparse_dicts)
+    np.save(sparse_out, np.array(all_sparse_chunks, dtype=object), allow_pickle=True)
+    # Flatten all titles for sparse
+    with open(titles_sparse_out, 'w', encoding='utf-8') as f:
+        json.dump(titles_dict, f, ensure_ascii=False)
+    print(f"\nSparse embedding complete. Saved:")
+    print(f"  Sparse: {sparse_out} count={len(all_sparse_chunks)}")
+    print(f"  Titles (sparse): {titles_sparse_out} issues={len(titles_dict)}")
+    print("Embedding generation (components dense, issues sparse, strict) complete!")
 if __name__ == "__main__":
+    encode_issues_components_and_sparse()