ns_issue_search / small_scripts /make_issue_titles.py
Bohaska
add issue id in beginning of title
99f4036
import os
import re
import json
# --- Configuration ---
# Path to your raw issue titles text file.
# Assuming it's in the same directory as this script.
RAW_TITLES_FILE = 'issue_titles_raw.txt'
# Output directory for the final issue_titles.json file.
# This should match the OUTPUT_DIR in your embedding.py and app.py expects it.
# If this script is in 'project_root/scripts/', and app.py is in 'project_root/',
# then '../' would be appropriate here. If both are in the same directory, use '.'
OUTPUT_DIR = '../'
def generate_issue_titles_json():
"""
Reads a raw text file of issue titles, parses them, and saves them
to a JSON file in the format {issue_number (str): title (str)}.
Skips lines that do not start with '#X: '.
"""
raw_titles_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), RAW_TITLES_FILE)
output_json_path = os.path.join(OUTPUT_DIR, 'issue_titles.json')
issue_titles_dict = {}
# Regex to match lines like "#0: Some Title" and capture the number and the title part
# It looks for '#' followed by digits, then ':', then captures everything after that.
line_pattern = re.compile(r'^#(\d+):\s*(.*)')
print(f"Attempting to read raw titles from: {raw_titles_path}")
if not os.path.exists(raw_titles_path):
print(f"Error: Raw titles file not found at '{raw_titles_path}'. Please create it.")
return
try:
with open(raw_titles_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue # Skip empty lines
match = line_pattern.match(line)
if match:
issue_number = match.group(1) # Captured number as string
issue_title_raw = match.group(2) # Captured title part
# Keep all text after ': ', no cleaning needed as per request
issue_titles_dict[issue_number] = f"#{issue_number}: " + issue_title_raw.strip()
else:
print(f" Skipping line {line_num}: '{line}' - Does not match expected format '#X: '")
if not issue_titles_dict:
print("No valid issue titles were extracted. 'issue_titles.json' will not be created.")
return
# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(output_json_path, 'w', encoding='utf-8') as f:
json.dump(issue_titles_dict, f, ensure_ascii=False, indent=2)
print(f"Successfully generated issue titles to: {output_json_path} ({len(issue_titles_dict)} titles)")
except Exception as e:
print(f"An error occurred during title generation: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
generate_issue_titles_json()