Spaces:

Bohaska
/

ns_issue_search

Running

ns_issue_search / small_scripts /make_issue_titles.py

Bohaska

add issue id in beginning of title

99f4036 7 months ago

2.89 kB

	import os
	import re
	import json

	# --- Configuration ---
	# Path to your raw issue titles text file.
	# Assuming it's in the same directory as this script.
	RAW_TITLES_FILE = 'issue_titles_raw.txt'

	# Output directory for the final issue_titles.json file.
	# This should match the OUTPUT_DIR in your embedding.py and app.py expects it.
	# If this script is in 'project_root/scripts/', and app.py is in 'project_root/',
	# then '../' would be appropriate here. If both are in the same directory, use '.'
	OUTPUT_DIR = '../'


	def generate_issue_titles_json():
	"""
	Reads a raw text file of issue titles, parses them, and saves them
	to a JSON file in the format {issue_number (str): title (str)}.
	Skips lines that do not start with '#X: '.
	"""
	raw_titles_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), RAW_TITLES_FILE)
	output_json_path = os.path.join(OUTPUT_DIR, 'issue_titles.json')

	issue_titles_dict = {}

	# Regex to match lines like "#0: Some Title" and capture the number and the title part
	# It looks for '#' followed by digits, then ':', then captures everything after that.
	line_pattern = re.compile(r'^#(\d+):\s(.)')

	print(f"Attempting to read raw titles from: {raw_titles_path}")

	if not os.path.exists(raw_titles_path):
	print(f"Error: Raw titles file not found at '{raw_titles_path}'. Please create it.")
	return

	try:
	with open(raw_titles_path, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if not line:
	continue # Skip empty lines

	match = line_pattern.match(line)
	if match:
	issue_number = match.group(1) # Captured number as string
	issue_title_raw = match.group(2) # Captured title part

	# Keep all text after ': ', no cleaning needed as per request
	issue_titles_dict[issue_number] = f"#{issue_number}: " + issue_title_raw.strip()
	else:
	print(f" Skipping line {line_num}: '{line}' - Does not match expected format '#X: '")

	if not issue_titles_dict:
	print("No valid issue titles were extracted. 'issue_titles.json' will not be created.")
	return

	# Ensure output directory exists
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	with open(output_json_path, 'w', encoding='utf-8') as f:
	json.dump(issue_titles_dict, f, ensure_ascii=False, indent=2)
	print(f"Successfully generated issue titles to: {output_json_path} ({len(issue_titles_dict)} titles)")

	except Exception as e:
	print(f"An error occurred during title generation: {e}")
	import traceback
	traceback.print_exc()


	if __name__ == "__main__":
	generate_issue_titles_json()