import os import re import json # --- Configuration --- # Path to your raw issue titles text file. # Assuming it's in the same directory as this script. RAW_TITLES_FILE = 'issue_titles_raw.txt' # Output directory for the final issue_titles.json file. # This should match the OUTPUT_DIR in your embedding.py and app.py expects it. # If this script is in 'project_root/scripts/', and app.py is in 'project_root/', # then '../' would be appropriate here. If both are in the same directory, use '.' OUTPUT_DIR = '../' def generate_issue_titles_json(): """ Reads a raw text file of issue titles, parses them, and saves them to a JSON file in the format {issue_number (str): title (str)}. Skips lines that do not start with '#X: '. """ raw_titles_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), RAW_TITLES_FILE) output_json_path = os.path.join(OUTPUT_DIR, 'issue_titles.json') issue_titles_dict = {} # Regex to match lines like "#0: Some Title" and capture the number and the title part # It looks for '#' followed by digits, then ':', then captures everything after that. line_pattern = re.compile(r'^#(\d+):\s*(.*)') print(f"Attempting to read raw titles from: {raw_titles_path}") if not os.path.exists(raw_titles_path): print(f"Error: Raw titles file not found at '{raw_titles_path}'. Please create it.") return try: with open(raw_titles_path, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue # Skip empty lines match = line_pattern.match(line) if match: issue_number = match.group(1) # Captured number as string issue_title_raw = match.group(2) # Captured title part # Keep all text after ': ', no cleaning needed as per request issue_titles_dict[issue_number] = f"#{issue_number}: " + issue_title_raw.strip() else: print(f" Skipping line {line_num}: '{line}' - Does not match expected format '#X: '") if not issue_titles_dict: print("No valid issue titles were extracted. 'issue_titles.json' will not be created.") return # Ensure output directory exists os.makedirs(OUTPUT_DIR, exist_ok=True) with open(output_json_path, 'w', encoding='utf-8') as f: json.dump(issue_titles_dict, f, ensure_ascii=False, indent=2) print(f"Successfully generated issue titles to: {output_json_path} ({len(issue_titles_dict)} titles)") except Exception as e: print(f"An error occurred during title generation: {e}") import traceback traceback.print_exc() if __name__ == "__main__": generate_issue_titles_json()