Spaces:

Bohaska
/

ns_issue_search

Running

File size: 2,887 Bytes

import os
import re
import json

# --- Configuration ---
# Path to your raw issue titles text file.
# Assuming it's in the same directory as this script.
RAW_TITLES_FILE = 'issue_titles_raw.txt'

# Output directory for the final issue_titles.json file.
# This should match the OUTPUT_DIR in your embedding.py and app.py expects it.
# If this script is in 'project_root/scripts/', and app.py is in 'project_root/',
# then '../' would be appropriate here. If both are in the same directory, use '.'
OUTPUT_DIR = '../'


def generate_issue_titles_json():
    """
    Reads a raw text file of issue titles, parses them, and saves them
    to a JSON file in the format {issue_number (str): title (str)}.
    Skips lines that do not start with '#X: '.
    """
    raw_titles_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), RAW_TITLES_FILE)
    output_json_path = os.path.join(OUTPUT_DIR, 'issue_titles.json')

    issue_titles_dict = {}

    # Regex to match lines like "#0: Some Title" and capture the number and the title part
    # It looks for '#' followed by digits, then ':', then captures everything after that.
    line_pattern = re.compile(r'^#(\d+):\s*(.*)')

    print(f"Attempting to read raw titles from: {raw_titles_path}")

    if not os.path.exists(raw_titles_path):
        print(f"Error: Raw titles file not found at '{raw_titles_path}'. Please create it.")
        return

    try:
        with open(raw_titles_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line:
                    continue  # Skip empty lines

                match = line_pattern.match(line)
                if match:
                    issue_number = match.group(1)  # Captured number as string
                    issue_title_raw = match.group(2)  # Captured title part

                    # Keep all text after ': ', no cleaning needed as per request
                    issue_titles_dict[issue_number] = f"#{issue_number}: " + issue_title_raw.strip()
                else:
                    print(f"  Skipping line {line_num}: '{line}' - Does not match expected format '#X: '")

        if not issue_titles_dict:
            print("No valid issue titles were extracted. 'issue_titles.json' will not be created.")
            return

        # Ensure output directory exists
        os.makedirs(OUTPUT_DIR, exist_ok=True)

        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(issue_titles_dict, f, ensure_ascii=False, indent=2)
        print(f"Successfully generated issue titles to: {output_json_path} ({len(issue_titles_dict)} titles)")

    except Exception as e:
        print(f"An error occurred during title generation: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    generate_issue_titles_json()