Spaces:
Running
Running
| import os | |
| import re | |
| import json | |
| # --- Configuration --- | |
| # Path to your raw issue titles text file. | |
| # Assuming it's in the same directory as this script. | |
| RAW_TITLES_FILE = 'issue_titles_raw.txt' | |
| # Output directory for the final issue_titles.json file. | |
| # This should match the OUTPUT_DIR in your embedding.py and app.py expects it. | |
| # If this script is in 'project_root/scripts/', and app.py is in 'project_root/', | |
| # then '../' would be appropriate here. If both are in the same directory, use '.' | |
| OUTPUT_DIR = '../' | |
| def generate_issue_titles_json(): | |
| """ | |
| Reads a raw text file of issue titles, parses them, and saves them | |
| to a JSON file in the format {issue_number (str): title (str)}. | |
| Skips lines that do not start with '#X: '. | |
| """ | |
| raw_titles_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), RAW_TITLES_FILE) | |
| output_json_path = os.path.join(OUTPUT_DIR, 'issue_titles.json') | |
| issue_titles_dict = {} | |
| # Regex to match lines like "#0: Some Title" and capture the number and the title part | |
| # It looks for '#' followed by digits, then ':', then captures everything after that. | |
| line_pattern = re.compile(r'^#(\d+):\s*(.*)') | |
| print(f"Attempting to read raw titles from: {raw_titles_path}") | |
| if not os.path.exists(raw_titles_path): | |
| print(f"Error: Raw titles file not found at '{raw_titles_path}'. Please create it.") | |
| return | |
| try: | |
| with open(raw_titles_path, 'r', encoding='utf-8') as f: | |
| for line_num, line in enumerate(f, 1): | |
| line = line.strip() | |
| if not line: | |
| continue # Skip empty lines | |
| match = line_pattern.match(line) | |
| if match: | |
| issue_number = match.group(1) # Captured number as string | |
| issue_title_raw = match.group(2) # Captured title part | |
| # Keep all text after ': ', no cleaning needed as per request | |
| issue_titles_dict[issue_number] = f"#{issue_number}: " + issue_title_raw.strip() | |
| else: | |
| print(f" Skipping line {line_num}: '{line}' - Does not match expected format '#X: '") | |
| if not issue_titles_dict: | |
| print("No valid issue titles were extracted. 'issue_titles.json' will not be created.") | |
| return | |
| # Ensure output directory exists | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| with open(output_json_path, 'w', encoding='utf-8') as f: | |
| json.dump(issue_titles_dict, f, ensure_ascii=False, indent=2) | |
| print(f"Successfully generated issue titles to: {output_json_path} ({len(issue_titles_dict)} titles)") | |
| except Exception as e: | |
| print(f"An error occurred during title generation: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| if __name__ == "__main__": | |
| generate_issue_titles_json() |