import os import csv import re def find_last_occurrence(lines, pattern): """ Search backwards in the given list of lines for the first line matching the pattern. Returns the captured group text if found, otherwise returns an empty string. """ for line in reversed(lines): match = re.match(pattern, line.strip()) if match: return match.group(1) return "" # Return an empty string if not found def extract_prompts_from_file(file_path): """ Search from the end of file for "title": "xxx", "cover_prompt": "xxx", "video_prompt": "xxx". - Case insensitive - Can contain multiple underscores between characters If the extracted title equals "Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall." then set all three values to empty strings. """ # Use (?i) in regex for case insensitivity # For example, title -> t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e # This means there can be 0~n underscores between t and i (?:_+)? and similarly for others title_pattern = ( r'^(?i)\s*"t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e"\s*:\s*"(.*)",?\s*$' ) cover_pattern = ( r'^(?i)\s*"c(?:_+)?o(?:_+)?v(?:_+)?e(?:_+)?r(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$' ) video_pattern = ( r'^(?i)\s*"v(?:_+)?i(?:_+)?d(?:_+)?e(?:_+)?o(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$' ) # Read the text with open(file_path, 'r', encoding='utf-8') as f: lines = f.read().splitlines() # Search for the three items from the end title = find_last_occurrence(lines, title_pattern) cover_prompt = find_last_occurrence(lines, cover_pattern) video_prompt = find_last_occurrence(lines, video_pattern) # If the found title is the specified text, set all three to empty if title.strip() == "Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall.": title = "" cover_prompt = "" video_prompt = "" return title, cover_prompt, video_prompt def process_txt_files(input_folder, output_csv): """ 1. Traverse all .txt files in input_folder 2. For each file, search backwards for the last occurrence of "title": "...", "cover_prompt": "...", "video_prompt": "..." 3. Output CSV: user prompt, title, cover prompt, video prompt """ out_dir = os.path.dirname(output_csv) if out_dir and not os.path.exists(out_dir): os.makedirs(out_dir) with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow(["user prompt", "title", "cover prompt", "video prompt"]) # Traverse all .txt files in the folder for filename in os.listdir(input_folder): if filename.lower().endswith(".txt"): full_path = os.path.join(input_folder, filename) # Extract text corresponding to JSON keys title, cover_prompt, video_prompt = extract_prompts_from_file(full_path) # User prompt is the filename without the extension user_prompt = os.path.splitext(filename)[0] # Write a row writer.writerow([user_prompt, title, cover_prompt, video_prompt]) if __name__ == "__main__": # 1) Change to your txt folder path #creation_outputs_ai_concrete_rag_50_testset #baseline_concrete_outputs_2 #creation_outputs_ai_concrete_rag_50_tags_4_testset #creation_rag_cot_prompt_ai_abstract_rag_50_testset_deepseek #creation_rag_cot_prompt_ai_concrete_rag_50_testset_deepseekr1 #baseline_concrete_outputs_deepseekr1 typs = ["concrete"] #"concrete", rags = [50] # 0,20,40,60,80,120,140 for typ in typs: for rag in rags: input_folder_path = f"creation_rag_cot_prompt_ai_{typ}_rag_{rag}_tags_1_testset" # 2) Change output CSV path # output_prompt_baseline/prompt_baseline_abstract_2.csv # output_prompt_rag_more # output_prompt_baseline/prompt_baseline_concrete_gpt4o.csv # output_prompt_rag_more/prompt_ai_concrete_rag_50_testset_deepseekr1.csv output_csv_file = f"output_prompt_rag_more/prompt_ai_{typ}_rag_{rag}_tags_1_testset.csv" process_txt_files(input_folder_path, output_csv_file) print("Processing complete! Results written to:", output_csv_file)