LLMPopcorn / output_data_processing_json.py

Upload output_data_processing_json.py with huggingface_hub

0d1e8bd verified 3 days ago

4.48 kB

	import os
	import csv
	import re

	def find_last_occurrence(lines, pattern):
	"""
	Search backwards in the given list of lines for the first line matching the pattern.
	Returns the captured group text if found, otherwise returns an empty string.
	"""
	for line in reversed(lines):
	match = re.match(pattern, line.strip())
	if match:
	return match.group(1)
	return "" # Return an empty string if not found

	def extract_prompts_from_file(file_path):
	"""
	Search from the end of file for "title": "xxx", "cover_prompt": "xxx", "video_prompt": "xxx".
	- Case insensitive
	- Can contain multiple underscores between characters

	If the extracted title equals
	"Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall."
	then set all three values to empty strings.
	"""
	# Use (?i) in regex for case insensitivity
	# For example, title -> t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e
	# This means there can be 0~n underscores between t and i (?:_+)? and similarly for others
	title_pattern = (
	r'^(?i)\s"t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e"\s:\s"(.)",?\s*$'
	)
	cover_pattern = (
	r'^(?i)\s"c(?:_+)?o(?:_+)?v(?:_+)?e(?:_+)?r(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s:\s"(.)",?\s*$'
	)
	video_pattern = (
	r'^(?i)\s"v(?:_+)?i(?:_+)?d(?:_+)?e(?:_+)?o(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s:\s"(.)",?\s*$'
	)

	# Read the text
	with open(file_path, 'r', encoding='utf-8') as f:
	lines = f.read().splitlines()

	# Search for the three items from the end
	title = find_last_occurrence(lines, title_pattern)
	cover_prompt = find_last_occurrence(lines, cover_pattern)
	video_prompt = find_last_occurrence(lines, video_pattern)

	# If the found title is the specified text, set all three to empty
	if title.strip() == "Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall.":
	title = ""
	cover_prompt = ""
	video_prompt = ""

	return title, cover_prompt, video_prompt


	def process_txt_files(input_folder, output_csv):
	"""
	1. Traverse all .txt files in input_folder
	2. For each file, search backwards for the last occurrence of "title": "...", "cover_prompt": "...", "video_prompt": "..."
	3. Output CSV: user prompt, title, cover prompt, video prompt
	"""
	out_dir = os.path.dirname(output_csv)
	if out_dir and not os.path.exists(out_dir):
	os.makedirs(out_dir)

	with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["user prompt", "title", "cover prompt", "video prompt"])

	# Traverse all .txt files in the folder
	for filename in os.listdir(input_folder):
	if filename.lower().endswith(".txt"):
	full_path = os.path.join(input_folder, filename)

	# Extract text corresponding to JSON keys
	title, cover_prompt, video_prompt = extract_prompts_from_file(full_path)

	# User prompt is the filename without the extension
	user_prompt = os.path.splitext(filename)[0]

	# Write a row
	writer.writerow([user_prompt, title, cover_prompt, video_prompt])


	if __name__ == "__main__":
	# 1) Change to your txt folder path
	#creation_outputs_ai_concrete_rag_50_testset
	#baseline_concrete_outputs_2
	#creation_outputs_ai_concrete_rag_50_tags_4_testset
	#creation_rag_cot_prompt_ai_abstract_rag_50_testset_deepseek
	#creation_rag_cot_prompt_ai_concrete_rag_50_testset_deepseekr1
	#baseline_concrete_outputs_deepseekr1
	typs = ["concrete"] #"concrete",
	rags = [50] # 0,20,40,60,80,120,140
	for typ in typs:
	for rag in rags:
	input_folder_path = f"creation_rag_cot_prompt_ai_{typ}_rag_{rag}_tags_1_testset"

	# 2) Change output CSV path
	# output_prompt_baseline/prompt_baseline_abstract_2.csv
	# output_prompt_rag_more
	# output_prompt_baseline/prompt_baseline_concrete_gpt4o.csv
	# output_prompt_rag_more/prompt_ai_concrete_rag_50_testset_deepseekr1.csv
	output_csv_file = f"output_prompt_rag_more/prompt_ai_{typ}_rag_{rag}_tags_1_testset.csv"

	process_txt_files(input_folder_path, output_csv_file)
	print("Processing complete! Results written to:", output_csv_file)