Upload output_data_processing_json.py with huggingface_hub
Browse files- output_data_processing_json.py +108 -0
output_data_processing_json.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import csv
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
def find_last_occurrence(lines, pattern):
|
| 6 |
+
"""
|
| 7 |
+
Search backwards in the given list of lines for the first line matching the pattern.
|
| 8 |
+
Returns the captured group text if found, otherwise returns an empty string.
|
| 9 |
+
"""
|
| 10 |
+
for line in reversed(lines):
|
| 11 |
+
match = re.match(pattern, line.strip())
|
| 12 |
+
if match:
|
| 13 |
+
return match.group(1)
|
| 14 |
+
return "" # Return an empty string if not found
|
| 15 |
+
|
| 16 |
+
def extract_prompts_from_file(file_path):
|
| 17 |
+
"""
|
| 18 |
+
Search from the end of file for "title": "xxx", "cover_prompt": "xxx", "video_prompt": "xxx".
|
| 19 |
+
- Case insensitive
|
| 20 |
+
- Can contain multiple underscores between characters
|
| 21 |
+
|
| 22 |
+
If the extracted title equals
|
| 23 |
+
"Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall."
|
| 24 |
+
then set all three values to empty strings.
|
| 25 |
+
"""
|
| 26 |
+
# Use (?i) in regex for case insensitivity
|
| 27 |
+
# For example, title -> t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e
|
| 28 |
+
# This means there can be 0~n underscores between t and i (?:_+)? and similarly for others
|
| 29 |
+
title_pattern = (
|
| 30 |
+
r'^(?i)\s*"t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e"\s*:\s*"(.*)",?\s*$'
|
| 31 |
+
)
|
| 32 |
+
cover_pattern = (
|
| 33 |
+
r'^(?i)\s*"c(?:_+)?o(?:_+)?v(?:_+)?e(?:_+)?r(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$'
|
| 34 |
+
)
|
| 35 |
+
video_pattern = (
|
| 36 |
+
r'^(?i)\s*"v(?:_+)?i(?:_+)?d(?:_+)?e(?:_+)?o(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$'
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Read the text
|
| 40 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 41 |
+
lines = f.read().splitlines()
|
| 42 |
+
|
| 43 |
+
# Search for the three items from the end
|
| 44 |
+
title = find_last_occurrence(lines, title_pattern)
|
| 45 |
+
cover_prompt = find_last_occurrence(lines, cover_pattern)
|
| 46 |
+
video_prompt = find_last_occurrence(lines, video_pattern)
|
| 47 |
+
|
| 48 |
+
# If the found title is the specified text, set all three to empty
|
| 49 |
+
if title.strip() == "Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall.":
|
| 50 |
+
title = ""
|
| 51 |
+
cover_prompt = ""
|
| 52 |
+
video_prompt = ""
|
| 53 |
+
|
| 54 |
+
return title, cover_prompt, video_prompt
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def process_txt_files(input_folder, output_csv):
|
| 58 |
+
"""
|
| 59 |
+
1. Traverse all .txt files in input_folder
|
| 60 |
+
2. For each file, search backwards for the last occurrence of "title": "...", "cover_prompt": "...", "video_prompt": "..."
|
| 61 |
+
3. Output CSV: user prompt, title, cover prompt, video prompt
|
| 62 |
+
"""
|
| 63 |
+
out_dir = os.path.dirname(output_csv)
|
| 64 |
+
if out_dir and not os.path.exists(out_dir):
|
| 65 |
+
os.makedirs(out_dir)
|
| 66 |
+
|
| 67 |
+
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
|
| 68 |
+
writer = csv.writer(csvfile)
|
| 69 |
+
writer.writerow(["user prompt", "title", "cover prompt", "video prompt"])
|
| 70 |
+
|
| 71 |
+
# Traverse all .txt files in the folder
|
| 72 |
+
for filename in os.listdir(input_folder):
|
| 73 |
+
if filename.lower().endswith(".txt"):
|
| 74 |
+
full_path = os.path.join(input_folder, filename)
|
| 75 |
+
|
| 76 |
+
# Extract text corresponding to JSON keys
|
| 77 |
+
title, cover_prompt, video_prompt = extract_prompts_from_file(full_path)
|
| 78 |
+
|
| 79 |
+
# User prompt is the filename without the extension
|
| 80 |
+
user_prompt = os.path.splitext(filename)[0]
|
| 81 |
+
|
| 82 |
+
# Write a row
|
| 83 |
+
writer.writerow([user_prompt, title, cover_prompt, video_prompt])
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
# 1) Change to your txt folder path
|
| 88 |
+
#creation_outputs_ai_concrete_rag_50_testset
|
| 89 |
+
#baseline_concrete_outputs_2
|
| 90 |
+
#creation_outputs_ai_concrete_rag_50_tags_4_testset
|
| 91 |
+
#creation_rag_cot_prompt_ai_abstract_rag_50_testset_deepseek
|
| 92 |
+
#creation_rag_cot_prompt_ai_concrete_rag_50_testset_deepseekr1
|
| 93 |
+
#baseline_concrete_outputs_deepseekr1
|
| 94 |
+
typs = ["concrete"] #"concrete",
|
| 95 |
+
rags = [50] # 0,20,40,60,80,120,140
|
| 96 |
+
for typ in typs:
|
| 97 |
+
for rag in rags:
|
| 98 |
+
input_folder_path = f"creation_rag_cot_prompt_ai_{typ}_rag_{rag}_tags_1_testset"
|
| 99 |
+
|
| 100 |
+
# 2) Change output CSV path
|
| 101 |
+
# output_prompt_baseline/prompt_baseline_abstract_2.csv
|
| 102 |
+
# output_prompt_rag_more
|
| 103 |
+
# output_prompt_baseline/prompt_baseline_concrete_gpt4o.csv
|
| 104 |
+
# output_prompt_rag_more/prompt_ai_concrete_rag_50_testset_deepseekr1.csv
|
| 105 |
+
output_csv_file = f"output_prompt_rag_more/prompt_ai_{typ}_rag_{rag}_tags_1_testset.csv"
|
| 106 |
+
|
| 107 |
+
process_txt_files(input_folder_path, output_csv_file)
|
| 108 |
+
print("Processing complete! Results written to:", output_csv_file)
|