junchenfu commited on
Commit
0d1e8bd
·
verified ·
1 Parent(s): 29cc382

Upload output_data_processing_json.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. output_data_processing_json.py +108 -0
output_data_processing_json.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import re
4
+
5
+ def find_last_occurrence(lines, pattern):
6
+ """
7
+ Search backwards in the given list of lines for the first line matching the pattern.
8
+ Returns the captured group text if found, otherwise returns an empty string.
9
+ """
10
+ for line in reversed(lines):
11
+ match = re.match(pattern, line.strip())
12
+ if match:
13
+ return match.group(1)
14
+ return "" # Return an empty string if not found
15
+
16
+ def extract_prompts_from_file(file_path):
17
+ """
18
+ Search from the end of file for "title": "xxx", "cover_prompt": "xxx", "video_prompt": "xxx".
19
+ - Case insensitive
20
+ - Can contain multiple underscores between characters
21
+
22
+ If the extracted title equals
23
+ "Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall."
24
+ then set all three values to empty strings.
25
+ """
26
+ # Use (?i) in regex for case insensitivity
27
+ # For example, title -> t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e
28
+ # This means there can be 0~n underscores between t and i (?:_+)? and similarly for others
29
+ title_pattern = (
30
+ r'^(?i)\s*"t(?:_+)?i(?:_+)?t(?:_+)?l(?:_+)?e"\s*:\s*"(.*)",?\s*$'
31
+ )
32
+ cover_pattern = (
33
+ r'^(?i)\s*"c(?:_+)?o(?:_+)?v(?:_+)?e(?:_+)?r(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$'
34
+ )
35
+ video_pattern = (
36
+ r'^(?i)\s*"v(?:_+)?i(?:_+)?d(?:_+)?e(?:_+)?o(?:_+)?p(?:_+)?r(?:_+)?o(?:_+)?m(?:_+)?p(?:_+)?t"\s*:\s*"(.*)",?\s*$'
37
+ )
38
+
39
+ # Read the text
40
+ with open(file_path, 'r', encoding='utf-8') as f:
41
+ lines = f.read().splitlines()
42
+
43
+ # Search for the three items from the end
44
+ title = find_last_occurrence(lines, title_pattern)
45
+ cover_prompt = find_last_occurrence(lines, cover_pattern)
46
+ video_prompt = find_last_occurrence(lines, video_pattern)
47
+
48
+ # If the found title is the specified text, set all three to empty
49
+ if title.strip() == "Unveiling the Legacy of Ancient Rome: Rise, Glory, and Downfall.":
50
+ title = ""
51
+ cover_prompt = ""
52
+ video_prompt = ""
53
+
54
+ return title, cover_prompt, video_prompt
55
+
56
+
57
+ def process_txt_files(input_folder, output_csv):
58
+ """
59
+ 1. Traverse all .txt files in input_folder
60
+ 2. For each file, search backwards for the last occurrence of "title": "...", "cover_prompt": "...", "video_prompt": "..."
61
+ 3. Output CSV: user prompt, title, cover prompt, video prompt
62
+ """
63
+ out_dir = os.path.dirname(output_csv)
64
+ if out_dir and not os.path.exists(out_dir):
65
+ os.makedirs(out_dir)
66
+
67
+ with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
68
+ writer = csv.writer(csvfile)
69
+ writer.writerow(["user prompt", "title", "cover prompt", "video prompt"])
70
+
71
+ # Traverse all .txt files in the folder
72
+ for filename in os.listdir(input_folder):
73
+ if filename.lower().endswith(".txt"):
74
+ full_path = os.path.join(input_folder, filename)
75
+
76
+ # Extract text corresponding to JSON keys
77
+ title, cover_prompt, video_prompt = extract_prompts_from_file(full_path)
78
+
79
+ # User prompt is the filename without the extension
80
+ user_prompt = os.path.splitext(filename)[0]
81
+
82
+ # Write a row
83
+ writer.writerow([user_prompt, title, cover_prompt, video_prompt])
84
+
85
+
86
+ if __name__ == "__main__":
87
+ # 1) Change to your txt folder path
88
+ #creation_outputs_ai_concrete_rag_50_testset
89
+ #baseline_concrete_outputs_2
90
+ #creation_outputs_ai_concrete_rag_50_tags_4_testset
91
+ #creation_rag_cot_prompt_ai_abstract_rag_50_testset_deepseek
92
+ #creation_rag_cot_prompt_ai_concrete_rag_50_testset_deepseekr1
93
+ #baseline_concrete_outputs_deepseekr1
94
+ typs = ["concrete"] #"concrete",
95
+ rags = [50] # 0,20,40,60,80,120,140
96
+ for typ in typs:
97
+ for rag in rags:
98
+ input_folder_path = f"creation_rag_cot_prompt_ai_{typ}_rag_{rag}_tags_1_testset"
99
+
100
+ # 2) Change output CSV path
101
+ # output_prompt_baseline/prompt_baseline_abstract_2.csv
102
+ # output_prompt_rag_more
103
+ # output_prompt_baseline/prompt_baseline_concrete_gpt4o.csv
104
+ # output_prompt_rag_more/prompt_ai_concrete_rag_50_testset_deepseekr1.csv
105
+ output_csv_file = f"output_prompt_rag_more/prompt_ai_{typ}_rag_{rag}_tags_1_testset.csv"
106
+
107
+ process_txt_files(input_folder_path, output_csv_file)
108
+ print("Processing complete! Results written to:", output_csv_file)