File size: 4,109 Bytes
67543b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from datasets import load_dataset
data = load_dataset("/map-vepfs/siwei/coig/hf/test_dataset")

print(data)
input()


import json
from tqdm import tqdm
import requests

api_key = "sk-lNZE6m8qs8dbhu6GDb5763Ea728041B08fB9D8EfB98fD57f"
api_url = "http://180.184.175.69:3000/v1/chat/completions"

import json
from tqdm import tqdm
import requests

def get_GPT_4_judgment(openai_api_url, openai_api_key , messages):

    def single_turn_wrapper(text):
        return [{"role": "user", "content": text}]

    url = openai_api_url
    key = openai_api_key

    if isinstance(messages, str):
        messages = single_turn_wrapper(messages)
    payload = json.dumps({
        "model": "gpt-4o",
        # "model": "GPT-4-0613",
        "messages": messages,
        "temperature": 0,
    })
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {key}'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.text


def save_json(data, file_path, indent=4, ensure_ascii=False):
    """
    保存数据为JSON文件。
    
    :param data: 要保存的数据(通常是字典或列表)
    :param file_path: JSON文件的路径
    :param indent: 缩进级别,默认4(美化输出)
    :param ensure_ascii: 是否保证ASCII编码,默认False(支持中文等非ASCII字符)
    """
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii)
        print(f"JSON文件已成功保存到 {file_path}")
    except Exception as e:
        print(f"保存JSON文件时出错: {e}")


def load_json(file_path):
    """
    读取 JSON 文件并返回解析后的数据。
    :param file_path: str, JSON 文件的路径
    :return: dict or list, 解析后的 JSON 数据
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: File '{file_path}' is not a valid JSON file.")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

if __name__ == '__main__':
    data = load_dataset('/map-vepfs/huggingface/datasets/OpenLLM-France/wikipedia', split="train")
    select_data = []
    for item in tqdm(data):
        if len(item['text'].split(' ')) > 1000 and 'Calendar' not in item['text'] and item['text'].count('|') < 10:

            select_data.append(item)

            if len(select_data) > 10000:
                break
        # print(item.key())
    select_data = select_data[:10000]
    print('over')
    input()
    save_json(select_data, './wiki/wiki_data_10k.json', indent=4, ensure_ascii=False)     
    data = load_json('./wiki/wiki_data_10k.json')
    new_data = []
    for item in tqdm(data):
        text = item['text']
        if '###' in text:
            text = text.replace('###', '---')
        text_splits = text.split('##')
        item['text_splits'] = text_splits
        new_data.append(item)
    save_json(new_data, './wiki/wiki_data_10k.json', indent=4, ensure_ascii=False)

    # data = new_data
    # new_data = []
    # for item in tqdm(data):
    #     bullet_points = []   
    #     for ts in text_splits:
    #         prompt2 = f"Give you a text: {ts} \n\n Please help me summarize the headline of the text in bullet points, and each bullet point only have 1-2 sentences."

    #         messages = [
    #             {"role": "user", 
    #             "content":  prompt2},
    #         ]

    #         response = get_GPT_4_judgment(api_url, api_key, messages)
    #         response = json.loads(response)
    #         response = response['choices'][0]['message']['content']
    #         bullet_points.append(response)
    #     item['bullet_points'] = bullet_points
    #     new_data.append(item)

    #     save_json(new_data, './wiki/wiki_data_summarize_300.json', indent=4, ensure_ascii=False)