File size: 4,109 Bytes
67543b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from datasets import load_dataset
data = load_dataset("/map-vepfs/siwei/coig/hf/test_dataset")
print(data)
input()
import json
from tqdm import tqdm
import requests
api_key = "sk-lNZE6m8qs8dbhu6GDb5763Ea728041B08fB9D8EfB98fD57f"
api_url = "http://180.184.175.69:3000/v1/chat/completions"
import json
from tqdm import tqdm
import requests
def get_GPT_4_judgment(openai_api_url, openai_api_key , messages):
def single_turn_wrapper(text):
return [{"role": "user", "content": text}]
url = openai_api_url
key = openai_api_key
if isinstance(messages, str):
messages = single_turn_wrapper(messages)
payload = json.dumps({
"model": "gpt-4o",
# "model": "GPT-4-0613",
"messages": messages,
"temperature": 0,
})
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {key}'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
def save_json(data, file_path, indent=4, ensure_ascii=False):
"""
保存数据为JSON文件。
:param data: 要保存的数据(通常是字典或列表)
:param file_path: JSON文件的路径
:param indent: 缩进级别,默认4(美化输出)
:param ensure_ascii: 是否保证ASCII编码,默认False(支持中文等非ASCII字符)
"""
try:
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=indent, ensure_ascii=ensure_ascii)
print(f"JSON文件已成功保存到 {file_path}")
except Exception as e:
print(f"保存JSON文件时出错: {e}")
def load_json(file_path):
"""
读取 JSON 文件并返回解析后的数据。
:param file_path: str, JSON 文件的路径
:return: dict or list, 解析后的 JSON 数据
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
return None
except json.JSONDecodeError:
print(f"Error: File '{file_path}' is not a valid JSON file.")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None
if __name__ == '__main__':
data = load_dataset('/map-vepfs/huggingface/datasets/OpenLLM-France/wikipedia', split="train")
select_data = []
for item in tqdm(data):
if len(item['text'].split(' ')) > 1000 and 'Calendar' not in item['text'] and item['text'].count('|') < 10:
select_data.append(item)
if len(select_data) > 10000:
break
# print(item.key())
select_data = select_data[:10000]
print('over')
input()
save_json(select_data, './wiki/wiki_data_10k.json', indent=4, ensure_ascii=False)
data = load_json('./wiki/wiki_data_10k.json')
new_data = []
for item in tqdm(data):
text = item['text']
if '###' in text:
text = text.replace('###', '---')
text_splits = text.split('##')
item['text_splits'] = text_splits
new_data.append(item)
save_json(new_data, './wiki/wiki_data_10k.json', indent=4, ensure_ascii=False)
# data = new_data
# new_data = []
# for item in tqdm(data):
# bullet_points = []
# for ts in text_splits:
# prompt2 = f"Give you a text: {ts} \n\n Please help me summarize the headline of the text in bullet points, and each bullet point only have 1-2 sentences."
# messages = [
# {"role": "user",
# "content": prompt2},
# ]
# response = get_GPT_4_judgment(api_url, api_key, messages)
# response = json.loads(response)
# response = response['choices'][0]['message']['content']
# bullet_points.append(response)
# item['bullet_points'] = bullet_points
# new_data.append(item)
# save_json(new_data, './wiki/wiki_data_summarize_300.json', indent=4, ensure_ascii=False)
|