Multi-modal_10B_CN / llava /data /clean_sharegpt.py

Upload 77 files

c98c191 over 2 years ago

6.1 kB

	"""
	- Convert html to markdown with basic data cleaning.
	- Deduplication.

	Usage:
	python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
	"""
	import argparse
	from concurrent.futures import ProcessPoolExecutor
	import json
	import logging
	import re
	from typing import Dict, Union

	import bs4
	import markdownify # == 0.11.6
	from tqdm import tqdm


	div_pattern = re.compile("<div.*?>")
	span_pattern = re.compile("<span.*?>")
	code_lang_pattern = re.compile(
	"```\s" + "(.?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
	)
	code_lang_format = "```\g<1>\n\g<2>\n```"
	regenerate_pattern = re.compile("\d+ / \d+")
	copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
	copy_code_pattern = re.compile("```(.?)Copy code\s```")


	def reformat_code(val: str) -> str:
	# Input code format is:
	# ```
	# $<language>Copy code$<exact_code_here>
	#
	# ```
	# This function convert it into the correct markdown format
	return re.sub(code_lang_pattern, code_lang_format, val)


	def html_to_markdown(val: str) -> str:
	# Remove all <div>. This is required to make intent work in code blocks.
	val = re.sub(div_pattern, "", val)
	# Remove all <span>. This is required to make underscores work in code blocks.
	val = re.sub(span_pattern, "", val)
	# Markdown to html
	val = markdownify.markdownify(val).strip()
	# Reformat code
	val = reformat_code(val)

	# Remove noisy "[number] / [number]" at the beginning
	noise = re.search(regenerate_pattern, val)
	if noise and noise.start() == 0:
	val = val[noise.end() :]
	# Remove noisy "Copy[number] chars / [number] words"
	val = re.sub(copy_chars_pattern, "", val)
	# Remove empty code block ```\nCopy code\n```
	val = re.sub(copy_code_pattern, "", val)

	# Strip
	val = val.replace("\n\n\n", "\n").strip()

	return val


	def contain_blocked_words(val: str) -> bool:
	blocked_words = ["openai", "chatgpt"]
	for w in blocked_words:
	if w in val.lower():
	return True
	return False


	def clean_html_one_sample(sample):
	roles = ["human", "gpt"]

	if len(sample["conversations"]) <= 1:
	return (sample, 1)

	# Adjust the offset for cases like https://sharegpt.com/c/VyaZlh4
	if sample["conversations"][0]["from"] != "human":
	sample["conversations"] = sample["conversations"][1:]
	if len(sample["conversations"]) <= 1:
	return (sample, 1)

	if sample["conversations"][-1]["from"] == "human":
	sample["conversations"] = sample["conversations"][:-1]
	if len(sample["conversations"]) <= 1:
	return (sample, 1)

	for i, c in enumerate(sample["conversations"]):
	if c["from"] != roles[i % 2]:
	return (sample, 2)

	if contain_blocked_words(c["value"]):
	return (sample, 3)

	try:
	new_val = html_to_markdown(c["value"])
	except (bs4.builder.ParserRejectedMarkup, AssertionError):
	return (sample, 4)

	c["value"] = new_val

	return (sample, 0)


	def clean_html_all(content, begin, end):
	"""
	Clean the source html files.
	"""
	cnt_skip = 0
	cnt_blocked_words = 0
	cnt_wrong_format = 0
	cnt_parser_error = 0
	cnt_too_short = 0
	cnt_id_duplication = 0
	cnt_value_duplication = 0
	cnt_tag = 0

	content = content[begin:end]
	processed = []
	with ProcessPoolExecutor() as executor:
	for result in tqdm(
	executor.map(clean_html_one_sample, content), total=len(content)
	):
	processed.append(result)

	visited = {}
	new_content = []
	for sample, error_code in tqdm(processed):
	cid = sample["id"]
	skipped = True

	if error_code != 0:
	if error_code == 1:
	print(f"id {cid} is too short")
	cnt_too_short += 1
	elif error_code == 2:
	print(f"id {cid} has a wrong format")
	cnt_wrong_format += 1
	elif error_code == 3:
	print(f"id {cid} contains blocked words")
	cnt_blocked_words += 1
	elif error_code == 4:
	print(f"id {cid} contains parser errors")
	cnt_parser_error += 1
	else:
	raise ValueError(f"Invalid error_code: {error_code}")
	elif cid in visited:
	print(f"id {cid} is an id duplication of {visited[cid]}")
	cnt_id_duplication += 1
	elif (
	sample["conversations"][1]["value"],
	len(sample["conversations"]),
	) in visited:
	key = (sample["conversations"][1]["value"], len(sample["conversations"]))
	print(f"id {cid} is a value duplication of {visited[key]}")
	cnt_value_duplication += 1
	else:
	key = (sample["conversations"][1]["value"], len(sample["conversations"]))
	visited[cid] = visited[key] = cid
	skipped = False

	if not skipped:
	new_content.append(sample)
	else:
	cnt_skip += 1

	print(
	f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
	f"cnt_blocked_words: {cnt_blocked_words}, cnt_parser_error: {cnt_parser_error}, "
	f"cnt_wrong_format: {cnt_wrong_format}, "
	f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
	f"cnt_value_duplication: {cnt_value_duplication}, "
	)

	return new_content


	def main(args):
	content = json.load(open(args["in_file"], "r"))
	content = clean_html_all(content, args["begin"], args["end"])
	json.dump(content, open(args["out_file"], "w"), indent=2)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--in-file", type=str, required=True)
	parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
	parser.add_argument("--begin", type=int)
	parser.add_argument("--end", type=int)
	parser.add_argument("--debug", action="store_true")
	args = parser.parse_args()
	main(vars(args))