Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| def refine_title(title: str, strict_title_match=False): | |
| # TODO : 这里可能需要更复杂的title清洗逻辑 | |
| # 删除title中的空格与换行符 | |
| title = re.sub(r'\s+', '', title) | |
| if not strict_title_match: | |
| try: | |
| # 优先提取阿拉伯数字章节编号(如1.1,2等) | |
| new_title = re.search(r"\d+\.\d+|\d+", title).group() | |
| except: | |
| try: | |
| # 其次提取中文数字章节编号(如六、二十四等) | |
| new_title = re.search(r'[一二三四五六七八九零十百]+', title).group() | |
| except Exception: | |
| new_title = title | |
| title = new_title | |
| return title | |
| def merge_qa_pair(vqa_jsonl, output_jsonl, strict_title_match=False): | |
| already_complete_count = 0 | |
| question_list = [] | |
| answer_list = [] | |
| with open(vqa_jsonl, 'r', encoding='utf-8') as vqa_file: | |
| for line in vqa_file: | |
| data = json.loads(line) | |
| if data["question"] != "": | |
| question_list.append(data) | |
| else: | |
| # 用于支持题目在前面,答案在后面的pdf | |
| answer_list.append(data) | |
| with open(output_jsonl, 'w', encoding='utf-8') as out_file: | |
| chapter_id = 0 | |
| chapter_title = "" | |
| label = float('inf') | |
| questions = {} | |
| answers = {} | |
| for data in question_list: | |
| label_match = re.search(r'\d+', data["label"]) | |
| if label_match: | |
| data["label"] = label_match.group() | |
| if data["chapter_title"] == "": | |
| data["chapter_title"] = chapter_title | |
| try: | |
| data["label"] = int(data["label"]) | |
| except Exception: | |
| continue | |
| if data["chapter_title"] != "" and data["chapter_title"] != chapter_title: | |
| if data["label"] < label: | |
| chapter_id += 1 | |
| chapter_title = data["chapter_title"] | |
| else: | |
| # 如果题号增加,章节标题却发生变化,说明可能错误提取了子标题。因此继续使用之前的章节标题。 | |
| data["chapter_title"] = chapter_title | |
| label = data["label"] | |
| data["original_chapter_title"] = data["chapter_title"] | |
| data["chapter_title"] = refine_title(data["chapter_title"], strict_title_match) | |
| if data['label'] > 0: | |
| # 已经完整的题目直接写入out_file | |
| if data["answer"] or data["solution"]: | |
| already_complete_count += 1 | |
| qa_pair = { | |
| "question_chapter_title": data["original_chapter_title"], | |
| "answer_chapter_title": data["original_chapter_title"], | |
| "label": data['label'], | |
| "question": data["question"], | |
| "answer": data["answer"], | |
| "solution": data.get("solution", "") | |
| } | |
| out_file.write(json.dumps(qa_pair, ensure_ascii=False) + '\n') | |
| else: | |
| questions[(data["chapter_title"], data['label'])] = data | |
| chapter_id = 0 | |
| chapter_title = "" | |
| label = float('inf') | |
| for data in answer_list: | |
| label_match = re.search(r'\d+', data["label"]) | |
| if label_match: | |
| data["label"] = label_match.group() | |
| if data["chapter_title"] == "": | |
| data["chapter_title"] = chapter_title | |
| try: | |
| data["label"] = int(data["label"]) | |
| except Exception: | |
| continue | |
| if data["chapter_title"] != "" and data["chapter_title"] != chapter_title: | |
| if data["label"] < label: | |
| chapter_id += 1 | |
| chapter_title = data["chapter_title"] | |
| else: | |
| # 如果题号增加,章节标题却发生变化,说明可能错误提取了子标题。因此继续使用之前的章节标题。 | |
| data["chapter_title"] = chapter_title | |
| label = data["label"] | |
| data["original_chapter_title"] = data["chapter_title"] | |
| data["chapter_title"] = refine_title(data["chapter_title"], strict_title_match) | |
| # 动态更新,防止错误的重复label覆盖掉之前的solution或answer | |
| if data['label'] > 0: | |
| if not answers.get((data["chapter_title"], data['label'])): | |
| answers[(data["chapter_title"], data['label'])] = data | |
| else: | |
| if not answers[(data["chapter_title"], data['label'])].get("solution") and data.get("solution"): | |
| answers[(data["chapter_title"], data['label'])]["solution"] = data["solution"] | |
| if not answers[(data["chapter_title"], data['label'])].get("answer") and data.get("answer"): | |
| answers[(data["chapter_title"], data['label'])]["answer"] = data["answer"] | |
| for label in questions: | |
| if label in answers: | |
| qa_pair = { | |
| "question_chapter_title": questions[label]["original_chapter_title"], | |
| "answer_chapter_title": answers[label]["original_chapter_title"], | |
| "label": label[1], | |
| "question": questions[label]["question"], | |
| "answer": answers[label]["answer"], | |
| "solution": answers[label].get("solution", "") | |
| } | |
| out_file.write(json.dumps(qa_pair, ensure_ascii=False) + '\n') | |
| print(f"Merged QA pairs: {len(questions.keys() & answers.keys()) + already_complete_count}") | |
| def jsonl_to_md(jsonl_file, md_file): | |
| with open(jsonl_file, 'r', encoding='utf-8') as in_file, open(md_file, 'w', encoding='utf-8') as out_file: | |
| for line in in_file: | |
| data = json.loads(line) | |
| out_file.write(f"### Question {data['label']}\n\n") | |
| out_file.write(f"{data['question']}\n\n") | |
| out_file.write(f"**Answer:** {data['answer']}\n\n") | |
| if data.get('solution'): | |
| out_file.write(f"**Solution:**\n\n{data['solution']}\n\n") | |
| out_file.write("---\n\n") |