Tsukihjy/testcase / testcase-data /Ours /get_content_wrong.py
Tsukihjy's picture
download
raw
973 Bytes
import re
def contains_link_or_image(text):
# 正则表达式匹配网页链接或图片链接
url_pattern = r'(https?://[^\s]+)|(www\.[^\s]+)|([^\s]+\.jpg|[^\s]+\.png|[^\s]+\.gif|[^\s]+\.jpeg)'
# 使用re.search判断是否匹配到链接
if re.search(url_pattern, text):
return True
return False
def contains_html_tags(text):
# 正则表达式匹配HTML标签
html_tag_pattern = r'<[^>]+>'
# 使用re.search判断是否匹配到HTML标签
if re.search(html_tag_pattern, text):
return True
return False
import json
add_data = json.load(open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_no_en.jsonl", "r", encoding="utf-8"))
check_list = []
for item in add_data:
tcb_id = item['tcb_id']
content = item['query']
if contains_html_tags(content) or contains_link_or_image(content):
check_list.append(tcb_id)
print(f"包含 HTML 标签")
print("\n".join(check_list))

Xet Storage Details

Size:
973 Bytes
·
Xet hash:
e6f2793db252c0530ed876a8df9e9d94dbe787ec5191e462a8a880a7f5ed79b6

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.