| import re | |
| def contains_link_or_image(text): | |
| # 正则表达式匹配网页链接或图片链接 | |
| url_pattern = r'(https?://[^\s]+)|(www\.[^\s]+)|([^\s]+\.jpg|[^\s]+\.png|[^\s]+\.gif|[^\s]+\.jpeg)' | |
| # 使用re.search判断是否匹配到链接 | |
| if re.search(url_pattern, text): | |
| return True | |
| return False | |
| def contains_html_tags(text): | |
| # 正则表达式匹配HTML标签 | |
| html_tag_pattern = r'<[^>]+>' | |
| # 使用re.search判断是否匹配到HTML标签 | |
| if re.search(html_tag_pattern, text): | |
| return True | |
| return False | |
| import json | |
| add_data = json.load(open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/tcb_v7_add_data_no_en.jsonl", "r", encoding="utf-8")) | |
| check_list = [] | |
| for item in add_data: | |
| tcb_id = item['tcb_id'] | |
| content = item['query'] | |
| if contains_html_tags(content) or contains_link_or_image(content): | |
| check_list.append(tcb_id) | |
| print(f"包含 HTML 标签") | |
| print("\n".join(check_list)) | |
Xet Storage Details
- Size:
- 973 Bytes
- Xet hash:
- e6f2793db252c0530ed876a8df9e9d94dbe787ec5191e462a8a880a7f5ed79b6
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.