liangyi_LLaMA_Factory / data /preprocess_data /generate_fine_grained_data.py
Mickey25's picture
Upload folder using huggingface_hub
46b244e verified
#!/usr/bin/env python3
"""
将粗粒度的订单抽取数据转换为细粒度的训练数据
每个input只对应一个特定字段的抽取任务
支持JSON和JSONL两种输入格式
"""
import json
import random
import os
from typing import Dict, List, Any
# 业务范围数据定义
RESOURCE_SUBRESOURCES = {
"七里扬帆": [
"七里扬帆草莓采摘入园票", "七里扬帆小火车", "七里扬帆葫芦山庄餐饮", "七里扬帆门票",
"七里扬帆游船", "三江口游线(游船)", "七里扬帆停车场", "七里扬帆葫芦峡漂流", "七里扬帆包船"
],
"三都渔村": [
"三都渔村门票", "三都渔村玻璃水滑道", "三都渔村婚礼表演"
],
"严州古城": [
"严州古城门票", "严州古城展馆联票+电瓶车", "严州古城摇橹船"
],
"交运公司": [
"交运包车", "交运租车"
],
"千岛湖好运岛": [
"千岛湖好运岛草莓采摘(2斤)", "千岛湖好运岛门票", "千岛湖好运岛游船", "千岛湖好运岛停车场"
],
"千鹤妇女精神教育基地": [
"千鹤门票", "千鹤景区讲解项目", "千鹤会务"
],
"大慈岩": [
"大慈岩豆腐包制作体验", "大慈岩索道", "大慈岩玻璃栈道", "大慈岩丛林速滑(旱滑道)下行",
"大慈岩餐饮", "大慈岩中餐", "大慈岩门票", "大慈岩停车场"
],
"宿江公司": [
"江清月近人实景演艺门票", "江清月近人白天场"
],
"导服中心": [
"扬帆旅行社全陪导游", "扬帆旅行社地接导游", "大慈岩景区导服", "灵栖洞景区导服",
"新叶古村景区导服", "千岛湖好运岛景区导服", "七里扬帆景区导服", "严州古城景区导服",
"新安江景区导服", "千鹤景区导服"
],
"新叶古村": [
"新叶古村草莓采摘入园票", "新叶古村门票", "新叶古村餐饮", "新叶古村停车场"
],
"新安江": [
"新安江草莓采摘入园票", "新安江船餐", "新安江中餐", "新安江游船"
],
"景澜酒店": [
"住宿", "会务"
],
"汉庭酒店": [
"住宿", "餐饮"
],
"灵栖洞": [
"灵栖洞豆腐包制作体验", "考拉森林丛林探险", "灵栖洞西游魔毯", "灵栖洞极速滑道",
"灵栖洞餐饮", "灵栖洞中餐", "灵栖洞门票", "灵栖洞手划船", "灵栖洞停车场", "灵栖洞喊泉"
],
"玉泉寺": [
"玉泉寺门票"
],
"雷迪森酒店": [
"住宿", "会务"
]
}
# 所有combo/items类型的枚举值定义
COMBO_ITEMS_ENUM_VALUES = {
"business_items": [
"三都渔村婚礼表演", "喊泉", "小火车单程", "小火车往返", "考拉森林丛林探险亲子线",
"考拉森林丛林探险成人线", "草莓采摘入园票", "草莓采摘(2斤)", "豆腐包制作体验"
],
"guide_items": [
"七里扬帆景区导服", "严州古城景区导服", "千岛湖好运岛景区导服", "千鹤景区导服",
"大慈岩景区导服", "小小讲解员体验", "扬帆旅行社全陪导游", "扬帆旅行社地接导游",
"新叶古村景区导服", "新安江景区导服", "民兵体验", "灵栖洞景区导服", "讲解费"
],
"parking_items": [
"中型", "中巴车", "大型", "大客车", "大巴车", "大车", "小型车", "小客车", "小车", "摩托车"
],
"recreational_combo": [
"中远程及新增市场价格", "协议不成团价", "协议成团价", "团队价", "学生团队价", "挂牌价",
"旅投员工价", "景酒打包价", "老年团队价", "门市价", "非协议不成团价", "非协议成团价"
],
"recreational_items": [
"丛林速滑(旱滑道)下行", "极速滑道", "玻璃栈道", "玻璃水滑道", "索道上下行",
"索道上行", "索道下行", "西游魔毯"
],
"ship_combo": [
"中远程及新增市场价格", "团队优惠价", "团队优惠价_小扬帆_1500", "团队优惠价_小扬帆_600",
"团队优惠价_小扬帆_800", "学生团队价", "广德", "建德市民", "散客挂牌价", "旅投员工价",
"景酒打包价", "正常价", "特殊价", "研学团", "老年团队价"
],
"ship_items": [
"七里扬帆游船", "三江口游线", "千岛湖好运岛游船", "小扬帆", "快艇1号", "快艇2号",
"扬帆16号", "扬帆2号", "扬帆3号", "扬帆之星", "摇橹船", "新安江竹筏漂流", "新安江龙舟漂游",
"梦幻新安江", "江南秘境1号", "江南秘境2号", "江清月近人实景演艺船票", "浙旅江南秘境",
"船票", "葫芦峡漂流", "诗韵新安江(新安江-严州古城航线含自助茶歇、简餐)"
],
"ticket_combo": [
"", "30年教师证", "70周岁", "中远程及新增市场价格", "儿童", "其他", "军官证", "协议不成团价",
"协议成团价", "在校大学生", "学生团队价", "导游证", "广德", "建德就业", "建德市民",
"接待免票", "新闻记者证", "旅投员工价", "景酒打包价", "杭州市民卡", "杭州文旅卡/惠民卡",
"残疾证", "消防员证", "献血荣誉证", "病故军属家人", "研学团", "老年团队价",
"萧山、临平、西湖管委会免票", "退役证", "钱江分免票", "门市价", "青少年优惠",
"非协议不成团价", "非协议成团价", "非建德户籍免票", "高层次人才"
],
"ticket_items": [
"展馆", "江清月近人", "江清月近人白天场", "门票"
],
"meal_standard_table": [
300, 400, 500, 600, 700, 800, 1000
],
"meal_standard": [
0, 20, 25, 30, 35, 40, 45, 50, 55, 60, 18
],
"meal_types": [
"中餐", "前程似锦宴", "金玉满堂宴", "阖家团圆宴"
],
"guide_types": [
"团队价", "散客"
],
"meeting_place": [
"基地教室1", "基地教室2", "基地教室3", "基地教室5"
],
"duration_hours": [
"全天", "半天"
],
"device_name": [
"投影"
],
"vehicle_type": [
"14座", "18座", "23-37座", "49-56座"
],
"travel_route": [
"", "三都", "下涯", "乾潭", "大同", "大慈岩", "大洋", "好运岛", "安仁", "寿昌", "新叶", "李家", "杨村桥", "梅城", "灵栖洞", "航头", "莲花", "长林", "马目"
],
"rent_vehicle_type": [
"15座全顺", "17座考斯特", "19座金龙中巴", "23座", "28座", "34座", "39座", "50座", "54座", "56座", "5座小汽车", "7座商务车"
],
"rent_travel_route": [
"其他", "建德市"
],
"room_type": [
"总套套房", "行政大床房", "行政套房", "豌豆星球太空梦亲子套房", "豌豆星球太空梦亲子房", "豪华双床房", "豪华大床房", "露台亲子房", "露台双床房", "露台大床房", "高级双床房", "高级大床房"
],
"room_agreement": [
"协议价", "门市价", "团队价", "散客价"
],
"meeting_room_name": [
"彩虹厅", "新安厅", "新安厅半厅", "沧滩厅", "白沙厅"
],
"meeting_agreement": [
"协议价", "挂牌价"
],
"coffee_break": [
"包含", "不包含", "可选"
],
"devices": [
"投影仪", "音响", "麦克风", "白板", "LED屏", "无"
],
"devices_agreement": [
"协议价", "门市价", "团队价", "散客价"
]
}
# 资源详细信息结构定义 - 按二级资源分类的所有字段及其枚举值
RESOURCE_DETAIL_STRUCTURE = {
# 七里扬帆停车场
"七里扬帆停车场": {
"parking_items": ["大车", "小车"]
},
# 七里扬帆包船
"七里扬帆包船": {
"ship_combo": ["团队优惠价", "团队优惠价_小扬帆_1500", "团队优惠价_小扬帆_600", "团队优惠价_小扬帆_800", "散客挂牌价"],
"ship_items": ["小扬帆", "快艇1号", "快艇2号", "扬帆16号", "扬帆2号", "扬帆3号", "扬帆之星", "江南秘境1号", "江南秘境2号", "浙旅江南秘境"]
},
# 七里扬帆小火车
"七里扬帆小火车": {
"business_items": ["小火车单程", "小火车往返"],
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"]
},
# 七里扬帆景区导服
"七里扬帆景区导服": {
"guide_items": ["七里扬帆景区导服"]
},
# 七里扬帆游船
"七里扬帆游船": {
"ship_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "研学团", "老年团队价"],
"ship_items": ["七里扬帆游船"]
},
# 七里扬帆草莓采摘入园票
"七里扬帆草莓采摘入园票": {
"business_items": ["草莓采摘入园票"]
},
# 七里扬帆葫芦山庄餐饮
"七里扬帆葫芦山庄餐饮": {
"meal_standard": [0, 20, 25, 30, 35, 40, 45, 50, 55, 60],
"meal_standard_table": [300, 500, 600, 800, 1000]
},
# 七里扬帆葫芦峡漂流
"七里扬帆葫芦峡漂流": {
"ship_combo": ["建德市民"],
"ship_items": ["葫芦峡漂流"]
},
# 七里扬帆门票
"七里扬帆门票": {
"ticket_combo": ["30年教师证", "70周岁", "中远程及新增市场价格", "儿童", "其他", "军官证", "在校大学生", "学生团队价", "导游证", "建德就业", "建德市民", "接待免票", "新闻记者证", "旅投员工价", "景酒打包价", "杭州市民卡", "杭州文旅卡/惠民卡", "残疾证", "消防员证", "献血荣誉证", "病故军属家人", "研学团", "老年团队价", "萧山、临平、西湖管委会免票", "退役证", "钱江分免票", "青少年优惠", "非建德户籍免票", "高层次人才"],
"ticket_items": ["门票"]
},
# 三江口游线(游船)
"三江口游线(游船)": {
"ship_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "研学团", "老年团队价"],
"ship_items": ["三江口游线"]
},
# 三都渔村婚礼表演
"三都渔村婚礼表演": {
"business_items": ["三都渔村婚礼表演"],
"recreational_combo": ["团队价", "挂牌价"]
},
# 三都渔村玻璃水滑道
"三都渔村玻璃水滑道": {
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"],
"recreational_items": ["玻璃水滑道"]
},
# 三都渔村门票
"三都渔村门票": {
"ticket_combo": [""],
"ticket_items": ["门票"]
},
# 严州古城展馆联票+电瓶车
"严州古城展馆联票+电瓶车": {
"ticket_combo": ["研学团"],
"ticket_items": ["展馆"]
},
# 严州古城摇橹船
"严州古城摇橹船": {
"ship_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "研学团", "老年团队价"],
"ship_items": ["摇橹船"]
},
# 严州古城景区导服
"严州古城景区导服": {
"guide_items": ["严州古城景区导服"]
},
# 严州古城门票
"严州古城门票": {
"ticket_combo": [""],
"ticket_items": ["门票"]
},
# 交运包车
"交运包车": {
"travel_route": ["", "三都", "下涯", "乾潭", "大同", "大慈岩", "大洋", "好运岛", "安仁", "寿昌", "新叶", "李家", "杨村桥", "梅城", "灵栖洞", "航头", "莲花", "长林", "马目"],
"vehicle_type": ["14座", "18座", "23-37座", "49-56座"]
},
# 交运租车
"交运租车": {
"rent_travel_route": ["其他", "建德市"],
"rent_vehicle_type": ["15座全顺", "17座考斯特", "19座金龙中巴", "23座", "28座", "34座", "39座", "50座", "54座", "56座", "5座小汽车", "7座商务车"]
},
# 会务
"会务": {
"meeting_agreement": ["协议价", "挂牌价"],
"meeting_room_name": ["彩虹厅", "新安厅", "新安厅半厅", "沧滩厅", "白沙厅"]
},
# 住宿
"住宿": {
"room_type": ["总套套房", "行政大床房", "行政套房", "豌豆星球太空梦亲子套房", "豌豆星球太空梦亲子房", "豪华双床房", "豪华大床房", "露台亲子房", "露台双床房", "露台大床房", "高级双床房", "高级大床房"],
"room_agreement": ["协议价", "门市价", "团队价", "散客价"]
},
# 千岛湖好运岛停车场
"千岛湖好运岛停车场": {
"parking_items": ["中巴车", "大巴车", "小型车"]
},
# 千岛湖好运岛景区导服
"千岛湖好运岛景区导服": {
"guide_items": ["千岛湖好运岛景区导服"]
},
# 千岛湖好运岛游船
"千岛湖好运岛游船": {
"ship_combo": ["中远程及新增市场价格", "学生团队价", "广德", "旅投员工价", "景酒打包价", "老年团队价"],
"ship_items": ["千岛湖好运岛游船"]
},
# 千岛湖好运岛草莓采摘(2斤)
"千岛湖好运岛草莓采摘(2斤)": {
"business_items": ["草莓采摘(2斤)"]
},
# 千岛湖好运岛门票
"千岛湖好运岛门票": {
"ticket_combo": ["30年教师证", "70周岁", "中远程及新增市场价格", "儿童", "其他", "军官证", "在校大学生", "学生团队价", "导游证", "广德", "建德就业", "建德市民", "接待免票", "新闻记者证", "旅投员工价", "景酒打包价", "杭州市民卡", "杭州文旅卡/惠民卡", "残疾证", "消防员证", "献血荣誉证", "病故军属家人", "老年团队价", "萧山、临平、西湖管委会免票", "退役证", "钱江分免票", "青少年优惠", "非建德户籍免票", "高层次人才"],
"ticket_items": ["门票"]
},
# 千鹤会务
"千鹤会务": {
"device_name": ["投影"],
"duration_hours": ["全天", "半天"],
"meeting_place": ["基地教室1", "基地教室2", "基地教室3", "基地教室5"]
},
# 千鹤景区导服
"千鹤景区导服": {
"guide_items": ["千鹤景区导服"]
},
# 千鹤景区讲解项目
"千鹤景区讲解项目": {
"guide_items": ["小小讲解员体验", "民兵体验", "讲解费"],
"guide_types": ["团队价", "散客"]
},
# 千鹤门票
"千鹤门票": {
"ticket_items": ["门票"]
},
# 大慈岩丛林速滑(旱滑道)下行
"大慈岩丛林速滑(旱滑道)下行": {
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"],
"recreational_items": ["丛林速滑(旱滑道)下行"]
},
# 大慈岩中餐
"大慈岩中餐": {
"meal_types": ["中餐"]
},
# 大慈岩停车场
"大慈岩停车场": {
"parking_items": ["大客车", "小客车", "摩托车"]
},
# 大慈岩景区导服
"大慈岩景区导服": {
"guide_items": ["大慈岩景区导服"]
},
# 大慈岩玻璃栈道
"大慈岩玻璃栈道": {
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"],
"recreational_items": ["玻璃栈道"]
},
# 大慈岩索道
"大慈岩索道": {
"recreational_combo": ["中远程及新增市场价格", "协议不成团价", "协议成团价", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价", "门市价", "非协议不成团价", "非协议成团价"],
"recreational_items": ["索道上下行", "索道上行", "索道下行"]
},
# 大慈岩豆腐包制作体验
"大慈岩豆腐包制作体验": {
"business_items": ["豆腐包制作体验"],
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"]
},
# 大慈岩门票
"大慈岩门票": {
"ticket_combo": ["30年教师证", "70周岁", "中远程及新增市场价格", "儿童", "其他", "军官证", "协议不成团价", "协议成团价", "在校大学生", "学生团队价", "导游证", "广德", "建德就业", "建德市民", "接待免票", "新闻记者证", "旅投员工价", "景酒打包价", "杭州市民卡", "杭州文旅卡/惠民卡", "残疾证", "消防员证", "献血荣誉证", "病故军属家人", "老年团队价", "萧山、临平、西湖管委会免票", "退役证", "钱江分免票", "门市价", "青少年优惠", "非协议不成团价", "非协议成团价", "非建德户籍免票", "高层次人才"],
"ticket_items": ["门票"]
},
# 大慈岩餐饮
"大慈岩餐饮": {
"meal_standard": [0, 20, 25, 30, 35, 40, 45, 50, 55, 60],
"meal_standard_table": [300, 500, 600, 800, 1000]
},
# 扬帆旅行社全陪导游
"扬帆旅行社全陪导游": {
"guide_items": ["扬帆旅行社全陪导游"]
},
# 扬帆旅行社地接导游
"扬帆旅行社地接导游": {
"guide_items": ["扬帆旅行社地接导游"]
},
# 新叶古村停车场
"新叶古村停车场": {
"parking_items": ["大车", "小车"]
},
# 新叶古村景区导服
"新叶古村景区导服": {
"guide_items": ["新叶古村景区导服"]
},
# 新叶古村草莓采摘入园票
"新叶古村草莓采摘入园票": {
"business_items": ["草莓采摘入园票"]
},
# 新叶古村门票
"新叶古村门票": {
"ticket_combo": ["30年教师证", "70周岁", "中远程及新增市场价格", "儿童", "其他", "军官证", "在校大学生", "学生团队价", "导游证", "广德", "建德就业", "建德市民", "接待免票", "新闻记者证", "旅投员工价", "景酒打包价", "杭州市民卡", "杭州文旅卡/惠民卡", "残疾证", "消防员证", "献血荣誉证", "病故军属家人", "研学团", "老年团队价", "萧山、临平、西湖管委会免票", "退役证", "钱江分免票", "青少年优惠", "非建德户籍免票", "高层次人才"],
"ticket_items": ["门票"]
},
# 新叶古村餐饮
"新叶古村餐饮": {
"meal_standard": [0, 20, 25, 30, 35, 40, 45, 50, 55, 60],
"meal_standard_table": [300, 500, 600, 800, 1000]
},
# 新安江中餐
"新安江中餐": {
"meal_types": ["中餐"]
},
# 新安江景区导服
"新安江景区导服": {
"guide_items": ["新安江景区导服"]
},
# 新安江游船
"新安江游船": {
"ship_combo": ["中远程及新增市场价格", "学生团队价", "广德", "旅投员工价", "景酒打包价", "研学团", "老年团队价"],
"ship_items": ["新安江竹筏漂流", "新安江龙舟漂游", "梦幻新安江", "江清月近人实景演艺船票", "诗韵新安江(新安江-严州古城航线含自助茶歇、简餐)"]
},
# 新安江船餐
"新安江船餐": {
"meal_types": ["前程似锦宴", "金玉满堂宴", "阖家团圆宴"]
},
# 新安江草莓采摘入园票
"新安江草莓采摘入园票": {
"business_items": ["草莓采摘入园票"]
},
# 江清月近人实景演艺门票
"江清月近人实景演艺门票": {
"ticket_combo": ["30年教师证", "70周岁", "中远程及新增市场价格", "儿童", "其他", "军官证", "在校大学生", "学生团队价", "导游证", "广德", "建德就业", "建德市民", "接待免票", "新闻记者证", "旅投员工价", "景酒打包价", "杭州市民卡", "杭州文旅卡/惠民卡", "残疾证", "消防员证", "献血荣誉证", "病故军属家人", "研学团", "老年团队价", "萧山、临平、西湖管委会免票", "退役证", "钱江分免票", "非建德户籍免票", "高层次人才"],
"ticket_items": ["江清月近人"]
},
# 江清月近人白天场
"江清月近人白天场": {
"ticket_combo": ["研学团"],
"ticket_items": ["江清月近人白天场"]
},
# 灵栖洞中餐
"灵栖洞中餐": {
"meal_types": ["中餐"]
},
# 灵栖洞停车场
"灵栖洞停车场": {
"parking_items": ["中型", "大型", "小车"]
},
# 灵栖洞喊泉
"灵栖洞喊泉": {
"business_items": ["喊泉"]
},
# 灵栖洞手划船
"灵栖洞手划船": {
"ship_combo": ["正常价", "特殊价"],
"ship_items": ["船票"]
},
# 灵栖洞景区导服
"灵栖洞景区导服": {
"guide_items": ["灵栖洞景区导服"]
},
# 灵栖洞极速滑道
"灵栖洞极速滑道": {
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"],
"recreational_items": ["极速滑道"]
},
# 灵栖洞西游魔毯
"灵栖洞西游魔毯": {
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"],
"recreational_items": ["西游魔毯"]
},
# 灵栖洞豆腐包制作体验
"灵栖洞豆腐包制作体验": {
"business_items": ["豆腐包制作体验"],
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"]
},
# 灵栖洞门票
"灵栖洞门票": {
"ticket_combo": ["30年教师证", "70周岁", "中远程及新增市场价格", "儿童", "其他", "军官证", "在校大学生", "学生团队价", "导游证", "广德", "建德就业", "建德市民", "接待免票", "新闻记者证", "旅投员工价", "景酒打包价", "杭州市民卡", "杭州文旅卡/惠民卡", "残疾证", "消防员证", "献血荣誉证", "病故军属家人", "研学团", "老年团队价", "萧山、临平、西湖管委会免票", "退役证", "钱江分免票", "青少年优惠", "非建德户籍免票", "高层次人才"],
"ticket_items": ["门票"]
},
# 灵栖洞餐饮
"灵栖洞餐饮": {
"meal_standard": [0, 20, 25, 30, 35, 40, 45, 50, 55, 60],
"meal_standard_table": [300, 400, 500, 600, 700, 800, 1000]
},
# 玉泉寺门票
"玉泉寺门票": {
"ticket_items": ["门票"]
},
# 考拉森林丛林探险
"考拉森林丛林探险": {
"business_items": ["考拉森林丛林探险亲子线", "考拉森林丛林探险成人线"],
"recreational_combo": ["中远程及新增市场价格", "学生团队价", "旅投员工价", "景酒打包价", "老年团队价"]
},
# 餐饮
"餐饮": {
"meal_standard": [18]
}
}
def get_all_resource_names() -> List[str]:
"""获取所有可能的资源名称"""
all_names = []
for resource_main, sub_resources in RESOURCE_SUBRESOURCES.items():
for sub_resource in sub_resources:
all_names.append(f"{resource_main}-{sub_resource}")
return all_names
def get_resource_detail_structure(resource_name: str) -> Dict[str, List]:
"""获取指定资源的详细信息结构"""
return RESOURCE_DETAIL_STRUCTURE.get(resource_name, {})
def get_field_description(field_type: str) -> str:
"""获取字段类型的中文描述"""
field_descriptions = {
"business_items": "业务项目",
"guide_items": "导游项目",
"parking_items": "停车类型",
"recreational_combo": "娱乐套餐类型",
"recreational_items": "娱乐项目",
"ship_combo": "船票套餐类型",
"ship_items": "船只类型",
"ticket_combo": "门票套餐类型",
"ticket_items": "票型",
"meal_standard": "餐饮标准",
"meal_standard_table": "桌餐标准",
"meal_types": "餐饮类型",
"guide_types": "导游类型",
"meeting_place": "会议场所",
"duration_hours": "时长",
"device_name": "设备名称",
"vehicle_type": "车辆类型",
"travel_route": "行程路线",
"rent_vehicle_type": "租车类型",
"rent_travel_route": "租车路线",
"room_type": "房型",
"room_agreement": "房价协议",
"meeting_room_name": "会议室名称",
"meeting_agreement": "会议协议",
"coffee_break": "茶歇",
"devices": "设备",
"devices_agreement": "设备协议"
}
return field_descriptions.get(field_type, field_type)
def load_original_data(file_path: str) -> List[Dict]:
"""加载原始训练数据,支持JSON和JSONL格式"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"文件不存在: {file_path}")
# 根据文件扩展名判断格式
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.jsonl':
return load_jsonl_data(file_path)
elif file_ext == '.json':
return load_json_data(file_path)
else:
# 尝试自动检测格式
return auto_detect_and_load(file_path)
def load_json_data(file_path: str) -> List[Dict]:
"""加载JSON格式数据"""
print(f"检测到JSON格式文件: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_jsonl_data(file_path: str) -> List[Dict]:
"""加载JSONL格式数据"""
print(f"检测到JSONL格式文件: {file_path}")
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line: # 跳过空行
continue
try:
item = json.loads(line)
data.append(item)
except json.JSONDecodeError as e:
print(f"警告: 第{line_num}行JSON解析失败: {e}")
print(f"问题行内容: {line[:100]}...")
continue
return data
def auto_detect_and_load(file_path: str) -> List[Dict]:
"""自动检测文件格式并加载"""
print(f"自动检测文件格式: {file_path}")
# 读取文件前几行来判断格式
with open(file_path, 'r', encoding='utf-8') as f:
first_line = f.readline().strip()
f.seek(0) # 重置文件指针
if first_line.startswith('['):
# JSON数组格式
print("检测到JSON数组格式")
return load_json_data(file_path)
elif first_line.startswith('{'):
# 可能是JSONL格式
print("检测到JSONL格式")
return load_jsonl_data(file_path)
else:
raise ValueError(f"无法识别的文件格式: {file_path}")
def create_field_specific_instruction(field_name: str, field_description: str, resource_name: str = None) -> str:
"""为特定字段创建指令"""
# 特殊处理resource_names字段,包含业务范围
if field_name == "resource_names":
all_resource_names = get_all_resource_names()
resource_list = "、".join(all_resource_names)
return f"""请从OCR文本中抽取旅行订单中的所有资源名称。
可识别的资源名称包括:{resource_list}
资源别称表述:
- 大慈岩丛林速滑(旱滑道)下行:可能表述为"旱滑道"、"丛林速滑"
- 灵栖洞西游魔毯:可能表述为"飞天魔毯"、"灵栖洞魔毯"
- 灵栖洞极速滑道:可能表述为"灵栖洞滑道"、"灵栖洞速滑"
- 考拉森林丛林探险:可能表述为"考拉森林"、"丛林探险"
- 三江口游线(游船):可能表述为"富春江游船"
- 严州古城:可能表述为"梅城"
- 三都渔村婚礼表演:可能表述为"三都渔村九姓渔氏水上婚礼表演"、"九姓渔氏"、"婚礼表演"
资源内在联系规则:
1. 七里扬帆景区:
- 出现"七里扬帆"(无三江口)信息时,包含:七里扬帆-七里扬帆门票、七里扬帆-七里扬帆游船
- 出现"三江口"信息时,包含:七里扬帆-三江口游线(游船),但不包含七里扬帆门票和七里扬帆游船
- 出现"葫芦峡漂流"信息时,包含:七里扬帆-七里扬帆门票、七里扬帆-七里扬帆游船
2. 灵栖洞景区:
- 出现"灵栖洞"信息时,包含:灵栖洞-灵栖洞门票、灵栖洞-灵栖洞手划船
严格按照以下JSON格式输出:
{{
"resource_names": ["资源主体-资源名称1", "资源主体-资源名称2"] 或 []
}}"""
# 特殊处理resource_detail字段,包含业务范围
if field_name == "resource_detail" and resource_name:
detail_structure = get_resource_detail_structure(resource_name)
if detail_structure:
field_lines = []
for field_type, enum_values in detail_structure.items():
if enum_values:
enum_list = "、".join(f'"{str(v)}"' for v in enum_values)
field_lines.append(f"{field_type} ({get_field_description(field_type)}): {enum_list}")
else:
field_lines.append(f"{field_type} ({get_field_description(field_type)}): null")
fields_info = "\n".join(field_lines)
# 构建JSON示例结构
json_fields = []
for field_type in detail_structure.keys():
json_fields.append(f' "{field_type}": 选择值或null')
json_structure = ",\n".join(json_fields)
return f"""请从OCR文本中抽取旅行订单中{resource_name}的详细信息。
提取字段及可选值:
{fields_info}
严格按照以下JSON格式输出:
{{
"resource_detail": {{
{json_structure}
}}
}}"""
else:
return f"""请从OCR文本中抽取旅行订单中{resource_name}的详细信息。
严格按照以下JSON格式输出:
{{
"resource_detail": {{}}
}}"""
# 基础字段的指令定义
field_instructions = {
"team_size": """请从OCR文本中抽取旅行订单的总人数信息。
严格按照以下JSON格式输出:
{
"team_size": 整数或null
}
注意:订单信息中的导游员、讲解员、司机、领队等人员,不包含在总人数中。
""",
"start_date": """请从OCR文本中抽取旅行订单的开始日期信息。
严格按照以下JSON格式输出:
{
"start_date": "YYYY-MM-DD"或null
}""",
"end_date": """请从OCR文本中抽取旅行订单的结束日期信息。
严格按照以下JSON格式输出:
{
"end_date": "YYYY-MM-DD"或null
}""",
"payment_method": """请从OCR文本中抽取旅行订单的支付方式信息。
严格按照以下JSON格式输出:
{
"payment_method": "支付方式"或null
}""",
"customer_name": """请从OCR文本中抽取旅行订单的客户名称(通常是旅行社或公司名称)。
严格按照以下JSON格式输出:
{
"customer_name": "客户名称"或null
}""",
"customer_market": """请从OCR文本中抽取旅行订单的客户地区,按照'省-市'格式。
严格按照以下JSON格式输出:
{
"customer_market": "省-市"或null
}""",
"customer_type": """请从OCR文本中抽取旅行订单的客户类型(如旅行社、机构等)。
严格按照以下JSON格式输出:
{
"customer_type": "客户类型"或null
}""",
"notes": """请从OCR文本中抽取旅行订单的备注信息。
严格按照以下JSON格式输出:
{
"notes": "备注信息"或null
}""",
"contacts": """请从OCR文本中抽取旅行订单的联系人信息。
提取字段:
name (联系人姓名)
phone (联系电话)
idcard (身份证号码,如果没有则为null)
严格按照以下JSON格式输出:
{
"contacts": {
"data": [
{
"name": "姓名",
"phone": "电话",
"idcard": "身份证号"或null
}
]
}
}""",
"resource_start_time": f"""请从OCR文本中抽取旅行订单中{resource_name or '该资源'}的开始时间。
严格按照以下JSON格式输出:
{{
"resource_start_time": "YYYY-MM-DD"或null
}}""",
"resource_end_time": f"""请从OCR文本中抽取旅行订单中{resource_name or '该资源'}的结束时间。
严格按照以下JSON格式输出:
{{
"resource_end_time": "YYYY-MM-DD"或null
}}
""",
"resource_team_size": f"""请从OCR文本中抽取旅行订单中{resource_name or '该资源'}的使用人数。
严格按照以下JSON格式输出:
{{
"resource_team_size": 整数或null
}}
注意:订单信息中的导游员、讲解员、司机、领队等人员,不包含在总人数中。
"""
}
return field_instructions.get(field_name, f"""请从OCR文本中抽取旅行订单的{field_description}信息。
严格按照以下JSON格式输出:
{{
"{field_name}": "值"或null
}}""")
def extract_field_value(output_json: Dict, field_path: List[str]) -> Any:
"""从完整输出中提取特定字段的值"""
current = output_json
try:
for key in field_path:
if isinstance(key, int):
# 处理数组索引
if isinstance(current, list) and 0 <= key < len(current):
current = current[key]
else:
return None
else:
# 处理字典键
current = current[key]
return current
except (KeyError, TypeError, IndexError):
return None
def format_output_value(value: Any, field_name: str) -> str:
"""格式化输出值,包含字段名"""
if value is None:
return f'"{field_name}": null'
if field_name in ["contacts", "resource_names", "resource_detail"]:
# 对于复杂对象,返回包含字段名的JSON字符串
return f'"{field_name}": {json.dumps(value, ensure_ascii=False)}'
elif field_name in ["team_size", "resource_team_size"] and isinstance(value, (int, float)):
# team_size字段保持为整数格式
return f'"{field_name}": {value}'
elif isinstance(value, str):
return f'"{field_name}": "{value}"'
elif isinstance(value, (int, float)):
return f'"{field_name}": "{str(value)}"'
else:
return f'"{field_name}": {json.dumps(value, ensure_ascii=False)}'
def extract_resource_names(resource_results: Dict) -> List[str]:
"""从resource_results中提取所有资源名称"""
if not resource_results or "data" not in resource_results:
return []
resource_names = []
for resource in resource_results["data"]:
if "resource_name" in resource:
resource_names.append(resource["resource_name"])
return resource_names
def find_resource_by_name(resource_results: Dict, target_name: str) -> Dict:
"""根据资源名称查找对应的资源信息"""
if not resource_results or "data" not in resource_results:
return {}
for resource in resource_results["data"]:
if resource.get("resource_name") == target_name:
return resource
return {}
def generate_fine_grained_data(original_data: List[Dict]) -> List[Dict]:
"""生成细粒度训练数据"""
fine_grained_data = []
# 定义基础字段(非资源相关)
basic_field_definitions = [
("team_size", "总人数", ["nonresource_results", "team_size"]),
("start_date", "开始日期", ["nonresource_results", "start_date"]),
("end_date", "结束日期", ["nonresource_results", "end_date"]),
("payment_method", "支付方式", ["nonresource_results", "payment_method"]),
("customer_name", "客户名称", ["nonresource_results", "customer_name"]),
("customer_market", "客户地区", ["nonresource_results", "customer_market"]),
("customer_type", "客户类型", ["nonresource_results", "customer_type"]),
("notes", "备注", ["nonresource_results", "notes"]),
("contacts", "联系人信息", ["contacts"])
]
for item in original_data:
input_text = item["input"]
# 解析原始输出
try:
output_json = json.loads(item["output"])
except json.JSONDecodeError:
print(f"跳过无效JSON: {item['output'][:100]}...")
continue
# 1. 为基础字段创建训练样本
for field_name, field_description, field_path in basic_field_definitions:
field_value = extract_field_value(output_json, field_path)
sample = {
"instruction": create_field_specific_instruction(field_name, field_description),
"input": input_text,
"output": format_output_value(field_value, field_name)
}
fine_grained_data.append(sample)
# 2. 创建资源名称列表抽取任务
resource_results = extract_field_value(output_json, ["resource_results"])
resource_names = extract_resource_names(resource_results)
sample = {
"instruction": create_field_specific_instruction("resource_names", "资源名称列表"),
"input": input_text,
"output": format_output_value(resource_names, "resource_names")
}
fine_grained_data.append(sample)
# 3. 为每个资源创建详细的抽取任务
for resource_name in resource_names:
resource_info = find_resource_by_name(resource_results, resource_name)
# 资源开始时间
start_time = resource_info.get("start_time")
sample = {
"instruction": create_field_specific_instruction("resource_start_time", "开始时间", resource_name),
"input": input_text,
"output": format_output_value(start_time, "resource_start_time")
}
fine_grained_data.append(sample)
# 资源结束时间
end_time = resource_info.get("end_time")
sample = {
"instruction": create_field_specific_instruction("resource_end_time", "结束时间", resource_name),
"input": input_text,
"output": format_output_value(end_time, "resource_end_time")
}
fine_grained_data.append(sample)
# 资源使用人数
team_size = resource_info.get("team_size")
sample = {
"instruction": create_field_specific_instruction("resource_team_size", "使用人数", resource_name),
"input": input_text,
"output": format_output_value(team_size, "resource_team_size")
}
fine_grained_data.append(sample)
# 资源详细信息
detail = resource_info.get("detail", {})
# 从资源名称中提取具体的资源类型(去掉资源主体前缀)
resource_type = resource_name
if "-" in resource_name:
resource_type = resource_name.split("-", 1)[1] # 取第二部分作为资源类型
sample = {
"instruction": create_field_specific_instruction("resource_detail", "详细信息", resource_type),
"input": input_text,
"output": format_output_value(detail, "resource_detail")
}
fine_grained_data.append(sample)
return fine_grained_data
def save_fine_grained_data(data: List[Dict], output_file: str, format_type: str = 'auto'):
"""保存细粒度数据,支持JSON和JSONL格式"""
if format_type == 'auto':
# 根据输出文件扩展名自动选择格式
file_ext = os.path.splitext(output_file)[1].lower()
if file_ext == '.jsonl':
format_type = 'jsonl'
else:
format_type = 'json'
if format_type == 'jsonl':
print(f"正在保存为JSONL格式: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
else:
print(f"正在保存为JSON格式: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def main():
import argparse
# 创建命令行参数解析器
parser = argparse.ArgumentParser(description='将粗粒度订单数据转换为细粒度训练数据')
parser.add_argument('--input', '-i', type=str,
default="/home/ziqiang/order_info/database/test/ocr_text_orders_20250812.jsonl",
help='输入文件路径 (支持JSON和JSONL格式)')
parser.add_argument('--output', '-o', type=str,
default="/home/ziqiang/order_info/database/dev/ocr_text_orders_20250812_re.jsonl",
help='输出文件路径')
parser.add_argument('--format', '-f', type=str, choices=['json', 'jsonl', 'auto'],
default='auto', help='输出格式 (auto: 根据文件扩展名自动选择)')
parser.add_argument('--shuffle', action='store_true', default=True,
help='是否打乱数据顺序')
args = parser.parse_args()
# 加载原始数据
print("正在加载原始数据...")
try:
original_data = load_original_data(args.input)
print(f"加载了 {len(original_data)} 条原始数据")
except Exception as e:
print(f"加载数据失败: {e}")
return
# 生成细粒度数据
print("正在生成细粒度训练数据...")
fine_grained_data = generate_fine_grained_data(original_data)
print(f"生成了 {len(fine_grained_data)} 条细粒度数据")
# 打乱数据顺序
if args.shuffle:
random.shuffle(fine_grained_data)
print("已打乱数据顺序")
# 保存细粒度数据
try:
save_fine_grained_data(fine_grained_data, args.output, args.format)
print(f"完成!细粒度数据已保存到 {args.output}")
except Exception as e:
print(f"保存数据失败: {e}")
return
# 输出统计信息
field_counts = {}
for item in fine_grained_data:
instruction = item["instruction"]
# 统计基础字段
for field_name in ["总人数", "开始日期", "结束日期", "支付方式", "客户名称", "客户地区", "客户类型", "备注", "联系人信息"]:
if field_name in instruction:
field_counts[field_name] = field_counts.get(field_name, 0) + 1
break
# 统计资源相关字段
for field_name in ["资源名称列表", "开始时间", "结束时间", "使用人数", "详细信息"]:
if field_name in instruction and ("资源" in instruction or "resource" in instruction.lower()):
field_counts[f"资源_{field_name}"] = field_counts.get(f"资源_{field_name}", 0) + 1
break
print("\n各字段的样本数量:")
for field, count in field_counts.items():
print(f" {field}: {count} 条")
if __name__ == "__main__":
main()