File size: 9,072 Bytes
ba016aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import datetime
from sqlalchemy import Column, Integer, String, Float, DateTime, Text, LargeBinary, ForeignKey
from sqlalchemy.orm import relationship
from database import Base

_TZ_BEIJING = datetime.timezone(datetime.timedelta(hours=8))

def _now_beijing():
    return datetime.datetime.now(_TZ_BEIJING).replace(tzinfo=None)


class VectorMatchTask(Base):
    __tablename__ = "vector_match_task"
    __table_args__ = {"comment": "向量匹配任务表"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    task_code = Column(String(30), unique=True, nullable=False, index=True, comment="任务编号,格式:YYYYMMDDHHMMSSmmm")
    match_mode = Column(String(50), nullable=False, default="two_file", comment="匹配模式:two_file/history/standard")
    candidate_scope = Column(String(50), nullable=False, default="current_task_target", comment="候选范围:current_task_target/history/standard")
    source_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="源数据集ID")
    target_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="目标候选集ID")
    top_k = Column(Integer, default=10, comment="每条源数据保留的Top-K候选数")
    rerank_top_k = Column(Integer, default=3, comment="Reranker重排序后保留的Top-K数")
    min_threshold = Column(Float, default=0.70, comment="最低相似度阈值")
    status = Column(String(20), default="pending", comment="任务状态:pending/running/completed/failed")
    source_row_count = Column(Integer, default=0, comment="源数据行数")
    target_row_count = Column(Integer, default=0, comment="目标候选行数")
    high_match_count = Column(Integer, default=0, comment="高度匹配数量(score>=0.90)")
    low_confidence_count = Column(Integer, default=0, comment="低置信数量(score<0.70)")
    reused_vectors = Column(Integer, default=0, comment="通过text_hash复用的向量数")
    new_vectors = Column(Integer, default=0, comment="新生成的向量数")
    progress_parse_source = Column(Integer, default=0, comment="解析源数据集进度(0-100)")
    progress_parse_target = Column(Integer, default=0, comment="解析目标候选集进度(0-100)")
    progress_vectorize = Column(Integer, default=0, comment="向量化进度(0-100)")
    progress_load_candidates = Column(Integer, default=0, comment="加载候选范围进度(0-100)")
    progress_similarity = Column(Integer, default=0, comment="相似度计算进度(0-100)")
    progress_rerank = Column(Integer, default=0, comment="Reranker重排序进度(0-100)")
    progress_save_results = Column(Integer, default=0, comment="保存结果进度(0-100)")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
    is_archived = Column(Integer, default=0, comment="是否归档:0=未归档,1=已归档")
    is_delete = Column(Integer, default=0, comment="是否删除:0=未删除,1=已删除")

    source_dataset = relationship("VectorDataset", foreign_keys=[source_dataset_id])
    target_dataset = relationship("VectorDataset", foreign_keys=[target_dataset_id])
    results = relationship("MatchResult", back_populates="task")


class VectorDataset(Base):
    __tablename__ = "vector_dataset"
    __table_args__ = {"comment": "向量数据集表(上传或逻辑数据集)"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=True, comment="所属任务ID")
    name = Column(String(255), nullable=False, comment="数据集名称")
    file_name = Column(String(255), nullable=True, comment="上传文件名")
    sheet_name = Column(String(100), nullable=True, comment="Excel工作表名")
    dataset_role = Column(String(20), nullable=False, comment="数据集角色:source(源)/target(目标候选)")
    data_scope = Column(String(20), default="task", comment="数据范围:task/history/standard")
    vector_fields = Column(Text, nullable=True, comment="参与向量化的字段列表(JSON)")
    row_count = Column(Integer, default=0, comment="数据行数")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    rows = relationship("VectorDataRow", back_populates="dataset")


class VectorDataRow(Base):
    __tablename__ = "vector_data_row"
    __table_args__ = {"comment": "向量数据行表(单行物料/申报项等)"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=False, index=True, comment="所属数据集ID")
    task_id = Column(Integer, nullable=True, index=True, comment="所属任务ID")
    dataset_role = Column(String(20), nullable=False, index=True, comment="数据集角色:source/target")
    data_scope = Column(String(20), default="task", index=True, comment="数据范围:task/history/standard")
    row_number = Column(Integer, nullable=False, comment="Excel中的行号")
    raw_text = Column(Text, nullable=False, comment="拼接后的原始文本")
    text_hash = Column(String(64), nullable=True, index=True, comment="文本SHA256哈希,用于向量复用")
    field_values = Column(Text, nullable=True, comment="各字段值(JSON)")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    dataset = relationship("VectorDataset", back_populates="rows")
    embedding = relationship("VectorEmbedding", back_populates="data_row", uselist=False)


class VectorEmbedding(Base):
    __tablename__ = "vector_embedding"
    __table_args__ = {"comment": "向量嵌入表(与数据行一对一)"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    data_row_id = Column(Integer, ForeignKey("vector_data_row.id"), unique=True, nullable=False, index=True, comment="关联 vector_data_row.id")
    text_hash = Column(String(64), nullable=False, index=True, comment="与行一致的文本哈希")
    embedding = Column(LargeBinary(length=65536), nullable=False, comment="float32 数组二进制存储")
    model_name = Column(String(100), nullable=True, comment="生成向量所用模型名")
    dimension = Column(Integer, nullable=True, comment="向量维度")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    data_row = relationship("VectorDataRow", back_populates="embedding")


class MatchResult(Base):
    __tablename__ = "match_result"
    __table_args__ = {"comment": "匹配结果表(源行与候选的关联及得分)"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=False, index=True, comment="所属任务ID")
    source_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="源数据行ID")
    target_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="目标候选行ID")
    similarity_score = Column(Float, nullable=False, comment="余弦相似度分数(0-1)")
    rerank_score = Column(Float, nullable=True, comment="Reranker精排分数,越高越相关")
    rank = Column(Integer, nullable=False, comment="排名(1=最相似)")
    rerank_rank = Column(Integer, nullable=True, comment="Reranker重排后的排名")
    candidate_scope = Column(String(50), nullable=True, comment="候选来源范围")
    match_level = Column(String(20), nullable=True, comment="匹配等级:high/possible/low_confidence/no_match")
    is_confirmed = Column(Integer, default=0, comment="是否已人工确认:0=未确认,1=已确认,-1=已忽略")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    task = relationship("VectorMatchTask", back_populates="results")
    source_row = relationship("VectorDataRow", foreign_keys=[source_row_id])
    target_row = relationship("VectorDataRow", foreign_keys=[target_row_id])