Spaces:
Running
Running
File size: 9,072 Bytes
010f0b1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | import datetime
from sqlalchemy import Column, Integer, String, Float, DateTime, Text, LargeBinary, ForeignKey
from sqlalchemy.orm import relationship
from database import Base
_TZ_BEIJING = datetime.timezone(datetime.timedelta(hours=8))
def _now_beijing():
return datetime.datetime.now(_TZ_BEIJING).replace(tzinfo=None)
class VectorMatchTask(Base):
__tablename__ = "vector_match_task"
__table_args__ = {"comment": "向量匹配任务表"}
id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
task_code = Column(String(30), unique=True, nullable=False, index=True, comment="任务编号,格式:YYYYMMDDHHMMSSmmm")
match_mode = Column(String(50), nullable=False, default="two_file", comment="匹配模式:two_file/history/standard")
candidate_scope = Column(String(50), nullable=False, default="current_task_target", comment="候选范围:current_task_target/history/standard")
source_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="源数据集ID")
target_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="目标候选集ID")
top_k = Column(Integer, default=10, comment="每条源数据保留的Top-K候选数")
rerank_top_k = Column(Integer, default=3, comment="Reranker重排序后保留的Top-K数")
min_threshold = Column(Float, default=0.70, comment="最低相似度阈值")
status = Column(String(20), default="pending", comment="任务状态:pending/running/completed/failed")
source_row_count = Column(Integer, default=0, comment="源数据行数")
target_row_count = Column(Integer, default=0, comment="目标候选行数")
high_match_count = Column(Integer, default=0, comment="高度匹配数量(score>=0.90)")
low_confidence_count = Column(Integer, default=0, comment="低置信数量(score<0.70)")
reused_vectors = Column(Integer, default=0, comment="通过text_hash复用的向量数")
new_vectors = Column(Integer, default=0, comment="新生成的向量数")
progress_parse_source = Column(Integer, default=0, comment="解析源数据集进度(0-100)")
progress_parse_target = Column(Integer, default=0, comment="解析目标候选集进度(0-100)")
progress_vectorize = Column(Integer, default=0, comment="向量化进度(0-100)")
progress_load_candidates = Column(Integer, default=0, comment="加载候选范围进度(0-100)")
progress_similarity = Column(Integer, default=0, comment="相似度计算进度(0-100)")
progress_rerank = Column(Integer, default=0, comment="Reranker重排序进度(0-100)")
progress_save_results = Column(Integer, default=0, comment="保存结果进度(0-100)")
created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
is_archived = Column(Integer, default=0, comment="是否归档:0=未归档,1=已归档")
is_delete = Column(Integer, default=0, comment="是否删除:0=未删除,1=已删除")
source_dataset = relationship("VectorDataset", foreign_keys=[source_dataset_id])
target_dataset = relationship("VectorDataset", foreign_keys=[target_dataset_id])
results = relationship("MatchResult", back_populates="task")
class VectorDataset(Base):
__tablename__ = "vector_dataset"
__table_args__ = {"comment": "向量数据集表(上传或逻辑数据集)"}
id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=True, comment="所属任务ID")
name = Column(String(255), nullable=False, comment="数据集名称")
file_name = Column(String(255), nullable=True, comment="上传文件名")
sheet_name = Column(String(100), nullable=True, comment="Excel工作表名")
dataset_role = Column(String(20), nullable=False, comment="数据集角色:source(源)/target(目标候选)")
data_scope = Column(String(20), default="task", comment="数据范围:task/history/standard")
vector_fields = Column(Text, nullable=True, comment="参与向量化的字段列表(JSON)")
row_count = Column(Integer, default=0, comment="数据行数")
is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
rows = relationship("VectorDataRow", back_populates="dataset")
class VectorDataRow(Base):
__tablename__ = "vector_data_row"
__table_args__ = {"comment": "向量数据行表(单行物料/申报项等)"}
id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=False, index=True, comment="所属数据集ID")
task_id = Column(Integer, nullable=True, index=True, comment="所属任务ID")
dataset_role = Column(String(20), nullable=False, index=True, comment="数据集角色:source/target")
data_scope = Column(String(20), default="task", index=True, comment="数据范围:task/history/standard")
row_number = Column(Integer, nullable=False, comment="Excel中的行号")
raw_text = Column(Text, nullable=False, comment="拼接后的原始文本")
text_hash = Column(String(64), nullable=True, index=True, comment="文本SHA256哈希,用于向量复用")
field_values = Column(Text, nullable=True, comment="各字段值(JSON)")
is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
dataset = relationship("VectorDataset", back_populates="rows")
embedding = relationship("VectorEmbedding", back_populates="data_row", uselist=False)
class VectorEmbedding(Base):
__tablename__ = "vector_embedding"
__table_args__ = {"comment": "向量嵌入表(与数据行一对一)"}
id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
data_row_id = Column(Integer, ForeignKey("vector_data_row.id"), unique=True, nullable=False, index=True, comment="关联 vector_data_row.id")
text_hash = Column(String(64), nullable=False, index=True, comment="与行一致的文本哈希")
embedding = Column(LargeBinary(length=65536), nullable=False, comment="float32 数组二进制存储")
model_name = Column(String(100), nullable=True, comment="生成向量所用模型名")
dimension = Column(Integer, nullable=True, comment="向量维度")
is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
data_row = relationship("VectorDataRow", back_populates="embedding")
class MatchResult(Base):
__tablename__ = "match_result"
__table_args__ = {"comment": "匹配结果表(源行与候选的关联及得分)"}
id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=False, index=True, comment="所属任务ID")
source_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="源数据行ID")
target_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="目标候选行ID")
similarity_score = Column(Float, nullable=False, comment="余弦相似度分数(0-1)")
rerank_score = Column(Float, nullable=True, comment="Reranker精排分数,越高越相关")
rank = Column(Integer, nullable=False, comment="排名(1=最相似)")
rerank_rank = Column(Integer, nullable=True, comment="Reranker重排后的排名")
candidate_scope = Column(String(50), nullable=True, comment="候选来源范围")
match_level = Column(String(20), nullable=True, comment="匹配等级:high/possible/low_confidence/no_match")
is_confirmed = Column(Integer, default=0, comment="是否已人工确认:0=未确认,1=已确认,-1=已忽略")
is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除")
created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
task = relationship("VectorMatchTask", back_populates="results")
source_row = relationship("VectorDataRow", foreign_keys=[source_row_id])
target_row = relationship("VectorDataRow", foreign_keys=[target_row_id])
|