import datetime from sqlalchemy import Column, Integer, String, Float, DateTime, Text, LargeBinary, ForeignKey from sqlalchemy.orm import relationship from database import Base _TZ_BEIJING = datetime.timezone(datetime.timedelta(hours=8)) def _now_beijing(): return datetime.datetime.now(_TZ_BEIJING).replace(tzinfo=None) class VectorMatchTask(Base): __tablename__ = "vector_match_task" __table_args__ = {"comment": "向量匹配任务表"} id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") task_code = Column(String(30), unique=True, nullable=False, index=True, comment="任务编号,格式:YYYYMMDDHHMMSSmmm") match_mode = Column(String(50), nullable=False, default="two_file", comment="匹配模式:two_file/history/standard") candidate_scope = Column(String(50), nullable=False, default="current_task_target", comment="候选范围:current_task_target/history/standard") source_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="源数据集ID") target_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="目标候选集ID") top_k = Column(Integer, default=10, comment="每条源数据保留的Top-K候选数") rerank_top_k = Column(Integer, default=3, comment="Reranker重排序后保留的Top-K数") min_threshold = Column(Float, default=0.70, comment="最低相似度阈值") status = Column(String(20), default="pending", comment="任务状态:pending/running/completed/failed") source_row_count = Column(Integer, default=0, comment="源数据行数") target_row_count = Column(Integer, default=0, comment="目标候选行数") high_match_count = Column(Integer, default=0, comment="高度匹配数量(score>=0.90)") low_confidence_count = Column(Integer, default=0, comment="低置信数量(score<0.70)") reused_vectors = Column(Integer, default=0, comment="通过text_hash复用的向量数") new_vectors = Column(Integer, default=0, comment="新生成的向量数") progress_parse_source = Column(Integer, default=0, comment="解析源数据集进度(0-100)") progress_parse_target = Column(Integer, default=0, comment="解析目标候选集进度(0-100)") progress_vectorize = Column(Integer, default=0, comment="向量化进度(0-100)") progress_load_candidates = Column(Integer, default=0, comment="加载候选范围进度(0-100)") progress_similarity = Column(Integer, default=0, comment="相似度计算进度(0-100)") progress_rerank = Column(Integer, default=0, comment="Reranker重排序进度(0-100)") progress_save_results = Column(Integer, default=0, comment="保存结果进度(0-100)") created_time = Column(DateTime, default=_now_beijing, comment="创建时间") updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") is_archived = Column(Integer, default=0, comment="是否归档:0=未归档,1=已归档") is_delete = Column(Integer, default=0, comment="是否删除:0=未删除,1=已删除") source_dataset = relationship("VectorDataset", foreign_keys=[source_dataset_id]) target_dataset = relationship("VectorDataset", foreign_keys=[target_dataset_id]) results = relationship("MatchResult", back_populates="task") class VectorDataset(Base): __tablename__ = "vector_dataset" __table_args__ = {"comment": "向量数据集表(上传或逻辑数据集)"} id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=True, comment="所属任务ID") name = Column(String(255), nullable=False, comment="数据集名称") file_name = Column(String(255), nullable=True, comment="上传文件名") sheet_name = Column(String(100), nullable=True, comment="Excel工作表名") dataset_role = Column(String(20), nullable=False, comment="数据集角色:source(源)/target(目标候选)") data_scope = Column(String(20), default="task", comment="数据范围:task/history/standard") vector_fields = Column(Text, nullable=True, comment="参与向量化的字段列表(JSON)") row_count = Column(Integer, default=0, comment="数据行数") is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") created_time = Column(DateTime, default=_now_beijing, comment="创建时间") updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") rows = relationship("VectorDataRow", back_populates="dataset") class VectorDataRow(Base): __tablename__ = "vector_data_row" __table_args__ = {"comment": "向量数据行表(单行物料/申报项等)"} id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=False, index=True, comment="所属数据集ID") task_id = Column(Integer, nullable=True, index=True, comment="所属任务ID") dataset_role = Column(String(20), nullable=False, index=True, comment="数据集角色:source/target") data_scope = Column(String(20), default="task", index=True, comment="数据范围:task/history/standard") row_number = Column(Integer, nullable=False, comment="Excel中的行号") raw_text = Column(Text, nullable=False, comment="拼接后的原始文本") text_hash = Column(String(64), nullable=True, index=True, comment="文本SHA256哈希,用于向量复用") field_values = Column(Text, nullable=True, comment="各字段值(JSON)") is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") created_time = Column(DateTime, default=_now_beijing, comment="创建时间") updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") dataset = relationship("VectorDataset", back_populates="rows") embedding = relationship("VectorEmbedding", back_populates="data_row", uselist=False) class VectorEmbedding(Base): __tablename__ = "vector_embedding" __table_args__ = {"comment": "向量嵌入表(与数据行一对一)"} id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") data_row_id = Column(Integer, ForeignKey("vector_data_row.id"), unique=True, nullable=False, index=True, comment="关联 vector_data_row.id") text_hash = Column(String(64), nullable=False, index=True, comment="与行一致的文本哈希") embedding = Column(LargeBinary(length=65536), nullable=False, comment="float32 数组二进制存储") model_name = Column(String(100), nullable=True, comment="生成向量所用模型名") dimension = Column(Integer, nullable=True, comment="向量维度") is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") created_time = Column(DateTime, default=_now_beijing, comment="创建时间") updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") data_row = relationship("VectorDataRow", back_populates="embedding") class MatchResult(Base): __tablename__ = "match_result" __table_args__ = {"comment": "匹配结果表(源行与候选的关联及得分)"} id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=False, index=True, comment="所属任务ID") source_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="源数据行ID") target_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="目标候选行ID") similarity_score = Column(Float, nullable=False, comment="余弦相似度分数(0-1)") rerank_score = Column(Float, nullable=True, comment="Reranker精排分数,越高越相关") rank = Column(Integer, nullable=False, comment="排名(1=最相似)") rerank_rank = Column(Integer, nullable=True, comment="Reranker重排后的排名") candidate_scope = Column(String(50), nullable=True, comment="候选来源范围") match_level = Column(String(20), nullable=True, comment="匹配等级:high/possible/low_confidence/no_match") is_confirmed = Column(Integer, default=0, comment="是否已人工确认:0=未确认,1=已确认,-1=已忽略") is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") created_time = Column(DateTime, default=_now_beijing, comment="创建时间") updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") task = relationship("VectorMatchTask", back_populates="results") source_row = relationship("VectorDataRow", foreign_keys=[source_row_id]) target_row = relationship("VectorDataRow", foreign_keys=[target_row_id])