Spaces:
Running
Running
| import datetime | |
| from sqlalchemy import Column, Integer, String, Float, DateTime, Text, LargeBinary, ForeignKey | |
| from sqlalchemy.orm import relationship | |
| from database import Base | |
| _TZ_BEIJING = datetime.timezone(datetime.timedelta(hours=8)) | |
| def _now_beijing(): | |
| return datetime.datetime.now(_TZ_BEIJING).replace(tzinfo=None) | |
| class VectorMatchTask(Base): | |
| __tablename__ = "vector_match_task" | |
| __table_args__ = {"comment": "向量匹配任务表"} | |
| id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") | |
| task_code = Column(String(30), unique=True, nullable=False, index=True, comment="任务编号,格式:YYYYMMDDHHMMSSmmm") | |
| match_mode = Column(String(50), nullable=False, default="two_file", comment="匹配模式:two_file/history/standard") | |
| candidate_scope = Column(String(50), nullable=False, default="current_task_target", comment="候选范围:current_task_target/history/standard") | |
| source_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="源数据集ID") | |
| target_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="目标候选集ID") | |
| top_k = Column(Integer, default=10, comment="每条源数据保留的Top-K候选数") | |
| rerank_top_k = Column(Integer, default=3, comment="Reranker重排序后保留的Top-K数") | |
| min_threshold = Column(Float, default=0.70, comment="最低相似度阈值") | |
| status = Column(String(20), default="pending", comment="任务状态:pending/running/completed/failed") | |
| source_row_count = Column(Integer, default=0, comment="源数据行数") | |
| target_row_count = Column(Integer, default=0, comment="目标候选行数") | |
| high_match_count = Column(Integer, default=0, comment="高度匹配数量(score>=0.90)") | |
| low_confidence_count = Column(Integer, default=0, comment="低置信数量(score<0.70)") | |
| reused_vectors = Column(Integer, default=0, comment="通过text_hash复用的向量数") | |
| new_vectors = Column(Integer, default=0, comment="新生成的向量数") | |
| progress_parse_source = Column(Integer, default=0, comment="解析源数据集进度(0-100)") | |
| progress_parse_target = Column(Integer, default=0, comment="解析目标候选集进度(0-100)") | |
| progress_vectorize = Column(Integer, default=0, comment="向量化进度(0-100)") | |
| progress_load_candidates = Column(Integer, default=0, comment="加载候选范围进度(0-100)") | |
| progress_similarity = Column(Integer, default=0, comment="相似度计算进度(0-100)") | |
| progress_rerank = Column(Integer, default=0, comment="Reranker重排序进度(0-100)") | |
| progress_save_results = Column(Integer, default=0, comment="保存结果进度(0-100)") | |
| created_time = Column(DateTime, default=_now_beijing, comment="创建时间") | |
| updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") | |
| is_archived = Column(Integer, default=0, comment="是否归档:0=未归档,1=已归档") | |
| is_delete = Column(Integer, default=0, comment="是否删除:0=未删除,1=已删除") | |
| source_dataset = relationship("VectorDataset", foreign_keys=[source_dataset_id]) | |
| target_dataset = relationship("VectorDataset", foreign_keys=[target_dataset_id]) | |
| results = relationship("MatchResult", back_populates="task") | |
| class VectorDataset(Base): | |
| __tablename__ = "vector_dataset" | |
| __table_args__ = {"comment": "向量数据集表(上传或逻辑数据集)"} | |
| id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") | |
| task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=True, comment="所属任务ID") | |
| name = Column(String(255), nullable=False, comment="数据集名称") | |
| file_name = Column(String(255), nullable=True, comment="上传文件名") | |
| sheet_name = Column(String(100), nullable=True, comment="Excel工作表名") | |
| dataset_role = Column(String(20), nullable=False, comment="数据集角色:source(源)/target(目标候选)") | |
| data_scope = Column(String(20), default="task", comment="数据范围:task/history/standard") | |
| vector_fields = Column(Text, nullable=True, comment="参与向量化的字段列表(JSON)") | |
| row_count = Column(Integer, default=0, comment="数据行数") | |
| is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") | |
| created_time = Column(DateTime, default=_now_beijing, comment="创建时间") | |
| updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") | |
| rows = relationship("VectorDataRow", back_populates="dataset") | |
| class VectorDataRow(Base): | |
| __tablename__ = "vector_data_row" | |
| __table_args__ = {"comment": "向量数据行表(单行物料/申报项等)"} | |
| id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") | |
| dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=False, index=True, comment="所属数据集ID") | |
| task_id = Column(Integer, nullable=True, index=True, comment="所属任务ID") | |
| dataset_role = Column(String(20), nullable=False, index=True, comment="数据集角色:source/target") | |
| data_scope = Column(String(20), default="task", index=True, comment="数据范围:task/history/standard") | |
| row_number = Column(Integer, nullable=False, comment="Excel中的行号") | |
| raw_text = Column(Text, nullable=False, comment="拼接后的原始文本") | |
| text_hash = Column(String(64), nullable=True, index=True, comment="文本SHA256哈希,用于向量复用") | |
| field_values = Column(Text, nullable=True, comment="各字段值(JSON)") | |
| is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") | |
| created_time = Column(DateTime, default=_now_beijing, comment="创建时间") | |
| updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") | |
| dataset = relationship("VectorDataset", back_populates="rows") | |
| embedding = relationship("VectorEmbedding", back_populates="data_row", uselist=False) | |
| class VectorEmbedding(Base): | |
| __tablename__ = "vector_embedding" | |
| __table_args__ = {"comment": "向量嵌入表(与数据行一对一)"} | |
| id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") | |
| data_row_id = Column(Integer, ForeignKey("vector_data_row.id"), unique=True, nullable=False, index=True, comment="关联 vector_data_row.id") | |
| text_hash = Column(String(64), nullable=False, index=True, comment="与行一致的文本哈希") | |
| embedding = Column(LargeBinary(length=65536), nullable=False, comment="float32 数组二进制存储") | |
| model_name = Column(String(100), nullable=True, comment="生成向量所用模型名") | |
| dimension = Column(Integer, nullable=True, comment="向量维度") | |
| is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") | |
| created_time = Column(DateTime, default=_now_beijing, comment="创建时间") | |
| updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") | |
| data_row = relationship("VectorDataRow", back_populates="embedding") | |
| class MatchResult(Base): | |
| __tablename__ = "match_result" | |
| __table_args__ = {"comment": "匹配结果表(源行与候选的关联及得分)"} | |
| id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID") | |
| task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=False, index=True, comment="所属任务ID") | |
| source_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="源数据行ID") | |
| target_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="目标候选行ID") | |
| similarity_score = Column(Float, nullable=False, comment="余弦相似度分数(0-1)") | |
| rerank_score = Column(Float, nullable=True, comment="Reranker精排分数,越高越相关") | |
| rank = Column(Integer, nullable=False, comment="排名(1=最相似)") | |
| rerank_rank = Column(Integer, nullable=True, comment="Reranker重排后的排名") | |
| candidate_scope = Column(String(50), nullable=True, comment="候选来源范围") | |
| match_level = Column(String(20), nullable=True, comment="匹配等级:high/possible/low_confidence/no_match") | |
| is_confirmed = Column(Integer, default=0, comment="是否已人工确认:0=未确认,1=已确认,-1=已忽略") | |
| is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记:0=有效,1=已删除") | |
| created_time = Column(DateTime, default=_now_beijing, comment="创建时间") | |
| updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间") | |
| task = relationship("VectorMatchTask", back_populates="results") | |
| source_row = relationship("VectorDataRow", foreign_keys=[source_row_id]) | |
| target_row = relationship("VectorDataRow", foreign_keys=[target_row_id]) | |