Spaces:

teryryy
/

vector-match-api

Running

File size: 9,072 Bytes

010f0b1

import datetime
from sqlalchemy import Column, Integer, String, Float, DateTime, Text, LargeBinary, ForeignKey
from sqlalchemy.orm import relationship
from database import Base

_TZ_BEIJING = datetime.timezone(datetime.timedelta(hours=8))

def _now_beijing():
    return datetime.datetime.now(_TZ_BEIJING).replace(tzinfo=None)


class VectorMatchTask(Base):
    __tablename__ = "vector_match_task"
    __table_args__ = {"comment": "向量匹配任务表"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    task_code = Column(String(30), unique=True, nullable=False, index=True, comment="任务编号，格式：YYYYMMDDHHMMSSmmm")
    match_mode = Column(String(50), nullable=False, default="two_file", comment="匹配模式：two_file/history/standard")
    candidate_scope = Column(String(50), nullable=False, default="current_task_target", comment="候选范围：current_task_target/history/standard")
    source_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="源数据集ID")
    target_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="目标候选集ID")
    top_k = Column(Integer, default=10, comment="每条源数据保留的Top-K候选数")
    rerank_top_k = Column(Integer, default=3, comment="Reranker重排序后保留的Top-K数")
    min_threshold = Column(Float, default=0.70, comment="最低相似度阈值")
    status = Column(String(20), default="pending", comment="任务状态：pending/running/completed/failed")
    source_row_count = Column(Integer, default=0, comment="源数据行数")
    target_row_count = Column(Integer, default=0, comment="目标候选行数")
    high_match_count = Column(Integer, default=0, comment="高度匹配数量(score>=0.90)")
    low_confidence_count = Column(Integer, default=0, comment="低置信数量(score<0.70)")
    reused_vectors = Column(Integer, default=0, comment="通过text_hash复用的向量数")
    new_vectors = Column(Integer, default=0, comment="新生成的向量数")
    progress_parse_source = Column(Integer, default=0, comment="解析源数据集进度(0-100)")
    progress_parse_target = Column(Integer, default=0, comment="解析目标候选集进度(0-100)")
    progress_vectorize = Column(Integer, default=0, comment="向量化进度(0-100)")
    progress_load_candidates = Column(Integer, default=0, comment="加载候选范围进度(0-100)")
    progress_similarity = Column(Integer, default=0, comment="相似度计算进度(0-100)")
    progress_rerank = Column(Integer, default=0, comment="Reranker重排序进度(0-100)")
    progress_save_results = Column(Integer, default=0, comment="保存结果进度(0-100)")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
    is_archived = Column(Integer, default=0, comment="是否归档：0=未归档,1=已归档")
    is_delete = Column(Integer, default=0, comment="是否删除：0=未删除,1=已删除")

    source_dataset = relationship("VectorDataset", foreign_keys=[source_dataset_id])
    target_dataset = relationship("VectorDataset", foreign_keys=[target_dataset_id])
    results = relationship("MatchResult", back_populates="task")


class VectorDataset(Base):
    __tablename__ = "vector_dataset"
    __table_args__ = {"comment": "向量数据集表（上传或逻辑数据集）"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=True, comment="所属任务ID")
    name = Column(String(255), nullable=False, comment="数据集名称")
    file_name = Column(String(255), nullable=True, comment="上传文件名")
    sheet_name = Column(String(100), nullable=True, comment="Excel工作表名")
    dataset_role = Column(String(20), nullable=False, comment="数据集角色：source(源)/target(目标候选)")
    data_scope = Column(String(20), default="task", comment="数据范围：task/history/standard")
    vector_fields = Column(Text, nullable=True, comment="参与向量化的字段列表(JSON)")
    row_count = Column(Integer, default=0, comment="数据行数")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    rows = relationship("VectorDataRow", back_populates="dataset")


class VectorDataRow(Base):
    __tablename__ = "vector_data_row"
    __table_args__ = {"comment": "向量数据行表（单行物料/申报项等）"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=False, index=True, comment="所属数据集ID")
    task_id = Column(Integer, nullable=True, index=True, comment="所属任务ID")
    dataset_role = Column(String(20), nullable=False, index=True, comment="数据集角色：source/target")
    data_scope = Column(String(20), default="task", index=True, comment="数据范围：task/history/standard")
    row_number = Column(Integer, nullable=False, comment="Excel中的行号")
    raw_text = Column(Text, nullable=False, comment="拼接后的原始文本")
    text_hash = Column(String(64), nullable=True, index=True, comment="文本SHA256哈希，用于向量复用")
    field_values = Column(Text, nullable=True, comment="各字段值(JSON)")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    dataset = relationship("VectorDataset", back_populates="rows")
    embedding = relationship("VectorEmbedding", back_populates="data_row", uselist=False)


class VectorEmbedding(Base):
    __tablename__ = "vector_embedding"
    __table_args__ = {"comment": "向量嵌入表（与数据行一对一）"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    data_row_id = Column(Integer, ForeignKey("vector_data_row.id"), unique=True, nullable=False, index=True, comment="关联 vector_data_row.id")
    text_hash = Column(String(64), nullable=False, index=True, comment="与行一致的文本哈希")
    embedding = Column(LargeBinary(length=65536), nullable=False, comment="float32 数组二进制存储")
    model_name = Column(String(100), nullable=True, comment="生成向量所用模型名")
    dimension = Column(Integer, nullable=True, comment="向量维度")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    data_row = relationship("VectorDataRow", back_populates="embedding")


class MatchResult(Base):
    __tablename__ = "match_result"
    __table_args__ = {"comment": "匹配结果表（源行与候选的关联及得分）"}

    id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
    task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=False, index=True, comment="所属任务ID")
    source_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="源数据行ID")
    target_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="目标候选行ID")
    similarity_score = Column(Float, nullable=False, comment="余弦相似度分数(0-1)")
    rerank_score = Column(Float, nullable=True, comment="Reranker精排分数，越高越相关")
    rank = Column(Integer, nullable=False, comment="排名(1=最相似)")
    rerank_rank = Column(Integer, nullable=True, comment="Reranker重排后的排名")
    candidate_scope = Column(String(50), nullable=True, comment="候选来源范围")
    match_level = Column(String(20), nullable=True, comment="匹配等级：high/possible/low_confidence/no_match")
    is_confirmed = Column(Integer, default=0, comment="是否已人工确认：0=未确认,1=已确认,-1=已忽略")
    is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
    created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
    updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

    task = relationship("VectorMatchTask", back_populates="results")
    source_row = relationship("VectorDataRow", foreign_keys=[source_row_id])
    target_row = relationship("VectorDataRow", foreign_keys=[target_row_id])