Spaces:

teryryy
/

vector-match-api

Running

App Files Files Community

vector-match-api / hf-vector-match-api /models.py

teryryy

Upload 13 files

010f0b1 verified 2 days ago

raw

history blame contribute delete

9.07 kB

	import datetime
	from sqlalchemy import Column, Integer, String, Float, DateTime, Text, LargeBinary, ForeignKey
	from sqlalchemy.orm import relationship
	from database import Base

	_TZ_BEIJING = datetime.timezone(datetime.timedelta(hours=8))

	def _now_beijing():
	return datetime.datetime.now(_TZ_BEIJING).replace(tzinfo=None)


	class VectorMatchTask(Base):
	__tablename__ = "vector_match_task"
	__table_args__ = {"comment": "向量匹配任务表"}

	id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
	task_code = Column(String(30), unique=True, nullable=False, index=True, comment="任务编号，格式：YYYYMMDDHHMMSSmmm")
	match_mode = Column(String(50), nullable=False, default="two_file", comment="匹配模式：two_file/history/standard")
	candidate_scope = Column(String(50), nullable=False, default="current_task_target", comment="候选范围：current_task_target/history/standard")
	source_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="源数据集ID")
	target_dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=True, comment="目标候选集ID")
	top_k = Column(Integer, default=10, comment="每条源数据保留的Top-K候选数")
	rerank_top_k = Column(Integer, default=3, comment="Reranker重排序后保留的Top-K数")
	min_threshold = Column(Float, default=0.70, comment="最低相似度阈值")
	status = Column(String(20), default="pending", comment="任务状态：pending/running/completed/failed")
	source_row_count = Column(Integer, default=0, comment="源数据行数")
	target_row_count = Column(Integer, default=0, comment="目标候选行数")
	high_match_count = Column(Integer, default=0, comment="高度匹配数量(score>=0.90)")
	low_confidence_count = Column(Integer, default=0, comment="低置信数量(score<0.70)")
	reused_vectors = Column(Integer, default=0, comment="通过text_hash复用的向量数")
	new_vectors = Column(Integer, default=0, comment="新生成的向量数")
	progress_parse_source = Column(Integer, default=0, comment="解析源数据集进度(0-100)")
	progress_parse_target = Column(Integer, default=0, comment="解析目标候选集进度(0-100)")
	progress_vectorize = Column(Integer, default=0, comment="向量化进度(0-100)")
	progress_load_candidates = Column(Integer, default=0, comment="加载候选范围进度(0-100)")
	progress_similarity = Column(Integer, default=0, comment="相似度计算进度(0-100)")
	progress_rerank = Column(Integer, default=0, comment="Reranker重排序进度(0-100)")
	progress_save_results = Column(Integer, default=0, comment="保存结果进度(0-100)")
	created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
	updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")
	is_archived = Column(Integer, default=0, comment="是否归档：0=未归档,1=已归档")
	is_delete = Column(Integer, default=0, comment="是否删除：0=未删除,1=已删除")

	source_dataset = relationship("VectorDataset", foreign_keys=[source_dataset_id])
	target_dataset = relationship("VectorDataset", foreign_keys=[target_dataset_id])
	results = relationship("MatchResult", back_populates="task")


	class VectorDataset(Base):
	__tablename__ = "vector_dataset"
	__table_args__ = {"comment": "向量数据集表（上传或逻辑数据集）"}

	id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
	task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=True, comment="所属任务ID")
	name = Column(String(255), nullable=False, comment="数据集名称")
	file_name = Column(String(255), nullable=True, comment="上传文件名")
	sheet_name = Column(String(100), nullable=True, comment="Excel工作表名")
	dataset_role = Column(String(20), nullable=False, comment="数据集角色：source(源)/target(目标候选)")
	data_scope = Column(String(20), default="task", comment="数据范围：task/history/standard")
	vector_fields = Column(Text, nullable=True, comment="参与向量化的字段列表(JSON)")
	row_count = Column(Integer, default=0, comment="数据行数")
	is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
	created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
	updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

	rows = relationship("VectorDataRow", back_populates="dataset")


	class VectorDataRow(Base):
	__tablename__ = "vector_data_row"
	__table_args__ = {"comment": "向量数据行表（单行物料/申报项等）"}

	id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
	dataset_id = Column(Integer, ForeignKey("vector_dataset.id"), nullable=False, index=True, comment="所属数据集ID")
	task_id = Column(Integer, nullable=True, index=True, comment="所属任务ID")
	dataset_role = Column(String(20), nullable=False, index=True, comment="数据集角色：source/target")
	data_scope = Column(String(20), default="task", index=True, comment="数据范围：task/history/standard")
	row_number = Column(Integer, nullable=False, comment="Excel中的行号")
	raw_text = Column(Text, nullable=False, comment="拼接后的原始文本")
	text_hash = Column(String(64), nullable=True, index=True, comment="文本SHA256哈希，用于向量复用")
	field_values = Column(Text, nullable=True, comment="各字段值(JSON)")
	is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
	created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
	updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

	dataset = relationship("VectorDataset", back_populates="rows")
	embedding = relationship("VectorEmbedding", back_populates="data_row", uselist=False)


	class VectorEmbedding(Base):
	__tablename__ = "vector_embedding"
	__table_args__ = {"comment": "向量嵌入表（与数据行一对一）"}

	id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
	data_row_id = Column(Integer, ForeignKey("vector_data_row.id"), unique=True, nullable=False, index=True, comment="关联 vector_data_row.id")
	text_hash = Column(String(64), nullable=False, index=True, comment="与行一致的文本哈希")
	embedding = Column(LargeBinary(length=65536), nullable=False, comment="float32 数组二进制存储")
	model_name = Column(String(100), nullable=True, comment="生成向量所用模型名")
	dimension = Column(Integer, nullable=True, comment="向量维度")
	is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
	created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
	updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

	data_row = relationship("VectorDataRow", back_populates="embedding")


	class MatchResult(Base):
	__tablename__ = "match_result"
	__table_args__ = {"comment": "匹配结果表（源行与候选的关联及得分）"}

	id = Column(Integer, primary_key=True, autoincrement=True, comment="主键ID")
	task_id = Column(Integer, ForeignKey("vector_match_task.id"), nullable=False, index=True, comment="所属任务ID")
	source_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="源数据行ID")
	target_row_id = Column(Integer, ForeignKey("vector_data_row.id"), nullable=False, comment="目标候选行ID")
	similarity_score = Column(Float, nullable=False, comment="余弦相似度分数(0-1)")
	rerank_score = Column(Float, nullable=True, comment="Reranker精排分数，越高越相关")
	rank = Column(Integer, nullable=False, comment="排名(1=最相似)")
	rerank_rank = Column(Integer, nullable=True, comment="Reranker重排后的排名")
	candidate_scope = Column(String(50), nullable=True, comment="候选来源范围")
	match_level = Column(String(20), nullable=True, comment="匹配等级：high/possible/low_confidence/no_match")
	is_confirmed = Column(Integer, default=0, comment="是否已人工确认：0=未确认,1=已确认,-1=已忽略")
	is_delete = Column(Integer, default=0, nullable=False, index=True, comment="软删除标记：0=有效，1=已删除")
	created_time = Column(DateTime, default=_now_beijing, comment="创建时间")
	updated_time = Column(DateTime, default=_now_beijing, onupdate=_now_beijing, comment="更新时间")

	task = relationship("VectorMatchTask", back_populates="results")
	source_row = relationship("VectorDataRow", foreign_keys=[source_row_id])
	target_row = relationship("VectorDataRow", foreign_keys=[target_row_id])