|
|
from typing import List |
|
|
from bio_requests.rag_request import RagRequest |
|
|
from dto.bio_document import BaseBioDocument |
|
|
from utils.bio_logger import bio_logger as logger |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
df = pd.read_excel("config/2023JCR(完整).xlsx") |
|
|
|
|
|
|
|
|
df = df[["ISSN", "5年IF", "EISSN"]] |
|
|
|
|
|
|
|
|
df["5年IF"] = pd.to_numeric(df["5年IF"], errors="coerce").fillna(0.01) |
|
|
|
|
|
|
|
|
class RerankService: |
|
|
def __init__(self): |
|
|
|
|
|
|
|
|
self.df = df |
|
|
|
|
|
async def rerank( |
|
|
self, rag_request: RagRequest, documents: List[BaseBioDocument] = [] |
|
|
) -> List[BaseBioDocument]: |
|
|
if not rag_request.data_source or "pubmed" not in rag_request.data_source: |
|
|
logger.info("RerankService: data_source is not pubmed, skip rerank") |
|
|
return documents |
|
|
logger.info("RerankService: start rerank") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for document in documents: |
|
|
issn = document.journal["issn"] |
|
|
|
|
|
|
|
|
if_5_year = self.df.loc[self.df["ISSN"] == issn, "5年IF"].values |
|
|
if if_5_year.size > 0: |
|
|
document.if_score = if_5_year[0] |
|
|
else: |
|
|
|
|
|
if_5_year = self.df.loc[self.df["EISSN"] == issn, "5年IF"].values |
|
|
if if_5_year.size > 0: |
|
|
document.if_score = if_5_year[0] |
|
|
else: |
|
|
document.if_score = None |
|
|
|
|
|
|
|
|
documents = list({doc.bio_id: doc for doc in documents}.values()) |
|
|
|
|
|
|
|
|
sorted_documents = sorted( |
|
|
documents, |
|
|
key=lambda x: x.if_score if x.if_score is not None else 0.01, |
|
|
reverse=True, |
|
|
) |
|
|
|
|
|
return sorted_documents |
|
|
|