Spaces:

Orias171
/

doc-ai-api

Sleeping

File size: 8,496 Bytes

dbe2c62

from collections import Counter
from statistics import mean, multimode

# ===============================
# HÀM CHÍNH
# ===============================
def mergeLinesToParagraphs(baseJson):
    """
    Nhận vào JSON sau extractData (lines-level)
    Trả về JSON mới (paragraph-level)
    """
    general = baseJson["general"]
    lines = baseJson["lines"]

    paragraphs = []
    buffer = []

    for i, curr in enumerate(lines):
        if not buffer:
            buffer.append(curr)
            continue

        prev = lines[i-1]

        if canMerge(prev, curr, i-1, i):
            buffer.append(curr)

        else:
            paragraphs.append(buildParagraph(buffer, len(paragraphs)+1, general))
            buffer = [curr]

    if buffer:
        paragraphs.append(buildParagraph(buffer, len(paragraphs)+1, general))

    merged = {"general": general, "paragraphs": paragraphs}
    # >>> TÍNH LẠI 'common' TRONG GENERAL DỰA TRÊN PARAGRAPHS
    merged = recomputeCommonsInGeneralAfterMerge(merged)

    return {"general": general, "paragraphs": paragraphs}



# ===============================
# CÁC HÀM ĐIỀU KIỆN MERGE
# ===============================

def canMerge(prev, curr, idx_prev=None, idx_curr=None):
    """
    Kiểm tra line curr có thể merge vào prev không
    Ghi log lý do True/False
    """
    pair = f"[{idx_prev+1}->{idx_curr+1}]" if idx_prev is not None else ""

    if isNewPara(curr):
        return False

    if not isSameFontSize(prev, curr):
        return False

    if not isSameStyle(prev, curr):
        return False
    
    if not isNear(prev, curr):
        return False

    if isSameAlign(prev, curr):
        return True

    if isBadAlign(prev, curr):
        return False

    if canMergeWithAlign(prev) or canMergeWithLeft(prev, curr):
        return True

    print(f"{pair} Merge=False | Reason: Fallback")
    return False


# Check MarkerText
def isNewPara(line):
    return line.get("MarkerText") not in (None, "", " ")

# Check FontSize
def isSameFontSize(prev, curr):
    return abs(prev["FontSize"] - curr["FontSize"]) <= 0.7


# Check Style
def isSameStyle(prev, curr):
    return isSameLineStyle(prev, curr) or isSameFirstStyle(prev, curr) or isSameLastStyle(prev, curr) or isSameWordStyle(prev, curr)

def isSameFStyle(prev, curr):
    return isSameLineFStyle(prev, curr) or isSameFirstFStyle(prev, curr) or isSameLastFStyle(prev, curr) or isSameWordFStyle(prev, curr)

def isSameCase(prev, curr):
    return isSameLineCase(prev, curr) or isSameFirstCase(prev, curr) or isSameLastCase(prev, curr) or isSameWordCase(prev, curr)

# Line - Line
def isSameLineStyle(prev, curr):
    return prev["Style"] == curr["Style"]

def isSameLineFStyle(prev, curr):
    return prev["Style"] %1000 == curr["Style"] %1000

def isSameLineCase(prev, curr):
    return prev["Style"] /1000 == curr["Style"] /1000

# First - Line
def isSameFirstStyle(prev, curr):
    return prev["Style"] == curr["Words"]["First"]["Style"]

def isSameFirstFStyle(prev, curr):
    return prev["Style"] %1000 == curr["Words"]["First"]["Style"] %1000

def isSameFirstCase(prev, curr):
    return prev["Style"] /1000 == curr["Words"]["First"]["Style"] /1000

# Last - Line
def isSameLastStyle(prev, curr):
    return prev["Words"]["Last"]["Style"] == curr["Style"]

def isSameLastFStyle(prev, curr):
    return prev["Words"]["Last"]["Style"] %1000 == curr["Style"] %1000

def isSameLastCase(prev, curr):
    return prev["Words"]["Last"]["Style"] /1000 == curr["Style"] /1000

# Last - First
def isSameWordStyle(prev, curr):
    return prev["Words"]["Last"]["Style"] == curr["Words"]["First"]["Style"]

def isSameWordFStyle(prev, curr):
    return prev["Words"]["Last"]["Style"] %1000 == curr["Words"]["First"]["Style"] %1000

def isSameWordCase(prev, curr):
    return prev["Words"]["Last"]["Style"] /1000 == curr["Words"]["First"]["Style"] /1000


# Linespace
def isNear(prev, curr):
    if "Position" not in prev or "Position" not in curr:
        return False
    if "LineHeight" not in curr:
        return False
    
    hig_curr = curr["LineHeight"]
    top_prev = prev["Position"]["Top"]
    top_curr = curr["Position"]["Top"]
    bot_curr = curr["Position"]["Bot"]
    
    return (top_curr < top_prev * 2) and ((top_curr < bot_curr * 2) or bot_curr <= 3.0) and (top_curr < hig_curr * 5)


def isSameAlign(prev, curr):
    return prev.get("Align") == curr.get("Align")

def isBadAlign(prev, curr):
    return (prev.get("Align") != "right" and curr.get("Align") == "right")

def isNoSameAlign0(prev):
    return prev.get("Align") == "Justify"

def isNoSameAlignC(prev):
    return prev.get("Align") == "Center"

def isNoSameAlignR(prev):
    return prev.get("Align") == "Right"

def isNoSameAlignL(prev, curr):
    return prev.get("Align") == "Left" and curr.get("Align") == "Justify"

def canMergeWithAlign(prev):
    return isNoSameAlign0(prev) or isNoSameAlignC(prev) or isNoSameAlignR(prev)

def canMergeWithLeft(prev, curr):
    return isNoSameAlignL(prev, curr)


# ===============================
# HÀM BUILD PARAGRAPH
# ===============================

def buildParagraph(lines, para_id, general=None):
    """
    Tạo dict Paragraph từ list lines đã merge
    """
    text = " ".join([ln["Text"] for ln in lines])
    marker_text = lines[0]["MarkerText"]
    marker_type = lines[0]["MarkerType"]

    # Style: lấy min theo từng chữ số
    style = mergeStyle([ln["Style"] for ln in lines])

    # first_word = lines[0]["Words"]["First"]
    # last_word = lines[-1]["Words"]["Last"]

    fs_values = [ln["FontSize"] for ln in lines if ln.get("FontSize") is not None]

    if fs_values:
        modes = multimode(fs_values)  # trả về list tất cả các mode
        if len(modes) == 1:
            font_size = modes[0]
        else:
            # có nhiều mode → chọn gần với commonFontSize trong general
            if general and general.get("commonFontSize") is not None:
                target = general["commonFontSize"]
                font_size = min(modes, key=lambda x: abs(x - target))
            else:
                font_size = mean(fs_values)
        font_size = round(font_size, 1)
    else:
        font_size = 12.0
    align = mostCommon([ln["Align"] for ln in lines]) or lines[-1]["Align"]

    return {
        "Paragraph": para_id,
        "Text": text,
        "MarkerText": marker_text,
        "MarkerType": marker_type,
        "Style": style,
        "FontSize": font_size,
        "Align": align,
    }


# ===============================
# HELPERS
# ===============================

def mergeStyle(styles):
    """
    styles: list số 4 chữ số (CaseStyle*1000 + FontStyle)
    - Lấy min của từng chữ số
    """
    digits = [list(str(s).zfill(4)) for s in styles]
    min_digits = [min(int(d[i]) for d in digits) for i in range(4)]
    return int("".join(str(d) for d in min_digits))


def mostCommon(values):
    if not values:
        return None
    count = Counter(values)
    most = count.most_common(1)
    return most[0][0] if most else None


# ===============================
# RESOLVE COMMONS
# ===============================

def recomputeCommonsInGeneralAfterMerge(mergedJson):
    """
    Cập nhật lại các 'common' trong mergedJson['general'] dựa trên danh sách paragraphs.
    Các field cập nhật:
      - commonFontSize
      - commonFontSizes: [{FontSize, Count}, ...] (giảm dần theo Count)
      - commonMarkers: top marker thỏa ngưỡng >= 0.5% tổng số paragraph, tối đa 10 mục
    """
    paragraphs = mergedJson.get("paragraphs", [])
    total = len(paragraphs)

    # --- Font sizes ---
    fs_values = [p["FontSize"] for p in paragraphs if p.get("FontSize") is not None]
    fs_counter = Counter(fs_values)

    commonFontSizes = [{"FontSize": round(fs, 1), "Count": cnt}
                       for fs, cnt in fs_counter.most_common()]
    commonFontSize = commonFontSizes[0]["FontSize"] if commonFontSizes else None

    # --- Markers ---
    mk_values = [p["MarkerType"] for p in paragraphs if p.get("MarkerType")]
    mk_counter = Counter(mk_values)
    threshold = max(1, int(total * 0.005))
    commonMarkers = [m for m, c in mk_counter.most_common(10) if c >= threshold]

    # --- Ghi đè vào general ---
    mergedJson["general"].update({
        "commonFontSize": commonFontSize,
        "commonFontSizes": commonFontSizes,
        "commonMarkers": commonMarkers
    })
    return mergedJson