Spaces:

teryryy
/

vector-match-api

Running

File size: 2,320 Bytes

ba016aa

import os
import hashlib
import json
import openpyxl
from typing import List, Dict, Tuple


UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "uploads")
os.makedirs(UPLOAD_DIR, exist_ok=True)


def save_upload_file(file_bytes: bytes, filename: str) -> str:
    filepath = os.path.join(UPLOAD_DIR, filename)
    with open(filepath, "wb") as f:
        f.write(file_bytes)
    return filepath


EXCLUDED_FIELDS = {"序号", "行号", "编号", "id", "ID", "Id", "no", "No", "NO", "行", "#"}

def get_sheet_info(filepath: str) -> Dict:
    wb = openpyxl.load_workbook(filepath, read_only=True)
    result = {"sheet_names": wb.sheetnames, "columns": {}, "all_columns": {}}
    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        headers = []
        for row in ws.iter_rows(min_row=1, max_row=1, values_only=True):
            headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
        result["all_columns"][sheet_name] = headers
        result["columns"][sheet_name] = [
            h for h in headers if h.strip() not in EXCLUDED_FIELDS
        ]
    wb.close()
    return result


def parse_excel_rows(
    filepath: str,
    sheet_name: str,
    vector_fields: List[str],
) -> List[Dict]:
    wb = openpyxl.load_workbook(filepath, read_only=True)
    ws = wb[sheet_name]
    rows_data = []
    headers = []
    for row_idx, row in enumerate(ws.iter_rows(values_only=True)):
        if row_idx == 0:
            headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
            continue
        row_dict = {}
        for i, val in enumerate(row):
            if i < len(headers):
                row_dict[headers[i]] = str(val) if val is not None else ""

        text_parts = []
        for field in vector_fields:
            if field in row_dict and row_dict[field]:
                text_parts.append(row_dict[field])
        raw_text = " ".join(text_parts)

        if not raw_text.strip():
            continue

        text_hash = hashlib.sha256(raw_text.encode("utf-8")).hexdigest()

        rows_data.append({
            "row_number": row_idx + 1,
            "raw_text": raw_text,
            "text_hash": text_hash,
            "field_values": json.dumps(row_dict, ensure_ascii=False),
        })
    wb.close()
    return rows_data