File size: 2,320 Bytes
ba016aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import hashlib
import json
import openpyxl
from typing import List, Dict, Tuple


UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "uploads")
os.makedirs(UPLOAD_DIR, exist_ok=True)


def save_upload_file(file_bytes: bytes, filename: str) -> str:
    filepath = os.path.join(UPLOAD_DIR, filename)
    with open(filepath, "wb") as f:
        f.write(file_bytes)
    return filepath


EXCLUDED_FIELDS = {"序号", "行号", "编号", "id", "ID", "Id", "no", "No", "NO", "行", "#"}

def get_sheet_info(filepath: str) -> Dict:
    wb = openpyxl.load_workbook(filepath, read_only=True)
    result = {"sheet_names": wb.sheetnames, "columns": {}, "all_columns": {}}
    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        headers = []
        for row in ws.iter_rows(min_row=1, max_row=1, values_only=True):
            headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
        result["all_columns"][sheet_name] = headers
        result["columns"][sheet_name] = [
            h for h in headers if h.strip() not in EXCLUDED_FIELDS
        ]
    wb.close()
    return result


def parse_excel_rows(
    filepath: str,
    sheet_name: str,
    vector_fields: List[str],
) -> List[Dict]:
    wb = openpyxl.load_workbook(filepath, read_only=True)
    ws = wb[sheet_name]
    rows_data = []
    headers = []
    for row_idx, row in enumerate(ws.iter_rows(values_only=True)):
        if row_idx == 0:
            headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
            continue
        row_dict = {}
        for i, val in enumerate(row):
            if i < len(headers):
                row_dict[headers[i]] = str(val) if val is not None else ""

        text_parts = []
        for field in vector_fields:
            if field in row_dict and row_dict[field]:
                text_parts.append(row_dict[field])
        raw_text = " ".join(text_parts)

        if not raw_text.strip():
            continue

        text_hash = hashlib.sha256(raw_text.encode("utf-8")).hexdigest()

        rows_data.append({
            "row_number": row_idx + 1,
            "raw_text": raw_text,
            "text_hash": text_hash,
            "field_values": json.dumps(row_dict, ensure_ascii=False),
        })
    wb.close()
    return rows_data