import os import hashlib import json import openpyxl from typing import List, Dict, Tuple UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "uploads") os.makedirs(UPLOAD_DIR, exist_ok=True) def save_upload_file(file_bytes: bytes, filename: str) -> str: filepath = os.path.join(UPLOAD_DIR, filename) with open(filepath, "wb") as f: f.write(file_bytes) return filepath EXCLUDED_FIELDS = {"序号", "行号", "编号", "id", "ID", "Id", "no", "No", "NO", "行", "#"} def get_sheet_info(filepath: str) -> Dict: wb = openpyxl.load_workbook(filepath, read_only=True) result = {"sheet_names": wb.sheetnames, "columns": {}, "all_columns": {}} for sheet_name in wb.sheetnames: ws = wb[sheet_name] headers = [] for row in ws.iter_rows(min_row=1, max_row=1, values_only=True): headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)] result["all_columns"][sheet_name] = headers result["columns"][sheet_name] = [ h for h in headers if h.strip() not in EXCLUDED_FIELDS ] wb.close() return result def parse_excel_rows( filepath: str, sheet_name: str, vector_fields: List[str], ) -> List[Dict]: wb = openpyxl.load_workbook(filepath, read_only=True) ws = wb[sheet_name] rows_data = [] headers = [] for row_idx, row in enumerate(ws.iter_rows(values_only=True)): if row_idx == 0: headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)] continue row_dict = {} for i, val in enumerate(row): if i < len(headers): row_dict[headers[i]] = str(val) if val is not None else "" text_parts = [] for field in vector_fields: if field in row_dict and row_dict[field]: text_parts.append(row_dict[field]) raw_text = " ".join(text_parts) if not raw_text.strip(): continue text_hash = hashlib.sha256(raw_text.encode("utf-8")).hexdigest() rows_data.append({ "row_number": row_idx + 1, "raw_text": raw_text, "text_hash": text_hash, "field_values": json.dumps(row_dict, ensure_ascii=False), }) wb.close() return rows_data