vector-match-api / services /excel_service.py
teryryy's picture
Upload 13 files
ba016aa verified
import os
import hashlib
import json
import openpyxl
from typing import List, Dict, Tuple
UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "uploads")
os.makedirs(UPLOAD_DIR, exist_ok=True)
def save_upload_file(file_bytes: bytes, filename: str) -> str:
filepath = os.path.join(UPLOAD_DIR, filename)
with open(filepath, "wb") as f:
f.write(file_bytes)
return filepath
EXCLUDED_FIELDS = {"序号", "行号", "编号", "id", "ID", "Id", "no", "No", "NO", "行", "#"}
def get_sheet_info(filepath: str) -> Dict:
wb = openpyxl.load_workbook(filepath, read_only=True)
result = {"sheet_names": wb.sheetnames, "columns": {}, "all_columns": {}}
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
headers = []
for row in ws.iter_rows(min_row=1, max_row=1, values_only=True):
headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
result["all_columns"][sheet_name] = headers
result["columns"][sheet_name] = [
h for h in headers if h.strip() not in EXCLUDED_FIELDS
]
wb.close()
return result
def parse_excel_rows(
filepath: str,
sheet_name: str,
vector_fields: List[str],
) -> List[Dict]:
wb = openpyxl.load_workbook(filepath, read_only=True)
ws = wb[sheet_name]
rows_data = []
headers = []
for row_idx, row in enumerate(ws.iter_rows(values_only=True)):
if row_idx == 0:
headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
continue
row_dict = {}
for i, val in enumerate(row):
if i < len(headers):
row_dict[headers[i]] = str(val) if val is not None else ""
text_parts = []
for field in vector_fields:
if field in row_dict and row_dict[field]:
text_parts.append(row_dict[field])
raw_text = " ".join(text_parts)
if not raw_text.strip():
continue
text_hash = hashlib.sha256(raw_text.encode("utf-8")).hexdigest()
rows_data.append({
"row_number": row_idx + 1,
"raw_text": raw_text,
"text_hash": text_hash,
"field_values": json.dumps(row_dict, ensure_ascii=False),
})
wb.close()
return rows_data