Spaces:

teryryy
/

vector-match-api

Running

App Files Files Community

vector-match-api / services /excel_service.py

teryryy

Upload 13 files

ba016aa verified 3 days ago

raw

history blame contribute delete

2.32 kB

	import os
	import hashlib
	import json
	import openpyxl
	from typing import List, Dict, Tuple


	UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "uploads")
	os.makedirs(UPLOAD_DIR, exist_ok=True)


	def save_upload_file(file_bytes: bytes, filename: str) -> str:
	filepath = os.path.join(UPLOAD_DIR, filename)
	with open(filepath, "wb") as f:
	f.write(file_bytes)
	return filepath


	EXCLUDED_FIELDS = {"序号", "行号", "编号", "id", "ID", "Id", "no", "No", "NO", "行", "#"}

	def get_sheet_info(filepath: str) -> Dict:
	wb = openpyxl.load_workbook(filepath, read_only=True)
	result = {"sheet_names": wb.sheetnames, "columns": {}, "all_columns": {}}
	for sheet_name in wb.sheetnames:
	ws = wb[sheet_name]
	headers = []
	for row in ws.iter_rows(min_row=1, max_row=1, values_only=True):
	headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
	result["all_columns"][sheet_name] = headers
	result["columns"][sheet_name] = [
	h for h in headers if h.strip() not in EXCLUDED_FIELDS
	]
	wb.close()
	return result


	def parse_excel_rows(
	filepath: str,
	sheet_name: str,
	vector_fields: List[str],
	) -> List[Dict]:
	wb = openpyxl.load_workbook(filepath, read_only=True)
	ws = wb[sheet_name]
	rows_data = []
	headers = []
	for row_idx, row in enumerate(ws.iter_rows(values_only=True)):
	if row_idx == 0:
	headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)]
	continue
	row_dict = {}
	for i, val in enumerate(row):
	if i < len(headers):
	row_dict[headers[i]] = str(val) if val is not None else ""

	text_parts = []
	for field in vector_fields:
	if field in row_dict and row_dict[field]:
	text_parts.append(row_dict[field])
	raw_text = " ".join(text_parts)

	if not raw_text.strip():
	continue

	text_hash = hashlib.sha256(raw_text.encode("utf-8")).hexdigest()

	rows_data.append({
	"row_number": row_idx + 1,
	"raw_text": raw_text,
	"text_hash": text_hash,
	"field_values": json.dumps(row_dict, ensure_ascii=False),
	})
	wb.close()
	return rows_data