Spaces:
Running
Running
| import os | |
| import hashlib | |
| import json | |
| import openpyxl | |
| from typing import List, Dict, Tuple | |
| UPLOAD_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "uploads") | |
| os.makedirs(UPLOAD_DIR, exist_ok=True) | |
| def save_upload_file(file_bytes: bytes, filename: str) -> str: | |
| filepath = os.path.join(UPLOAD_DIR, filename) | |
| with open(filepath, "wb") as f: | |
| f.write(file_bytes) | |
| return filepath | |
| EXCLUDED_FIELDS = {"序号", "行号", "编号", "id", "ID", "Id", "no", "No", "NO", "行", "#"} | |
| def get_sheet_info(filepath: str) -> Dict: | |
| wb = openpyxl.load_workbook(filepath, read_only=True) | |
| result = {"sheet_names": wb.sheetnames, "columns": {}, "all_columns": {}} | |
| for sheet_name in wb.sheetnames: | |
| ws = wb[sheet_name] | |
| headers = [] | |
| for row in ws.iter_rows(min_row=1, max_row=1, values_only=True): | |
| headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)] | |
| result["all_columns"][sheet_name] = headers | |
| result["columns"][sheet_name] = [ | |
| h for h in headers if h.strip() not in EXCLUDED_FIELDS | |
| ] | |
| wb.close() | |
| return result | |
| def parse_excel_rows( | |
| filepath: str, | |
| sheet_name: str, | |
| vector_fields: List[str], | |
| ) -> List[Dict]: | |
| wb = openpyxl.load_workbook(filepath, read_only=True) | |
| ws = wb[sheet_name] | |
| rows_data = [] | |
| headers = [] | |
| for row_idx, row in enumerate(ws.iter_rows(values_only=True)): | |
| if row_idx == 0: | |
| headers = [str(c) if c else f"列{i+1}" for i, c in enumerate(row)] | |
| continue | |
| row_dict = {} | |
| for i, val in enumerate(row): | |
| if i < len(headers): | |
| row_dict[headers[i]] = str(val) if val is not None else "" | |
| text_parts = [] | |
| for field in vector_fields: | |
| if field in row_dict and row_dict[field]: | |
| text_parts.append(row_dict[field]) | |
| raw_text = " ".join(text_parts) | |
| if not raw_text.strip(): | |
| continue | |
| text_hash = hashlib.sha256(raw_text.encode("utf-8")).hexdigest() | |
| rows_data.append({ | |
| "row_number": row_idx + 1, | |
| "raw_text": raw_text, | |
| "text_hash": text_hash, | |
| "field_values": json.dumps(row_dict, ensure_ascii=False), | |
| }) | |
| wb.close() | |
| return rows_data | |