Spaces:

Scoooooott
/

smart_table_parsing

Sleeping

App Files Files Community

Scoooooott commited on Jan 14

Commit

2ccb4a8

1 Parent(s): 50d1ba3

Refactor app.py for Docker, optimize requirements, update Dockerfile

Browse files

Files changed (3) hide show

Dockerfile +1 -0
app.py +179 -0
requirements.txt +4 -0

Dockerfile CHANGED Viewed

@@ -3,4 +3,5 @@ WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 COPY . .
 CMD ["python", "app.py"]

 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 COPY . .
+EXPOSE 8000
 CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import os
+import decimal
+from datetime import datetime, date
+from statistics import mean
+import requests
+import tempfile
+from openpyxl import load_workbook
+from frictionless import Resource, Detector
+from frictionless.formats import ExcelControl
+import logging
+from mcp.server.fastmcp import FastMCP
+# --- 0. 日志配置 ---
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# --- 1. 初始化 MCP Server ---
+mcp = FastMCP("excel-analyzer-pro", host="0.0.0.0", port=8000)
+# --- 2. 核心逻辑函数 ---
+def get_sheets_logic(file_url):
+    with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f_in:
+        resp = requests.get(file_url, timeout=60)
+        f_in.write(resp.content)
+        tmp_path = f_in.name
+    wb = load_workbook(tmp_path, read_only=True)
+    sheets = wb.sheetnames
+    wb.close()
+    os.unlink(tmp_path)
+    return sheets
+def profile_data_logic(file_url, sheet_name, detail_level, sample_rows=5):
+    """
+    分析 Excel/CSV 文件并返回结构化画像。
+    """
+    src_tmp_name = None
+    normalized_tmp_name = None
+    try:
+        logger.info(f"正在分析, 模式: {detail_level}")
+        # 1. 下载文件到临时空间
+        resp = requests.get(file_url, timeout=60)
+        resp.raise_for_status()
+        # 使用 .xlsx 后缀确保 openpyxl 和 frictionless 能正确识别格式
+        with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f_in:
+            f_in.write(resp.content)
+            src_tmp_name = f_in.name
+        # 2. 规范化处理 (处理 Excel 公式、合并单元格)
+        with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f_out:
+            normalized_tmp_name = f_out.name
+        # 3. Frictionless 探测配置
+        # 加载计算后的值 (data_only=True)
+        wb = load_workbook(src_tmp_name, data_only=True)
+        if sheet_name and sheet_name not in wb.sheetnames:
+            return {"status": "error", "message": f"Sheet '{sheet_name}' 不存在"}
+        wb.save(normalized_tmp_name)
+        # 2. 核心探测逻辑
+        control = ExcelControl(sheet=sheet_name, fill_merged_cells=True) if sheet_name else ExcelControl(fill_merged_cells=True)
+        detector = Detector(sample_size=1000, field_confidence=0.9)
+        resource = Resource(path=normalized_tmp_name, control=control, detector=detector)
+        resource.infer(stats=True)
+        with resource:
+            schema = resource.schema
+            all_rows = resource.read_rows()
+            total_rows = len(all_rows)
+            fields_analysis = []
+            for index, field in enumerate(schema.fields):
+                # --- [Basic Level] 基础信息 ---
+                analysis = {
+                    "name": field.name,
+                    "type": field.type
+                }
+                # 如果级别是 basic，直接跳过后续重型计算
+                if detail_level == "basic":
+                    fields_analysis.append(analysis)
+                    continue
+                # 提取非空数据用于后续分析
+                col_data = []
+                null_count = 0
+                for row in all_rows:
+                    val = row[index] if len(row) > index else None
+                    if val is not None:
+                        col_data.append(val)
+                    else:
+                        null_count += 1
+                # --- [Standard Level] 质量与数值统计 ---
+                analysis["missing_rate"] = f"{(null_count / total_rows) * 100:.2f}%" if total_rows > 0 else "0%"
+                if field.type in ['integer', 'number'] and col_data:
+                    # 确保是可计算的数值
+                    numeric_values = [
+                        float(v) for v in col_data
+                        if isinstance(v, (int, float, decimal.Decimal))
+                    ]
+                    if numeric_values:
+                        analysis["numeric_stats"] = {
+                            "min": min(numeric_values),
+                            "max": max(numeric_values),
+                            "avg": round(mean(numeric_values), 2)
+                        }
+                # --- [Full Level] 枚举与样本 ---
+                if detail_level == "full":
+                    # 枚举检测：如果唯一值较少（<15）且有数据，则视为枚举
+                    unique_vals = list(set(col_data))
+                    if 0 < len(unique_vals) <= 15:
+                        analysis["is_enumeration"] = True
+                        analysis["enum_values"] = [str(v) for v in unique_vals]
+                    else:
+                        analysis["is_enumeration"] = False
+                    # 样本提取：转换日期等特殊对象为字符串
+                    samples = []
+                    for v in col_data[:sample_rows]:
+                        if isinstance(v, (datetime, date)):
+                            samples.append(v.isoformat())
+                        else:
+                            samples.append(str(v))
+                    analysis["samples"] = samples
+                fields_analysis.append(analysis)
+            # 4. 组装最终报告
+            report = {
+                "file_summary": {
+                    "total_rows": total_rows,
+                    "total_fields": len(fields_analysis),
+                    "detail_level_applied": detail_level
+                },
+                "schema_analysis": fields_analysis
+            }
+            logger.info(f"分析成功: 识别到 {len(fields_analysis)} 列")
+            return report
+    except Exception as e:
+        logger.error(f"分析异常: {str(e)}")
+        return {
+            "status": "failed",
+            "error_type": type(e).__name__,
+            "message": str(e)
+        }
+    finally:
+        # 5. 清理临时文件，防止磁盘溢出
+        for path in (src_tmp_name, normalized_tmp_name):
+            if path and os.path.exists(path):
+                try:
+                    os.unlink(path)
+                except Exception as cleanup_err:
+                    logger.warning(f"清理临时文件失败: {cleanup_err}")
+# --- 3. 注册工具到 MCP ---
+@mcp.tool()
+def get_excel_structure(fileUrl: str) -> list[str]:
+    """获取Excel所有工作表名称"""
+    return get_sheets_logic(fileUrl)
+@mcp.tool()
+def profile_sheet_data(fileUrl: str, sheet_name: str = None, detail_level: str = "standard", sample_rows: int = 5) -> dict:
+    """深度分析指定工作表的数据画像"""
+    return profile_data_logic(fileUrl, sheet_name, detail_level, sample_rows)
+# Hugging Face 需要这个入口
+if __name__ == "__main__":
+    mcp.run(transport="sse")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+mcp[fastapi]
+requests
+openpyxl
+frictionless