Scoooooott commited on
Commit
2ccb4a8
·
1 Parent(s): 50d1ba3

Refactor app.py for Docker, optimize requirements, update Dockerfile

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -0
  2. app.py +179 -0
  3. requirements.txt +4 -0
Dockerfile CHANGED
@@ -3,4 +3,5 @@ WORKDIR /code
3
  COPY ./requirements.txt /code/requirements.txt
4
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
5
  COPY . .
 
6
  CMD ["python", "app.py"]
 
3
  COPY ./requirements.txt /code/requirements.txt
4
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
5
  COPY . .
6
+ EXPOSE 8000
7
  CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import decimal
3
+ from datetime import datetime, date
4
+ from statistics import mean
5
+ import requests
6
+ import tempfile
7
+ from openpyxl import load_workbook
8
+ from frictionless import Resource, Detector
9
+ from frictionless.formats import ExcelControl
10
+ import logging
11
+
12
+ from mcp.server.fastmcp import FastMCP
13
+
14
+ # --- 0. 日志配置 ---
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # --- 1. 初始化 MCP Server ---
19
+ mcp = FastMCP("excel-analyzer-pro", host="0.0.0.0", port=8000)
20
+
21
+ # --- 2. 核心逻辑函数 ---
22
+ def get_sheets_logic(file_url):
23
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f_in:
24
+ resp = requests.get(file_url, timeout=60)
25
+ f_in.write(resp.content)
26
+ tmp_path = f_in.name
27
+ wb = load_workbook(tmp_path, read_only=True)
28
+ sheets = wb.sheetnames
29
+ wb.close()
30
+ os.unlink(tmp_path)
31
+ return sheets
32
+
33
+ def profile_data_logic(file_url, sheet_name, detail_level, sample_rows=5):
34
+ """
35
+ 分析 Excel/CSV 文件并返回结构化画像。
36
+ """
37
+ src_tmp_name = None
38
+ normalized_tmp_name = None
39
+
40
+ try:
41
+ logger.info(f"正在分析, 模式: {detail_level}")
42
+
43
+ # 1. 下载文件到临时空间
44
+ resp = requests.get(file_url, timeout=60)
45
+ resp.raise_for_status()
46
+
47
+ # 使用 .xlsx 后缀确保 openpyxl 和 frictionless 能正确识别格式
48
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f_in:
49
+ f_in.write(resp.content)
50
+ src_tmp_name = f_in.name
51
+
52
+ # 2. 规范化处理 (处理 Excel 公式、合并单元格)
53
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f_out:
54
+ normalized_tmp_name = f_out.name
55
+
56
+
57
+ # 3. Frictionless 探测配置
58
+ # 加载计算后的值 (data_only=True)
59
+ wb = load_workbook(src_tmp_name, data_only=True)
60
+ if sheet_name and sheet_name not in wb.sheetnames:
61
+ return {"status": "error", "message": f"Sheet '{sheet_name}' 不存在"}
62
+ wb.save(normalized_tmp_name)
63
+
64
+ # 2. 核心探测逻辑
65
+ control = ExcelControl(sheet=sheet_name, fill_merged_cells=True) if sheet_name else ExcelControl(fill_merged_cells=True)
66
+
67
+ detector = Detector(sample_size=1000, field_confidence=0.9)
68
+ resource = Resource(path=normalized_tmp_name, control=control, detector=detector)
69
+ resource.infer(stats=True)
70
+
71
+ with resource:
72
+ schema = resource.schema
73
+ all_rows = resource.read_rows()
74
+ total_rows = len(all_rows)
75
+
76
+ fields_analysis = []
77
+ for index, field in enumerate(schema.fields):
78
+ # --- [Basic Level] 基础信息 ---
79
+ analysis = {
80
+ "name": field.name,
81
+ "type": field.type
82
+ }
83
+
84
+ # 如果级别是 basic,直接跳过后续重型计算
85
+ if detail_level == "basic":
86
+ fields_analysis.append(analysis)
87
+ continue
88
+
89
+ # 提取非空数据用于后续分析
90
+ col_data = []
91
+ null_count = 0
92
+ for row in all_rows:
93
+ val = row[index] if len(row) > index else None
94
+ if val is not None:
95
+ col_data.append(val)
96
+ else:
97
+ null_count += 1
98
+
99
+ # --- [Standard Level] 质量与数值统计 ---
100
+ analysis["missing_rate"] = f"{(null_count / total_rows) * 100:.2f}%" if total_rows > 0 else "0%"
101
+
102
+ if field.type in ['integer', 'number'] and col_data:
103
+ # 确保是可计算的数值
104
+ numeric_values = [
105
+ float(v) for v in col_data
106
+ if isinstance(v, (int, float, decimal.Decimal))
107
+ ]
108
+ if numeric_values:
109
+ analysis["numeric_stats"] = {
110
+ "min": min(numeric_values),
111
+ "max": max(numeric_values),
112
+ "avg": round(mean(numeric_values), 2)
113
+ }
114
+
115
+ # --- [Full Level] 枚举与样本 ---
116
+ if detail_level == "full":
117
+ # 枚举检测:如果唯一值较少(<15)且有数据,则视为枚举
118
+ unique_vals = list(set(col_data))
119
+ if 0 < len(unique_vals) <= 15:
120
+ analysis["is_enumeration"] = True
121
+ analysis["enum_values"] = [str(v) for v in unique_vals]
122
+ else:
123
+ analysis["is_enumeration"] = False
124
+
125
+ # 样本提取:转换日期等特殊对象为字符串
126
+ samples = []
127
+ for v in col_data[:sample_rows]:
128
+ if isinstance(v, (datetime, date)):
129
+ samples.append(v.isoformat())
130
+ else:
131
+ samples.append(str(v))
132
+ analysis["samples"] = samples
133
+
134
+ fields_analysis.append(analysis)
135
+
136
+ # 4. 组装最终报告
137
+ report = {
138
+ "file_summary": {
139
+ "total_rows": total_rows,
140
+ "total_fields": len(fields_analysis),
141
+ "detail_level_applied": detail_level
142
+ },
143
+ "schema_analysis": fields_analysis
144
+ }
145
+
146
+ logger.info(f"分析成功: 识别到 {len(fields_analysis)} 列")
147
+ return report
148
+
149
+ except Exception as e:
150
+ logger.error(f"分析异常: {str(e)}")
151
+ return {
152
+ "status": "failed",
153
+ "error_type": type(e).__name__,
154
+ "message": str(e)
155
+ }
156
+
157
+ finally:
158
+ # 5. 清理临时文件,防止磁盘溢出
159
+ for path in (src_tmp_name, normalized_tmp_name):
160
+ if path and os.path.exists(path):
161
+ try:
162
+ os.unlink(path)
163
+ except Exception as cleanup_err:
164
+ logger.warning(f"清理临时文件失败: {cleanup_err}")
165
+
166
+ # --- 3. 注册工具到 MCP ---
167
+ @mcp.tool()
168
+ def get_excel_structure(fileUrl: str) -> list[str]:
169
+ """获取Excel所有工作表名称"""
170
+ return get_sheets_logic(fileUrl)
171
+
172
+ @mcp.tool()
173
+ def profile_sheet_data(fileUrl: str, sheet_name: str = None, detail_level: str = "standard", sample_rows: int = 5) -> dict:
174
+ """深度分析指定工作表的数据画像"""
175
+ return profile_data_logic(fileUrl, sheet_name, detail_level, sample_rows)
176
+
177
+ # Hugging Face 需要这个入口
178
+ if __name__ == "__main__":
179
+ mcp.run(transport="sse")
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ mcp[fastapi]
2
+ requests
3
+ openpyxl
4
+ frictionless