fengxb30 commited on
Commit
877acbb
·
verified ·
1 Parent(s): 6870358

Delete finai_xbrl_pipeline.ipynb

Browse files
Files changed (1) hide show
  1. finai_xbrl_pipeline.ipynb +0 -333
finai_xbrl_pipeline.ipynb DELETED
@@ -1,333 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "8b6363a9",
6
- "metadata": {},
7
- "source": [
8
- "# FinAI — SEC XBRL 下载与解析流水线\n",
9
- "\n",
10
- "**说明**:本 Notebook 包含:\n",
11
- "\n",
12
- "- 自动下载目标公司的 XBRL(10‑K / 10‑Q)文件\n",
13
- "- 解析 XBRL(.xml/.xbrl/.xsd)并导出结构化 JSON/CSV\n",
14
- "- 为后续 FinGPT/LoRA 微调准备数据格式与建议\n",
15
- "\n",
16
- "**使用方法**:在本地或 Colab 上运行每个单元格。若在 Colab 中运行请在第一格安装依赖(已包含)。\n",
17
- "\n",
18
- "目标公司:AAPL, MSFT, GOOGL, JPM, GS, XOM, WMT, TSLA\n"
19
- ]
20
- },
21
- {
22
- "cell_type": "code",
23
- "id": "a04aab59",
24
- "metadata": {
25
- "jupyter": {
26
- "is_executing": true
27
- }
28
- },
29
- "source": [
30
- "# 安装依赖(在本地可用 pip 安装;在 Colab 可直接运行)\n",
31
- "!pip install tqdm requests lxml beautifulsoup4 pandas xbrl-parser || true\n",
32
- "\n",
33
- "# 小提示:在某些环境中,xbrl-parser 可能不可用或不可维护,\n",
34
- "# 使用 BeautifulSoup + lxml 解析 XML/XBRL 是稳妥方案。\n"
35
- ],
36
- "outputs": [],
37
- "execution_count": null
38
- },
39
- {
40
- "cell_type": "code",
41
- "id": "b27e39a1",
42
- "metadata": {
43
- "ExecuteTime": {
44
- "end_time": "2025-11-12T04:29:37.420996Z",
45
- "start_time": "2025-11-12T04:29:36.325209Z"
46
- }
47
- },
48
- "source": [
49
- "import os\n",
50
- "import requests\n",
51
- "import json\n",
52
- "from bs4 import BeautifulSoup\n",
53
- "from tqdm import tqdm\n",
54
- "import pandas as pd\n",
55
- "\n",
56
- "# 目标公司(CIK 保留 10 位格式)\n",
57
- "companies = {\n",
58
- " \"AAPL\": {\"name\": \"Apple Inc.\", \"cik\": \"0000320193\"},\n",
59
- " \"MSFT\": {\"name\": \"Microsoft Corporation\", \"cik\": \"0000789019\"},\n",
60
- " \"GOOGL\": {\"name\": \"Alphabet Inc.\", \"cik\": \"0001652044\"},\n",
61
- " \"JPM\": {\"name\": \"JPMorgan Chase & Co.\", \"cik\": \"0000019617\"},\n",
62
- " \"GS\": {\"name\": \"Goldman Sachs Group Inc.\", \"cik\": \"0000886982\"},\n",
63
- " \"XOM\": {\"name\": \"Exxon Mobil Corporation\", \"cik\": \"0000034088\"},\n",
64
- " \"WMT\": {\"name\": \"Walmart Inc.\", \"cik\": \"0000104169\"},\n",
65
- " \"TSLA\": {\"name\": \"Tesla Inc.\", \"cik\": \"0001318605\"}\n",
66
- "}\n",
67
- "\n",
68
- "HEADERS = {\"User-Agent\": \"FinAI-Agent/1.0 (16696065317@163.com)\"}\n",
69
- "SAVE_ROOT = \"./sec_xbrl_data\"\n",
70
- "os.makedirs(SAVE_ROOT, exist_ok=True)\n"
71
- ],
72
- "outputs": [],
73
- "execution_count": 2
74
- },
75
- {
76
- "cell_type": "code",
77
- "id": "31227534",
78
- "metadata": {
79
- "ExecuteTime": {
80
- "end_time": "2025-11-12T04:30:09.569648Z",
81
- "start_time": "2025-11-12T04:30:09.559109Z"
82
- }
83
- },
84
- "source": [
85
- "def download_xbrl_for_company(cik_10, ticker, max_filings=5, target_forms=('10-K','10-Q')):\n",
86
- " \"\"\"下载指定公司最近的若干 10-K/10-Q 的 XBRL 相关文件(.xml .xbrl .xsd)\n",
87
- " cik_10: 字符串,10 位 CIK(例 '0000320193')\n",
88
- " max_filings: 最多抓取多少份对应表单(按最近排序)\n",
89
- " 返回值:下载的文件路径列表\n",
90
- " \"\"\"\n",
91
- " base_submission = f\"https://data.sec.gov/submissions/CIK{cik_10}.json\"\n",
92
- " resp = requests.get(base_submission, headers=HEADERS)\n",
93
- " if resp.status_code != 200:\n",
94
- " print(f\"无法获取提交索引: {cik_10} -> HTTP {resp.status_code}\")\n",
95
- " return []\n",
96
- " data = resp.json()\n",
97
- " recent = data.get('filings', {}).get('recent', {})\n",
98
- " forms = recent.get('form', [])\n",
99
- " accessions = recent.get('accessionNumber', [])\n",
100
- "\n",
101
- " selected = []\n",
102
- " for f, a in zip(forms, accessions):\n",
103
- " if f in target_forms:\n",
104
- " selected.append(a)\n",
105
- " if len(selected) >= max_filings:\n",
106
- " break\n",
107
- "\n",
108
- " downloaded = []\n",
109
- " cik_num = str(int(cik_10)) # remove leading zeros for URL\n",
110
- " company_dir = os.path.join(SAVE_ROOT, ticker)\n",
111
- " os.makedirs(company_dir, exist_ok=True)\n",
112
- "\n",
113
- " for acc in tqdm(selected, desc=f\"{ticker} filings\"):\n",
114
- " acc_id = acc.replace('-', '')\n",
115
- " idx_url = f\"https://www.sec.gov/Archives/edgar/data/{cik_num}/{acc_id}/index.json\"\n",
116
- " r = requests.get(idx_url, headers=HEADERS)\n",
117
- " if r.status_code != 200:\n",
118
- " continue\n",
119
- " items = r.json().get('directory', {}).get('item', [])\n",
120
- " for item in items:\n",
121
- " if item.get('name', '').lower().endswith(('.xbrl', '.xml', '.xsd')):\n",
122
- " url = f\"https://www.sec.gov/Archives/edgar/data/{cik_num}/{acc_id}/{item['name']}\"\n",
123
- " local_path = os.path.join(company_dir, f\"{acc}_{item['name']}\")\n",
124
- " if not os.path.exists(local_path):\n",
125
- " try:\n",
126
- " respf = requests.get(url, headers=HEADERS)\n",
127
- " if respf.status_code == 200:\n",
128
- " with open(local_path, 'wb') as fp:\n",
129
- " fp.write(respf.content)\n",
130
- " downloaded.append(local_path)\n",
131
- " except Exception as e:\n",
132
- " print('下载异常', e)\n",
133
- " else:\n",
134
- " downloaded.append(local_path)\n",
135
- " return downloaded\n",
136
- "\n",
137
- "# 示例调用(请在有网络的环境运行)\n",
138
- "# downloaded = download_xbrl_for_company('0000320193', 'AAPL', max_filings=3)\n",
139
- "# print(downloaded)\n"
140
- ],
141
- "outputs": [],
142
- "execution_count": 10
143
- },
144
- {
145
- "cell_type": "code",
146
- "id": "eaf17825",
147
- "metadata": {
148
- "jupyter": {
149
- "is_executing": true
150
- }
151
- },
152
- "source": [
153
- "# 批量下载所有目标公司(请在有网络环境下运行)\n",
154
- "all_downloads = {}\n",
155
- "for ticker, info in companies.items():\n",
156
- " print('\\n---', ticker, info['name'])\n",
157
- " files = download_xbrl_for_company(info['cik'], ticker, max_filings=5)\n",
158
- " all_downloads[ticker] = files\n",
159
- "\n",
160
- "# 保存一份索引\n",
161
- "with open(os.path.join(SAVE_ROOT, 'download_index.json'), 'w', encoding='utf-8') as f:\n",
162
- " json.dump(all_downloads, f, indent=2, ensure_ascii=False)\n",
163
- "\n",
164
- "print('\\n下载完成。请检查', SAVE_ROOT)\n"
165
- ],
166
- "outputs": [],
167
- "execution_count": null
168
- },
169
- {
170
- "cell_type": "code",
171
- "id": "afafa968",
172
- "metadata": {
173
- "ExecuteTime": {
174
- "end_time": "2025-11-12T05:10:24.104088Z",
175
- "start_time": "2025-11-12T05:10:24.083690Z"
176
- }
177
- },
178
- "source": [
179
- "def parse_xbrl_to_facts(filepath):\n",
180
- " \"\"\"将 XBRL/XML 文件解析为一个字典 facts,尽量提取有意义的 tag 和 value\n",
181
- " 注意:XBRL 里会有命名空间和带前缀的标签,这里做简单处理以便后续使用。\n",
182
- " \"\"\"\n",
183
- " with open(filepath, 'rb') as f:\n",
184
- " content = f.read()\n",
185
- " try:\n",
186
- " soup = BeautifulSoup(content, 'lxml-xml')\n",
187
- " except Exception:\n",
188
- " soup = BeautifulSoup(content, 'xml')\n",
189
- " facts = []\n",
190
- " # 查找可能是事实(amount 或者有 contextRef 的节点)\n",
191
- " for tag in soup.find_all():\n",
192
- " name = tag.name\n",
193
- " text = (tag.get_text() or '').strip()\n",
194
- " if not text:\n",
195
- " continue\n",
196
- " # 过滤掉长文本标签(例如文档注释),保留可能的数值或简短事实\n",
197
- " if len(text) > 1000:\n",
198
- " continue\n",
199
- " # 有些 XBRL 事实带 contextRef 或 unitRef\n",
200
- " attrs = dict(tag.attrs)\n",
201
- " facts.append({\n",
202
- " 'tag': name,\n",
203
- " 'text': text,\n",
204
- " 'attrs': attrs\n",
205
- " })\n",
206
- " return facts\n",
207
- "\n",
208
- "# 示例:解析单个文件(请替换为真实下载后的路径)\n",
209
- "# facts = parse_xbrl_to_facts('./sec_xbrl_data/AAPL/0000320193-...-cal.xml')\n",
210
- "# print(len(facts))\n"
211
- ],
212
- "outputs": [],
213
- "execution_count": 14
214
- },
215
- {
216
- "cell_type": "code",
217
- "execution_count": null,
218
- "id": "95e236d1",
219
- "metadata": {},
220
- "outputs": [],
221
- "source": [
222
- "# 将目录下所有下载的 XBRL 文件解析并存储为结构化 JSON/CSV\n",
223
- "import glob\n",
224
- "\n",
225
- "all_parsed = {}\n",
226
- "for ticker in companies.keys():\n",
227
- " company_dir = os.path.join(SAVE_ROOT, ticker)\n",
228
- " if not os.path.exists(company_dir):\n",
229
- " continue\n",
230
- " all_parsed[ticker] = {}\n",
231
- " for fp in glob.glob(os.path.join(company_dir, '*')):\n",
232
- " try:\n",
233
- " facts = parse_xbrl_to_facts(fp)\n",
234
- " all_parsed[ticker][os.path.basename(fp)] = facts\n",
235
- " except Exception as e:\n",
236
- " print('解析失败', fp, e)\n",
237
- "\n",
238
- "with open(os.path.join(SAVE_ROOT, 'parsed_facts.json'), 'w', encoding='utf-8') as f:\n",
239
- " json.dump(all_parsed, f, ensure_ascii=False, indent=2)\n",
240
- "\n",
241
- "print('解析并保存完成。')\n"
242
- ]
243
- },
244
- {
245
- "cell_type": "code",
246
- "execution_count": null,
247
- "id": "1c339487",
248
- "metadata": {},
249
- "outputs": [],
250
- "source": [
251
- "# 从解析结果中抽取数值型的事实示例(尝试解析数值与单位)\n",
252
- "import re\n",
253
- "\n",
254
- "def extract_numeric_facts(facts_list):\n",
255
- " numeric = []\n",
256
- " for f in facts_list:\n",
257
- " text = f['text']\n",
258
- " # 简单判断是否为数字(包括带逗号和括号表示负数的情况)\n",
259
- " if re.match(r'^[\\(\\)\\d,\\.-]+$', text.strip()):\n",
260
- " # 清洗数字形式\n",
261
- " cleaned = text.replace('(', '-').replace(')', '').replace(',', '')\n",
262
- " try:\n",
263
- " val = float(cleaned)\n",
264
- " except:\n",
265
- " continue\n",
266
- " numeric.append({'tag': f['tag'], 'value': val, 'attrs': f['attrs']})\n",
267
- " return numeric\n",
268
- "\n",
269
- "# 示例读取 parsed_facts.json(如已生成)\n",
270
- "# with open(os.path.join(SAVE_ROOT,'parsed_facts.json'),'r',encoding='utf-8') as f:\n",
271
- "# data = json.load(f)\n",
272
- "# demo = []\n",
273
- "# for k,v in data.get('AAPL',{}).items():\n",
274
- "# demo.extend(extract_numeric_facts(v))\n",
275
- "# print(demo[:20])\n"
276
- ]
277
- },
278
- {
279
- "cell_type": "markdown",
280
- "id": "facb30a9",
281
- "metadata": {},
282
- "source": "\n"
283
- },
284
- {
285
- "cell_type": "code",
286
- "execution_count": null,
287
- "id": "23e44084",
288
- "metadata": {},
289
- "outputs": [],
290
- "source": [
291
- "# 示例:从 parsed_facts.json 构造训练样本并保存为 JSONL(监督微调格式)\n",
292
- "import random\n",
293
- "\n",
294
- "OUT_TRAIN = os.path.join(SAVE_ROOT, 'fingpt_train.jsonl')\n",
295
- "\n",
296
- "with open(os.path.join(SAVE_ROOT,'parsed_facts.json'),'r',encoding='utf-8') as f:\n",
297
- " parsed = json.load(f)\n",
298
- "\n",
299
- "samples = []\n",
300
- "for ticker, files in parsed.items():\n",
301
- " for fname, facts in files.items():\n",
302
- " # 简单拼接 facts 的若干条为示例输入\n",
303
- " small = facts[:30]\n",
304
- " context = json.dumps(small, ensure_ascii=False)\n",
305
- " inst = f\"请根据下列 XBRL 提取的事实,简要总结公司的关键财务数字(最多 60 字):\\n{context}\"\n",
306
- " resp = \"<在运行时由人工/规则生成真实标签或使用自动规则生成占位标签>\"\n",
307
- " samples.append({'instruction': inst, 'response': resp})\n",
308
- "\n",
309
- "# 保存前 1000 条(示例)\n",
310
- "with open(OUT_TRAIN, 'w', encoding='utf-8') as fout:\n",
311
- " for s in samples[:1000]:\n",
312
- " fout.write(json.dumps(s, ensure_ascii=False) + '\\n')\n",
313
- "\n",
314
- "print('训练样本示例已保存:', OUT_TRAIN)\n"
315
- ]
316
- },
317
- {
318
- "cell_type": "markdown",
319
- "id": "9b6f419b",
320
- "metadata": {},
321
- "source": "\n"
322
- }
323
- ],
324
- "metadata": {
325
- "kernelspec": {
326
- "name": "python3",
327
- "language": "python",
328
- "display_name": "Python 3 (ipykernel)"
329
- }
330
- },
331
- "nbformat": 4,
332
- "nbformat_minor": 5
333
- }