Corin1998 commited on
Commit
52f6947
·
verified ·
1 Parent(s): 56ab7e4

Update pipelines/storage.py

Browse files
Files changed (1) hide show
  1. pipelines/storage.py +17 -11
pipelines/storage.py CHANGED
@@ -4,35 +4,40 @@ import json
4
  import pandas as pd
5
  from huggingface_hub import HfApi
6
 
 
7
  def _as_parquet_bytes(record: dict) -> bytes:
8
  df = pd.DataFrame([record])
9
  buf = io.BytesIO()
10
  df.to_parquet(buf, index=False)
11
  return buf.getvalue()
12
 
 
13
  def persist_to_hf(
14
- dataset_repo: str,
15
- record: dict,
16
- anon_pdf_bytes: bytes,
17
- paraquet_path: str,
18
- json_path: str,
19
- pdf_path: str,
20
  ):
21
  token = os.environ.get("HF_TOKEN")
22
  if not token:
23
- return ["error":"HF_TOKEN not set"]
 
24
 
25
  api = HfApi(token=token)
26
 
 
27
  pq_bytes = _as_parquet_bytes(record)
28
  api.upload_file(
29
  path_or_fileobj=pq_bytes,
30
- path_in_repo=paraquet_path,
31
  repo_id=dataset_repo,
32
  repo_type="dataset",
33
  commit_message="Add candidate parquet record",
34
  )
35
 
 
36
  js_bytes = json.dumps(record, ensure_ascii=False, indent=2).encode("utf-8")
37
  api.upload_file(
38
  path_or_fileobj=js_bytes,
@@ -40,14 +45,15 @@ def persist_to_hf(
40
  repo_id=dataset_repo,
41
  repo_type="dataset",
42
  commit_message="Add candidate JSON record",
43
- )
44
 
 
45
  api.upload_file(
46
  path_or_fileobj=anon_pdf_bytes,
47
  path_in_repo=pdf_path,
48
  repo_id=dataset_repo,
49
  repo_type="dataset",
50
- commit_message="Add anonymized PDF",
51
  )
52
 
53
- return{"status": "ok", "dataset_repo": dataset_repo, "files": [paraquet_path, json_path, pdf_path]}
 
4
  import pandas as pd
5
  from huggingface_hub import HfApi
6
 
7
+
8
  def _as_parquet_bytes(record: dict) -> bytes:
9
  df = pd.DataFrame([record])
10
  buf = io.BytesIO()
11
  df.to_parquet(buf, index=False)
12
  return buf.getvalue()
13
 
14
+
15
  def persist_to_hf(
16
+ dataset_repo: str,
17
+ record: dict,
18
+ anon_pdf_bytes: bytes,
19
+ parquet_path: str,
20
+ json_path: str,
21
+ pdf_path: str,
22
  ):
23
  token = os.environ.get("HF_TOKEN")
24
  if not token:
25
+ # ここが原因でした。辞書で返します。
26
+ return {"error": "HF_TOKEN not set"}
27
 
28
  api = HfApi(token=token)
29
 
30
+ # 1) Parquet
31
  pq_bytes = _as_parquet_bytes(record)
32
  api.upload_file(
33
  path_or_fileobj=pq_bytes,
34
+ path_in_repo=parquet_path,
35
  repo_id=dataset_repo,
36
  repo_type="dataset",
37
  commit_message="Add candidate parquet record",
38
  )
39
 
40
+ # 2) JSON
41
  js_bytes = json.dumps(record, ensure_ascii=False, indent=2).encode("utf-8")
42
  api.upload_file(
43
  path_or_fileobj=js_bytes,
 
45
  repo_id=dataset_repo,
46
  repo_type="dataset",
47
  commit_message="Add candidate JSON record",
48
+ )
49
 
50
+ # 3) PDF
51
  api.upload_file(
52
  path_or_fileobj=anon_pdf_bytes,
53
  path_in_repo=pdf_path,
54
  repo_id=dataset_repo,
55
  repo_type="dataset",
56
+ commit_message="Add anonymized PDF",
57
  )
58
 
59
+ return {"status": "ok", "dataset_repo": dataset_repo, "files": [parquet_path, json_path, pdf_path]}