pzweuj commited on
Commit
b44ba53
·
verified ·
1 Parent(s): 4d1536e

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +28 -35
Dockerfile CHANGED
@@ -1,15 +1,5 @@
1
  # TransVar API for HuggingFace Spaces
2
  # 仓库: https://github.com/pzweuj/TransVar2API
3
- #
4
- # 使用方法:
5
- # 1. 首次构建: 设置 HF_DATASET="" (从UCSC下载)
6
- # 2. 后续构建: 设置 HF_DATASET="your-username/transvar_db" (从Dataset下载)
7
- #
8
- # 数据集需要包含以下结构:
9
- # hg38/hg38.fa
10
- # hg38/ncbiRefSeq.txt.gz
11
- # hg19/hg19.fa
12
- # hg19/ncbiRefSeq.txt.gz
13
 
14
  FROM python:3.9-slim
15
 
@@ -49,36 +39,39 @@ RUN pip3 install --no-cache-dir transvar
49
  # 创建数据目录
50
  RUN mkdir -p /data/transvar_db/refseq_hg38 /data/transvar_db/refseq_hg19
51
 
52
- # ========== 下载/复制参考基因组数据 ==========
53
- WORKDIR /data
54
-
55
  # 如果设置了 HF_DATASET,从数据集下载
56
  RUN if [ -n "$HF_DATASET" ]; then \
57
- echo "Downloading from HF Dataset: $HF_DATASET"; \
58
- python3 -c "
59
  import os
60
- import sys
61
- from huggingface_hub import hf_hub_download
62
 
63
  dataset = os.environ.get('HF_DATASET', '')
64
- files = {
65
- 'hg38/hg38.fa': 'refseq_hg38/hg38.fa',
66
- 'hg38/ncbiRefSeq.txt.gz': 'refseq_hg38/ncbiRefSeq.txt.gz',
67
- 'hg19/hg19.fa': 'refseq_hg19/hg19.fa',
68
- 'hg19/ncbiRefSeq.txt.gz': 'refseq_hg19/ncbiRefSeq.txt.gz',
69
- }
70
-
71
- for remote, local in files.items():
72
- try:
73
- path = hf_hub_download(repo_id=dataset, filename=remote, repo_type='dataset')
74
- os.makedirs(os.path.dirname(f'/data/transvar_db/{local}'), exist_ok=True)
75
- import shutil
76
- shutil.copy(path, f'/data/transvar_db/{local}')
77
- print(f'Downloaded: {remote}')
78
- except Exception as e:
79
- print(f'Failed to download {remote}: {e}')
80
- sys.exit(1)
81
- "; fi
 
 
 
 
 
 
82
 
83
  # 如果没有设置 HF_DATASET,从 UCSC 下载(首次构建)
84
  WORKDIR /data/transvar_db/refseq_hg38
 
1
  # TransVar API for HuggingFace Spaces
2
  # 仓库: https://github.com/pzweuj/TransVar2API
 
 
 
 
 
 
 
 
 
 
3
 
4
  FROM python:3.9-slim
5
 
 
39
  # 创建数据目录
40
  RUN mkdir -p /data/transvar_db/refseq_hg38 /data/transvar_db/refseq_hg19
41
 
42
+ # ========== 下载数据脚本 ==========
 
 
43
  # 如果设置了 HF_DATASET,从数据集下载
44
  RUN if [ -n "$HF_DATASET" ]; then \
45
+ echo "Downloading from HF Dataset: $HF_DATASET" && \
46
+ python3 << 'PYEOF'
47
  import os
48
+ from huggingface_hub import hf_hub_download, snapshot_download
 
49
 
50
  dataset = os.environ.get('HF_DATASET', '')
51
+ target_dir = '/data/transvar_db'
52
+
53
+ # 尝试使用 snapshot_download 下载整个目录
54
+ try:
55
+ local_path = snapshot_download(repo_id=dataset, repo_type='dataset', cache_dir='/tmp/hf_cache')
56
+ # 复制到目标目录
57
+ os.system(f'cp -r {local_path}/* {target_dir}/')
58
+ print(f'Downloaded from Dataset: {local_path}')
59
+ except Exception as e:
60
+ print(f'snapshot_download failed: {e}')
61
+ # 尝试单独下载文件
62
+ files = ['hg38/hg38.fa', 'hg38/ncbiRefSeq.txt.gz', 'hg19/hg19.fa', 'hg19/ncbiRefSeq.txt.gz']
63
+ for f in files:
64
+ try:
65
+ path = hf_hub_download(repo_id=dataset, filename=f, repo_type='dataset')
66
+ subdir = f.split('/')[0] # hg38 or hg19
67
+ filename = f.split('/')[1]
68
+ os.makedirs(f'{target_dir}/refseq_{subdir}', exist_ok=True)
69
+ os.system(f'cp {path} {target_dir}/refseq_{subdir}/{filename}')
70
+ print(f'Downloaded: {f}')
71
+ except Exception as e2:
72
+ print(f'Failed to download {f}: {e2}')
73
+ PYEOF
74
+ fi
75
 
76
  # 如果没有设置 HF_DATASET,从 UCSC 下载(首次构建)
77
  WORKDIR /data/transvar_db/refseq_hg38