howard9963 commited on
Commit
bc261ee
·
verified ·
1 Parent(s): e1a9ade

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py CHANGED
@@ -12,6 +12,19 @@ import re
12
  from typing import Tuple, Optional, List, Dict
13
  from dataclasses import dataclass
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  import fitz # PyMuPDF (pymupdf)
16
  import gradio as gr
17
 
@@ -81,6 +94,57 @@ HF_TOKEN = os.getenv("HF_TOKEN") # 若模型 gated 需設定
81
  _hf_tok = None
82
  _hf_model = None
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def _ensure_local_model(logs: Optional[List[str]] = None):
85
  global _hf_tok, _hf_model
86
  if _hf_tok is not None and _hf_model is not None:
 
12
  from typing import Tuple, Optional, List, Dict
13
  from dataclasses import dataclass
14
 
15
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0") # 關閉 Rust 下載器,避免 single-flight/Reqwest 錯
16
+ HF_BASE = "/data/.huggingface" if (os.path.isdir("/data") and os.access("/data", os.W_OK)) \
17
+ else os.path.join(tempfile.gettempdir(), "huggingface")
18
+ os.environ.setdefault("HF_HOME", HF_BASE)
19
+ os.environ.setdefault("TRANSFORMERS_CACHE", os.path.join(HF_BASE, "transformers"))
20
+ os.environ.setdefault("HF_HUB_CACHE", os.path.join(HF_BASE, "hub"))
21
+ for _d in (os.environ["HF_HOME"], os.environ["TRANSFORMERS_CACHE"], os.environ["HF_HUB_CACHE"]):
22
+ os.makedirs(_d, exist_ok=True)
23
+
24
+ print(f"🧠 [LOG] 從 Hugging Face 載入模型(將快取/權重放在 {os.environ['HF_HOME']}):{model_repo_id}")
25
+ local_model, local_tokenizer = load_model_from_repo(model_repo_id)
26
+ print("✅ [LOG] 模型已就緒(本地推論)")
27
+
28
  import fitz # PyMuPDF (pymupdf)
29
  import gradio as gr
30
 
 
94
  _hf_tok = None
95
  _hf_model = None
96
 
97
+ # ======== 載入 LLM(T4 預設 4-bit + offload 到 /data) ========
98
+ def load_model_from_repo(repo_id: str):
99
+ # 1) 先預下載到 /data
100
+ local_dir = _prefetch_repo(repo_id)
101
+
102
+ # 2) tokenizer
103
+ tok = AutoTokenizer.from_pretrained(local_dir, use_fast=True)
104
+ if tok.pad_token is None:
105
+ tok.pad_token = tok.eos_token
106
+
107
+ # 3) 4-bit 設定(T4 無 bf16,使用 fp16 作為計算 dtype)
108
+ qconf = BitsAndBytesConfig(
109
+ load_in_4bit=True,
110
+ bnb_4bit_use_double_quant=True,
111
+ bnb_4bit_quant_type="nf4",
112
+ bnb_4bit_compute_dtype=torch.float16,
113
+ )
114
+
115
+ # 4) offload 資料夾統一到 /data
116
+ offload_folder = os.path.join(os.environ["HF_HOME"], "offload")
117
+ os.makedirs(offload_folder, exist_ok=True)
118
+
119
+ # 5) 設定記憶體預算
120
+ max_memory = {}
121
+ if _torch.cuda.is_available():
122
+ num_gpus = _torch.cuda.device_count()
123
+ # 預留一點空間給 KV cache 與其他進程;你可把 14 調成 12 以更保守
124
+ per_gpu_budget_gib = 14
125
+ for i in range(num_gpus):
126
+ max_memory[i] = f"{per_gpu_budget_gib}GiB"
127
+ # CPU 也給個上限,用於 offload
128
+ max_memory["cpu"] = f"{max(4, int(psutil.virtual_memory().available // (1024**3) * 0.7))}GiB"
129
+ device_map = "auto" # 讓 transformers 自動把部分層分配到各 GPU/CPU
130
+ compute_dtype = _torch.float16 # T4 沒 bf16,用 fp16 計算
131
+ else:
132
+ # 沒有 GPU:全部走 CPU
133
+ max_memory["cpu"] = f"{max(4, int(psutil.virtual_memory().available // (1024**3) * 0.9))}GiB"
134
+ device_map = {"": "cpu"}
135
+ compute_dtype = _torch.float32
136
+
137
+ mdl = AutoModelForCausalLM.from_pretrained(
138
+ local_dir,
139
+ quantization_config=qconf,
140
+ device_map="auto",
141
+ max_memory=max_memory,
142
+ offload_folder=offload_folder,
143
+ low_cpu_mem_usage=True,
144
+ use_safetensors=True,
145
+ )
146
+ return mdl, tok
147
+
148
  def _ensure_local_model(logs: Optional[List[str]] = None):
149
  global _hf_tok, _hf_model
150
  if _hf_tok is not None and _hf_model is not None: