Spaces:
Paused
Paused
Upload hf_backend/config.py with huggingface_hub
Browse files- hf_backend/config.py +171 -0
hf_backend/config.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class AppConfig:
|
| 10 |
+
dataset_repo_id: str
|
| 11 |
+
hf_token: str
|
| 12 |
+
shared_token: str
|
| 13 |
+
resend_api_key: str
|
| 14 |
+
resend_api_base: str
|
| 15 |
+
from_email: str
|
| 16 |
+
kindle_email_default: str
|
| 17 |
+
translation_model: str
|
| 18 |
+
default_language: str
|
| 19 |
+
batch_size: int
|
| 20 |
+
epub_accumulated_num: int
|
| 21 |
+
epub_backmatter_skip_after_percent: int
|
| 22 |
+
epub_backmatter_titles: str
|
| 23 |
+
gemini_review_mode: str
|
| 24 |
+
gemini_review_min_chinese_ratio: float
|
| 25 |
+
gemini_review_length_ratio_min: float
|
| 26 |
+
gemini_review_length_ratio_max: float
|
| 27 |
+
api_base: str
|
| 28 |
+
ebook_convert_bin: str
|
| 29 |
+
custom_api: str
|
| 30 |
+
gemini_key: str
|
| 31 |
+
claude_key: str
|
| 32 |
+
deepl_key: str
|
| 33 |
+
caiyun_key: str
|
| 34 |
+
openai_key: str
|
| 35 |
+
manifest_path: str
|
| 36 |
+
inbox_prefix: str
|
| 37 |
+
done_prefix: str
|
| 38 |
+
work_dir: Path
|
| 39 |
+
max_jobs_per_run: int
|
| 40 |
+
lock_timeout_minutes: int
|
| 41 |
+
auto_resume_stale_minutes: int
|
| 42 |
+
max_auto_resume_attempts: int
|
| 43 |
+
epub_checkpoint_interval: int
|
| 44 |
+
weekly_url_template: str
|
| 45 |
+
weekly_filename_template: str
|
| 46 |
+
fetch_user_agent: str
|
| 47 |
+
fetch_timeout_seconds: int
|
| 48 |
+
standard_ebooks_search_url: str
|
| 49 |
+
project_gutenberg_search_url: str
|
| 50 |
+
internet_archive_advancedsearch_url: str
|
| 51 |
+
internet_archive_metadata_url_template: str
|
| 52 |
+
src_a_search_url: str
|
| 53 |
+
src_a_base_url: str
|
| 54 |
+
src_b_base_url: str
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def repo_root(self) -> Path:
|
| 58 |
+
return Path(__file__).resolve().parent.parent
|
| 59 |
+
|
| 60 |
+
@property
|
| 61 |
+
def make_book_script(self) -> Path:
|
| 62 |
+
return self.repo_root / "bilingual_book_maker" / "make_book.py"
|
| 63 |
+
|
| 64 |
+
def resolve_model(self, preferred: str | None = None) -> str:
|
| 65 |
+
model = (preferred or self.translation_model).strip()
|
| 66 |
+
if model == "google":
|
| 67 |
+
return "gemini"
|
| 68 |
+
return model
|
| 69 |
+
|
| 70 |
+
def resolve_language(self, preferred: str | None = None) -> str:
|
| 71 |
+
return (preferred or self.default_language).strip()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def load_config() -> AppConfig:
|
| 75 |
+
dataset_repo_id = os.getenv("HF_DATASET_REPO_ID", "").strip()
|
| 76 |
+
hf_token = os.getenv("HF_TOKEN", "").strip()
|
| 77 |
+
shared_token = os.getenv("RUN_SHARED_TOKEN", "").strip()
|
| 78 |
+
if not dataset_repo_id:
|
| 79 |
+
raise RuntimeError("HF_DATASET_REPO_ID is required")
|
| 80 |
+
if not hf_token:
|
| 81 |
+
raise RuntimeError("HF_TOKEN is required")
|
| 82 |
+
if not shared_token:
|
| 83 |
+
raise RuntimeError("RUN_SHARED_TOKEN is required")
|
| 84 |
+
|
| 85 |
+
work_dir = Path(os.getenv("WORK_DIR", "/tmp/ebook-work")).resolve()
|
| 86 |
+
work_dir.mkdir(parents=True, exist_ok=True)
|
| 87 |
+
|
| 88 |
+
gemini_key = (
|
| 89 |
+
os.getenv("BBM_GEMINI_ROUTER_API_KEY", "").strip()
|
| 90 |
+
or os.getenv("BBM_GOOGLE_GEMINI_KEY", "").strip()
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
return AppConfig(
|
| 94 |
+
dataset_repo_id=dataset_repo_id,
|
| 95 |
+
hf_token=hf_token,
|
| 96 |
+
shared_token=shared_token,
|
| 97 |
+
resend_api_key=os.getenv("RESEND_API_KEY", "").strip(),
|
| 98 |
+
resend_api_base=os.getenv("RESEND_API_BASE", "https://api.resend.com").strip(),
|
| 99 |
+
from_email=os.getenv("FROM_EMAIL", "").strip(),
|
| 100 |
+
kindle_email_default=os.getenv("KINDLE_EMAIL_DEFAULT", "").strip(),
|
| 101 |
+
translation_model=os.getenv("TRANSLATION_MODEL", "gemini").strip(),
|
| 102 |
+
default_language=os.getenv("DEFAULT_LANGUAGE", "zh-hans").strip(),
|
| 103 |
+
batch_size=int(os.getenv("TRANSLATION_BATCH_SIZE", "10")),
|
| 104 |
+
epub_accumulated_num=int(os.getenv("EPUB_ACCUMULATED_NUM", "2200")),
|
| 105 |
+
epub_backmatter_skip_after_percent=int(
|
| 106 |
+
os.getenv("EPUB_BACKMATTER_SKIP_AFTER_PERCENT", "50")
|
| 107 |
+
),
|
| 108 |
+
epub_backmatter_titles=os.getenv(
|
| 109 |
+
"EPUB_BACKMATTER_TITLES",
|
| 110 |
+
"about the author,notes about the author,notes on the author,biography,biographical note,credits,acknowledgments,acknowledgements,copyright,bibliography,index,endnotes,notes,further reading,also by the author,about this book,about this ebook",
|
| 111 |
+
).strip(),
|
| 112 |
+
gemini_review_mode=os.getenv(
|
| 113 |
+
"GEMINI_REVIEW_MODE",
|
| 114 |
+
"suspicious_only",
|
| 115 |
+
).strip(),
|
| 116 |
+
gemini_review_min_chinese_ratio=float(
|
| 117 |
+
os.getenv("GEMINI_REVIEW_MIN_CHINESE_RATIO", "0.2")
|
| 118 |
+
),
|
| 119 |
+
gemini_review_length_ratio_min=float(
|
| 120 |
+
os.getenv("GEMINI_REVIEW_LENGTH_RATIO_MIN", "0.35")
|
| 121 |
+
),
|
| 122 |
+
gemini_review_length_ratio_max=float(
|
| 123 |
+
os.getenv("GEMINI_REVIEW_LENGTH_RATIO_MAX", "2.5")
|
| 124 |
+
),
|
| 125 |
+
api_base=os.getenv("TRANSLATION_API_BASE", "").strip(),
|
| 126 |
+
ebook_convert_bin=os.getenv("EBOOK_CONVERT_BIN", "ebook-convert").strip(),
|
| 127 |
+
custom_api=os.getenv("BBM_CUSTOM_API", "").strip(),
|
| 128 |
+
gemini_key=gemini_key,
|
| 129 |
+
claude_key=os.getenv("BBM_CLAUDE_API_KEY", "").strip(),
|
| 130 |
+
deepl_key=os.getenv("BBM_DEEPL_API_KEY", "").strip(),
|
| 131 |
+
caiyun_key=os.getenv("BBM_CAIYUN_API_KEY", "").strip(),
|
| 132 |
+
openai_key=os.getenv("BBM_OPENAI_API_KEY", "").strip(),
|
| 133 |
+
manifest_path=os.getenv("HF_MANIFEST_PATH", "jobs/index.json").strip(),
|
| 134 |
+
inbox_prefix=os.getenv("HF_INBOX_PREFIX", "inbox").strip(),
|
| 135 |
+
done_prefix=os.getenv("HF_DONE_PREFIX", "done").strip(),
|
| 136 |
+
work_dir=work_dir,
|
| 137 |
+
max_jobs_per_run=int(os.getenv("MAX_JOBS_PER_RUN", "1")),
|
| 138 |
+
lock_timeout_minutes=int(os.getenv("LOCK_TIMEOUT_MINUTES", "2400")),
|
| 139 |
+
auto_resume_stale_minutes=int(os.getenv("AUTO_RESUME_STALE_MINUTES", "20")),
|
| 140 |
+
max_auto_resume_attempts=int(os.getenv("MAX_AUTO_RESUME_ATTEMPTS", "30")),
|
| 141 |
+
epub_checkpoint_interval=int(os.getenv("EPUB_CHECKPOINT_INTERVAL", "50")),
|
| 142 |
+
weekly_url_template=os.getenv(
|
| 143 |
+
"WEEKLY_SOURCE_URL_TEMPLATE",
|
| 144 |
+
"https://github.com/hehonghui/awesome-english-ebooks/raw/refs/heads/master/01_economist/te_{date}/TheEconomist.{date}.epub",
|
| 145 |
+
).strip(),
|
| 146 |
+
weekly_filename_template=os.getenv(
|
| 147 |
+
"WEEKLY_SOURCE_FILENAME_TEMPLATE",
|
| 148 |
+
"TheEconomist.{date}.epub",
|
| 149 |
+
).strip(),
|
| 150 |
+
fetch_user_agent=os.getenv("FETCH_USER_AGENT", "ebook-fetcher/1.0").strip(),
|
| 151 |
+
fetch_timeout_seconds=int(os.getenv("FETCH_TIMEOUT_SECONDS", "30")),
|
| 152 |
+
standard_ebooks_search_url=os.getenv(
|
| 153 |
+
"STANDARD_EBOOKS_SEARCH_URL",
|
| 154 |
+
"https://standardebooks.org/ebooks",
|
| 155 |
+
).strip(),
|
| 156 |
+
project_gutenberg_search_url=os.getenv(
|
| 157 |
+
"PROJECT_GUTENBERG_SEARCH_URL",
|
| 158 |
+
"https://www.gutenberg.org/ebooks/search/",
|
| 159 |
+
).strip(),
|
| 160 |
+
internet_archive_advancedsearch_url=os.getenv(
|
| 161 |
+
"INTERNET_ARCHIVE_ADVANCEDSEARCH_URL",
|
| 162 |
+
"https://archive.org/advancedsearch.php",
|
| 163 |
+
).strip(),
|
| 164 |
+
internet_archive_metadata_url_template=os.getenv(
|
| 165 |
+
"INTERNET_ARCHIVE_METADATA_URL_TEMPLATE",
|
| 166 |
+
"https://archive.org/metadata/{identifier}",
|
| 167 |
+
).strip(),
|
| 168 |
+
src_a_search_url=os.getenv("SRC_A_SEARCH_URL", "").strip(),
|
| 169 |
+
src_a_base_url=os.getenv("SRC_A_BASE_URL", "").strip(),
|
| 170 |
+
src_b_base_url=os.getenv("SRC_B_BASE_URL", "").strip(),
|
| 171 |
+
)
|