File size: 8,908 Bytes
6c21523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────────────────────
# deploy_changes.sh  β€”  Push local changes to GitHub + HuggingFace Space
#
# USAGE:
#   chmod +x deploy_changes.sh          # one-time: make it executable
#   ./deploy_changes.sh "your message"  # commit + push to both remotes
#   ./deploy_changes.sh                 # uses default commit message
#
# WHAT IT DOES (in order):
#   1. Stages all modified tracked files  (git add -u)
#   2. Commits with your message
#   3. Pushes to GitHub  (origin  β†’ github.com/irajkooh/MultiModalRag)
#   4. Pushes to HF Space via a clean orphan branch β€” binary data files
#      (PDF, PNG, DOCX) are excluded from the Space push because HF Space
#      does not support Git LFS; those files live in the HF Dataset repo
#      irajkoohi/MultiModalRag_dataset and are downloaded at Space startup.
#
# DATA FILES (persistent across Space restarts):
#   - Add/remove files in data/ and run:
#       python3 -c "
#       from huggingface_hub import HfApi
#       import os, sys
#       api = HfApi(token=os.environ['HF_TOKEN'])
#       api.upload_file(path_or_fileobj=sys.argv[1],
#                       path_in_repo='data/'+os.path.basename(sys.argv[1]),
#                       repo_id='irajkoohi/MultiModalRag_dataset',
#                       repo_type='dataset')
#       " data/yourfile.pdf
#
# NOTES:
#   - Untracked new files are NOT staged automatically; run `git add <file>` first
#   - If GitHub push fails with "non-fast-forward", run:
#       git pull --rebase origin main && ./deploy_changes.sh "retry"
# ─────────────────────────────────────────────────────────────────────────────
set -euo pipefail

MSG="${1:-"chore: update app"}"
RESET_DB=false
for arg in "$@"; do [[ "$arg" == "--reset-db" ]] && RESET_DB=true; done

if $RESET_DB; then
    echo "β–Ά Clearing stale vectorstore from HF Hub dataset..."
    python3 - <<'PYEOF'
import os, sys, re
token = os.environ.get("MultiModalRag_Token", "").strip()
if not token:
    # Try loading from _secrets/HF_TOKEN.txt β€” extract the hf_... token line
    try:
        with open("_secrets/HF_TOKEN.txt") as f:
            for line in f:
                line = line.strip()
                if re.match(r'^hf_[A-Za-z0-9]+$', line):
                    token = line
                    break
    except Exception:
        pass
if not token:
    print("⚠  HF token not found β€” skipping DB reset")
    sys.exit(0)
from huggingface_hub import HfApi
api = HfApi(token=token)
repo = "irajkoohi/MultiModalRag_dataset"
try:
    files = [f for f in api.list_repo_files(repo, repo_type="dataset") if f.startswith("vectorstore/")]
    for f in files:
        api.delete_file(path_in_repo=f, repo_id=repo, repo_type="dataset",
                        commit_message="reset vectorstore")
    print(f"βœ…  Cleared {len(files)} vectorstore file(s) from HF Hub dataset")
except Exception as e:
    print(f"⚠  DB reset failed: {e}")
PYEOF
fi

echo "β–Ά Staging modified files..."
git add -u

# Check if there's anything to commit
if git diff --cached --quiet; then
    echo "βœ… Nothing to commit β€” working tree clean."
else
    echo "β–Ά Committing: \"$MSG\""
    git commit -m "$MSG"
fi

echo "β–Ά Pushing to GitHub (origin)..."
git push origin main

# ── Upload committed binary data files to HF Hub dataset ─────────────────────
# PDFs/DOCX/PNGs are excluded from the Space rsync (no Git LFS support).
# Uploading them here ensures sync_from_hf_hub() can download them on Space startup.
echo "β–Ά Syncing data files to HF Hub dataset (upload new + delete removed)..."
python3 - <<'PYEOF'
import os, sys, re, subprocess
from pathlib import Path

token = os.environ.get("MultiModalRag_Token", "").strip()
if not token:
    try:
        with open("_secrets/HF_TOKEN.txt") as f:
            for line in f:
                line = line.strip()
                if re.match(r'^hf_[A-Za-z0-9]+$', line):
                    token = line
                    break
    except Exception:
        pass
if not token:
    print("⚠  HF token not found β€” skipping data file sync to HF Hub")
    sys.exit(0)

from huggingface_hub import HfApi, CommitOperationAdd, CommitOperationDelete
api = HfApi(token=token)
repo = "irajkoohi/MultiModalRag_dataset"

result = subprocess.run(["git", "ls-files", "data/"], capture_output=True, text=True)
committed = result.stdout.splitlines()

# Top-level data files only (no subdirs like images/ or tables/)
sync_exts = {'.pdf', '.png', '.jpg', '.jpeg', '.docx', '.xlsx', '.txt'}
local_files = [
    f for f in committed
    if Path(f).suffix.lower() in sync_exts and '/' not in f[len("data/"):]
]
local_set = set(local_files)

# Files present on HF Hub dataset under data/ (top-level only)
hub_data_files = [
    f for f in api.list_repo_files(repo, repo_type="dataset")
    if f.startswith("data/") and '/' not in f[len("data/"):]
]

upload_ops = [CommitOperationAdd(path_in_repo=f, path_or_fileobj=f) for f in local_files]
delete_ops = [CommitOperationDelete(path_in_repo=f) for f in hub_data_files if f not in local_set]

all_ops = upload_ops + delete_ops
if not all_ops:
    print("  Data files already in sync β€” nothing to do.")
    sys.exit(0)

try:
    api.create_commit(
        repo_id=repo,
        repo_type="dataset",
        operations=all_ops,
        commit_message="deploy: sync data files",
    )
    if upload_ops:
        print(f"βœ…  Uploaded {len(upload_ops)} file(s): {[Path(f).name for f in local_files]}")
    if delete_ops:
        to_del = [Path(f).name for f in hub_data_files if f not in local_set]
        print(f"πŸ—‘οΈ  Deleted {len(delete_ops)} stale file(s) from HF Hub: {to_del}")
except Exception as e:
    print(f"⚠  HF Hub data sync failed: {e}")
PYEOF

# ── Upload data/tables/ (SQLite DBs) to HF Hub dataset ───────────────────────
echo "β–Ά Syncing data/tables/ to HF Hub dataset..."
python3 - <<'PYEOF'
import os, sys, re
from pathlib import Path

token = os.environ.get("MultiModalRag_Token", "").strip()
if not token:
    try:
        with open("_secrets/HF_TOKEN.txt") as f:
            for line in f:
                line = line.strip()
                if re.match(r'^hf_[A-Za-z0-9]+$', line):
                    token = line
                    break
    except Exception:
        pass
if not token:
    print("⚠  HF token not found β€” skipping tables sync to HF Hub")
    sys.exit(0)

tables_dir = Path("data/tables")
if not tables_dir.exists() or not any(tables_dir.iterdir()):
    print("  data/tables/ is empty β€” skipping.")
    sys.exit(0)

from huggingface_hub import HfApi
api = HfApi(token=token)
repo = "irajkoohi/MultiModalRag_dataset"
try:
    api.upload_folder(
        folder_path=str(tables_dir),
        path_in_repo="tables",
        repo_id=repo,
        repo_type="dataset",
        commit_message="deploy: sync tables",
        ignore_patterns=["*.lock", ".DS_Store"],
    )
    print(f"βœ…  Uploaded data/tables/ to HF Hub dataset")
except Exception as e:
    print(f"⚠  Tables sync failed: {e}")
PYEOF

# ── HF Space push via a temp directory (never touches working tree) ──────────
echo "β–Ά Building clean Space deploy branch (binary files excluded)..."

_tmpdir=$(mktemp -d)
# Copy entire working tree to temp dir, excluding what doesn't belong on Space
rsync -a --exclude='.git' \
         --exclude='data/*.pdf' \
         --exclude='data/*.png' \
         --exclude='data/*.jpg' \
         --exclude='data/*.jpeg' \
         --exclude='data/*.docx' \
         --exclude='data/*.xlsx' \
         --exclude='data/images/' \
         --exclude='data/tables/' \
         --exclude='vectorstore/' \
         --exclude='vectorstore_corrupted_backup/' \
         --exclude='_secrets/' \
         --exclude='.venv/' \
         --exclude='__pycache__/' \
         --exclude='*.pyc' \
         . "$_tmpdir/"

# Build an orphan git repo in the temp dir and push it
pushd "$_tmpdir" > /dev/null
git init -q
git checkout -b space-deploy
git add -A
git commit -q -m "$MSG [space deploy]"
echo "β–Ά Force-pushing to HuggingFace Space..."
git remote add space "$(cd - > /dev/null && git remote get-url space)"
git push space space-deploy:main --force
popd > /dev/null
rm -rf "$_tmpdir"

echo ""
echo "βœ… Deployed successfully!"
echo "   GitHub : https://github.com/irajkooh/MultiModalRag"
echo "   Space  : https://huggingface.co/spaces/irajkoohi/MultiModalRag"
echo "   Dataset: https://huggingface.co/datasets/irajkoohi/MultiModalRag_dataset"